├── src └── docstore │ ├── __init__.py │ ├── static │ ├── stacks.ico │ ├── specktre.png │ ├── natural_paper.png │ ├── generic_document.png │ ├── stacks.svg │ └── style.css │ ├── templates │ ├── _title_attribution.html │ ├── _head.html │ ├── _tag_list.html │ ├── _meta_info.html │ └── index.html │ ├── git.py │ ├── merging.py │ ├── downloads.py │ ├── tag_cloud.py │ ├── file_normalisation.py │ ├── text_utils.py │ ├── tint_colors.py │ ├── models.py │ ├── thumbnails.py │ ├── tag_list.py │ ├── server.py │ ├── documents.py │ └── cli.py ├── .dockerignore ├── .gitignore ├── tests ├── stubs │ ├── smartypants.pyi │ └── wcag_contrast_ratio.pyi ├── files │ ├── cluster.png │ ├── snakes.pdf │ ├── Newtons_cradle.gif │ ├── cluster_segment.png │ ├── Rotating_earth_(large).gif │ ├── Rotating_earth_(large)_singleframe.gif │ └── credits.txt ├── test_tag_list.py ├── conftest.py ├── test_downloads.py ├── test_tint_colors.py ├── test_models.py ├── test_thumbnails.py ├── test_merging.py ├── test_text_utils.py ├── test_file_normalisation.py ├── test_documents.py ├── test_cli.py └── test_server.py ├── docstore.png ├── migrations ├── exceptions.py ├── from_2-0-0_to_2-1-0.py └── from_2-1-0_to_2-2-0.py ├── docs ├── quick_look.png ├── thumbnails.png ├── tint_colors.png ├── previewing-the-files.md ├── storing-the-metadata.md └── storing-the-files.md ├── .gitattributes ├── dev_requirements.in ├── requirements.in ├── .github ├── dependabot.yml ├── workflows │ └── test.yml └── install-github-bin ├── pyproject.toml ├── requirements.txt ├── LICENSE ├── dev_requirements.txt └── README.md /src/docstore/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | .hypothesis 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .tox 2 | *.egg-info 3 | *.pyc 4 | .coverage 5 | -------------------------------------------------------------------------------- /tests/stubs/smartypants.pyi: -------------------------------------------------------------------------------- 1 | def smartypants(s: str) -> str: ... 2 | -------------------------------------------------------------------------------- /docstore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/docstore.png -------------------------------------------------------------------------------- /migrations/exceptions.py: -------------------------------------------------------------------------------- 1 | class IncorrectSchemaError(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /docs/quick_look.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/docs/quick_look.png -------------------------------------------------------------------------------- /docs/thumbnails.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/docs/thumbnails.png -------------------------------------------------------------------------------- /docs/tint_colors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/docs/tint_colors.png -------------------------------------------------------------------------------- /tests/files/cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/cluster.png -------------------------------------------------------------------------------- /tests/files/snakes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/snakes.pdf -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | requirements.txt linguist-generated=true 2 | dev_requirements.txt linguist-generated=true 3 | -------------------------------------------------------------------------------- /src/docstore/static/stacks.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/src/docstore/static/stacks.ico -------------------------------------------------------------------------------- /tests/files/Newtons_cradle.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/Newtons_cradle.gif -------------------------------------------------------------------------------- /tests/files/cluster_segment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/cluster_segment.png -------------------------------------------------------------------------------- /src/docstore/static/specktre.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/src/docstore/static/specktre.png -------------------------------------------------------------------------------- /src/docstore/static/natural_paper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/src/docstore/static/natural_paper.png -------------------------------------------------------------------------------- /tests/files/Rotating_earth_(large).gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/Rotating_earth_(large).gif -------------------------------------------------------------------------------- /src/docstore/static/generic_document.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/src/docstore/static/generic_document.png -------------------------------------------------------------------------------- /tests/files/Rotating_earth_(large)_singleframe.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/Rotating_earth_(large)_singleframe.gif -------------------------------------------------------------------------------- /tests/test_tag_list.py: -------------------------------------------------------------------------------- 1 | from docstore.tag_list import render_tag_list 2 | 3 | 4 | def test_empty_render_tag_list() -> None: 5 | assert render_tag_list({}) == [] 6 | -------------------------------------------------------------------------------- /tests/stubs/wcag_contrast_ratio.pyi: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | Color: typing.TypeAlias = tuple[float, float, float] 4 | 5 | def rgb(color1: Color, color2: Color) -> float: ... 6 | -------------------------------------------------------------------------------- /dev_requirements.in: -------------------------------------------------------------------------------- 1 | -e file:. 2 | -r requirements.txt 3 | 4 | bs4 5 | pytest 6 | pytest-cov 7 | coverage 8 | mypy 9 | ruff 10 | types-beautifulsoup4 11 | types-tqdm 12 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | attrs>=20.2.0 2 | cattrs>=1.1.1 3 | click>=7.1.2 4 | hyperlink>=21.0.0 5 | Flask>=1.1.2 6 | Pillow 7 | rapidfuzz 8 | smartypants>=2.0.1 9 | Unidecode>=1.1.1 10 | wcag_contrast_ratio>=0.9 11 | -------------------------------------------------------------------------------- /tests/files/credits.txt: -------------------------------------------------------------------------------- 1 | Newtons_cradle.gif 2 | https://en.wikipedia.org/wiki/File:Newtons_cradle_animation_book_2.gif 3 | 4 | Rotating_earth_(large).gif 5 | https://en.wikipedia.org/wiki/File:Rotating_earth_(large).gif 6 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | import pytest 5 | 6 | 7 | @pytest.fixture 8 | def root(tmpdir: pathlib.Path) -> pathlib.Path: 9 | os.makedirs(str(tmpdir / "root")) 10 | return pathlib.Path(str(tmpdir / "root")) 11 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | time: "09:00" 8 | - package-ecosystem: "pip" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | time: "09:00" 13 | -------------------------------------------------------------------------------- /src/docstore/templates/_title_attribution.html: -------------------------------------------------------------------------------- 1 | {%- if doc|tags_with_prefix(prefix + ":") -%} 2 | , {{ prefix }} 3 | {% for t in doc|tags_with_prefix(prefix + ":") -%} 4 | {%- if t not in request_tags %}{% endif -%} 5 | {{ t | replace(prefix + ":", "") }} 6 | {%- if t not in request_tags -%}{% endif %} 7 | {%- if not loop.last -%}, {% endif -%} 8 | {%- endfor -%} 9 | {% endif %} 10 | -------------------------------------------------------------------------------- /src/docstore/git.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import os 3 | import subprocess 4 | 5 | 6 | @functools.lru_cache() 7 | def current_commit() -> str: 8 | """ 9 | Returns the commit of the current docstore version. 10 | """ 11 | return ( 12 | subprocess.check_output( 13 | ["git", "rev-parse", "HEAD"], cwd=os.path.dirname(os.path.abspath(__file__)) 14 | ) 15 | .strip() 16 | .decode("utf8")[:7] 17 | ) 18 | -------------------------------------------------------------------------------- /src/docstore/templates/_head.html: -------------------------------------------------------------------------------- 1 | 2 | {% if request_tags %} 3 | tagged with {% for t in request_tags %}{{ t }}{% if not loop.last %}, {% endif %}{% endfor %} — 4 | {% endif %} 5 | docstore{% if title %}/{{ title | smartypants | safe }}{% endif %} 6 | 7 | 8 | 9 | 10 | 11 | 16 | -------------------------------------------------------------------------------- /src/docstore/templates/_tag_list.html: -------------------------------------------------------------------------------- 1 | {% for entry in tag_tally|render_tag_list %} 2 | {% if entry['type'] == 'html_literal' %} 3 | {{ entry['value'] | safe }} 4 | {% elif entry['type'] == 'tag_link' %} 5 | {% set t = entry['name'] %} 6 | {% if t in request_tags %} 7 | {{ entry['display_name'] }} 8 | {% else %} 9 | {{ entry['display_name'] }} 10 | {% endif %} ({{ entry['count'] }}) 11 | {% elif entry['type'] == 'tag_text' %} 12 | {{ entry['display_name'] }} 13 | {% else %} 14 | {{ entry }} 15 | {% endif %} 16 | {% endfor %} 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "docstore" 3 | version = "2.0.0" 4 | 5 | [project.scripts] 6 | docstore = "docstore.cli:main" 7 | 8 | [tool.setuptools.packages.find] 9 | where = ["src"] 10 | 11 | [tool.setuptools.package-data] 12 | analytics = ["static/*", "templates/*"] 13 | 14 | [tool.coverage.run] 15 | branch = true 16 | source = [ 17 | "docstore", 18 | "tests", 19 | ] 20 | 21 | [tool.coverage.report] 22 | show_missing = true 23 | skip_covered = true 24 | fail_under = 90 25 | # fail_under = 100 26 | 27 | # [tool.pytest.ini_options] 28 | # filterwarnings = ["error"] 29 | 30 | [tool.mypy] 31 | mypy_path = "src" 32 | strict = true 33 | -------------------------------------------------------------------------------- /tests/test_downloads.py: -------------------------------------------------------------------------------- 1 | from email.message import Message 2 | 3 | from docstore.downloads import guess_filename 4 | 5 | 6 | def test_guess_filename_with_no_content_disposition() -> None: 7 | msg = Message() 8 | assert guess_filename("https://i.org/example.png", headers=msg) == "example.png" 9 | 10 | 11 | def test_guess_filename_with_content_disposition() -> None: 12 | msg = Message() 13 | msg.add_header("Content-Disposition", "attachment", filename="MyExample.png") 14 | assert guess_filename("https://i.org/example.png", headers=msg) == "MyExample.png" 15 | 16 | 17 | def test_guess_filename_with_content_disposition_but_no_filename() -> None: 18 | msg = Message() 19 | msg.add_header("Content-Disposition", "attachment") 20 | assert guess_filename("https://i.org/example.png", headers=msg) == "example.png" 21 | -------------------------------------------------------------------------------- /src/docstore/merging.py: -------------------------------------------------------------------------------- 1 | from .models import Document 2 | from .text_utils import common_prefix 3 | 4 | 5 | def get_title_candidates(documents: list[Document]) -> list[str]: 6 | title_candidates = [] 7 | 8 | for doc in documents: 9 | if doc.title not in title_candidates: 10 | title_candidates.append(doc.title) 11 | 12 | guessed_title = common_prefix(title_candidates) 13 | 14 | if guessed_title and guessed_title not in title_candidates: 15 | title_candidates.insert(0, guessed_title) 16 | 17 | return title_candidates 18 | 19 | 20 | def get_union_of_tags(documents: list[Document]) -> list[str]: 21 | """ 22 | Get a list of every tag on any document in ``documents``. 23 | """ 24 | tags = [] 25 | 26 | for doc in documents: 27 | for t in doc.tags: 28 | if t not in tags: 29 | tags.append(t) 30 | 31 | return tags 32 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile requirements.in --output-file requirements.txt 3 | attrs==23.2.0 4 | # via 5 | # -r requirements.in 6 | # cattrs 7 | blinker==1.8.2 8 | # via flask 9 | cattrs==23.2.3 10 | # via -r requirements.in 11 | click==8.1.7 12 | # via 13 | # -r requirements.in 14 | # flask 15 | flask==3.0.3 16 | # via -r requirements.in 17 | hyperlink==21.0.0 18 | # via -r requirements.in 19 | idna==3.7 20 | # via hyperlink 21 | itsdangerous==2.2.0 22 | # via flask 23 | jinja2==3.1.4 24 | # via flask 25 | markupsafe==2.1.5 26 | # via 27 | # jinja2 28 | # werkzeug 29 | pillow==10.4.0 30 | # via -r requirements.in 31 | rapidfuzz==3.9.4 32 | # via -r requirements.in 33 | smartypants==2.0.1 34 | # via -r requirements.in 35 | unidecode==1.3.8 36 | # via -r requirements.in 37 | wcag-contrast-ratio==0.9 38 | # via -r requirements.in 39 | werkzeug==3.0.3 40 | # via flask 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 Alex Chan 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /src/docstore/downloads.py: -------------------------------------------------------------------------------- 1 | import cgi 2 | from email.message import Message 3 | import os 4 | import pathlib 5 | from urllib.request import urlretrieve 6 | from urllib.parse import urlparse 7 | 8 | 9 | def guess_filename(url: str, headers: Message) -> str: 10 | """ 11 | Given a URL and the HTTP response headers, guess the final name of this file. 12 | """ 13 | fallback = os.path.basename(urlparse(url).path) 14 | 15 | try: 16 | _, params = cgi.parse_header(headers["Content-Disposition"]) 17 | except TypeError: 18 | return fallback 19 | 20 | try: 21 | return params["filename"] 22 | except (KeyError, TypeError): 23 | return fallback 24 | 25 | 26 | def download_file(url: str) -> pathlib.Path: # pragma: no cover 27 | """ 28 | Download a file from a URL. Returns the path to the downloaded file. 29 | """ 30 | tmp_path, headers = urlretrieve(url) 31 | 32 | filename = guess_filename(url=url, headers=headers) 33 | 34 | out_path = os.path.join(os.path.dirname(tmp_path), filename) 35 | os.rename(tmp_path, out_path) 36 | 37 | return pathlib.Path(out_path) 38 | -------------------------------------------------------------------------------- /src/docstore/tag_cloud.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | 4 | class TagCloud: 5 | def __init__(self, tag_tally: dict[str, int]): 6 | self.tag_tally = tag_tally 7 | self.lowest_weight = min(tag_tally.values()) 8 | self.highest_weight = max(tag_tally.values()) 9 | self.range = (self.highest_weight - self.lowest_weight) or 1 10 | 11 | self.font_size_start = 10 12 | self.font_size_end = 24 13 | self.font_incr = (self.font_size_end - self.font_size_start) / self.range 14 | 15 | self.greyscale_start = 170 16 | self.greyscale_end = 70 17 | self.greyscale_incr = (self.greyscale_end - self.greyscale_start) / self.range 18 | 19 | @functools.lru_cache() 20 | def get_style(self, tag_count: int) -> str: 21 | weighting = tag_count - self.lowest_weight 22 | font_size = self.font_size_start + weighting * self.font_incr 23 | color = int(self.greyscale_start + weighting * self.greyscale_incr) 24 | return "font-size: %fpt; color: rgb(%d, %d, %d)" % ( 25 | font_size, 26 | color, 27 | color, 28 | color, 29 | ) 30 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | jobs: 13 | test: 14 | # Note: this project uses some macOS-specific tools like `qlmanage`, so 15 | # the tests need to run on macOS. 16 | # 17 | # This is different from my other Python projects, which usually run on 18 | # `ubuntu-latest`. 19 | runs-on: macos-latest 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | 24 | - name: Set up Python 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: "3.12" 28 | cache: 'pip' 29 | cache-dependency-path: 'dev_requirements.txt' 30 | 31 | - name: Install dependencies 32 | run: | 33 | brew install ffmpeg 34 | .github/install-github-bin alexwlchan/dominant_colours 35 | pip install -r dev_requirements.txt 36 | 37 | - name: Run linting 38 | run: | 39 | ruff check . 40 | ruff format --check . 41 | 42 | - name: Check types 43 | run: mypy src tests 44 | 45 | - name: Run tests 46 | run: | 47 | coverage run -m pytest tests 48 | coverage report 49 | -------------------------------------------------------------------------------- /tests/test_tint_colors.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from docstore.thumbnails import create_thumbnail 4 | from docstore.tint_colors import ( 5 | Color, 6 | choose_tint_color_from_dominant_colors, 7 | choose_tint_color, 8 | ) 9 | 10 | 11 | def test_choose_tint_color() -> None: 12 | thumbnail_path = create_thumbnail("tests/files/Newtons_cradle.gif") 13 | 14 | tint_color = choose_tint_color( 15 | thumbnail_path=thumbnail_path, file_path="tests/files/Newtons_cradle.gif" 16 | ) 17 | assert all(0.4 <= c <= 0.5 for c in tint_color), tint_color 18 | 19 | 20 | @pytest.mark.parametrize( 21 | "dominant_color, background_color, expected_tint", 22 | [ 23 | ((1, 1, 1), (1, 1, 1), (0, 0, 0)), 24 | ((0.9, 0.9, 0.9), (1, 1, 1), (0, 0, 0)), 25 | ((0, 0, 0), (0, 0, 0), (1, 1, 1)), 26 | ], 27 | ) 28 | def test_selects_black_or_white_if_unsufficient_contrast( 29 | dominant_color: Color, 30 | background_color: Color, 31 | expected_tint: Color, 32 | ) -> None: 33 | assert ( 34 | choose_tint_color_from_dominant_colors( 35 | dominant_colors=[dominant_color], background_color=background_color 36 | ) 37 | == expected_tint 38 | ) 39 | -------------------------------------------------------------------------------- /src/docstore/file_normalisation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import secrets 3 | import shutil 4 | 5 | from docstore.text_utils import slugify 6 | 7 | 8 | def normalised_filename_copy(*, src: str, dst: str) -> str: 9 | """ 10 | Copies a file from ``src`` to ``dst``. 11 | 12 | This rename function applies two normalisation steps: 13 | 14 | - It removes non-ASCII characters and spaces 15 | - It appends random hex value before the filename extension 16 | if there are multiple files with the same name 17 | 18 | This rename function tries to be "safe". In particular, if there's 19 | already a file at ``dst``, it refuses to overwrite it. Instead, 20 | it appends a random identifier to ``dst`` and copies to that instead. 21 | 22 | e.g. if you pass dst=``Statement.pdf``, it might create files like 23 | ``Statement.pdf``, ``Statement_1c5e.pdf``, ``Statement_3fc9.pdf`` 24 | 25 | Returns the name of the final file. 26 | 27 | """ 28 | out_dir, filename = os.path.split(dst) 29 | 30 | os.makedirs(out_dir, exist_ok=True) 31 | 32 | name, ext = os.path.splitext(filename) 33 | name = slugify(name) 34 | 35 | out_path = os.path.join(out_dir, name + ext) 36 | 37 | while True: 38 | try: 39 | with open(out_path, "xb") as out_file: 40 | with open(src, "rb") as infile: 41 | shutil.copyfileobj(infile, out_file) 42 | except FileExistsError: 43 | out_path = os.path.join(out_dir, name + "_" + secrets.token_hex(2) + ext) 44 | else: 45 | return out_path 46 | -------------------------------------------------------------------------------- /tests/test_models.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import typing 3 | import uuid 4 | 5 | import pytest 6 | 7 | from docstore.models import Dimensions, Document, File, Thumbnail, from_json, to_json 8 | 9 | 10 | def is_recent(ds: datetime.datetime) -> bool: 11 | return (datetime.datetime.now() - ds).seconds < 2 12 | 13 | 14 | def test_document_defaults() -> None: 15 | d1 = Document(title="My test document") 16 | assert uuid.UUID(d1.id) 17 | assert is_recent(d1.date_saved) 18 | assert d1.tags == [] 19 | assert d1.files == [] 20 | 21 | d2 = Document(title="A different document") 22 | assert d1.id != d2.id 23 | 24 | 25 | def test_file_defaults() -> None: 26 | f = File( 27 | filename="cats.jpg", 28 | path="files/c/cats.jpg", 29 | size=100, 30 | checksum="sha256:123", 31 | thumbnail=Thumbnail( 32 | path="thumbnails/c/cats.jpg", 33 | dimensions=Dimensions(400, 300), 34 | tint_color="#ffffff", 35 | ), 36 | ) 37 | uuid.UUID(f.id) 38 | assert is_recent(f.date_saved) 39 | 40 | 41 | def test_can_serialise_document_to_json() -> None: 42 | f = File( 43 | filename="cats.jpg", 44 | path="files/c/cats.jpg", 45 | size=100, 46 | checksum="sha256:123", 47 | thumbnail=Thumbnail( 48 | path="thumbnails/c/cats.jpg", 49 | dimensions=Dimensions(400, 300), 50 | tint_color="#ffffff", 51 | ), 52 | ) 53 | 54 | documents = [Document(title="Another test document", files=[f])] 55 | assert from_json(to_json(documents)) == documents 56 | 57 | 58 | @pytest.mark.parametrize("documents", [[1, 2, 3], {"a", "b", "c"}]) 59 | def test_to_json_with_bad_list_is_typeerror(documents: typing.Any) -> None: 60 | with pytest.raises(TypeError, match=r"Expected type List\[Document\]!"): 61 | to_json(documents) 62 | -------------------------------------------------------------------------------- /migrations/from_2-0-0_to_2-1-0.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | DB schema migration: v2.0.0 ~> v2.1.0 4 | 5 | * Convert the document tree from a list of flat documents to a dict with some 6 | top-level metadata. 7 | * Record the dimension on Thumbnail instances. 8 | 9 | """ 10 | 11 | import datetime 12 | import json 13 | import os 14 | import shutil 15 | import sys 16 | 17 | import cattr 18 | import tqdm 19 | 20 | from docstore.git import current_commit 21 | from docstore.thumbnails import get_dimensions 22 | from exceptions import IncorrectSchemaError 23 | 24 | OLD_DB_SCHEMA = "v2.0.0" 25 | NEW_DB_SCHEMA = "v2.1.0" 26 | 27 | 28 | if __name__ == "__main__": 29 | try: 30 | root = sys.argv[1] 31 | except IndexError: 32 | root = "." 33 | 34 | documents_path = os.path.join(root, "documents.json") 35 | backup_path = os.path.join(root, f"documents.{OLD_DB_SCHEMA}.json.bak") 36 | 37 | documents = json.load(open(documents_path)) 38 | 39 | if not isinstance(documents, list): 40 | raise IncorrectSchemaError( 41 | f"The docstore instance at {root} doesn't look like {OLD_DB_SCHEMA}" 42 | ) 43 | 44 | assert not os.path.exists(backup_path) 45 | shutil.copyfile(documents_path, backup_path) 46 | 47 | # Create the new top-level structure 48 | new_structure = { 49 | "docstore": { 50 | "db_schema": NEW_DB_SCHEMA, 51 | "commit": current_commit(), 52 | "last_modified": datetime.datetime.now().isoformat(), 53 | }, 54 | "documents": documents, 55 | } 56 | 57 | # Backfill the thumbnail dimensions 58 | for doc in tqdm.tqdm(documents): 59 | for f in doc["files"]: 60 | dimensions = get_dimensions(os.path.join(root, f["thumbnail"]["path"])) 61 | f["thumbnail"]["dimensions"] = cattr.unstructure(dimensions) 62 | 63 | # Write the new database 64 | with open(documents_path, "w") as outfile: 65 | outfile.write(json.dumps(new_structure, indent=2, sort_keys=True)) 66 | -------------------------------------------------------------------------------- /src/docstore/static/stacks.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_thumbnails.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | from PIL import Image 4 | import pytest 5 | 6 | from docstore.thumbnails import create_thumbnail, get_dimensions 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "filename", ["Newtons_cradle.gif", "Rotating_earth_(large).gif"] 11 | ) 12 | def test_creates_thumbnail_of_animated_gif(filename: str) -> None: 13 | path = create_thumbnail(f"tests/files/{filename}", max_size=400) 14 | assert path.endswith(".mp4") 15 | 16 | 17 | def test_creates_thumbnail_of_single_frame_gif() -> None: 18 | path = create_thumbnail( 19 | "tests/files/Rotating_earth_(large)_singleframe.gif", max_size=400 20 | ) 21 | assert path.endswith(".png") 22 | 23 | im = Image.open(path) 24 | assert im.size == (400, 400) 25 | 26 | 27 | def test_creates_thumbnail_of_png() -> None: 28 | path = create_thumbnail("tests/files/cluster.png", max_size=250) 29 | assert path.endswith("/cluster.png") 30 | 31 | im = Image.open(path) 32 | assert im.size == (250, 162) 33 | 34 | 35 | def test_creates_thumbnail_of_pdf() -> None: 36 | path = create_thumbnail("tests/files/snakes.pdf", max_size=350) 37 | assert path.endswith("/snakes.pdf.png") 38 | 39 | im = Image.open(path) 40 | assert im.size == (247, 350) 41 | 42 | 43 | def test_creates_thumbnail_if_no_quicklook_plugin_available( 44 | tmpdir: pathlib.Path, 45 | ) -> None: 46 | path = str(tmpdir / "sqlite.db") 47 | 48 | with open(path, "wb") as outfile: 49 | outfile.write(b"SQLite format 3\x00") 50 | 51 | path = create_thumbnail(path) 52 | 53 | 54 | def test_gets_dimensions_of_an_image() -> None: 55 | dimensions = get_dimensions("tests/files/cluster.png") 56 | assert dimensions.width == 500 57 | assert dimensions.height == 325 58 | 59 | 60 | def test_gets_dimensions_of_a_video() -> None: 61 | thumbnail_path = create_thumbnail("tests/files/Newtons_cradle.gif") 62 | 63 | dimensions = get_dimensions(thumbnail_path) 64 | assert dimensions.width == 400 65 | assert dimensions.height == 300 66 | -------------------------------------------------------------------------------- /src/docstore/text_utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import re 4 | import sys 5 | 6 | from unidecode import unidecode 7 | 8 | 9 | def common_prefix(values: list[str]) -> str: 10 | prefix = os.path.commonprefix(values).strip() 11 | 12 | prefix = prefix.strip("()").strip() 13 | if prefix.lower().endswith("(part"): 14 | prefix = prefix[: -len("(part")].strip() 15 | 16 | if prefix.lower().endswith("- part"): 17 | prefix = prefix[: -len("- part")].strip() 18 | 19 | return prefix 20 | 21 | 22 | def slugify(u: str) -> str: 23 | """ 24 | Convert Unicode string into blog slug. 25 | 26 | Based on http://www.leancrew.com/all-this/2014/10/asciifying/ 27 | 28 | """ 29 | u = re.sub("[–—/:;,._]", "-", u) # replace separating punctuation 30 | a = unidecode(u).lower() # best ASCII substitutions, lowercased 31 | a = re.sub(r"[^a-z0-9 -]", "", a) # delete any other characters 32 | a = a.replace(" ", "-") # spaces to hyphens 33 | a = re.sub(r"-+", "-", a) # condense repeated hyphens 34 | return a 35 | 36 | 37 | def pretty_date(d: datetime.datetime, now: datetime.datetime) -> str: 38 | delta = now - d 39 | if delta.total_seconds() < 120: 40 | return "just now" 41 | elif delta.total_seconds() < 60 * 60: 42 | return f"{int(delta.seconds / 60)} minutes ago" 43 | elif d.date() == now.date(): 44 | return "earlier today" 45 | elif d.date() == now.date() - datetime.timedelta(days=1): 46 | return "yesterday" 47 | else: 48 | for days in range(2, 8): 49 | if d.date() == now.date() - datetime.timedelta(days=days): 50 | return f"{days} days ago" 51 | return d.strftime("%-d %b %Y") 52 | 53 | 54 | def hostname(url: str) -> str: 55 | """ 56 | Returns a guess for the hostname of a URL to display in the tag. 57 | """ 58 | try: 59 | return url.split("/")[2] 60 | except IndexError: 61 | print(f"Unable to detect hostname of URL: {url}", file=sys.stderr) 62 | return url 63 | -------------------------------------------------------------------------------- /migrations/from_2-1-0_to_2-2-0.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | DB schema migration: v2.1.0 ~> v2.2.0 4 | 5 | * Record the tint color on Thumbnail instances. 6 | 7 | """ 8 | 9 | import datetime 10 | import filecmp 11 | import json 12 | import os 13 | import shutil 14 | import sys 15 | 16 | import tqdm 17 | 18 | from docstore.git import current_commit 19 | from docstore.tint_colors import choose_tint_color 20 | 21 | OLD_DB_SCHEMA = "v2.1.0" 22 | NEW_DB_SCHEMA = "v2.2.0" 23 | 24 | 25 | if __name__ == "__main__": 26 | try: 27 | root = sys.argv[1] 28 | except IndexError: 29 | root = "." 30 | 31 | documents_path = os.path.join(root, "documents.json") 32 | backup_path = os.path.join(root, f"documents.{OLD_DB_SCHEMA}.json.bak") 33 | 34 | documents = json.load(open(documents_path)) 35 | assert documents["docstore"]["db_schema"] == OLD_DB_SCHEMA 36 | 37 | # Backfill the thumbnail dimensions 38 | for doc in tqdm.tqdm(documents["documents"]): 39 | for f in doc["files"]: 40 | tint_color = choose_tint_color( 41 | thumbnail_path=os.path.join(root, f["thumbnail"]["path"]), 42 | file_path=os.path.join(root, f["path"]), 43 | ) 44 | 45 | hex_tint_color = "#%02x%02x%02x" % tuple( 46 | int(component * 255) for component in tint_color 47 | ) 48 | 49 | f["thumbnail"]["tint_color"] = hex_tint_color 50 | 51 | new_output = { 52 | "docstore": { 53 | "db_schema": NEW_DB_SCHEMA, 54 | "commit": current_commit(), 55 | "last_modified": datetime.datetime.now().isoformat(), 56 | }, 57 | "documents": documents["documents"], 58 | } 59 | 60 | if os.path.exists(backup_path) and not filecmp.cmp( 61 | backup_path, documents_path, shallow=False 62 | ): 63 | raise RuntimeError("Have you already started a migration of this version?") 64 | 65 | shutil.copyfile(documents_path, backup_path) 66 | 67 | # Write the new database 68 | with open(documents_path, "w") as outfile: 69 | outfile.write(json.dumps(new_output, indent=2, sort_keys=True)) 70 | -------------------------------------------------------------------------------- /dev_requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile dev_requirements.in --output-file dev_requirements.txt 3 | -e file:. 4 | # via -r dev_requirements.in 5 | attrs==23.2.0 6 | # via 7 | # -r requirements.txt 8 | # cattrs 9 | beautifulsoup4==4.12.3 10 | # via bs4 11 | blinker==1.8.2 12 | # via 13 | # -r requirements.txt 14 | # flask 15 | bs4==0.0.2 16 | # via -r dev_requirements.in 17 | cattrs==23.2.3 18 | # via -r requirements.txt 19 | click==8.1.7 20 | # via 21 | # -r requirements.txt 22 | # flask 23 | coverage[toml]==7.5.3 24 | # via 25 | # -r dev_requirements.in 26 | # pytest-cov 27 | flask==3.0.3 28 | # via -r requirements.txt 29 | hyperlink==21.0.0 30 | # via -r requirements.txt 31 | idna==3.7 32 | # via 33 | # -r requirements.txt 34 | # hyperlink 35 | iniconfig==2.0.0 36 | # via pytest 37 | itsdangerous==2.2.0 38 | # via 39 | # -r requirements.txt 40 | # flask 41 | jinja2==3.1.4 42 | # via 43 | # -r requirements.txt 44 | # flask 45 | markupsafe==2.1.5 46 | # via 47 | # -r requirements.txt 48 | # jinja2 49 | # werkzeug 50 | mypy==1.10.1 51 | # via -r dev_requirements.in 52 | mypy-extensions==1.0.0 53 | # via mypy 54 | packaging==24.1 55 | # via pytest 56 | pillow==10.4.0 57 | # via -r requirements.txt 58 | pluggy==1.5.0 59 | # via pytest 60 | pytest==8.2.2 61 | # via 62 | # -r dev_requirements.in 63 | # pytest-cov 64 | pytest-cov==5.0.0 65 | # via -r dev_requirements.in 66 | rapidfuzz==3.9.4 67 | # via -r requirements.txt 68 | ruff==0.5.1 69 | # via -r dev_requirements.in 70 | smartypants==2.0.1 71 | # via -r requirements.txt 72 | soupsieve==2.5 73 | # via beautifulsoup4 74 | types-beautifulsoup4==4.12.0.20240511 75 | # via -r dev_requirements.in 76 | types-html5lib==1.1.11.20240228 77 | # via types-beautifulsoup4 78 | types-tqdm==4.66.0.20240417 79 | # via -r dev_requirements.in 80 | typing-extensions==4.12.2 81 | # via mypy 82 | unidecode==1.3.8 83 | # via -r requirements.txt 84 | wcag-contrast-ratio==0.9 85 | # via -r requirements.txt 86 | werkzeug==3.0.3 87 | # via 88 | # -r requirements.txt 89 | # flask 90 | -------------------------------------------------------------------------------- /src/docstore/static/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | background: url('/static/natural_paper.png'); 3 | font: 14pt Helvetica; 4 | line-height: 1.45em; 5 | margin: 0; 6 | padding: 0; 7 | } 8 | 9 | aside { 10 | background: url('/static/specktre.png'); 11 | background-size: auto 100%; 12 | padding: 6px 0px 2px 0px; 13 | border-bottom: 2px solid #aaa; 14 | font: 20pt "Nanum Brush Script"; 15 | font-weight: bold; 16 | box-shadow: 0 3px 6px rgba(0, 0, 0, 0.3) 17 | } 18 | 19 | td { 20 | line-height: 1.45em; 21 | font-size: 10pt; 22 | } 23 | 24 | main, #aside_inner { 25 | max-width: 940px; 26 | margin-left: auto; 27 | margin-right: auto; 28 | padding-left: 20px; 29 | padding-right: 20px; 30 | } 31 | 32 | .thumbnail { 33 | padding-right: 15px; 34 | } 35 | 36 | .thumbnail img { 37 | width: 100%; 38 | } 39 | 40 | h2 { 41 | margin-bottom: 0.5em; 42 | line-height: 1.25em; 43 | } 44 | 45 | table { 46 | width: 100%; 47 | } 48 | 49 | table hr { 50 | margin-left: 1em; 51 | margin-right: 1em; 52 | } 53 | 54 | hr { 55 | border: none; 56 | height: 1px; 57 | background: rgba(128, 128, 128, 0.5); 58 | } 59 | 60 | .thumbnail a:hover { 61 | background: #606060; 62 | } 63 | 64 | .thumbnail a img:hover { 65 | opacity: 0.8; 66 | } 67 | 68 | a { 69 | color: #606060; 70 | } 71 | 72 | a:hover { 73 | background: rgba(128, 128, 128, 0.3); 74 | } 75 | 76 | a.remove_tag { 77 | color: #d01c11; 78 | text-decoration: none; 79 | } 80 | 81 | a.remove_tag:hover { 82 | background: rgba(208, 28, 17, 0.3); 83 | } 84 | 85 | .meta_info { 86 | background: white; 87 | border: 2px solid #aaa; 88 | padding: 12px 15px; 89 | margin: 1em -14px; 90 | border-radius: 8px; 91 | font-size: 13pt; 92 | box-shadow: 0 3px 6px rgba(0, 0, 0, 0.2); 93 | } 94 | 95 | .meta_info ul { 96 | line-height: 1.4em; 97 | } 98 | 99 | #tag_cloud { 100 | text-align: justify; 101 | } 102 | 103 | #tag_cloud .tag { 104 | display: inline-block; 105 | } 106 | 107 | a.disabled { 108 | pointer-events: none; 109 | cursor: default; 110 | text-decoration: none; 111 | color: black; 112 | } 113 | 114 | .doc_id { 115 | font-weight: normal; 116 | display: none; 117 | color: #999; 118 | } 119 | 120 | h2:hover .doc_id { 121 | display: inline-block; 122 | } 123 | 124 | .sortBy { 125 | float: right; 126 | margin-bottom: 0; 127 | } 128 | -------------------------------------------------------------------------------- /tests/test_merging.py: -------------------------------------------------------------------------------- 1 | from docstore.merging import get_title_candidates, get_union_of_tags 2 | from docstore.models import Document 3 | 4 | 5 | class TestGetTitleCandidates: 6 | def test_single_document_is_title(self) -> None: 7 | doc = Document(title="Title 1") 8 | assert get_title_candidates([doc]) == ["Title 1"] 9 | 10 | def test_multiples_document_are_title_and_common_prefix(self) -> None: 11 | doc1 = Document(title="My document 1") 12 | doc2 = Document(title="My document 2") 13 | assert get_title_candidates([doc1, doc2]) == [ 14 | "My document", 15 | "My document 1", 16 | "My document 2", 17 | ] 18 | 19 | def test_does_not_double_add_common_prefix(self) -> None: 20 | doc1 = Document(title="My document") 21 | doc2 = Document(title="My document 2") 22 | assert get_title_candidates([doc1, doc2]) == ["My document", "My document 2"] 23 | 24 | def test_does_not_double_add_title(self) -> None: 25 | doc1 = Document(title="My document") 26 | doc2 = Document(title="My document") 27 | assert get_title_candidates([doc1, doc2]) == ["My document"] 28 | 29 | def test_does_not_add_empty_prefix(self) -> None: 30 | doc1 = Document(title="My document") 31 | doc2 = Document(title="Another document") 32 | assert get_title_candidates([doc1, doc2]) == ["My document", "Another document"] 33 | 34 | 35 | class TestGetUnionOfTags: 36 | def create_document_with_tags(self, tags: list[str]) -> Document: 37 | return Document(title="A test document", tags=tags) 38 | 39 | def test_tags_on_one_document_are_tags(self) -> None: 40 | doc = self.create_document_with_tags(tags=["tag1", "tag2", "tag3"]) 41 | assert get_union_of_tags([doc]) == ["tag1", "tag2", "tag3"] 42 | 43 | def test_get_tags_on_multiple_documents_with_no_overlap(self) -> None: 44 | doc1 = self.create_document_with_tags(tags=["tag1"]) 45 | doc2 = self.create_document_with_tags(tags=["tag2"]) 46 | doc3 = self.create_document_with_tags(tags=["tag3"]) 47 | assert get_union_of_tags([doc1, doc2, doc3]) == ["tag1", "tag2", "tag3"] 48 | 49 | def test_union_tags_deduplicates(self) -> None: 50 | doc1 = self.create_document_with_tags(tags=["tag1", "tag2"]) 51 | doc2 = self.create_document_with_tags(tags=["tag3", "tag2"]) 52 | doc3 = self.create_document_with_tags(tags=["tag3"]) 53 | assert get_union_of_tags([doc1, doc2, doc3]) == ["tag1", "tag2", "tag3"] 54 | -------------------------------------------------------------------------------- /tests/test_text_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import pytest 4 | 5 | from docstore.text_utils import common_prefix, hostname, pretty_date, slugify 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "values, expected_prefix", 10 | [ 11 | (["My document"], "My document"), 12 | (["My document", "A different document"], ""), 13 | (["My document (1)", "My document (2)"], "My document"), 14 | (["My document (part 1)", "My document (part 2)"], "My document"), 15 | (["My document - part ", "My document - part 2"], "My document"), 16 | ], 17 | ) 18 | def test_common_prefix(values: list[str], expected_prefix: str) -> None: 19 | assert common_prefix(values) == expected_prefix 20 | 21 | 22 | @pytest.mark.parametrize( 23 | "u, expected_slug", 24 | [ 25 | ("abc", "abc"), 26 | ("a:b", "a-b"), 27 | ("Çingleton", "cingleton"), 28 | ("a b", "a-b"), 29 | ("a_b", "a-b"), 30 | ("a b", "a-b"), 31 | ], 32 | ) 33 | def test_slugify(u: str, expected_slug: str) -> None: 34 | assert slugify(u) == expected_slug 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "d, now, expected_str", 39 | [ 40 | (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 1, 1, 1, 11), "just now"), 41 | (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 1, 1, 2, 59), "just now"), 42 | (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 1, 1, 3, 1), "2 minutes ago"), 43 | (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 1, 3, 1, 1), "earlier today"), 44 | (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 2, 1, 1, 1), "yesterday"), 45 | (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 6, 1, 1, 1), "5 days ago"), 46 | (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 12, 1, 1, 1), "1 Jan 2001"), 47 | (datetime(2001, 1, 1, 13, 0, 0), datetime(2001, 1, 3, 12, 0, 0), "2 days ago"), 48 | ], 49 | ) 50 | def test_pretty_date(d: datetime, now: datetime, expected_str: str) -> None: 51 | assert pretty_date(d=d, now=now) == expected_str 52 | 53 | 54 | @pytest.mark.parametrize( 55 | "url, expected_hostname", 56 | [ 57 | ("https://example.org/path/to/file.pdf", "example.org"), 58 | # This really appeared in the source_url of a docstore instance migrated 59 | # from v1, and caused a 500 error in the app. It's weird, but shouldn't cause 60 | # the app to crash. 61 | ("magic", "magic"), 62 | ], 63 | ) 64 | def test_hostname(url: str, expected_hostname: str) -> None: 65 | assert hostname(url) == expected_hostname 66 | -------------------------------------------------------------------------------- /tests/test_file_normalisation.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import os 3 | import pathlib 4 | 5 | from docstore.file_normalisation import normalised_filename_copy 6 | 7 | 8 | def test_copies_a_file(tmp_path: pathlib.Path) -> None: 9 | src = tmp_path / "src.txt" 10 | dst = tmp_path / "dst.txt" 11 | 12 | src.write_text("Hello world") 13 | assert not dst.exists() 14 | 15 | normalised_filename_copy(src=str(src), dst=str(dst)) 16 | 17 | assert dst.read_text() == "Hello world" 18 | 19 | 20 | def test_creates_intermediate_directories(tmp_path: pathlib.Path) -> None: 21 | src = tmp_path / "src.txt" 22 | dst = tmp_path / "1" / "2" / "3" / "dst.txt" 23 | 24 | src.write_text("Hello world") 25 | assert not dst.exists() 26 | 27 | normalised_filename_copy(src=str(src), dst=str(dst)) 28 | 29 | assert dst.read_text() == "Hello world" 30 | 31 | 32 | def test_copies_multiple_files_to_the_same_dst(tmp_path: pathlib.Path) -> None: 33 | src1 = tmp_path / "src1.txt" 34 | src2 = tmp_path / "src2.txt" 35 | src3 = tmp_path / "src3.txt" 36 | 37 | dst = tmp_path / "dst.txt" 38 | 39 | src1.write_text("Hello world") 40 | src2.write_text("Bonjour le monde") 41 | src3.write_text("Hallo Welt") 42 | 43 | normalised_filename_copy(src=str(src1), dst=str(dst)) 44 | normalised_filename_copy(src=str(src2), dst=str(dst)) 45 | normalised_filename_copy(src=str(src3), dst=str(dst)) 46 | 47 | assert len([f for f in os.listdir(tmp_path) if "dst" in f]) == 3 48 | 49 | dst_contents = { 50 | open(os.path.join(tmp_path, f)).read() 51 | for f in os.listdir(tmp_path) 52 | if "dst" in f 53 | } 54 | 55 | assert dst_contents == {"Hello world", "Bonjour le monde", "Hallo Welt"} 56 | 57 | 58 | def test_copies_multiple_files_concurrently(tmp_path: pathlib.Path) -> None: 59 | src1 = tmp_path / "src1.txt" 60 | src2 = tmp_path / "src2.txt" 61 | src3 = tmp_path / "src3.txt" 62 | 63 | dst = tmp_path / "dst.txt" 64 | 65 | src1.write_text("Hello world") 66 | src2.write_text("Bonjour le monde") 67 | src3.write_text("Hallo Welt") 68 | 69 | with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: 70 | futures = { 71 | executor.submit(normalised_filename_copy, src=str(s), dst=str(dst)) 72 | for s in (src1, src2, src3) 73 | } 74 | concurrent.futures.wait(futures) 75 | 76 | assert len([f for f in os.listdir(tmp_path) if "dst" in f]) == 3 77 | 78 | dst_contents = { 79 | open(os.path.join(tmp_path, f)).read() 80 | for f in os.listdir(tmp_path) 81 | if "dst" in f 82 | } 83 | 84 | assert dst_contents == {"Hello world", "Bonjour le monde", "Hallo Welt"} 85 | -------------------------------------------------------------------------------- /src/docstore/templates/_meta_info.html: -------------------------------------------------------------------------------- 1 | {% if documents|length <= page_end %} 2 | {% set next_url = "#" %} 3 | {% else %} 4 | {% set next_url = query_string|set_page(page + 1) %} 5 | {% endif %} 6 | 7 | {% if page == 1 %} 8 | {% set prev_url = "#" %} 9 | {% else %} 10 | {% set prev_url = query_string|set_page(page - 1) %} 11 | {% endif %} 12 | 13 |
14 | {% if documents|length == 0 %} 15 | no documents found! 16 | {% else %} 17 | showing document{% if page_start != page_end %}s{% endif %} {{ page_start }}{% if page_start != page_end %}–{{ page_end }}{% endif %} of {{ documents|length }}. 18 | 19 | {% if (prev_url != "#") or (next_url != "#") %} 20 | « prev 21 | • 22 | next » 23 | {% endif %} 24 | 25 |
26 | 27 | 38 |
39 | 40 | 52 | {% endif %} 53 | 54 | {% if (include_tags and tag_tally) or request_tags %} 55 |
56 | {% endif %} 57 | 58 | {% if include_tags and tag_tally %} 59 | {% if tag_tally|length > 15 %} 60 |
tag list 61 | {% include "_tag_list.html" %} 62 |
63 | 64 |
tag cloud 65 | {% endif %} 66 | 67 | {% set tag_cloud = TagCloud(tag_tally) %} 68 | 69 |
70 | {% for t, count in tag_tally.items()|sort() %} 71 | {% if t in request_tags %} 72 | {{ t }} 73 | {% else %} 74 | {{ t }} 75 | {% endif %} 76 | {% endfor %} 77 |
78 | 79 | {% if tag_tally|length > 15 %} 80 |
81 | {% endif %} 82 | 83 | {% if request_tags %} 84 |
85 | {% endif %} 86 | {% endif %} 87 | 88 | {% if request_tags %} 89 | filtering to tag{% if request_tags|length > 1 %}s{% endif %} 90 | {% for t in request_tags %} 91 | {{ t }} [x] 92 | {% endfor %} 93 | {% endif %} 94 |
95 | -------------------------------------------------------------------------------- /src/docstore/tint_colors.py: -------------------------------------------------------------------------------- 1 | import colorsys 2 | import subprocess 3 | import typing 4 | 5 | import wcag_contrast_ratio as contrast 6 | 7 | 8 | Color: typing.TypeAlias = tuple[float, float, float] 9 | 10 | 11 | def choose_tint_color_from_dominant_colors( 12 | dominant_colors: list[Color], background_color: Color 13 | ) -> Color: 14 | """ 15 | Given a set of dominant colors (say, from a k-means algorithm) and the 16 | background against which they'll be displayed, choose a tint color. 17 | 18 | Both ``dominant_colors`` and ``background_color`` should be tuples in [0,1]. 19 | """ 20 | # The minimum contrast ratio for text and background to meet WCAG AA 21 | # is 4.5:1, so discard any dominant colours with a lower contrast. 22 | sufficient_contrast_colors: list[Color] = [ 23 | typing.cast(Color, tuple(col)) 24 | for col in dominant_colors 25 | if contrast.rgb(col, background_color) >= 4.5 26 | ] 27 | 28 | # If none of the dominant colours meet WCAG AA with the background, 29 | # try again with black and white -- every colour in the RGB space 30 | # has a contrast ratio of 4.5:1 with at least one of these, so we'll 31 | # get a tint colour, even if it's not a good one. 32 | # 33 | # Note: you could modify the dominant colours until one of them 34 | # has sufficient contrast, but that's omitted here because it adds 35 | # a lot of complexity for a relatively unusual case. 36 | if not sufficient_contrast_colors: 37 | return choose_tint_color_from_dominant_colors( 38 | dominant_colors=dominant_colors + [(0, 0, 0), (1, 1, 1)], 39 | background_color=background_color, 40 | ) 41 | 42 | # Of the colors with sufficient contrast, pick the one with the 43 | # highest saturation. This is meant to optimise for colors that are 44 | # more colourful/interesting than simple greys and browns. 45 | hsv_candidates: dict[Color, Color] = { 46 | rgb_col: colorsys.rgb_to_hsv(*rgb_col) for rgb_col in sufficient_contrast_colors 47 | } 48 | 49 | return max(hsv_candidates, key=lambda rgb_col: hsv_candidates[rgb_col][2]) 50 | 51 | 52 | def from_hex(hs: str | bytes) -> Color: 53 | """ 54 | Returns an RGB tuple from a hex string, e.g. #ff0102 -> (255, 1, 2) 55 | """ 56 | return int(hs[1:3], 16), int(hs[3:5], 16), int(hs[5:7], 16) 57 | 58 | 59 | def choose_tint_color_for_file(path: str) -> Color: 60 | """ 61 | Returns the tint colour for a file. 62 | """ 63 | background_color = (1, 1, 1) 64 | 65 | cmd = ["dominant_colours", "--no-palette", "--max-colours=12", path] 66 | 67 | dominant_colors = [ 68 | from_hex(line) for line in subprocess.check_output(cmd).splitlines() 69 | ] 70 | 71 | colors = [(r / 255, g / 255, b / 255) for r, g, b in dominant_colors] 72 | 73 | return choose_tint_color_from_dominant_colors( 74 | dominant_colors=colors, background_color=background_color 75 | ) 76 | 77 | 78 | def choose_tint_color(*, thumbnail_path: str, file_path: str) -> Color: 79 | # In general, we use the thumbnail to choose the tint color. The thumbnail 80 | # is what the tint color will usually appear next to. However, thumbnails 81 | # for animated GIFs are MP4 videos rather than images, so we need to go to 82 | # the original image to get the tint color. 83 | if file_path.endswith((".jpg", ".jpeg", ".gif", ".png")): 84 | return choose_tint_color_for_file(file_path) 85 | else: 86 | return choose_tint_color_for_file(thumbnail_path) 87 | -------------------------------------------------------------------------------- /docs/previewing-the-files.md: -------------------------------------------------------------------------------- 1 | # Previewing the files 2 | 3 | As part of the web app, I create thumbnail images for every file. 4 | This makes it easier for me to find the file I'm looking for -- often I can spot it before I finish reading the text. 5 | 6 | ![A web app showing a list of files, with a small thumbnail on the left-hand side next to each file.](thumbnails.png) 7 | 8 | I've tried a couple of approaches for rendering file thumbnails. 9 | You can see the current implementation in [thumbnails.py][thumbnails.py]. 10 | 11 | [thumbnails.py]: https://github.com/alexwlchan/docstore/blob/main/src/docstore/thumbnails.py 12 | 13 | 14 | 15 | ## Using pdftocairo to preview PDFs 16 | 17 | Initially I was using [pdftocairo][pdftocairo] to render thumbnails, with a command like: 18 | 19 | ``` 20 | pdftocairo document.pdf \ 21 | -jpeg \ 22 | -singlefile \ 23 | -scale-to-x 400 24 | ``` 25 | 26 | This creates a JPEG thumbnail of `document.pdf` that's 400 pixels wide. 27 | 28 | But it occasionally had issues, especially with PDFs I'd downloaded from my bank or HR system – for example, if it was missing a font, it rendered a completely blank thumbnail. 29 | That's not useful! 30 | 31 | [pdftocairo]: https://www.mankier.com/1/pdftocairo 32 | 33 | 34 | 35 | ## Using Quick Look to preview arbitrary files 36 | 37 | As I started to store more files in docstore besides scanned PDFs, I needed to write more thumbnailing rules. 38 | For a while I tried to maintain that code myself, but I've switched to using [Quick Look][ql]. 39 | This is a file previewing tool built into macOS: when looking at a file in the Finder, you can press space to see a preview of the file: 40 | 41 | ![A Finder window with a Quick Look preview for a Keynote file titled "Using Python to organise my physical paper"](quick_look.png) 42 | 43 | Quick Look exposes a command-line interface for creating thumbnails: 44 | 45 | ``` 46 | qlmanage -t -s 400 document.pdf 47 | ``` 48 | 49 | This creates a PNG thumbnail of `document.pdf` that's 400 pixels wide. 50 | 51 | By default Quick Look supports a wide variety of file types, and it's pluggable – developers can write [Quick Look generators][ql_generators] to create previews if they have a custom file format. 52 | This means I let macOS handle the thumbnailing and don't have to worry about it in docstore. 53 | 54 | [ql]: https://en.wikipedia.org/wiki/Quick_Look 55 | [ql_generators]: https://developer.apple.com/design/human-interface-guidelines/macos/system-capabilities/quick-look/ 56 | 57 | 58 | 59 | ## Using FFmpeg to preview animated GIFs 60 | 61 | I have a handful of animated GIFs in docstore, and I want to get animated previews. 62 | For this I've found the best approach is to use [FFmpeg](https://ffmpeg.org/) to create a small video file that autoplays: 63 | 64 | ``` 65 | ffmpeg -i animated.gif \ 66 | -movflags faststart \ 67 | -pix_fmt yuv420p \ 68 | -vf scale=400:400 69 | out.mp4 70 | ``` 71 | 72 | This is the same approach Twitter use for displaying GIFs. 73 | 74 | 75 | 76 | ## Extracting tint colours with *k*-means colouring and Pillow 77 | 78 | In the web app, there are links to the right-hand side of each thumbnail: to filter to other files with the same tag, or to the URL where I downloaded each file. 79 | I want to use a colour from the thumbnail to tint these links, just because it looks pretty: 80 | 81 | ![A series of four colourful book covers, with metadata links to their right in matching colours.](tint_colors.png) 82 | 83 | 84 | For this, I use a technique used [*k*-means colouring][kmeans]. 85 | I've written a [separate blog post][blog] about exactly how this works. 86 | 87 | [kmeans]: https://en.wikipedia.org/wiki/K-means_clustering 88 | [blog]: https://alexwlchan.net/2019/08/finding-tint-colours-with-k-means/ 89 | -------------------------------------------------------------------------------- /src/docstore/models.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import typing 4 | import uuid 5 | 6 | import attr 7 | import cattr 8 | 9 | from docstore.git import current_commit 10 | 11 | 12 | DB_SCHEMA = "v2.2.0" 13 | 14 | 15 | def _convert_to_datetime(d: datetime.datetime | str) -> datetime.datetime: 16 | if isinstance(d, datetime.datetime): 17 | return d 18 | else: 19 | return datetime.datetime.fromisoformat(d) 20 | 21 | 22 | def _convert_to_thumbnail(t: typing.Any) -> "Thumbnail": 23 | if isinstance(t, Thumbnail): 24 | return t 25 | else: 26 | return Thumbnail(**t) 27 | 28 | 29 | def _convert_to_dimensions(d: typing.Any) -> "Dimensions": 30 | if isinstance(d, Dimensions): 31 | return d 32 | else: 33 | return Dimensions(**d) 34 | 35 | 36 | def _convert_to_file(f_list: list[typing.Any]) -> "list[File]": 37 | return [f if isinstance(f, File) else File(**f) for f in f_list] 38 | 39 | 40 | @attr.s 41 | class Dimensions: 42 | width: int = attr.ib() 43 | height: int = attr.ib() 44 | 45 | 46 | @attr.s 47 | class Thumbnail: 48 | path: str = attr.ib() 49 | dimensions: Dimensions = attr.ib(converter=_convert_to_dimensions) 50 | tint_color: str = attr.ib() 51 | 52 | 53 | @attr.s 54 | class File: 55 | filename: str = attr.ib(converter=str) 56 | path: str = attr.ib() 57 | size: int = attr.ib() 58 | checksum: str = attr.ib() 59 | thumbnail: Thumbnail = attr.ib(converter=_convert_to_thumbnail) 60 | source_url: str | None = attr.ib(default=None) 61 | date_saved: datetime.datetime = attr.ib( 62 | factory=datetime.datetime.now, converter=_convert_to_datetime 63 | ) 64 | id: str = attr.ib(default=attr.Factory(lambda: str(uuid.uuid4()))) 65 | 66 | 67 | @attr.s 68 | class Document: 69 | title: str = attr.ib() 70 | id: str = attr.ib(default=attr.Factory(lambda: str(uuid.uuid4()))) 71 | date_saved: datetime.datetime = attr.ib( 72 | factory=datetime.datetime.now, converter=_convert_to_datetime 73 | ) 74 | tags: list[str] = attr.ib(factory=list) 75 | files: list[File] = attr.ib(factory=list, converter=_convert_to_file) 76 | 77 | 78 | class DocstoreEncoder(json.JSONEncoder): 79 | def default(self, obj: typing.Any) -> typing.Any: 80 | if isinstance(obj, datetime.datetime): 81 | return obj.isoformat() 82 | else: # pragma: no cover 83 | return super().default(obj) 84 | 85 | 86 | def to_json(documents: list[Document]) -> str: 87 | """ 88 | Returns a JSON string containing all the documents. 89 | """ 90 | if not isinstance(documents, list) or not all( 91 | isinstance(d, Document) for d in documents 92 | ): 93 | raise TypeError("Expected type List[Document]!") 94 | 95 | # Use the same order that's used to serve the documents; Python's sort() 96 | # function goes faster if the documents are already in the right order. 97 | documents = sorted(documents, key=lambda d: d.date_saved, reverse=True) 98 | 99 | return json.dumps( 100 | { 101 | "docstore": { 102 | "db_schema": DB_SCHEMA, 103 | "commit": current_commit(), 104 | "last_modified": datetime.datetime.now().isoformat(), 105 | }, 106 | "documents": cattr.unstructure(documents), 107 | }, 108 | indent=2, 109 | sort_keys=True, 110 | cls=DocstoreEncoder, 111 | ) 112 | 113 | 114 | def from_json(json_string: str) -> list[Document]: 115 | """ 116 | Parses a JSON string containing all the documents. 117 | """ 118 | parsed_structure = json.loads(json_string) 119 | assert parsed_structure["docstore"]["db_schema"] == DB_SCHEMA 120 | return cattr.structure(parsed_structure["documents"], list[Document]) 121 | -------------------------------------------------------------------------------- /src/docstore/thumbnails.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import subprocess 4 | import sys 5 | import tempfile 6 | 7 | from PIL import Image, UnidentifiedImageError 8 | 9 | from docstore.models import Dimensions 10 | 11 | 12 | def _is_animated_gif(path: str) -> bool: 13 | """ 14 | Returns True if the file at ``path`` is an animated GIF. 15 | """ 16 | try: 17 | im = Image.open(path) 18 | except UnidentifiedImageError: 19 | # Not an image 20 | return False 21 | else: 22 | return getattr(im, "is_animated", False) 23 | 24 | 25 | def _create_gif_thumbnail_from_ffmpeg(*, path: str, max_size: int, out_dir: str) -> str: 26 | im = Image.open(path) 27 | 28 | if im.width > im.height and im.width >= max_size: 29 | width, height = (max_size, int(im.height * max_size / im.width)) 30 | else: 31 | width, height = (int(im.width * max_size / im.height), max_size) 32 | 33 | # The yuv420p encoder requires even values 34 | width, height = (int(width / 2) * 2, int(height / 2) * 2) 35 | 36 | out_path = os.path.join(out_dir, os.path.basename(path) + ".mp4") 37 | 38 | subprocess.check_call( 39 | [ 40 | "ffmpeg", 41 | "-i", 42 | path, 43 | "-movflags", 44 | "faststart", 45 | "-pix_fmt", 46 | "yuv420p", 47 | "-vf", 48 | f"scale={width}:{height}", 49 | out_path, 50 | ], 51 | stdout=subprocess.DEVNULL, 52 | ) 53 | 54 | return out_path 55 | 56 | 57 | def _create_thumbnail_from_quick_look(*, path: str, max_size: int, out_dir: str) -> str: 58 | try: 59 | subprocess.check_call( 60 | ["qlmanage", "-t", path, "-s", f"{max_size}x{max_size}", "-o", out_dir], 61 | stdout=subprocess.DEVNULL, 62 | stderr=subprocess.DEVNULL, 63 | timeout=5, 64 | ) 65 | except subprocess.TimeoutExpired: 66 | # It's possible for somethign to go wrong with the Quick Look 67 | # process where it just hangs and doesn't create a thumbnail. 68 | # If so, just continue without creating the thumbnail. 69 | pass 70 | 71 | try: 72 | result = os.path.join(out_dir, os.listdir(out_dir)[0]) 73 | except IndexError: 74 | print(f"Quick Look could not create a thumbnail for {path}", file=sys.stderr) 75 | result = os.path.join(out_dir, "generic_document.png") 76 | shutil.copyfile( 77 | src=os.path.join( 78 | os.path.dirname(os.path.abspath(__file__)), 79 | "static/generic_document.png", 80 | ), 81 | dst=result, 82 | ) 83 | 84 | if result.endswith(".png.png"): 85 | os.rename(result, result.replace(".png.png", ".png")) 86 | result = result.replace(".png.png", ".png") 87 | 88 | return result 89 | 90 | 91 | def create_thumbnail(path: str, *, max_size: int = 400) -> str: 92 | """ 93 | Creates a thumbnail of the file at ``path``. 94 | 95 | Returns the path to the new file. 96 | """ 97 | if _is_animated_gif(path): 98 | return _create_gif_thumbnail_from_ffmpeg( 99 | path=path, max_size=max_size, out_dir=tempfile.mkdtemp() 100 | ) 101 | else: 102 | return _create_thumbnail_from_quick_look( 103 | path=path, max_size=max_size, out_dir=tempfile.mkdtemp() 104 | ) 105 | 106 | 107 | def get_dimensions(path: str) -> Dimensions: 108 | """ 109 | Returns the (width, height) of a given path. 110 | """ 111 | if path.endswith(".png"): # image thumbnail 112 | im = Image.open(path) 113 | return Dimensions(width=im.width, height=im.height) 114 | 115 | elif path.endswith(".mp4"): # video thumbnail 116 | # See https://stackoverflow.com/a/29585066/1558022 117 | output = subprocess.check_output( 118 | [ 119 | "ffprobe", 120 | "-v", 121 | "error", 122 | "-show_entries", 123 | "stream=width,height", 124 | "-of", 125 | "csv=p=0:s=x", 126 | os.path.abspath(path), 127 | ] 128 | ) 129 | width, height = output.strip().split(b"x") 130 | return Dimensions(width=int(width), height=int(height)) 131 | 132 | else: # pragma: no cover 133 | raise ValueError(f"Unrecognised thumbnail type: {path}") 134 | -------------------------------------------------------------------------------- /docs/storing-the-metadata.md: -------------------------------------------------------------------------------- 1 | # Storing the metadata 2 | 3 | I store a certain amount of metadata alongside each file, including: 4 | 5 | * The original filename 6 | * When I saved it 7 | * A one-line human-readable description 8 | * What tags I'm using 9 | 10 | This document explains how I model the metadata and how I serialise it to disk. 11 | 12 | 13 | 14 | ## Modelling the metadata with attrs 15 | 16 | I use the [attrs library][attrs] to define my metadata models. 17 | The library provides a couple of decorators that let you define data classes without writing all the usual boilerplate. 18 | 19 | For example: 20 | 21 | ```pycon 22 | >>> import attr 23 | 24 | >>> @attr.s 25 | ... class Document: 26 | ... path = attr.ib() 27 | ... tags = attr.ib() 28 | ``` 29 | 30 | This defines a class called `Document` with two attributes `path` and `tags`. 31 | It gives me a constructor, and makes both those attributes available for reading/writing: 32 | 33 | ```pycon 34 | >>> doc = Document( 35 | ... path="scanned_doc.pdf", 36 | ... tags=["home", "bills", "acme energy"] 37 | ... ) 38 | 39 | >>> doc.path 40 | "scanned_doc.pdf" 41 | 42 | >>> doc.tags 43 | ["home", "bills", "acme energy"] 44 | 45 | >>> doc.tags.append("utilities:electricity") 46 | >>> doc.tags 47 | ["home", "bills", "acme energy", "utilities:electricity"] 48 | ``` 49 | 50 | The attrs library defines commonly used methods on the class, saving you from writing that boilerplate yourself. 51 | For example, it includes a nice repr() of objects: 52 | 53 | ```pycon 54 | >>> repr(doc) 55 | Document(path="scanned_doc.pdf", tags=["home", "bills", "acme energy", "utilities:electricity"]) 56 | ``` 57 | 58 | That repr() can be eval()'d to get back the same value, and attrs provides methods for equality and hashing (not shown): 59 | 60 | ```pycon 61 | >>> eval(repr(doc)) == doc 62 | True 63 | 64 | >>> doc == Document(path="cat.jpg", tags=["pets"]) 65 | False 66 | ``` 67 | 68 | If this looks similar to the [dataclasses module][dataclasses] in the Python standard library, it's because attrs was a direct inspiration for dataclasses. 69 | I was using attrs before dataclasses existed and I've never been persuaded to switch. 70 | 71 | Using attrs allows me to write short, compact models for my metadata. 72 | The entire model definition is less than 40 lines: [see models.py](https://github.com/alexwlchan/docstore/blob/a4b7972d147b538bbf48792566d55eeaea24e32a/src/docstore/models.py#L40-L71) for my model implemtnation. 73 | 74 | [attrs]: https://www.attrs.org/en/stable/ 75 | [dataclasses]: https://docs.python.org/3/library/dataclasses.html 76 | 77 | 78 | 79 | ## Using JSON as a database 80 | 81 | You can serialise an attrs model to a Python dict: 82 | 83 | ```pycon 84 | >>> attr.asdict(doc) 85 | {"path": "scanned_doc.pdf", "tags": ["home", "bills", "acme energy", "utilities:electricity"]} 86 | ``` 87 | 88 | This looks pretty close to JSON, and I save all the metadata into a standalone JSON file that lives in the top-level directory of a docstore instance. 89 | 90 | There are several reasons I like JSON for storing my docstore metadata: 91 | 92 | - It maps very closely to data structures in Python. 93 | I don't have to deal with any complex serialisation code. 94 | 95 | - JSON is a simple format with parsing libraries in lots of languages. 96 | Even if I lost all the code for docstore, I could still use the metadata. 97 | 98 | - JSON is plain text, so it's easy to edit. 99 | If I want to edit some metadata, I can open the metadata file in any text editor and make changes. 100 | This means I didn't have to put any editing-related code in docstore itself. 101 | 102 | I only have a few thousand files, so the performance impact of reading/writing all the JSON every time is minimal. 103 | You shouldn't use JSON for large data sets, but for small data sets it's absolutely fine. 104 | 105 | 106 | 107 | ## Serialising attrs models to JSON and back 108 | 109 | To save attrs models as JSON, or to read JSON as attrs models, I use the [cattrs library][cattrs]. 110 | This provides a pair of functions to go in both directions: 111 | 112 | ```pycon 113 | >>> cattr.unstructure(doc) 114 | {"path": "scanned_doc.pdf", "tags": ["home", "bills", "acme energy"]} 115 | 116 | >>> cattr.structure( 117 | ... {"path": "cat.jpg", "tags": ["pets"]}, 118 | ... Document) 119 | Document(path="cat.jpg", tags=["pets"]) 120 | ``` 121 | 122 | It has all the logic for doing validation, handling errors, and converting everything to the right type – so I don't have to write any custom serialisation code in docstore. 123 | 124 | [cattrs]: https://cattrs.readthedocs.io/en/latest/ 125 | -------------------------------------------------------------------------------- /.github/install-github-bin: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Install one of my Rust binaries from GitHub. 3 | # 4 | # This was originally written to pull just my Rust binaries, but it 5 | # turns out it can work reasonably well for other projects. This allows 6 | # me to bypass Homebrew, which is generally nicer. 7 | # 8 | # For my CLI tools written in Rust, I use a GitHub Action [1] 9 | # to compile binaries and add them to a release on GitHub. 10 | # 11 | # These are the "canonical" versions, rather than recompiling on 12 | # each machine and having slightly different versions because I 13 | # was using different checkouts of the code. 14 | # 15 | # This script gets the latest version of a binary from GitHub and 16 | # adds it to my $PATH. 17 | # 18 | # This script isn't meant to be run directly (hence the underscore) -- 19 | # I have other scripts that invoke it (e.g. install-vfd), so I can find 20 | # those scripts with autocomplete in my shell, even on a new machine. 21 | # 22 | # TODO: I publish macOS/Windows/Linux binaries, but this script only 23 | # fetches the macOS binary. In theory, it could detect which OS it's 24 | # running on and pick the right binary. 25 | # 26 | # [1]: https://github.com/taiki-e/upload-rust-binary-action 27 | 28 | set -o errexit 29 | set -o nounset 30 | 31 | if (( "$#" != 1 )) 32 | then 33 | echo "Usage: _install-rust-bin " 34 | fi 35 | 36 | REPO_NAME="$1" 37 | 38 | RELEASES_API_URL="https://api.github.com/repos/$REPO_NAME/releases/latest" 39 | 40 | # Fetch the asset URL using the GitHub Releases API [2]. 41 | # 42 | # For the purposes of this script, these are the interesting bits of the 43 | # API response that we want to pay attention to: 44 | # 45 | # { 46 | # "assets": [ 47 | # { 48 | # "name": "vfd-x86_64-apple-darwin.tar.gz", 49 | # "url": "https://api.github.com/repos/alexwlchan/books.alexwlchan.net/releases/assets/64229966", 50 | # ... 51 | # }, 52 | # { 53 | # "name": "vfd-x86_64-pc-windows-msvc.zip", 54 | # "url": "https://api.github.com/repos/alexwlchan/books.alexwlchan.net/releases/assets/64229889", 55 | # ... 56 | # }, 57 | # { 58 | # "name": "vfd-x86_64-unknown-linux-gnu.tar.gz", 59 | # "url": "https://api.github.com/repos/alexwlchan/books.alexwlchan.net/releases/assets/64229611", 60 | # ... 61 | # } 62 | # ], 63 | # } 64 | # 65 | # [2]: https://docs.github.com/en/rest/releases/releases#get-the-latest-release 66 | # 67 | # Note: this will filter out ARM binaries because I'm not running on 68 | # Apple Silicon yet; this will need updating eventually. 69 | # 70 | ASSET_URL=$(curl --silent "$RELEASES_API_URL" \ 71 | | jq -r '.assets | .[] | select(.name | contains("darwin")) | select(.name | contains("arm") | not) | select(.name | contains("aarch64") | not) | .url' \ 72 | | grep -v arm64 73 | ) 74 | 75 | if [[ "$ASSET_URL" == "" ]] 76 | then 77 | echo "No macOS download available for the latest version! Is it still building?" >&2 78 | exit 1 79 | fi 80 | 81 | # Download and unpack the asset using the GitHub Release Assets API [3]. 82 | # 83 | # We supply the headers required by the GitHub API, and the `--location` 84 | # flag caused curl to follow redirects. 85 | # 86 | # Note: this assumes the binary is packaged as a tar.gz. The Windows 87 | # binaries are zipped instead of tar.gz-ed, so if you want to support 88 | # Windows, inspect the "content_type" field in the Releases API response. 89 | # 90 | # [3]: https://docs.github.com/en/rest/releases/assets#get-a-release-asset 91 | 92 | cd $(mktemp -d) 93 | 94 | curl \ 95 | --header "Accept: application/octet-stream" \ 96 | --location \ 97 | --silent \ 98 | "$ASSET_URL" > "asset.tar.gz" 99 | 100 | # Identify the name of the binary, which may be different from the repo name. 101 | # 102 | # We list all the files in the asset package, which should contain a single 103 | # file, and assume that's the name of the binary. 104 | ASSET_FILES=$(tar --list --file "asset.tar.gz") 105 | 106 | if [[ "$REPO_NAME" == "BurntSushi/ripgrep" ]] 107 | then 108 | BINARY_PATH=$(echo "$ASSET_FILES" | grep "/rg$") 109 | BINARY_NAME="rg" 110 | else 111 | if (( $(echo "$ASSET_FILES" | wc -l) != 1 )) 112 | then 113 | echo "Release asset doesn't contain exactly 1 file; not sure what to do:" >&2 114 | echo "$ASSET_FILES" >&2 115 | exit 1 116 | fi 117 | 118 | BINARY_PATH="$(echo "$ASSET_FILES" | head -n 1)" 119 | BINARY_NAME="$BINARY_PATH" 120 | fi 121 | 122 | # Now actually extract the binary, make it executable, and add it to the PATH. 123 | tar --extract --gunzip --file "asset.tar.gz" 124 | 125 | chmod +x "$BINARY_PATH" 126 | sudo mv "$BINARY_PATH" /usr/local/bin 127 | 128 | which "$BINARY_NAME" 129 | "$BINARY_NAME" --version 130 | -------------------------------------------------------------------------------- /src/docstore/tag_list.py: -------------------------------------------------------------------------------- 1 | """ 2 | Given a series of tags, arrange them into a hierarchy. For example: 3 | - seasons 4 | - autumn 5 | - summer 6 | - trees 7 | - ash 8 | - oak 9 | - yew 10 | - ancient 11 | This is based on 12 | https://github.com/dreamwidth/dw-free/blob/6ec1e146d3c464e506a77913f0abf0d51a944f95/styles/core2.s2#L4126 13 | """ 14 | 15 | import typing 16 | 17 | 18 | class HtmlLiteral(typing.TypedDict): 19 | type: typing.Literal["html_literal"] 20 | value: str 21 | 22 | 23 | class TagLink(typing.TypedDict): 24 | type: typing.Literal["tag_link"] 25 | name: str 26 | count: int 27 | display_name: str 28 | 29 | 30 | class TagText(typing.TypedDict): 31 | type: typing.Literal["tag_text"] 32 | display_name: str 33 | 34 | 35 | def render_tag_list(tag_tally: dict[str, int]) -> list[HtmlLiteral | TagLink | TagText]: 36 | if not tag_tally: 37 | return [] 38 | 39 | prev_tags: list[str] = [] 40 | tag_list_pos = 0 41 | tier_elements: list[HtmlLiteral | TagLink | TagText] = [] 42 | levels_to_close = 0 43 | 44 | result: list[HtmlLiteral | TagLink | TagText] = [] 45 | 46 | for name, count in sorted(tag_tally.items()): 47 | tags = name.split(":") 48 | 49 | pos = 0 50 | show_lower_tiers = False 51 | 52 | for tier in tags: 53 | # If we're on a tag's last tier and this tag isn't already selected, 54 | # we need to return a link to the tag, otherwise plain text is returned. 55 | if len(tags) == pos + 1: 56 | tier_elements = [ 57 | { 58 | "type": "tag_link", 59 | "name": name, 60 | "count": count, 61 | "display_name": tier.lstrip("_"), 62 | } 63 | ] 64 | else: 65 | tier_elements = [{"type": "tag_text", "display_name": tier}] 66 | 67 | # Prev tag has fewer tiers than than current tag. 68 | if len(prev_tags) < pos + 1: 69 | result.append({"type": "html_literal", "value": "", 85 | } 86 | ) 87 | levels_to_close -= 1 88 | i -= 1 89 | 90 | # If we just closed some lists, that means that any lower 91 | # tiers in this tag need to be explicitly displayed, even 92 | # if they match the same-level tier of the previous tag 93 | show_lower_tiers = True 94 | 95 | if levels_to_close <= pos: 96 | # This is the first tier at this level, so open list 97 | result.append({"type": "html_literal", "value": ""}) 124 | levels_to_close -= 1 125 | else: # pragma: no cover 126 | # I haven't been able to find a test case that triggers this 127 | # particular branch, so I'm excluding it from coverage. 128 | # If it does come up, come back and add a test for this line! 129 | assert 0 130 | 131 | return result 132 | -------------------------------------------------------------------------------- /src/docstore/server.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import datetime 3 | import functools 4 | import hashlib 5 | import os 6 | import pathlib 7 | import secrets 8 | import typing 9 | import urllib.parse 10 | from urllib.parse import parse_qsl, urlparse, urlencode 11 | 12 | from flask import ( 13 | Flask, 14 | Response as FlaskResponse, 15 | make_response, 16 | render_template, 17 | request, 18 | send_file, 19 | send_from_directory, 20 | ) 21 | import hyperlink 22 | import smartypants 23 | from werkzeug.middleware.profiler import ProfilerMiddleware 24 | 25 | from .documents import find_original_filename, read_documents 26 | from .models import Document 27 | from .tag_cloud import TagCloud 28 | from .tag_list import render_tag_list 29 | from .text_utils import hostname, pretty_date 30 | 31 | 32 | def tags_with_prefix(document: Document, prefix: str) -> list[str]: 33 | return [t for t in document.tags if t.startswith(prefix)] 34 | 35 | 36 | def tags_without_prefix(document: Document, prefix: str) -> list[str]: 37 | return [t for t in document.tags if not t.startswith(prefix)] 38 | 39 | 40 | def url_without_sortby(u: str) -> str: 41 | url = hyperlink.URL.from_text(u) 42 | return str(url.remove("sortBy")) 43 | 44 | 45 | def serve_file(*, root: pathlib.Path, shard: str, filename: str) -> FlaskResponse: 46 | """ 47 | Serves a file which has been saved in docstore. 48 | 49 | This adds the Content-Disposition header to the response, so files 50 | are downloaded with the original filename they were uploaded as, 51 | rather than the normalised filename. 52 | 53 | """ 54 | path = os.path.abspath(os.path.join(root, "files", shard, filename)) 55 | response = make_response(send_file(path)) 56 | 57 | original_filename = find_original_filename(root, path=path) 58 | 59 | # See https://stackoverflow.com/a/49481671/1558022 for UTF-8 encoding 60 | encoded_filename = urllib.parse.quote(original_filename, encoding="utf-8") 61 | response.headers["Content-Disposition"] = f"filename*=utf-8''{encoded_filename}" 62 | 63 | return response 64 | 65 | 66 | def create_app(title: str, root: pathlib.Path, thumbnail_width: int) -> Flask: 67 | app = Flask(__name__) 68 | 69 | app.config["THUMBNAIL_WIDTH"] = thumbnail_width 70 | 71 | app.jinja_env.trim_blocks = True 72 | app.jinja_env.lstrip_blocks = True 73 | 74 | app.jinja_env.filters["hostname"] = hostname 75 | app.jinja_env.filters["pretty_date"] = lambda d: pretty_date( 76 | d, now=datetime.datetime.now() 77 | ) 78 | app.jinja_env.filters["render_tag_list"] = render_tag_list 79 | app.jinja_env.filters["smartypants"] = smartypants.smartypants 80 | app.jinja_env.filters["url_without_sortby"] = url_without_sortby 81 | 82 | app.jinja_env.filters["tags_with_prefix"] = tags_with_prefix 83 | app.jinja_env.filters["tags_without_prefix"] = tags_without_prefix 84 | 85 | @app.route("/") 86 | def list_documents() -> str: 87 | request_tags = set(request.args.getlist("tag")) 88 | documents = [ 89 | doc for doc in read_documents(root) if request_tags.issubset(set(doc.tags)) 90 | ] 91 | 92 | tag_tally: dict[str, int] = collections.Counter() 93 | for doc in documents: 94 | for t in doc.tags: 95 | tag_tally[t] += 1 96 | 97 | try: 98 | page = int(request.args["page"]) 99 | except KeyError: 100 | page = 1 101 | 102 | sort_by = request.args.get("sortBy", "date (newest first)") 103 | 104 | if sort_by.startswith("date"): 105 | sort_key = lambda d: d.date_saved # noqa 106 | elif sort_by.startswith("title"): 107 | sort_key = lambda d: d.title.lower() # noqa 108 | elif sort_by == "random": 109 | if page == 1: 110 | app.config["_RANDOM_SEED"] = secrets.token_bytes() 111 | seed = app.config["_RANDOM_SEED"] 112 | 113 | def sort_key(d: Document) -> str: 114 | h = hashlib.md5() 115 | h.update(d.id.encode("utf8")) 116 | h.update(seed) 117 | return h.hexdigest() 118 | else: 119 | raise ValueError(f"Unrecognised sortBy query parameter: {sort_by}") 120 | 121 | if sort_by in {"date (newest first)", "title (Z to A)"}: 122 | sort_reverse = True 123 | else: 124 | sort_reverse = False 125 | 126 | html = render_template( 127 | "index.html", 128 | documents=sorted(documents, key=sort_key, reverse=sort_reverse), 129 | request_tags=request_tags, 130 | query_string=tuple(parse_qsl(urlparse(request.url).query)), 131 | tag_tally=tag_tally, 132 | title=title, 133 | page=page, 134 | sort_by=sort_by, 135 | TagCloud=TagCloud, 136 | ) 137 | 138 | return html 139 | 140 | @app.route("/thumbnails//") 141 | def thumbnails(shard: str, filename: str) -> FlaskResponse: 142 | return send_from_directory( 143 | os.path.abspath(os.path.join(root, "thumbnails", shard)), filename 144 | ) 145 | 146 | app.add_url_rule( 147 | rule="/files//", 148 | view_func=lambda shard, filename: serve_file( 149 | root=root, shard=shard, filename=filename 150 | ), 151 | ) 152 | 153 | QueryString: typing.TypeAlias = list[tuple[str, str]] 154 | 155 | @app.template_filter("add_tag") 156 | @functools.lru_cache() 157 | def add_tag(query_string: QueryString, tag: str) -> str: 158 | return "?" + urlencode( 159 | [(k, v) for k, v in query_string if k != "page"] + [("tag", tag)] 160 | ) 161 | 162 | @app.template_filter("remove_tag") 163 | def remove_tag(query_string: QueryString, tag: str) -> str: 164 | return "?" + urlencode( 165 | [(k, v) for k, v in query_string if (k, v) != ("tag", tag)] 166 | ) 167 | 168 | @app.template_filter("set_page") 169 | @functools.lru_cache() 170 | def set_page(query_string: QueryString, page: int) -> str: 171 | pageless_qs = [(k, v) for k, v in query_string if k != "page"] 172 | if page == 1: 173 | return "?" + urlencode(pageless_qs) 174 | else: 175 | return "?" + urlencode(pageless_qs + [("page", page)]) 176 | 177 | return app 178 | 179 | 180 | def run_profiler(app: Flask, *, host: str, port: int) -> None: # pragma: no cover 181 | app.config["PROFILE"] = True 182 | app.wsgi_app = ProfilerMiddleware(app.wsgi_app, restrictions=[30]) # type: ignore 183 | app.run(host=host, port=port, debug=True) 184 | 185 | 186 | def run_server( 187 | app: Flask, *, host: str, port: int, debug: bool 188 | ) -> None: # pragma: no cover 189 | app.run(host=host, port=port, debug=debug) 190 | -------------------------------------------------------------------------------- /docs/storing-the-files.md: -------------------------------------------------------------------------------- 1 | # Storing the files 2 | 3 | Part of the point of docstore is to abstract away the management of individual files. 4 | I don't want to worry about managing individual files and folders – I want the tool to do that for me. 5 | 6 | This document explains a bit about how docstore manages my files. 7 | 8 | 9 | 10 | ## Where the files are stored 11 | 12 | I run docstore on my home computer, which shouldn't accessible from the Internet. 13 | **The files are stored on the local disk, not in cloud storage.** 14 | 15 | I use docstore to store files with private information: bank statements, medical letters, rental contracts, and more. 16 | If I uploaded them to a cloud storage service like S3, there's a risk I'd misconfigure the permissions and inadvertently make the files public. 17 | For me, the security of knowing they're not in the cloud outweighs the potential convenience of having remote access. 18 | 19 | 20 | 21 | ## How the files are named / filename normalisation 22 | 23 | Although my scanned documents have autogenerated filenames, sometimes I download documents that I want to save (e.g. electronic banking statements), which have … interesting filename choices. 24 | 25 | These are real filenames I've received: 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 |
FilenameComments
VolcanoPattern.pdf10/10 great name.
Alex Chan_5312.pdfSpaces in filenames cause nothing but trouble.
Statement.pdfThis is a bank statement with no context. I have dozens of files with identical names, covering different accounts and date ranges.
Alexander Chan›Payslip November 2014-2015.PDFSpecial characters are annoying.
V5C:3 scrappage note.pdfI have no idea how I created this file. This is the V5C/3 form, so at some point the slash has been converted to a colon – but both the colon and slash are used as path separators on macOS, and are best avoided.
53 | 54 | So I can't rely on the original filename: maybe it contains special characters, or I have different files with the same filename. 55 | The original filename is a useful piece of metadata that I want to keep, but I can't use it for saving files. 56 | 57 | **I save files under a normalised version of their original filename.** 58 | I want to keep as close to the original filename as possible -- so no UUIDs. 59 | Then I save the original filename as a bit of metadata in the database. 60 | 61 | The normalisation process has two steps: 62 | 63 | - Creating an ASCII-safe filename using [Dr Drang's slugify() function](http://www.leancrew.com/all-this/2014/10/asciifying/). 64 | This uses the [Unidecode](https://pypi.org/project/Unidecode/) and [re libraries](https://docs.python.org/3/library/re.html) to remove any non-ASCII characters and spaces. 65 | 66 | - Appending a random hex value before the filename extension if there are multiple files with the same name. 67 | This avoids saving two files with the same name. 68 | e.g. `Statement.pdf`, `Statement_1c5e.pdf`, `Statement_3fc9.pdf`, … 69 | 70 | For the exact implementation, see [file_normalisation.py](https://github.com/alexwlchan/docstore/blob/main/src/docstore/file_normalisation.py). 71 | 72 | 73 | 74 | ## Ensuring I don't save two files with the same name / exclusive-open mode in Python 75 | 76 | What if two processes try to save a file with the same name simultaneously? 77 | How do I ensure the normalisation kicks in and adds the random hex value to keep the files apart? 78 | 79 | This is probably overkill: I'm the only person saving documents, and I can't do multiple things at once. 80 | But it was pretty easy to add, and it's a useful example of a less well-known feature in Python. 81 | 82 | If you've used Python, you probably know how to [read and write files][python_rw]: 83 | 84 | ```pycon 85 | >>> with open("greeting.txt", mode="w") as outfile: 86 | ... outfile.write("Hello world!") 87 | 12 88 | 89 | >>> with open("greeting.txt", mode="r") as infile: 90 | ... print(infile.read()) 91 | "Hello world!" 92 | ``` 93 | 94 | The `mode` argument tells Python whether you're writing (`w`) or reading (`r`). 95 | These are by far the most commonly used values. 96 | 97 | What if you want to write to a file, but only if it doesn't exist yet? 98 | You could check if it exists first: 99 | 100 | ```pycon 101 | >>> if not os.path.exists("important.txt"): 102 | ... with open("important.txt", mode="w"): 103 | ... 104 | ``` 105 | 106 | but this is risky – what if the file is created between the existence check and when you open it? 107 | 108 | Better is to use mode `x` which means **exclusive open**. 109 | You write as normal, but if the file already exists, the `open()` throws a FileExistsError: 110 | 111 | ```pycon 112 | >>> with open("greeting.txt", mode="x") as outfile: 113 | ... outfile.write("Hello world!") 114 | 12 115 | 116 | >>> with open("greeting.txt", mode="x") as outfile: 117 | ... outfile.write("Bonjour le monde!") 118 | Traceback (most recent call last): 119 | File "", line 1, in 120 | FileExistsError: [Errno 17] File exists: 'greeting.txt' 121 | ``` 122 | 123 | This is enforced at the OS-level so it's a bit more robust. 124 | I use this to ensure I don't save two files with the same name – one will succeed, the other will throw a FileExistsError and get a random hex value inserted to distinguish it. 125 | 126 | For the exact implementation, see [file_normalisation.py](https://github.com/alexwlchan/docstore/blob/main/src/docstore/file_normalisation.py). 127 | 128 | [python_rw]: https://docs.python.org/3/tutorial/inputoutput.html#tut-files 129 | 130 | 131 | 132 | ## Downloading files with their original filename / the Content-Disposition header 133 | 134 | When I download a file from the web app, I want to download it with the original filename -- not the normalised version. 135 | 136 | For example, if I have an HTML link: 137 | 138 | ```html 139 | 140 | ``` 141 | 142 | then if I downloaded this link, my web browser would download a file named `beijing.pdf`. 143 | 144 | But you can use the [Content-Disposition header][cd_header] to suggest to a browser that it should download a file with a different name. 145 | In particular, if the server returns the header: 146 | 147 | ``` 148 | Content-Disposition: attachment; filename="北京.pdf" 149 | ``` 150 | 151 | then the browser will download the file as `北京.pdf`. 152 | 153 | For the exact implementation, see [`serve_file()` in `server.py`](https://github.com/alexwlchan/docstore/blob/7cb1cfd708c212af4dc0673dc8da372f7b8c79a4/src/docstore/server.py#L39-L57). 154 | 155 | [cd_header]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition 156 | -------------------------------------------------------------------------------- /src/docstore/documents.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import hashlib 3 | import json 4 | import os 5 | import pathlib 6 | import shutil 7 | import typing 8 | 9 | import cattr 10 | 11 | from docstore.file_normalisation import normalised_filename_copy 12 | from docstore.models import ( 13 | DocstoreEncoder, 14 | Document, 15 | File, 16 | Thumbnail, 17 | from_json, 18 | to_json, 19 | ) 20 | from docstore.text_utils import slugify 21 | from docstore.thumbnails import create_thumbnail, get_dimensions 22 | from docstore.tint_colors import choose_tint_color 23 | 24 | 25 | def db_path(root: pathlib.Path) -> pathlib.Path: 26 | """ 27 | Returns the path to the database. 28 | """ 29 | return root / "documents.json" 30 | 31 | 32 | class CachedDocuments(typing.TypedDict): 33 | last_modified: float 34 | contents: list[Document] 35 | 36 | 37 | _cached_documents: CachedDocuments = { 38 | "last_modified": -1, 39 | "contents": [], 40 | } 41 | 42 | 43 | def read_documents(root: pathlib.Path) -> list[Document]: 44 | """ 45 | Get a list of all the documents. 46 | """ 47 | # JSON parsing is somewhat expensive. By caching the result rather than 48 | # going to disk each time, we see a ~10x speedup in returning responses 49 | # from the server. 50 | try: 51 | if ( 52 | _cached_documents["last_modified"] is not None 53 | and os.stat(db_path(root)).st_mtime <= _cached_documents["last_modified"] 54 | ): 55 | return _cached_documents["contents"] 56 | except FileNotFoundError: 57 | pass 58 | 59 | try: 60 | with open(db_path(root)) as infile: 61 | result = from_json(infile.read()) 62 | except FileNotFoundError: 63 | return [] 64 | 65 | _cached_documents["last_modified"] = os.stat(db_path(root)).st_mtime 66 | _cached_documents["contents"] = result 67 | 68 | return result 69 | 70 | 71 | def write_documents(*, root: pathlib.Path, documents: list[Document]) -> None: 72 | json_string = to_json(documents) 73 | 74 | os.makedirs(root, exist_ok=True) 75 | 76 | with open(db_path(root), "w") as out_file: 77 | out_file.write(json_string) 78 | 79 | 80 | def sha256(path: pathlib.Path) -> str: 81 | h = hashlib.sha256() 82 | with open(path, "rb") as infile: 83 | for byte_block in iter(lambda: infile.read(4096), b""): 84 | h.update(byte_block) 85 | 86 | return "sha256:%s" % h.hexdigest() 87 | 88 | 89 | def store_new_document( 90 | *, 91 | root: pathlib.Path, 92 | path: pathlib.Path, 93 | title: str, 94 | tags: list[str], 95 | source_url: str | None, 96 | date_saved: datetime.datetime, 97 | ) -> Document: 98 | filename = os.path.basename(path) 99 | 100 | # Files are sharded by the first letter of their filename, 101 | # e.g. "aardvark.png" is saved in "a/aardvark.png" 102 | shard = slugify(filename)[0].lower() 103 | 104 | dst = os.path.join(root, "files", shard, filename) 105 | 106 | out_path = normalised_filename_copy(src=str(path), dst=dst) 107 | 108 | thumbnail_path = create_thumbnail(out_path) 109 | thumbnail_name = os.path.basename(thumbnail_path) 110 | thumb_out_path = os.path.join(root, "thumbnails", thumbnail_name[0], thumbnail_name) 111 | os.makedirs(os.path.dirname(thumb_out_path), exist_ok=True) 112 | shutil.move(thumbnail_path, thumb_out_path) 113 | 114 | tint_color = choose_tint_color(thumbnail_path=thumb_out_path, file_path=out_path) 115 | 116 | hex_tint_color = "#%02x%02x%02x" % tuple( 117 | int(component * 255) for component in tint_color 118 | ) 119 | 120 | new_document = Document( 121 | title=title, 122 | date_saved=date_saved, 123 | tags=tags, 124 | files=[ 125 | File( 126 | filename=filename, 127 | path=os.path.relpath(out_path, root), 128 | size=os.stat(out_path).st_size, 129 | checksum=sha256(pathlib.Path(out_path)), 130 | source_url=source_url, 131 | thumbnail=Thumbnail( 132 | path=os.path.relpath(thumb_out_path, root), 133 | dimensions=get_dimensions(thumb_out_path), 134 | tint_color=hex_tint_color, 135 | ), 136 | date_saved=date_saved, 137 | ) 138 | ], 139 | ) 140 | 141 | documents = read_documents(root) 142 | documents.append(new_document) 143 | 144 | write_documents(root=root, documents=documents) 145 | 146 | # Don't delete the original file until it's been successfully recorded 147 | # and a thumbnail created. 148 | os.unlink(path) 149 | 150 | return new_document 151 | 152 | 153 | def pairwise_merge_documents( 154 | root: pathlib.Path, 155 | *, 156 | doc1: Document, 157 | doc2: Document, 158 | new_title: str, 159 | new_tags: list[str], 160 | ) -> Document: 161 | """ 162 | Merge the files on two documents together. 163 | 164 | Before: 2 documents with 1 file each 165 | After: 1 document with 2 files 166 | """ 167 | documents = read_documents(root) 168 | assert doc2 in documents 169 | documents.remove(doc2) 170 | 171 | # Modify the copy of the document that's about to be written; this will 172 | # throw an error if the document has changed between starting and finishing 173 | # the merge. 174 | stored_doc1 = documents[documents.index(doc1)] 175 | 176 | stored_doc1.date_saved = min([stored_doc1.date_saved, doc2.date_saved]) 177 | stored_doc1.tags = new_tags 178 | stored_doc1.title = new_title 179 | stored_doc1.files.extend(doc2.files) 180 | write_documents(root=root, documents=documents) 181 | 182 | return stored_doc1 183 | 184 | 185 | def delete_document(root: pathlib.Path, *, doc_id: str) -> None: 186 | documents = read_documents(root) 187 | doc = [d for d in documents if d.id == doc_id][0] 188 | 189 | delete_dir = os.path.join(root, "deleted", doc.id) 190 | os.makedirs(delete_dir, exist_ok=True) 191 | 192 | for f in doc.files: 193 | os.rename( 194 | os.path.join(root, f.path), 195 | os.path.join(delete_dir, os.path.basename(f.path)), 196 | ) 197 | os.unlink(os.path.join(root, f.thumbnail.path)) 198 | 199 | with open(os.path.join(delete_dir, "document.json"), "w") as outfile: 200 | outfile.write( 201 | json.dumps( 202 | cattr.unstructure(doc), indent=2, sort_keys=True, cls=DocstoreEncoder 203 | ) 204 | ) 205 | 206 | documents = [d for d in documents if d.id != doc_id] 207 | write_documents(root=root, documents=documents) 208 | 209 | 210 | def find_original_filename(root: pathlib.Path, *, path: str) -> str: 211 | """ 212 | Returns the name of the original file stored in this path. 213 | """ 214 | documents = read_documents(root) 215 | for d in documents: 216 | for f in d.files: 217 | if f.path == os.path.relpath(path, root): 218 | return f.filename 219 | 220 | raise ValueError(f"Couldn't find file stored with path {path}") 221 | -------------------------------------------------------------------------------- /tests/test_documents.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import os 4 | import pathlib 5 | import shutil 6 | 7 | from docstore.documents import ( 8 | delete_document, 9 | pairwise_merge_documents, 10 | read_documents, 11 | sha256, 12 | store_new_document, 13 | write_documents, 14 | ) 15 | from docstore.models import Dimensions, Document, File, Thumbnail 16 | 17 | 18 | def test_sha256() -> None: 19 | p = pathlib.Path("tests/files/cluster.png") 20 | 21 | assert ( 22 | sha256(p) 23 | == "sha256:683cbee0c2dda22b42fd92bda0f31e4b6b49cd8650a7924d72a14a30f11bfbe5" 24 | ) 25 | 26 | 27 | def test_read_blank_documents_is_empty(tmpdir: pathlib.Path) -> None: 28 | assert read_documents(tmpdir) == [] 29 | 30 | 31 | def test_can_write_and_read_documents(tmpdir: pathlib.Path) -> None: 32 | documents = [Document(title="My first document")] 33 | 34 | write_documents(root=tmpdir, documents=documents) 35 | 36 | # Repeat a couple of times so we hit the caching paths. 37 | for _ in range(3): 38 | assert read_documents(tmpdir) == documents 39 | 40 | 41 | def test_can_merge_documents(tmpdir: pathlib.Path, root: pathlib.Path) -> None: 42 | shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "cluster1.png") 43 | shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "cluster2.png") 44 | 45 | doc1 = store_new_document( 46 | root=root, 47 | path=tmpdir / "cluster1.png", 48 | title="My first document", 49 | tags=["tag1"], 50 | source_url="htttps://example.org/cluster1.png", 51 | date_saved=datetime.datetime.now(), 52 | ) 53 | doc2 = store_new_document( 54 | root=root, 55 | path=tmpdir / "cluster2.png", 56 | title="My second document", 57 | tags=["tag2"], 58 | source_url="htttps://example.org/cluster2.png", 59 | date_saved=datetime.datetime.now(), 60 | ) 61 | 62 | pairwise_merge_documents( 63 | root=root, 64 | doc1=doc1, 65 | doc2=doc2, 66 | new_title="My merged document", 67 | new_tags=["tag1", "tag2", "new_merged_tag"], 68 | ) 69 | 70 | stored_documents = read_documents(root) 71 | 72 | assert stored_documents == [ 73 | Document( 74 | id=doc1.id, 75 | date_saved=doc1.date_saved, 76 | files=doc1.files + doc2.files, 77 | title="My merged document", 78 | tags=["tag1", "tag2", "new_merged_tag"], 79 | ) 80 | ] 81 | 82 | 83 | def test_merging_uses_earliest_date(tmpdir: pathlib.Path) -> None: 84 | doc1 = Document(title="Doc1", date_saved=datetime.datetime(2010, 1, 1)) 85 | doc2 = Document(title="Doc2", date_saved=datetime.datetime(2002, 2, 2)) 86 | 87 | write_documents(root=tmpdir, documents=[doc1, doc2]) 88 | 89 | pairwise_merge_documents( 90 | root=tmpdir, 91 | doc1=doc1, 92 | doc2=doc2, 93 | new_title="DocMerged", 94 | new_tags=[], 95 | ) 96 | 97 | stored_documents = read_documents(tmpdir) 98 | 99 | assert doc2.date_saved < doc1.date_saved 100 | assert len(stored_documents) == 1 101 | assert stored_documents[0].date_saved == doc2.date_saved 102 | 103 | 104 | def test_store_new_document(tmpdir: pathlib.Path) -> None: 105 | root = tmpdir / "root" 106 | shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "My Cluster.png") 107 | 108 | documents = read_documents(root) 109 | assert len(documents) == 0 110 | 111 | now = datetime.datetime(2020, 2, 20) 112 | 113 | new_document = store_new_document( 114 | root=root, 115 | path=tmpdir / "My Cluster.png", 116 | title="My cluster title", 117 | tags=["tag1", "tag2", "tag3"], 118 | source_url="https://example.org/cluster.png", 119 | date_saved=now, 120 | ) 121 | 122 | assert not os.path.exists(tmpdir / "My Cluster.png") 123 | 124 | assert isinstance(new_document, Document) 125 | assert new_document.title == "My cluster title" 126 | assert new_document.date_saved == now 127 | assert new_document.tags == ["tag1", "tag2", "tag3"] 128 | 129 | assert len(new_document.files) == 1 130 | new_file = new_document.files[0] 131 | assert isinstance(new_file, File) 132 | assert new_file.filename == "My Cluster.png" 133 | assert new_file.path == "files/m/my-cluster.png" 134 | assert new_file.size == 41151 135 | assert ( 136 | new_file.checksum 137 | == "sha256:683cbee0c2dda22b42fd92bda0f31e4b6b49cd8650a7924d72a14a30f11bfbe5" 138 | ) 139 | assert new_file.source_url == "https://example.org/cluster.png" 140 | assert new_file.date_saved == now 141 | 142 | assert new_file.thumbnail == Thumbnail( 143 | path="thumbnails/m/my-cluster.png", 144 | dimensions=Dimensions(400, 260), 145 | tint_color="#007f7f", 146 | ) 147 | assert os.path.exists(root / new_file.thumbnail.path) 148 | 149 | assert read_documents(root) == [new_document] 150 | 151 | # Storing a second document gets us both documents, but with different names 152 | shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "My Cluster.png") 153 | new_document2 = store_new_document( 154 | root=root, 155 | path=tmpdir / "My Cluster.png", 156 | title="My second cluster title", 157 | tags=["tag1", "tag2", "tag3", "tag4"], 158 | source_url="https://example.org/cluster2.png", 159 | date_saved=now, 160 | ) 161 | 162 | assert isinstance(new_document2, Document) 163 | new_file2 = new_document2.files[0] 164 | assert new_file2.filename == "My Cluster.png" 165 | assert new_file2.path != "files/m/my-cluster.png" 166 | assert new_file2.path.startswith("files/m/my-cluster_") 167 | assert new_file2.path.endswith(".png") 168 | 169 | assert read_documents(root) == [new_document, new_document2] 170 | 171 | assert len(os.listdir(root / "files" / "m")) == 2 172 | assert len(os.listdir(root / "thumbnails" / "m")) == 2 173 | 174 | 175 | def test_deleting_document(tmpdir: pathlib.Path, root: pathlib.Path) -> None: 176 | root = tmpdir / "root" 177 | shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "cluster.png") 178 | 179 | doc1 = store_new_document( 180 | root=root, 181 | path=tmpdir / "cluster.png", 182 | title="A document about to be deleted", 183 | tags=[], 184 | source_url="htttps://example.org/cluster.png", 185 | date_saved=datetime.datetime.now(), 186 | ) 187 | doc2 = Document(title="Doc1", date_saved=datetime.datetime(2010, 1, 1)) 188 | doc3 = Document(title="Doc2", date_saved=datetime.datetime(2002, 2, 2)) 189 | 190 | write_documents(root=root, documents=[doc1, doc2, doc3]) 191 | 192 | assert read_documents(root) == [doc1, doc2, doc3] 193 | 194 | delete_document(root, doc_id=doc1.id) 195 | 196 | assert read_documents(root) == [doc2, doc3] 197 | 198 | deleted_json_path = root / "deleted" / doc1.id / "document.json" 199 | assert os.path.exists(deleted_json_path) 200 | assert json.load(open(deleted_json_path))["id"] == doc1.id 201 | assert not os.path.exists(root / "files" / "c" / "cluster.png") 202 | assert os.path.exists(root / "deleted" / doc1.id / "cluster.png") 203 | -------------------------------------------------------------------------------- /src/docstore/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | {% include "_head.html" %} 3 | 4 | 5 | 10 | 11 |
12 | {% set page_size = 100 %} 13 | 14 | {% set page_start = (page - 1) * page_size + 1 %} 15 | {% set page_end = page_start + page_size - 1 %} 16 | 17 | {% if documents|length < page_end %} 18 | {% set page_end = documents|length %} 19 | {% endif %} 20 | 21 | {% set include_tags = True %} 22 | {% with placement="top" %} 23 | {% include "_meta_info.html" %} 24 | {% endwith %} 25 | 26 | 46 | 47 | 97 | 98 | {% for doc in documents[page_start - 1:page_end] %} 99 |
100 | 162 | 163 | 174 | 175 | 211 |
212 | {% endfor %} 213 | 214 | {% if page_end - page_start > 10 %} 215 | {% set include_tags = False %} 216 | {% with placement="bottom" %} 217 | {% include "_meta_info.html" %} 218 | {% endwith %} 219 | {% endif %} 220 |
221 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import pathlib 4 | import shutil 5 | import uuid 6 | 7 | from click.testing import CliRunner 8 | import pytest 9 | 10 | from docstore.cli import main 11 | from docstore.documents import read_documents, store_new_document, write_documents 12 | from docstore.models import Dimensions, Document, File, Thumbnail 13 | from test_models import is_recent 14 | 15 | 16 | class TestAdd: 17 | def test_stores_new_document( 18 | self, tmpdir: pathlib.Path, root: pathlib.Path 19 | ) -> None: 20 | shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "My Cluster.png") 21 | 22 | runner = CliRunner() 23 | result = runner.invoke( 24 | main, 25 | [ 26 | f"--root={root}", 27 | "add", 28 | str(tmpdir / "My Cluster.png"), 29 | "--title", 30 | "My first document", 31 | "--tags", 32 | "tag1, tag2, tag3", 33 | ], 34 | ) 35 | 36 | assert result.exit_code == 0, result.output 37 | 38 | doc_id = result.output.strip() 39 | 40 | documents = read_documents(root) 41 | 42 | assert len(documents) == 1 43 | assert documents[0].id == doc_id 44 | assert documents[0].title == "My first document" 45 | assert documents[0].tags == ["tag1", "tag2", "tag3"] 46 | assert is_recent(documents[0].date_saved) 47 | 48 | assert len(documents[0].files) == 1 49 | f = documents[0].files[0] 50 | assert f.filename == "My Cluster.png" 51 | assert f.path == "files/m/my-cluster.png" 52 | assert f.source_url is None 53 | assert f.date_saved == documents[0].date_saved 54 | 55 | @pytest.mark.parametrize( 56 | "tag_arg, expected_tags", 57 | [ 58 | ("", []), 59 | ("tag with trailing whitespace ", ["tag with trailing whitespace"]), 60 | ( 61 | "multiple,comma,separated,tags", 62 | ["multiple", "comma", "separated", "tags"], 63 | ), 64 | ], 65 | ) 66 | def test_adds_tags_to_document( 67 | self, 68 | tmpdir: pathlib.Path, 69 | root: pathlib.Path, 70 | tag_arg: str, 71 | expected_tags: list[str], 72 | ) -> None: 73 | shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "My Cluster.png") 74 | 75 | runner = CliRunner() 76 | result = runner.invoke( 77 | main, 78 | [ 79 | f"--root={root}", 80 | "add", 81 | str(tmpdir / "My Cluster.png"), 82 | "--title", 83 | "My second document", 84 | "--tags", 85 | tag_arg, 86 | ], 87 | ) 88 | 89 | assert result.exit_code == 0, result.output 90 | 91 | documents = read_documents(root) 92 | assert documents[0].tags == expected_tags 93 | 94 | @pytest.mark.parametrize( 95 | "source_url_arg, expected_source_url", 96 | [ 97 | ("", ""), 98 | ("https://example.org/cluster.png", "https://example.org/cluster.png"), 99 | ], 100 | ) 101 | def test_adds_source_url_to_file( 102 | self, 103 | tmpdir: pathlib.Path, 104 | root: pathlib.Path, 105 | source_url_arg: str, 106 | expected_source_url: str, 107 | ) -> None: 108 | shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "My Cluster.png") 109 | 110 | runner = CliRunner() 111 | result = runner.invoke( 112 | main, 113 | [ 114 | f"--root={root}", 115 | "add", 116 | str(tmpdir / "My Cluster.png"), 117 | "--title", 118 | "My stored document", 119 | "--tags", 120 | "tag1, tag2, tag3", 121 | "--source_url", 122 | source_url_arg, 123 | ], 124 | ) 125 | 126 | assert result.exit_code == 0, result.output 127 | 128 | documents = read_documents(root) 129 | assert documents[0].files[0].source_url == expected_source_url 130 | 131 | 132 | class TestMerge: 133 | @pytest.mark.parametrize("doc_count", [1, 2, 3, 4]) 134 | def test_merges_documents_with_identical_metadata( 135 | self, root: pathlib.Path, doc_count: int 136 | ) -> None: 137 | documents = [ 138 | Document(title="My Document", tags=["tag1", "tag2", "tag3"]) 139 | for _ in range(doc_count) 140 | ] 141 | 142 | write_documents(root=root, documents=documents) 143 | 144 | runner = CliRunner() 145 | result = runner.invoke( 146 | main, [f"--root={root}", "merge", "--yes"] + [doc.id for doc in documents] 147 | ) 148 | assert result.exit_code == 0, result.output 149 | 150 | if doc_count > 1: 151 | assert "Using common title: My Document\n" in result.output 152 | assert "Using common tags: tag1, tag2, tag3\n" in result.output 153 | 154 | stored_documents = read_documents(root) 155 | 156 | assert len(stored_documents) == 1 157 | assert stored_documents[0].id == documents[0].id 158 | assert stored_documents[0].title == "My Document" 159 | assert stored_documents[0].tags == ["tag1", "tag2", "tag3"] 160 | 161 | def test_merges_documents_with_inferred_metadata(self, root: pathlib.Path) -> None: 162 | documents = [ 163 | Document(title=f"My Document {i}", tags=[f"tag{i}"]) for i in range(3) 164 | ] 165 | 166 | write_documents(root=root, documents=documents) 167 | 168 | runner = CliRunner() 169 | result = runner.invoke( 170 | main, [f"--root={root}", "merge", "--yes"] + [doc.id for doc in documents] 171 | ) 172 | assert result.exit_code == 0, result.output 173 | 174 | assert "Guessed title: My Document\n" in result.output 175 | assert "Guessed tags: tag0, tag1, tag2\n" in result.output 176 | 177 | stored_documents = read_documents(root) 178 | 179 | assert len(stored_documents) == 1 180 | assert stored_documents[0].id == documents[0].id 181 | assert stored_documents[0].title == "My Document" 182 | assert stored_documents[0].tags == ["tag0", "tag1", "tag2"] 183 | 184 | @pytest.mark.parametrize("doc_count", [1, 2, 3, 4]) 185 | def test_merging_combines_files(self, root: pathlib.Path, doc_count: int) -> None: 186 | shutil.copyfile(src="tests/files/cluster.png", dst=root / "cluster.png") 187 | documents = [ 188 | Document( 189 | title="My Document", 190 | tags=["tag"], 191 | files=[ 192 | File( 193 | filename=f"cluster{i}.png", 194 | path="cluster.png", 195 | size=100, 196 | checksum="sha256:123", 197 | thumbnail=Thumbnail( 198 | path="cluster.png", 199 | dimensions=Dimensions(400, 300), 200 | tint_color="#000000", 201 | ), 202 | ) 203 | ], 204 | ) 205 | for i in range(doc_count) 206 | ] 207 | 208 | write_documents(root=root, documents=documents) 209 | 210 | runner = CliRunner() 211 | result = runner.invoke( 212 | main, [f"--root={root}", "merge", "--yes"] + [doc.id for doc in documents] 213 | ) 214 | assert result.exit_code == 0, result.output 215 | 216 | stored_documents = read_documents(root) 217 | 218 | assert len(stored_documents) == 1 219 | assert len(stored_documents[0].files) == doc_count 220 | 221 | 222 | def test_deleting_document_through_cli( 223 | tmpdir: pathlib.Path, root: pathlib.Path 224 | ) -> None: 225 | shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "cluster.png") 226 | 227 | doc1 = store_new_document( 228 | root=root, 229 | path=tmpdir / "cluster.png", 230 | title="A document about to be deleted", 231 | tags=[], 232 | source_url="https://example.org/cluster.png", 233 | date_saved=datetime.datetime.now(), 234 | ) 235 | doc2 = Document(title="Doc1", date_saved=datetime.datetime(2010, 1, 1)) 236 | doc3 = Document(title="Doc2", date_saved=datetime.datetime(2002, 2, 2)) 237 | 238 | write_documents(root=root, documents=[doc1, doc2, doc3]) 239 | 240 | assert read_documents(root) == [doc1, doc2, doc3] 241 | 242 | runner = CliRunner() 243 | result = runner.invoke(main, [f"--root={root}", "delete", doc1.id, doc2.id]) 244 | assert result.exit_code == 0, result.output 245 | 246 | assert read_documents(root) == [doc3] 247 | 248 | for deleted_doc in [doc1, doc2]: 249 | deleted_json_path = root / "deleted" / deleted_doc.id / "document.json" 250 | assert os.path.exists(deleted_json_path) 251 | 252 | 253 | def test_deleting_in_empty_instance_is_error(root: pathlib.Path) -> None: 254 | runner = CliRunner() 255 | result = runner.invoke(main, [f"--root={root}", "delete", str(uuid.uuid4())]) 256 | 257 | assert result.exit_code == 1, result.output 258 | assert result.output.strip() == f"There is no docstore instance at {root}!" 259 | -------------------------------------------------------------------------------- /tests/test_server.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterator 2 | import datetime 3 | import re 4 | import pathlib 5 | import shutil 6 | import typing 7 | 8 | import bs4 9 | from flask.testing import FlaskClient 10 | import pytest 11 | 12 | from docstore.documents import store_new_document, write_documents 13 | from docstore.models import Document 14 | from docstore.server import create_app 15 | 16 | 17 | @pytest.fixture 18 | def client(root: pathlib.Path) -> Iterator[FlaskClient]: 19 | app = create_app(root=root, title="My test instance", thumbnail_width=200) 20 | app.config["TESTING"] = True 21 | 22 | with app.test_client() as client: 23 | yield client 24 | 25 | 26 | def test_empty_response(client: FlaskClient) -> None: 27 | resp = client.get("/") 28 | assert resp.status_code == 200 29 | assert b"no documents found!" in resp.data 30 | 31 | 32 | def test_shows_documents( 33 | tmpdir: pathlib.Path, root: pathlib.Path, client: FlaskClient 34 | ) -> None: 35 | for _ in range(3): 36 | shutil.copyfile("tests/files/cluster.png", str(tmpdir / "cluster.png")) 37 | store_new_document( 38 | root=root, 39 | path=tmpdir / "cluster.png", 40 | title="My test document", 41 | tags=["tag1", "tag2", "tag3"], 42 | source_url="https://example.org/cluster", 43 | date_saved=datetime.datetime.now(), 44 | ) 45 | 46 | resp = client.get("/") 47 | assert resp.status_code == 200 48 | assert resp.data.count(b"My test document") == 3 49 | assert b"date saved: just now" in resp.data 50 | 51 | # TODO: Detect this thumbnail URL from the page HTML 52 | resp = client.get("/thumbnails/c/cluster.png") 53 | assert resp.data[:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a" # PNG magic number 54 | 55 | resp = client.get("/files/c/cluster.png") 56 | assert resp.data == open("tests/files/cluster.png", "rb").read() 57 | 58 | 59 | def test_filters_documents_by_tag(root: pathlib.Path, client: FlaskClient) -> None: 60 | documents = [Document(title=f"Document {i}", tags=[f"tag{i}"]) for i in range(3)] 61 | write_documents(root=root, documents=documents) 62 | 63 | resp = client.get("/?tag=tag0") 64 | assert resp.status_code == 200 65 | assert b"Document 0" in resp.data 66 | assert b"Document 1" not in resp.data 67 | assert b"Document 2" not in resp.data 68 | 69 | 70 | def test_paginates_document(root: pathlib.Path, client: FlaskClient) -> None: 71 | documents = [Document(title=f"Document {i}") for i in range(200)] 72 | write_documents(root=root, documents=documents) 73 | 74 | resp = client.get("/") 75 | assert resp.status_code == 200 76 | 77 | # More recent documents appear first 78 | assert b"Document 199" in resp.data 79 | assert b"Document 100" in resp.data 80 | assert b"Document 99" not in resp.data 81 | 82 | assert "« prev" in resp.data.decode("utf8") 83 | assert "next »" in resp.data.decode("utf8") 84 | 85 | resp_page_2 = client.get("/?page=2") 86 | assert resp_page_2.status_code == 200 87 | assert b"Document 100" not in resp_page_2.data 88 | assert b"Document 99" in resp_page_2.data 89 | assert b"Document 0" in resp_page_2.data 90 | 91 | 92 | def test_documents_with_lots_of_tags(root: pathlib.Path, client: FlaskClient) -> None: 93 | documents = [Document(title=f"Document {i}", tags=[f"tag{i}"]) for i in range(200)] 94 | 95 | documents.extend( 96 | [ 97 | Document(title="Another document", tags=["nest0:tag1"]), 98 | Document(title="Another document", tags=["nest0:tag1:tagA"]), 99 | Document(title="Another document", tags=["nest0:tag1:tagB"]), 100 | Document(title="Another document", tags=["nest1:tag1"]), 101 | ] 102 | ) 103 | 104 | write_documents(root=root, documents=documents) 105 | 106 | resp = client.get("/") 107 | assert resp.status_code == 200 108 | 109 | assert b'
' in resp.data 110 | 111 | 112 | def tidy(html_str: typing.Any) -> str: 113 | assert isinstance(html_str, str) 114 | return re.sub(r"\s+", " ", html_str.strip()) 115 | 116 | 117 | class TestCase(typing.TypedDict): 118 | tags: list[str] 119 | expected_title: str 120 | urls: list[str] 121 | 122 | 123 | @pytest.mark.parametrize( 124 | "test_case", 125 | [ 126 | { 127 | "tags": ["by:John Smith"], 128 | "expected_title": "{title}, by John Smith ({doc_id})", 129 | "urls": ["/", "/?tag=by%3AJohn+Smith"], 130 | }, 131 | { 132 | "tags": ["by:John Smith", "by:Jane Doe"], 133 | "expected_title": "{title}, by John Smith, Jane Doe ({doc_id})", 134 | "urls": [ 135 | "/", 136 | "/?tag=by%3AJohn+Smith", 137 | "/?tag=by%3AJane+Doe", 138 | "/?tag=by%3AJane+Doe&tag=by%3AJohn+Smith", 139 | ], 140 | }, 141 | { 142 | "tags": ["from:ACME Corp"], 143 | "expected_title": "{title}, from ACME Corp ({doc_id})", 144 | "urls": ["/", "/?tag=from%3AACME+Corp"], 145 | }, 146 | { 147 | "tags": ["from:ACME Corp", "from:Widget Inc"], 148 | "expected_title": "{title}, from ACME Corp, Widget Inc ({doc_id})", 149 | "urls": [ 150 | "/", 151 | "/?tag=from%3AACME+Corp", 152 | "/?tag=from%3AWidget+Inc", 153 | "/?tag=from%3AACME+Corp&tag=from%3AWidget+Inc", 154 | ], 155 | }, 156 | { 157 | "tags": ["by:John Smith", "from:ACME Corp"], 158 | "expected_title": "{title}, by John Smith, from ACME Corp ({doc_id})", 159 | "urls": [ 160 | "/", 161 | "/?tag=by%3AJohn+Smith", 162 | "/?tag=from%3AACME+Corp", 163 | "/?tag=by%3AJohn+Smith&tag=from%3AACME+Corp", 164 | ], 165 | }, 166 | ], 167 | ) 168 | def test_shows_attribution_tags( 169 | root: pathlib.Path, client: FlaskClient, test_case: TestCase 170 | ) -> None: 171 | doc_tags = test_case["tags"] + ["tag1", "tag2"] 172 | 173 | doc = Document(title="My document", tags=doc_tags) 174 | write_documents(root=root, documents=[doc]) 175 | 176 | for url in test_case["urls"]: 177 | print(url) 178 | resp = client.get(url) 179 | assert resp.status_code == 200 180 | 181 | soup = bs4.BeautifulSoup(resp.data, "html.parser") 182 | 183 | h2_title = soup.find("h2", attrs={"class": "title"}) 184 | assert h2_title is not None 185 | assert tidy(h2_title.text) == test_case["expected_title"].format( 186 | title=doc.title, doc_id=doc.id 187 | ) 188 | 189 | tags_list = soup.find("div", attrs={"class": "tags"}) 190 | assert tags_list is not None 191 | assert tidy(tags_list.text) == "tagged with: tag1 tag2" 192 | 193 | 194 | def test_links_attribution_tags(root: pathlib.Path, client: FlaskClient) -> None: 195 | doc = Document(title="My document", tags=["by:John Smith"]) 196 | write_documents(root=root, documents=[doc]) 197 | 198 | # If the tag is not selected, the attribution tag in the title is a link 199 | # that filters to the selected tag. 200 | resp = client.get("/") 201 | assert resp.status_code == 200 202 | 203 | soup = bs4.BeautifulSoup(resp.data, "html.parser") 204 | 205 | h2_title = soup.find("h2", attrs={"class": "title"}) 206 | assert isinstance(h2_title, bs4.Tag) 207 | assert h2_title.find("a", attrs={"href": "?tag=by%3AJohn+Smith"}) is not None 208 | 209 | # If the tag is selected, the attribution tag in the title is regular text, 210 | # not a link. 211 | resp = client.get("/?tag=by%3aJohn+Smith") 212 | assert resp.status_code == 200 213 | 214 | soup = bs4.BeautifulSoup(resp.data, "html.parser") 215 | 216 | h2_title = soup.find("h2", attrs={"class": "title"}) 217 | assert h2_title is not None 218 | assert h2_title.find("a") is None 219 | 220 | 221 | def test_sets_thumbnail_width(client: FlaskClient) -> None: 222 | """ 223 | If the user sets a custom thumbnail width, the appropriate CSS style is 224 | added to the rendered page. 225 | """ 226 | client.application.config["THUMBNAIL_WIDTH"] = 100 227 | 228 | resp = client.get("/") 229 | 230 | soup = bs4.BeautifulSoup(resp.data, "html.parser") 231 | 232 | style_tag = soup.find("style") 233 | assert isinstance(style_tag, bs4.Tag) 234 | assert tidy(style_tag.string) == ".thumbnail { width: 100px; }" 235 | 236 | 237 | def test_tags_are_sorted_alphabetically( 238 | root: pathlib.Path, client: FlaskClient 239 | ) -> None: 240 | doc = Document(title="My document", tags=["bulgaria", "austria", "croatia"]) 241 | write_documents(root=root, documents=[doc]) 242 | 243 | resp = client.get("/") 244 | 245 | soup = bs4.BeautifulSoup(resp.data, "html.parser") 246 | 247 | tags_div = soup.find("div", attrs={"class": "tags"}) 248 | assert tags_div is not None 249 | assert tidy(tags_div.text) == "tagged with: austria bulgaria croatia" 250 | 251 | 252 | def test_gets_curly_quotes(root: pathlib.Path, client: FlaskClient) -> None: 253 | app = create_app(root=root, title="Isn't this a good title?", thumbnail_width=200) 254 | app.config["TESTING"] = True 255 | 256 | with app.test_client() as client: 257 | resp = client.get("/") 258 | 259 | soup = bs4.BeautifulSoup(resp.data, "html.parser") 260 | 261 | title = soup.find("title") 262 | assert title is not None 263 | assert title.text.strip() == "docstore/Isn’t this a good title?" 264 | 265 | aside_inner = soup.find("div", attrs={"id": "aside_inner"}) 266 | assert aside_inner is not None 267 | assert aside_inner.text.strip() == "docstore/Isn’t this a good title?" 268 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # docstore 2 | 3 | docstore is a tool I wrote to help me manage my scanned documents and reference files. 4 | It uses [keyword tagging](https://en.wikipedia.org/wiki/Tag_(metadata)) to categorise files, and creates thumbnails to help identify files. 5 | 6 | It has two parts: 7 | 8 | * A CLI tool that lets me store new documents 9 | * A web app that lets me browse the documents I've already stored 10 | 11 | Here's an example of how I'd use the CLI tool to save a file: 12 | 13 | ``` 14 | docstore add '~/Desktop/Contract of Employment.pdf' \ 15 | --source_url='https://email.example.com/message/1234' \ 16 | --title='2020-10: Contract of employment for ACME' \ 17 | --tags='employer:acme-corp, contract:employment' 18 | ``` 19 | 20 | Here's a screenshot of the web app: 21 | 22 | ![A screenshot of docstore](docstore.png) 23 | 24 | The web app allows me to filter by one or more tags, or to sort by title/date, to help me find the document I'm looking for. 25 | 26 | 27 | 28 | ## Usage 29 | 30 | Clone this repo and install the package locally: 31 | 32 | ```console 33 | $ git clone https://github.com/alexwlchan/docstore.git 34 | $ cd docstore 35 | $ pip3 install -e . 36 | ``` 37 | 38 | You can add files using `docstore add` and run the web app with `docstore serve`. 39 | 40 | Note that docstore is only intended for me to use -- it solves a specific problem that I have, and is designed to solve my exact needs. 41 | 42 | You're welcome to use it, but I'm unlikely to provide support or add features for other people. 43 | 44 | 45 | 46 | ## How it works: design and implementation notes 47 | 48 | I learnt a lot of stuff writing docstore, and the source code is public so other people can read it and see how it works. 49 | 50 | Everything is written in Python, with [Click][click] and [Flask][flask] being the core of the CLI and and the web app. 51 | 52 | Because reading source code is a pretty inefficient way to learn, I have some documents that explain the key ideas: 53 | 54 | - [Storing the files](docs/storing-the-files.md) – where files are stored, what name they're stored under, ensuring I don't save two files with the same name 55 | - [Storing the metadata](docs/storing-the-metadata.md) – what metadata I store, how I model it, why I save it as JSON, how I serialise Python models to JSON and back 56 | - [Previewing the files](docs/previewing-the-files.md) – how I create file previews with Quick Look and FFmpeg, how I extract a tint colour from thumbnails from the web app 57 | 58 | [click]: https://palletsprojects.com/p/click/ 59 | [flask]: https://palletsprojects.com/p/flask/ 60 | 61 | 62 | 63 | ## Why I wrote it 64 | 65 | * **I prefer keyword tagging to files-and-folders as a way to organise files.** 66 | I'm a particular fan of how [Pinboard](https://pinboard.in/) does tagging, but I haven't found an app that stores files with Pinboard-like. 67 | 68 | * **I want my documents stored locally.** 69 | My scanned paperwork in particular contains a lot of private information -- bank statements, medical letters, rental contracts, and more. 70 | I don't want to upload them to a cloud service and risk them being leaked. 71 | 72 | * **I'm very picky about how this sort of thing.** 73 | I've tried a bunch of other apps and services for doing this sort of thing, but none of them were quite right. 74 | I found it easier to write my own tool than try to use something written by somebody else. 75 | 76 | It helps that my needs are quite simple -- the whole app is about a thousand lines of code, which is pretty manageable. 77 | 78 | 79 | 80 | ## Design principles 81 | 82 | * **My files and metadata should be portable.** 83 | All the data for a collection of files stored with docstore is kept in a single directory. 84 | That directory can be copied or synced to another machine, and I can start working with them immediately -- no config or setup required. 85 | 86 | This is important for day-to-day utility, and for disaster recovery. 87 | If something happens to my main computer, I want to be able to get to my documents again (including the keyword tags for organisation) as quickly as possible. 88 | 89 | * **Use JSON as a database.** 90 | All the metadata about my documents is kept in a single JSON file. 91 | JSON is a simple, popular format with several advantages for me: 92 | 93 | - Lots of tools can read it. 94 | Pretty much every programming language has a JSON parser, so I'm guaranteed I'll be able to parse the metadata file for years to come. 95 | - I can edit JSON in a text editor. 96 | This saves me building editing features into docstore -- if I've made a typo or want to change something, I can edit the metadata JSON directly. 97 | - It maps directly to Python data structures (Python is what I use to write docstore). 98 | The serialisation and deserialisation isn't very complicated. 99 | 100 | If you were building an app that had to store a lot of documents or support multiple users, JSON would be a poor choice -- you'd want to use a proper database instead. 101 | My biggest docstore instance only has a few thousand files, and the cost of JSON parsing is negligible. 102 | 103 | * **A document can have multiple files.** 104 | 105 | This wasn't part of my original design, but I added it when I rewrote docstore in autumn 2020. 106 | This means that I can group files so they show up together. 107 | Examples of when I use this: 108 | 109 | - I have two scans of the same piece of paper 110 | - I have a scanned copy of a letter, and an electronic copy I was sent separately 111 | - I have multiple versions of a contract at different stages of signing 112 | 113 | Here's how a document is described in the JSON: 114 | 115 | ```json 116 | { 117 | "date_saved": "2020-10-03T16:30:08.471833", 118 | "files": [ 119 | { 120 | "checksum": "sha256:fe79444e61b9c009a22497a9878020da98f557476b7f993432bc94fa700e888a", 121 | "date_saved": "2020-10-03T16:30:08.471833", 122 | "filename": "Eldritchbot.pdf", 123 | "id": "331e2b59-fe82-48a4-8d59-f71b0f2ad7b3", 124 | "path": "files/e/eldritchbot.pdf", 125 | "size": 2215466, 126 | "source_url": "https://www.patreon.com/posts/visit-from-40137342", 127 | "thumbnail": { 128 | "path": "thumbnails/E/Eldritchbot.pdf.png" 129 | } 130 | }, 131 | { 132 | "checksum": "sha256:ebee96fbb3725e3c708388e6b3f446b933967849980aabb61c51a146942dc7f4", 133 | "date_saved": "2020-10-03T16:32:08.471833", 134 | "filename": "Eldritchbot.epub", 135 | "id": "00faef01-d3b4-4ff3-a226-770f652849e6", 136 | "path": "files/e/eldritchbot.epub", 137 | "size": 2215466, 138 | "source_url": "https://www.patreon.com/posts/visit-from-40137342", 139 | "thumbnail": { 140 | "path": "thumbnails/E/Eldritchbot.epub.png" 141 | } 142 | } 143 | ], 144 | "id": "9dd532c7-edf9-428a-9637-df9bb6030378", 145 | "tags": [ 146 | "smolrobots", 147 | "sci-fi", 148 | "by:Thomas Heasman-Hunt" 149 | ], 150 | "title": "A Visit from Eldritchbot" 151 | } 152 | ``` 153 | 154 | * **Stay close to the original filename.** 155 | 156 | As much as possible, I want docstore to use the original filename. 157 | This makes the underlying storage human-readable, and it means that if I lost the metadata, the files would still be somewhat useful. 158 | 159 | Here's what the underlying storage looks like: 160 | 161 | ``` 162 | docstore/ 163 | └── files/ 164 | ├── a/ 165 | │ ├── admin-renewal-cover-letter.html 166 | │ ├── advice-for-patients-and-visitors.pdf 167 | │ └── application-paperwork.pdf 168 | ├── b/ 169 | ├── c/ 170 | └── ... 171 | ``` 172 | 173 | docstore records the original filename in the metadata, and then does some normalisation before copying a file to its storage. 174 | The normalisation does a couple of things: 175 | 176 | * Remove any special characters and spaces. 177 | e.g. `alex.chan › payslip › january 2015–2016.pdf` becomes `alex-chan-payslip-january-2015-2016.pdf` 178 | 179 | * Lowercase the filename. 180 | e.g. `P60Certificate.pdf` becomes `p60certificate.pdf` 181 | 182 | * De-duplicate documents with the same name by adding some random hex to the end of the name. 183 | e.g. if I store two documents called `statement.pdf`, one will be stored as `statement.pdf` and the other as `statement_f97b.pdf`. 184 | 185 | This normalisation means I don't have to worry about whether my filesystem can cope with weird characters, or if I'm storing two different files with the same name. 186 | 187 | The thumbnails for each file use a similar filename, so it's easy to find the thumbnail that corresponds to a file (and vice versa). 188 | For example, if a document is stored as `p60-certificate.pdf`, the thumbnail is stored as `p60-certificate.pdf.png`. 189 | 190 | These normalised filenames aren't exposed through the web app – if I'm downloading a file, docstore sets a [`Content-Disposition` header](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition) that tells my browser to download it with the original filename. 191 | 192 | 193 | ## Technology 194 | 195 | * docstore is written in **Python**. 196 | The web app uses [**Flask**](https://pypi.org/project/Flask/), and the CLI uses [**Click**](https://pypi.org/project/click/). 197 | * I use [**attrs**](https://pypi.org/project/attrs/) for the internal models, and [**cattrs**](https://pypi.org/project/cattrs/) to serialise my internal models to JSON. 198 | * I use [macOS **Quick Look**](https://en.wikipedia.org/wiki/Quick_Look) and [**ffmpeg**](https://ffmpeg.org) to create thumbnails, and a [*k*-means clustering algorithm](https://alexwlchan.net/2019/08/finding-tint-colours-with-k-means/) to get the tint colour to go with the thumbnails. 199 | * The filename normalisation is based on the blog post ["ASCIIfying" by Dr. Drang](http://www.leancrew.com/all-this/2014/10/asciifying/) 200 | * The code for displaying tags in a list is based on [templates from Dreamwidth](https://github.com/dreamwidth/dw-free/blob/6ec1e146d3c464e506a77913f0abf0d51a944f95/styles/core2.s2#L4126-L4220) 201 | * The code for displaying a tag cloud is based on [jquery.tagcloud.js by addywaddy](https://github.com/addywaddy/jquery.tagcloud.js/) 202 | 203 | 204 | ## License 205 | 206 | MIT. 207 | -------------------------------------------------------------------------------- /src/docstore/cli.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | import datetime 3 | import functools 4 | import json 5 | import os 6 | import pathlib 7 | import sys 8 | import typing 9 | 10 | import click 11 | 12 | 13 | @click.group() 14 | @click.option( 15 | "--root", 16 | default=".", 17 | help="The root of the docstore database.", 18 | type=click.Path(), 19 | show_default=True, 20 | ) 21 | @click.pass_context 22 | def main(ctx, root): # type: ignore 23 | ctx.obj = pathlib.Path(root) 24 | 25 | 26 | def _require_existing_instance(inner): # type: ignore 27 | """ 28 | When you call ``docstore add``, most of the time you want to be adding 29 | documents to an existing instance, not creating a new instance. 30 | 31 | It's easy to get the directory wrong, so this decorator will check you 32 | really wanted to create a new instance vs. adding to an old one. 33 | """ 34 | 35 | @functools.wraps(inner) 36 | def wrapper(*args, **kwargs): # type: ignore 37 | from docstore.documents import db_path 38 | 39 | root = click.get_current_context().obj 40 | 41 | if ( 42 | root == "." 43 | and not os.path.exists(db_path(pathlib.Path("."))) 44 | and not any(ag == "--root" or ag.startswith("--root=") for ag in sys.argv) 45 | ): # pragma: no cover 46 | click.echo( 47 | f"There is no existing docstore instance at {os.path.abspath('.')}", 48 | err=True, 49 | ) 50 | click.confirm("Do you want to create a new instance?", abort=True, err=True) 51 | 52 | return inner(*args, **kwargs) 53 | 54 | return wrapper 55 | 56 | 57 | @main.command(help="Run a docstore API server") 58 | @click.option( 59 | "--host", default="127.0.0.1", help="The interface to bind to.", show_default=True 60 | ) 61 | @click.option("--port", default=3391, help="The port to bind to.", show_default=True) 62 | @click.option("--title", default="", help="The title of the app.") 63 | @click.option( 64 | "--thumbnail_width", default=200, help="Thumbnail width (px).", show_default=True 65 | ) 66 | @click.option("--debug", default=False, is_flag=True, help="Run in debug mode.") 67 | @click.option("--profile", default=False, is_flag=True, help="Run a profiler.") 68 | @click.pass_obj 69 | def serve( 70 | root: pathlib.Path, 71 | host: str, 72 | port: int, 73 | debug: bool, 74 | profile: bool, 75 | title: str, 76 | thumbnail_width: int, 77 | ) -> None: # pragma: no cover 78 | from docstore.server import create_app, run_profiler, run_server 79 | 80 | app = create_app(root=root, title=title, thumbnail_width=thumbnail_width) 81 | 82 | if profile: 83 | run_profiler(app, host=host, port=port) 84 | else: 85 | run_server(app, host=host, port=port, debug=debug) 86 | 87 | 88 | def _add_document( 89 | root: pathlib.Path, 90 | path: pathlib.Path, 91 | title: str | None, 92 | tags: str | None, 93 | source_url: str | None, 94 | ) -> None: 95 | from docstore.documents import store_new_document 96 | 97 | document = store_new_document( 98 | root=root, 99 | path=path, 100 | title=title or "", 101 | tags=[t.strip() for t in (tags or "").split(",") if t.strip()], 102 | source_url=source_url, 103 | date_saved=datetime.datetime.now(), 104 | ) 105 | 106 | print(document.id) 107 | 108 | 109 | @main.command(help="Store a file in docstore") 110 | @click.argument("path", nargs=1, type=click.Path(), required=True) 111 | @click.option( 112 | "--title", 113 | help="The title of the file.", 114 | required=True, 115 | prompt="What is the title of the file?", 116 | ) 117 | @click.option( 118 | "--tags", 119 | help="The tags to apply to the file.", 120 | required=True, 121 | prompt="How should the file be tagged?", 122 | ) 123 | @click.option("--source_url", help="Where was this file downloaded from?.") 124 | @click.pass_obj 125 | @_require_existing_instance # type: ignore 126 | def add(root, path, title, tags, source_url): 127 | return _add_document( 128 | root=root, path=path, title=title, tags=tags, source_url=source_url 129 | ) 130 | 131 | 132 | @main.command(help="Store a file on the web in docstore") 133 | @click.option( 134 | "--url", help="URL of the file to store.", type=click.Path(), required=True 135 | ) 136 | @click.option("--title", help="The title of the file.") 137 | @click.option("--tags", help="The tags to apply to the file.") 138 | @click.option("--source_url", help="Where was this file downloaded from?.") 139 | @click.pass_obj 140 | @_require_existing_instance # type: ignore 141 | def add_from_url( 142 | root: pathlib.Path, 143 | url: str, 144 | title: str | None, 145 | tags: str | None, 146 | source_url: str | None, 147 | ) -> None: # pragma: no cover 148 | from docstore.downloads import download_file 149 | 150 | path = download_file(url) 151 | 152 | return _add_document( 153 | root=root, path=path, title=title, tags=tags, source_url=source_url 154 | ) 155 | 156 | 157 | @main.command(help="Migrate a V1 docstore") 158 | @click.option( 159 | "--v1_path", 160 | help="Path to the root of the V1 instance.", 161 | type=click.Path(), 162 | required=True, 163 | ) 164 | @click.pass_obj 165 | def migrate(root: pathlib.Path, v1_path: pathlib.Path) -> None: # pragma: no cover 166 | documents = json.load(open(os.path.join(v1_path, "documents.json"))) 167 | 168 | for _, doc in documents.items(): 169 | stored_file_path = v1_path / "files" / doc["file_identifier"] 170 | 171 | try: 172 | filename_path = v1_path / "files" / doc["filename"] 173 | except KeyError: 174 | filename_path = stored_file_path 175 | 176 | if stored_file_path.exists(): 177 | os.rename(stored_file_path, filename_path) 178 | 179 | from docstore.documents import store_new_document 180 | 181 | store_new_document( 182 | root=root, 183 | path=filename_path, 184 | title=doc.get("title", ""), 185 | tags=doc.get("tags", []), 186 | source_url=doc.get("user_data", {}).get("source_url", ""), 187 | date_saved=datetime.datetime.fromisoformat(doc["date_created"]), 188 | ) 189 | print(doc.get("filename", os.path.basename(doc["file_identifier"]))) 190 | 191 | 192 | @main.command(help="Delete one or more documents") 193 | @click.argument("doc_ids", nargs=-1) 194 | @click.pass_obj 195 | def delete(root: pathlib.Path, doc_ids: list[str]) -> None: 196 | from docstore.documents import db_path, delete_document 197 | 198 | if not os.path.exists(db_path(root)): 199 | sys.exit(f"There is no docstore instance at {root}!") 200 | 201 | for d_id in doc_ids: 202 | delete_document(root=root, doc_id=d_id) 203 | print(d_id) 204 | 205 | 206 | @main.command(help="Verify your stored files") 207 | @click.pass_obj 208 | def verify(root: pathlib.Path) -> None: 209 | import collections 210 | from docstore.documents import read_documents, sha256 211 | import tqdm 212 | 213 | errors = collections.defaultdict(list) 214 | 215 | for doc in tqdm.tqdm(list(read_documents(root))): 216 | for f in doc.files: 217 | f_path = root / f.path 218 | if f.size != os.stat(f_path).st_size: 219 | errors[f.id].append( 220 | f"Size mismatch\n actual = {os.stat(f_path).st_size}\n expected = {f.size}" 221 | ) 222 | 223 | if f.checksum != sha256(f_path): 224 | errors[f.id].append( 225 | f"Checksum mismatch\n actual = {sha256(f_path)}\n expected = {f.checksum}" 226 | ) 227 | 228 | from pprint import pprint 229 | 230 | pprint(errors) 231 | 232 | 233 | @main.command(help="Merge the files on two documents") 234 | @click.argument("doc_ids", nargs=-1) 235 | @click.option("--yes", is_flag=True, help="Skip confirmation prompts.") 236 | @click.pass_obj 237 | def merge(root: pathlib.Path, doc_ids: list[str], yes: bool) -> None: 238 | if len(doc_ids) == 1: 239 | return 240 | 241 | from docstore.documents import read_documents 242 | 243 | documents = {d.id: d for d in read_documents(root)} 244 | 245 | documents_to_merge = [documents[d_id] for d_id in doc_ids] 246 | 247 | for doc in documents_to_merge: 248 | click.echo( 249 | f'{doc.id.split("-")[0]} {click.style(doc.title, fg="yellow") or ""}' 250 | ) 251 | 252 | if not yes: # pragma: no cover 253 | click.confirm(f"Merge these {len(doc_ids)} documents?", abort=True) 254 | 255 | # What should the title of the merged document be? 256 | from docstore.merging import get_title_candidates 257 | 258 | title_candidates = get_title_candidates(documents_to_merge) 259 | 260 | if len(title_candidates) == 1: 261 | click.echo(f"Using common title: {click.style(title_candidates[0], fg='blue')}") 262 | new_title = title_candidates[0] 263 | else: 264 | print("") 265 | click.echo(f'Guessed title: {click.style(title_candidates[0], fg="blue")}') 266 | if yes or click.confirm("Use title?"): 267 | new_title = title_candidates[0] 268 | else: # pragma: no cover 269 | new_title = typing.cast( 270 | str, click.edit("\n".join(title_candidates)) 271 | ).strip() 272 | 273 | # What should the tags on the merged document be? 274 | from docstore.merging import get_union_of_tags 275 | 276 | all_tags = get_union_of_tags(documents_to_merge) 277 | 278 | print("") 279 | 280 | if all(doc.tags == all_tags for doc in documents_to_merge): 281 | click.echo(f"Using common tags: {click.style(', '.join(all_tags), fg='blue')}") 282 | new_tags = all_tags 283 | else: 284 | click.echo(f"Guessed tags: {click.style(', '.join(all_tags), fg='blue')}") 285 | if yes or click.confirm("Use tags?"): 286 | new_tags = all_tags 287 | else: # pragma: no cover 288 | new_tags = ( 289 | typing.cast(str, click.edit("\n".join(all_tags))).strip().splitlines() 290 | ) 291 | 292 | from docstore.documents import pairwise_merge_documents 293 | 294 | doc1 = documents[doc_ids[0]] 295 | for doc2_id in doc_ids[1:]: 296 | doc2 = documents[doc2_id] 297 | doc1 = pairwise_merge_documents( 298 | root=root, doc1=doc1, doc2=doc2, new_title=new_title, new_tags=new_tags 299 | ) 300 | 301 | 302 | def find_similar_pairs( 303 | tags: Iterable[str], *, required_similarity: int = 80 304 | ) -> Iterable[tuple[str, str]]: 305 | """ 306 | Find pairs of similar-looking tags in the collection ``tags``. 307 | 308 | Increase ``required_similarity`` for stricter matching (=> less results). 309 | """ 310 | import itertools 311 | 312 | from rapidfuzz import fuzz 313 | 314 | for t1, t2 in itertools.combinations(sorted(tags), 2): 315 | # utilities:gas, utilities:electricity 316 | if os.path.commonprefix([t1, t2]).endswith(":"): 317 | continue 318 | 319 | # utilities, utilities:gas 320 | if t1.startswith(f"{t2}:") or t2.startswith(f"{t1}:"): 321 | continue 322 | 323 | if fuzz.ratio(t1, t2) > required_similarity: 324 | yield (t1, t2) 325 | 326 | 327 | @main.command(help="Show tags that might be similar") 328 | @click.pass_obj 329 | def show_similar_tags(root: pathlib.Path) -> None: 330 | import collections 331 | from docstore.documents import read_documents 332 | 333 | documents = read_documents(root) 334 | tags: dict[str, int] = collections.Counter() 335 | 336 | for doc in documents: 337 | for t in doc.tags: 338 | tags[t] += 1 339 | 340 | for t1, t2 in find_similar_pairs(set(tags)): 341 | print("%3d %s" % (tags[t1], t1)) 342 | print("%3d %s" % (tags[t2], t2)) 343 | print("") 344 | --------------------------------------------------------------------------------