├── src
    └── docstore
    │   ├── __init__.py
    │   ├── static
    │       ├── stacks.ico
    │       ├── specktre.png
    │       ├── natural_paper.png
    │       ├── generic_document.png
    │       ├── stacks.svg
    │       └── style.css
    │   ├── templates
    │       ├── _title_attribution.html
    │       ├── _head.html
    │       ├── _tag_list.html
    │       ├── _meta_info.html
    │       └── index.html
    │   ├── git.py
    │   ├── merging.py
    │   ├── downloads.py
    │   ├── tag_cloud.py
    │   ├── file_normalisation.py
    │   ├── text_utils.py
    │   ├── tint_colors.py
    │   ├── models.py
    │   ├── thumbnails.py
    │   ├── tag_list.py
    │   ├── server.py
    │   ├── documents.py
    │   └── cli.py
├── .dockerignore
├── .gitignore
├── tests
    ├── stubs
    │   ├── smartypants.pyi
    │   └── wcag_contrast_ratio.pyi
    ├── files
    │   ├── cluster.png
    │   ├── snakes.pdf
    │   ├── Newtons_cradle.gif
    │   ├── cluster_segment.png
    │   ├── Rotating_earth_(large).gif
    │   ├── Rotating_earth_(large)_singleframe.gif
    │   └── credits.txt
    ├── test_tag_list.py
    ├── conftest.py
    ├── test_downloads.py
    ├── test_tint_colors.py
    ├── test_models.py
    ├── test_thumbnails.py
    ├── test_merging.py
    ├── test_text_utils.py
    ├── test_file_normalisation.py
    ├── test_documents.py
    ├── test_cli.py
    └── test_server.py
├── docstore.png
├── migrations
    ├── exceptions.py
    ├── from_2-0-0_to_2-1-0.py
    └── from_2-1-0_to_2-2-0.py
├── docs
    ├── quick_look.png
    ├── thumbnails.png
    ├── tint_colors.png
    ├── previewing-the-files.md
    ├── storing-the-metadata.md
    └── storing-the-files.md
├── .gitattributes
├── dev_requirements.in
├── requirements.in
├── .github
    ├── dependabot.yml
    ├── workflows
    │   └── test.yml
    └── install-github-bin
├── pyproject.toml
├── requirements.txt
├── LICENSE
├── dev_requirements.txt
└── README.md


/src/docstore/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | **/*.pyc
2 | .hypothesis
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .tox
2 | *.egg-info
3 | *.pyc
4 | .coverage
5 | 


--------------------------------------------------------------------------------
/tests/stubs/smartypants.pyi:
--------------------------------------------------------------------------------
1 | def smartypants(s: str) -> str: ...
2 | 


--------------------------------------------------------------------------------
/docstore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/docstore.png


--------------------------------------------------------------------------------
/migrations/exceptions.py:
--------------------------------------------------------------------------------
1 | class IncorrectSchemaError(Exception):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/docs/quick_look.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/docs/quick_look.png


--------------------------------------------------------------------------------
/docs/thumbnails.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/docs/thumbnails.png


--------------------------------------------------------------------------------
/docs/tint_colors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/docs/tint_colors.png


--------------------------------------------------------------------------------
/tests/files/cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/cluster.png


--------------------------------------------------------------------------------
/tests/files/snakes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/snakes.pdf


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | requirements.txt linguist-generated=true
2 | dev_requirements.txt linguist-generated=true
3 | 


--------------------------------------------------------------------------------
/src/docstore/static/stacks.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/src/docstore/static/stacks.ico


--------------------------------------------------------------------------------
/tests/files/Newtons_cradle.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/Newtons_cradle.gif


--------------------------------------------------------------------------------
/tests/files/cluster_segment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/cluster_segment.png


--------------------------------------------------------------------------------
/src/docstore/static/specktre.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/src/docstore/static/specktre.png


--------------------------------------------------------------------------------
/src/docstore/static/natural_paper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/src/docstore/static/natural_paper.png


--------------------------------------------------------------------------------
/tests/files/Rotating_earth_(large).gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/Rotating_earth_(large).gif


--------------------------------------------------------------------------------
/src/docstore/static/generic_document.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/src/docstore/static/generic_document.png


--------------------------------------------------------------------------------
/tests/files/Rotating_earth_(large)_singleframe.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexwlchan/docstore/main/tests/files/Rotating_earth_(large)_singleframe.gif


--------------------------------------------------------------------------------
/tests/test_tag_list.py:
--------------------------------------------------------------------------------
1 | from docstore.tag_list import render_tag_list
2 | 
3 | 
4 | def test_empty_render_tag_list() -> None:
5 |     assert render_tag_list({}) == []
6 | 


--------------------------------------------------------------------------------
/tests/stubs/wcag_contrast_ratio.pyi:
--------------------------------------------------------------------------------
1 | import typing
2 | 
3 | Color: typing.TypeAlias = tuple[float, float, float]
4 | 
5 | def rgb(color1: Color, color2: Color) -> float: ...
6 | 


--------------------------------------------------------------------------------
/dev_requirements.in:
--------------------------------------------------------------------------------
 1 | -e file:.
 2 | -r requirements.txt
 3 | 
 4 | bs4
 5 | pytest
 6 | pytest-cov
 7 | coverage
 8 | mypy
 9 | ruff
10 | types-beautifulsoup4
11 | types-tqdm
12 | 


--------------------------------------------------------------------------------
/requirements.in:
--------------------------------------------------------------------------------
 1 | attrs>=20.2.0
 2 | cattrs>=1.1.1
 3 | click>=7.1.2
 4 | hyperlink>=21.0.0
 5 | Flask>=1.1.2
 6 | Pillow
 7 | rapidfuzz
 8 | smartypants>=2.0.1
 9 | Unidecode>=1.1.1
10 | wcag_contrast_ratio>=0.9
11 | 


--------------------------------------------------------------------------------
/tests/files/credits.txt:
--------------------------------------------------------------------------------
1 | Newtons_cradle.gif
2 | https://en.wikipedia.org/wiki/File:Newtons_cradle_animation_book_2.gif
3 | 
4 | Rotating_earth_(large).gif
5 | https://en.wikipedia.org/wiki/File:Rotating_earth_(large).gif
6 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def root(tmpdir: pathlib.Path) -> pathlib.Path:
 9 |     os.makedirs(str(tmpdir / "root"))
10 |     return pathlib.Path(str(tmpdir / "root"))
11 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "daily"
 7 |       time: "09:00"
 8 |   - package-ecosystem: "pip"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "daily"
12 |       time: "09:00"
13 | 


--------------------------------------------------------------------------------
/src/docstore/templates/_title_attribution.html:
--------------------------------------------------------------------------------
 1 | {%- if doc|tags_with_prefix(prefix + ":") -%}
 2 |   , {{ prefix }}
 3 |   {% for t in doc|tags_with_prefix(prefix + ":") -%}
 4 |     {%- if t not in request_tags %}<a href="{{ query_string|add_tag(t) }}">{% endif -%}
 5 |     {{ t | replace(prefix + ":", "") }}
 6 |     {%- if t not in request_tags -%}</a>{% endif %}
 7 |     {%- if not loop.last -%}, {% endif -%}
 8 |   {%- endfor -%}
 9 | {% endif %}
10 | 


--------------------------------------------------------------------------------
/src/docstore/git.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import os
 3 | import subprocess
 4 | 
 5 | 
 6 | @functools.lru_cache()
 7 | def current_commit() -> str:
 8 |     """
 9 |     Returns the commit of the current docstore version.
10 |     """
11 |     return (
12 |         subprocess.check_output(
13 |             ["git", "rev-parse", "HEAD"], cwd=os.path.dirname(os.path.abspath(__file__))
14 |         )
15 |         .strip()
16 |         .decode("utf8")[:7]
17 |     )
18 | 


--------------------------------------------------------------------------------
/src/docstore/templates/_head.html:
--------------------------------------------------------------------------------
 1 | <title>
 2 |   {% if request_tags %}
 3 |     tagged with {% for t in request_tags %}{{ t }}{% if not loop.last %}, {% endif %}{% endfor %} —
 4 |   {% endif %}
 5 |   docstore{% if title %}/{{ title | smartypants | safe }}{% endif %}
 6 | </title>
 7 | 
 8 | <link rel="shortcut icon" href="{{ url_for('static', filename='stacks.ico') }}">
 9 | <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}"/>
10 | 
11 | <style>
12 |   .thumbnail {
13 |     width: {{ config['THUMBNAIL_WIDTH'] }}px;
14 |   }
15 | </style>
16 | 


--------------------------------------------------------------------------------
/src/docstore/templates/_tag_list.html:
--------------------------------------------------------------------------------
 1 | {% for entry in tag_tally|render_tag_list %}
 2 |   {% if entry['type'] == 'html_literal' %}
 3 |     {{ entry['value'] | safe }}
 4 |   {% elif entry['type'] == 'tag_link' %}
 5 |     {% set t = entry['name'] %}
 6 |     {% if t in request_tags %}
 7 |       {{ entry['display_name'] }}
 8 |     {% else %}
 9 |       <a href="{{ query_string|add_tag(t) }}">{{ entry['display_name'] }}</a>
10 |     {% endif %} ({{ entry['count'] }})
11 |   {% elif entry['type'] == 'tag_text' %}
12 |     {{ entry['display_name'] }}
13 |   {% else %}
14 |     {{ entry }}
15 |   {% endif %}
16 | {% endfor %}
17 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "docstore"
 3 | version = "2.0.0"
 4 | 
 5 | [project.scripts]
 6 | docstore = "docstore.cli:main"
 7 | 
 8 | [tool.setuptools.packages.find]
 9 | where = ["src"]
10 | 
11 | [tool.setuptools.package-data]
12 | analytics = ["static/*", "templates/*"]
13 | 
14 | [tool.coverage.run]
15 | branch = true
16 | source = [
17 |   "docstore",
18 |   "tests",
19 | ]
20 | 
21 | [tool.coverage.report]
22 | show_missing = true
23 | skip_covered = true
24 | fail_under = 90
25 | # fail_under = 100
26 | 
27 | # [tool.pytest.ini_options]
28 | # filterwarnings = ["error"]
29 | 
30 | [tool.mypy]
31 | mypy_path = "src"
32 | strict = true
33 | 


--------------------------------------------------------------------------------
/tests/test_downloads.py:
--------------------------------------------------------------------------------
 1 | from email.message import Message
 2 | 
 3 | from docstore.downloads import guess_filename
 4 | 
 5 | 
 6 | def test_guess_filename_with_no_content_disposition() -> None:
 7 |     msg = Message()
 8 |     assert guess_filename("https://i.org/example.png", headers=msg) == "example.png"
 9 | 
10 | 
11 | def test_guess_filename_with_content_disposition() -> None:
12 |     msg = Message()
13 |     msg.add_header("Content-Disposition", "attachment", filename="MyExample.png")
14 |     assert guess_filename("https://i.org/example.png", headers=msg) == "MyExample.png"
15 | 
16 | 
17 | def test_guess_filename_with_content_disposition_but_no_filename() -> None:
18 |     msg = Message()
19 |     msg.add_header("Content-Disposition", "attachment")
20 |     assert guess_filename("https://i.org/example.png", headers=msg) == "example.png"
21 | 


--------------------------------------------------------------------------------
/src/docstore/merging.py:
--------------------------------------------------------------------------------
 1 | from .models import Document
 2 | from .text_utils import common_prefix
 3 | 
 4 | 
 5 | def get_title_candidates(documents: list[Document]) -> list[str]:
 6 |     title_candidates = []
 7 | 
 8 |     for doc in documents:
 9 |         if doc.title not in title_candidates:
10 |             title_candidates.append(doc.title)
11 | 
12 |     guessed_title = common_prefix(title_candidates)
13 | 
14 |     if guessed_title and guessed_title not in title_candidates:
15 |         title_candidates.insert(0, guessed_title)
16 | 
17 |     return title_candidates
18 | 
19 | 
20 | def get_union_of_tags(documents: list[Document]) -> list[str]:
21 |     """
22 |     Get a list of every tag on any document in ``documents``.
23 |     """
24 |     tags = []
25 | 
26 |     for doc in documents:
27 |         for t in doc.tags:
28 |             if t not in tags:
29 |                 tags.append(t)
30 | 
31 |     return tags
32 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file was autogenerated by uv via the following command:
 2 | #    uv pip compile requirements.in --output-file requirements.txt
 3 | attrs==23.2.0
 4 |     # via
 5 |     #   -r requirements.in
 6 |     #   cattrs
 7 | blinker==1.8.2
 8 |     # via flask
 9 | cattrs==23.2.3
10 |     # via -r requirements.in
11 | click==8.1.7
12 |     # via
13 |     #   -r requirements.in
14 |     #   flask
15 | flask==3.0.3
16 |     # via -r requirements.in
17 | hyperlink==21.0.0
18 |     # via -r requirements.in
19 | idna==3.7
20 |     # via hyperlink
21 | itsdangerous==2.2.0
22 |     # via flask
23 | jinja2==3.1.4
24 |     # via flask
25 | markupsafe==2.1.5
26 |     # via
27 |     #   jinja2
28 |     #   werkzeug
29 | pillow==10.4.0
30 |     # via -r requirements.in
31 | rapidfuzz==3.9.4
32 |     # via -r requirements.in
33 | smartypants==2.0.1
34 |     # via -r requirements.in
35 | unidecode==1.3.8
36 |     # via -r requirements.in
37 | wcag-contrast-ratio==0.9
38 |     # via -r requirements.in
39 | werkzeug==3.0.3
40 |     # via flask
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2019 Alex Chan
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/src/docstore/downloads.py:
--------------------------------------------------------------------------------
 1 | import cgi
 2 | from email.message import Message
 3 | import os
 4 | import pathlib
 5 | from urllib.request import urlretrieve
 6 | from urllib.parse import urlparse
 7 | 
 8 | 
 9 | def guess_filename(url: str, headers: Message) -> str:
10 |     """
11 |     Given a URL and the HTTP response headers, guess the final name of this file.
12 |     """
13 |     fallback = os.path.basename(urlparse(url).path)
14 | 
15 |     try:
16 |         _, params = cgi.parse_header(headers["Content-Disposition"])
17 |     except TypeError:
18 |         return fallback
19 | 
20 |     try:
21 |         return params["filename"]
22 |     except (KeyError, TypeError):
23 |         return fallback
24 | 
25 | 
26 | def download_file(url: str) -> pathlib.Path:  # pragma: no cover
27 |     """
28 |     Download a file from a URL.  Returns the path to the downloaded file.
29 |     """
30 |     tmp_path, headers = urlretrieve(url)
31 | 
32 |     filename = guess_filename(url=url, headers=headers)
33 | 
34 |     out_path = os.path.join(os.path.dirname(tmp_path), filename)
35 |     os.rename(tmp_path, out_path)
36 | 
37 |     return pathlib.Path(out_path)
38 | 


--------------------------------------------------------------------------------
/src/docstore/tag_cloud.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | 
 4 | class TagCloud:
 5 |     def __init__(self, tag_tally: dict[str, int]):
 6 |         self.tag_tally = tag_tally
 7 |         self.lowest_weight = min(tag_tally.values())
 8 |         self.highest_weight = max(tag_tally.values())
 9 |         self.range = (self.highest_weight - self.lowest_weight) or 1
10 | 
11 |         self.font_size_start = 10
12 |         self.font_size_end = 24
13 |         self.font_incr = (self.font_size_end - self.font_size_start) / self.range
14 | 
15 |         self.greyscale_start = 170
16 |         self.greyscale_end = 70
17 |         self.greyscale_incr = (self.greyscale_end - self.greyscale_start) / self.range
18 | 
19 |     @functools.lru_cache()
20 |     def get_style(self, tag_count: int) -> str:
21 |         weighting = tag_count - self.lowest_weight
22 |         font_size = self.font_size_start + weighting * self.font_incr
23 |         color = int(self.greyscale_start + weighting * self.greyscale_incr)
24 |         return "font-size: %fpt; color: rgb(%d, %d, %d)" % (
25 |             font_size,
26 |             color,
27 |             color,
28 |             color,
29 |         )
30 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 | 
 8 |   pull_request:
 9 |     branches:
10 |     - main
11 | 
12 | jobs:
13 |   test:
14 |     # Note: this project uses some macOS-specific tools like `qlmanage`, so
15 |     # the tests need to run on macOS.
16 |     #
17 |     # This is different from my other Python projects, which usually run on
18 |     # `ubuntu-latest`.
19 |     runs-on: macos-latest
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v4
23 | 
24 |     - name: Set up Python
25 |       uses: actions/setup-python@v5
26 |       with:
27 |         python-version: "3.12"
28 |         cache: 'pip'
29 |         cache-dependency-path: 'dev_requirements.txt'
30 | 
31 |     - name: Install dependencies
32 |       run: |
33 |         brew install ffmpeg
34 |         .github/install-github-bin alexwlchan/dominant_colours
35 |         pip install -r dev_requirements.txt
36 | 
37 |     - name: Run linting
38 |       run: |
39 |         ruff check .
40 |         ruff format --check .
41 | 
42 |     - name: Check types
43 |       run: mypy src tests
44 | 
45 |     - name: Run tests
46 |       run: |
47 |         coverage run -m pytest tests
48 |         coverage report
49 | 


--------------------------------------------------------------------------------
/tests/test_tint_colors.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from docstore.thumbnails import create_thumbnail
 4 | from docstore.tint_colors import (
 5 |     Color,
 6 |     choose_tint_color_from_dominant_colors,
 7 |     choose_tint_color,
 8 | )
 9 | 
10 | 
11 | def test_choose_tint_color() -> None:
12 |     thumbnail_path = create_thumbnail("tests/files/Newtons_cradle.gif")
13 | 
14 |     tint_color = choose_tint_color(
15 |         thumbnail_path=thumbnail_path, file_path="tests/files/Newtons_cradle.gif"
16 |     )
17 |     assert all(0.4 <= c <= 0.5 for c in tint_color), tint_color
18 | 
19 | 
20 | @pytest.mark.parametrize(
21 |     "dominant_color, background_color, expected_tint",
22 |     [
23 |         ((1, 1, 1), (1, 1, 1), (0, 0, 0)),
24 |         ((0.9, 0.9, 0.9), (1, 1, 1), (0, 0, 0)),
25 |         ((0, 0, 0), (0, 0, 0), (1, 1, 1)),
26 |     ],
27 | )
28 | def test_selects_black_or_white_if_unsufficient_contrast(
29 |     dominant_color: Color,
30 |     background_color: Color,
31 |     expected_tint: Color,
32 | ) -> None:
33 |     assert (
34 |         choose_tint_color_from_dominant_colors(
35 |             dominant_colors=[dominant_color], background_color=background_color
36 |         )
37 |         == expected_tint
38 |     )
39 | 


--------------------------------------------------------------------------------
/src/docstore/file_normalisation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import secrets
 3 | import shutil
 4 | 
 5 | from docstore.text_utils import slugify
 6 | 
 7 | 
 8 | def normalised_filename_copy(*, src: str, dst: str) -> str:
 9 |     """
10 |     Copies a file from ``src`` to ``dst``.
11 | 
12 |     This rename function applies two normalisation steps:
13 | 
14 |     -   It removes non-ASCII characters and spaces
15 |     -   It appends random hex value before the filename extension
16 |         if there are multiple files with the same name
17 | 
18 |     This rename function tries to be "safe".  In particular, if there's
19 |     already a file at ``dst``, it refuses to overwrite it.  Instead,
20 |     it appends a random identifier to ``dst`` and copies to that instead.
21 | 
22 |     e.g. if you pass dst=``Statement.pdf``, it might create files like
23 |     ``Statement.pdf``, ``Statement_1c5e.pdf``, ``Statement_3fc9.pdf``
24 | 
25 |     Returns the name of the final file.
26 | 
27 |     """
28 |     out_dir, filename = os.path.split(dst)
29 | 
30 |     os.makedirs(out_dir, exist_ok=True)
31 | 
32 |     name, ext = os.path.splitext(filename)
33 |     name = slugify(name)
34 | 
35 |     out_path = os.path.join(out_dir, name + ext)
36 | 
37 |     while True:
38 |         try:
39 |             with open(out_path, "xb") as out_file:
40 |                 with open(src, "rb") as infile:
41 |                     shutil.copyfileobj(infile, out_file)
42 |         except FileExistsError:
43 |             out_path = os.path.join(out_dir, name + "_" + secrets.token_hex(2) + ext)
44 |         else:
45 |             return out_path
46 | 


--------------------------------------------------------------------------------
/tests/test_models.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import typing
 3 | import uuid
 4 | 
 5 | import pytest
 6 | 
 7 | from docstore.models import Dimensions, Document, File, Thumbnail, from_json, to_json
 8 | 
 9 | 
10 | def is_recent(ds: datetime.datetime) -> bool:
11 |     return (datetime.datetime.now() - ds).seconds < 2
12 | 
13 | 
14 | def test_document_defaults() -> None:
15 |     d1 = Document(title="My test document")
16 |     assert uuid.UUID(d1.id)
17 |     assert is_recent(d1.date_saved)
18 |     assert d1.tags == []
19 |     assert d1.files == []
20 | 
21 |     d2 = Document(title="A different document")
22 |     assert d1.id != d2.id
23 | 
24 | 
25 | def test_file_defaults() -> None:
26 |     f = File(
27 |         filename="cats.jpg",
28 |         path="files/c/cats.jpg",
29 |         size=100,
30 |         checksum="sha256:123",
31 |         thumbnail=Thumbnail(
32 |             path="thumbnails/c/cats.jpg",
33 |             dimensions=Dimensions(400, 300),
34 |             tint_color="#ffffff",
35 |         ),
36 |     )
37 |     uuid.UUID(f.id)
38 |     assert is_recent(f.date_saved)
39 | 
40 | 
41 | def test_can_serialise_document_to_json() -> None:
42 |     f = File(
43 |         filename="cats.jpg",
44 |         path="files/c/cats.jpg",
45 |         size=100,
46 |         checksum="sha256:123",
47 |         thumbnail=Thumbnail(
48 |             path="thumbnails/c/cats.jpg",
49 |             dimensions=Dimensions(400, 300),
50 |             tint_color="#ffffff",
51 |         ),
52 |     )
53 | 
54 |     documents = [Document(title="Another test document", files=[f])]
55 |     assert from_json(to_json(documents)) == documents
56 | 
57 | 
58 | @pytest.mark.parametrize("documents", [[1, 2, 3], {"a", "b", "c"}])
59 | def test_to_json_with_bad_list_is_typeerror(documents: typing.Any) -> None:
60 |     with pytest.raises(TypeError, match=r"Expected type List\[Document\]!"):
61 |         to_json(documents)
62 | 


--------------------------------------------------------------------------------
/migrations/from_2-0-0_to_2-1-0.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | DB schema migration: v2.0.0 ~> v2.1.0
 4 | 
 5 | *   Convert the document tree from a list of flat documents to a dict with some
 6 |     top-level metadata.
 7 | *   Record the dimension on Thumbnail instances.
 8 | 
 9 | """
10 | 
11 | import datetime
12 | import json
13 | import os
14 | import shutil
15 | import sys
16 | 
17 | import cattr
18 | import tqdm
19 | 
20 | from docstore.git import current_commit
21 | from docstore.thumbnails import get_dimensions
22 | from exceptions import IncorrectSchemaError
23 | 
24 | OLD_DB_SCHEMA = "v2.0.0"
25 | NEW_DB_SCHEMA = "v2.1.0"
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     try:
30 |         root = sys.argv[1]
31 |     except IndexError:
32 |         root = "."
33 | 
34 |     documents_path = os.path.join(root, "documents.json")
35 |     backup_path = os.path.join(root, f"documents.{OLD_DB_SCHEMA}.json.bak")
36 | 
37 |     documents = json.load(open(documents_path))
38 | 
39 |     if not isinstance(documents, list):
40 |         raise IncorrectSchemaError(
41 |             f"The docstore instance at {root} doesn't look like {OLD_DB_SCHEMA}"
42 |         )
43 | 
44 |     assert not os.path.exists(backup_path)
45 |     shutil.copyfile(documents_path, backup_path)
46 | 
47 |     # Create the new top-level structure
48 |     new_structure = {
49 |         "docstore": {
50 |             "db_schema": NEW_DB_SCHEMA,
51 |             "commit": current_commit(),
52 |             "last_modified": datetime.datetime.now().isoformat(),
53 |         },
54 |         "documents": documents,
55 |     }
56 | 
57 |     # Backfill the thumbnail dimensions
58 |     for doc in tqdm.tqdm(documents):
59 |         for f in doc["files"]:
60 |             dimensions = get_dimensions(os.path.join(root, f["thumbnail"]["path"]))
61 |             f["thumbnail"]["dimensions"] = cattr.unstructure(dimensions)
62 | 
63 |     # Write the new database
64 |     with open(documents_path, "w") as outfile:
65 |         outfile.write(json.dumps(new_structure, indent=2, sort_keys=True))
66 | 


--------------------------------------------------------------------------------
/src/docstore/static/stacks.svg:
--------------------------------------------------------------------------------
1 | <svg height='300px' width='300px'  fill="#000000" xmlns:x="http://ns.adobe.com/Extensibility/1.0/" xmlns:i="http://ns.adobe.com/AdobeIllustrator/10.0/" xmlns:graph="http://ns.adobe.com/Graphs/1.0/" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" x="0px" y="0px" viewBox="0 0 100 100" style="enable-background:new 0 0 100 100;" xml:space="preserve"><switch><foreignObject requiredExtensions="http://ns.adobe.com/AdobeIllustrator/10.0/" x="0" y="0" width="1" height="1"></foreignObject><g i:extraneous="self"><g><path d="M5273.1,2400.1v-2c0-2.8-5-4-9.7-4s-9.7,1.3-9.7,4v2c0,1.8,0.7,3.6,2,4.9l5,4.9c0.3,0.3,0.4,0.6,0.4,1v6.4     c0,0.4,0.2,0.7,0.6,0.8l2.9,0.9c0.5,0.1,1-0.2,1-0.8v-7.2c0-0.4,0.2-0.7,0.4-1l5.1-5C5272.4,2403.7,5273.1,2401.9,5273.1,2400.1z      M5263.4,2400c-4.8,0-7.4-1.3-7.5-1.8v0c0.1-0.5,2.7-1.8,7.5-1.8c4.8,0,7.3,1.3,7.5,1.8C5270.7,2398.7,5268.2,2400,5263.4,2400z"></path><path d="M5268.4,2410.3c-0.6,0-1,0.4-1,1c0,0.6,0.4,1,1,1h4.3c0.6,0,1-0.4,1-1c0-0.6-0.4-1-1-1H5268.4z"></path><path d="M5272.7,2413.7h-4.3c-0.6,0-1,0.4-1,1c0,0.6,0.4,1,1,1h4.3c0.6,0,1-0.4,1-1C5273.7,2414.1,5273.3,2413.7,5272.7,2413.7z"></path><path d="M5272.7,2417h-4.3c-0.6,0-1,0.4-1,1c0,0.6,0.4,1,1,1h4.3c0.6,0,1-0.4,1-1C5273.7,2417.5,5273.3,2417,5272.7,2417z"></path></g><g><path d="M95.3,70.6l-12.8-4.8l-23.1,8.7c-3,1.2-6.3,1.7-9.4,1.7c-3.2,0-6.4-0.6-9.4-1.7l-23.1-8.7L4.7,70.6     c-2.1,0.8-2.2,2.7-2.2,3.3c0,0.6,0.2,2.5,2.2,3.3l38.4,14.5c4.4,1.7,9.4,1.7,13.8,0l38.4-14.5c2.1-0.8,2.2-2.7,2.2-3.3     C97.5,73.3,97.3,71.4,95.3,70.6z"></path><path d="M95.3,46.7l-12.8-4.8l-23.1,8.7c-3,1.2-6.3,1.7-9.4,1.7c-3.2,0-6.4-0.6-9.4-1.7l-23.1-8.7L4.7,46.7     c-2.1,0.8-2.2,2.7-2.2,3.3c0,0.6,0.2,2.5,2.2,3.3l38.4,14.5c4.4,1.7,9.4,1.7,13.8,0l38.4-14.5c2.1-0.8,2.2-2.7,2.2-3.3     C97.5,49.4,97.3,47.5,95.3,46.7z"></path><path d="M4.7,29.5L43.1,44c4.4,1.7,9.4,1.7,13.8,0l38.4-14.5c2.1-0.8,2.2-2.8,2.2-3.3c0-0.6-0.2-2.5-2.2-3.3L56.9,8.4     c-4.4-1.7-9.4-1.7-13.8,0L4.7,22.9c-2.1,0.8-2.2,2.7-2.2,3.3C2.5,26.7,2.7,28.7,4.7,29.5z"></path></g></g></switch></svg>


--------------------------------------------------------------------------------
/tests/test_thumbnails.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | from PIL import Image
 4 | import pytest
 5 | 
 6 | from docstore.thumbnails import create_thumbnail, get_dimensions
 7 | 
 8 | 
 9 | @pytest.mark.parametrize(
10 |     "filename", ["Newtons_cradle.gif", "Rotating_earth_(large).gif"]
11 | )
12 | def test_creates_thumbnail_of_animated_gif(filename: str) -> None:
13 |     path = create_thumbnail(f"tests/files/{filename}", max_size=400)
14 |     assert path.endswith(".mp4")
15 | 
16 | 
17 | def test_creates_thumbnail_of_single_frame_gif() -> None:
18 |     path = create_thumbnail(
19 |         "tests/files/Rotating_earth_(large)_singleframe.gif", max_size=400
20 |     )
21 |     assert path.endswith(".png")
22 | 
23 |     im = Image.open(path)
24 |     assert im.size == (400, 400)
25 | 
26 | 
27 | def test_creates_thumbnail_of_png() -> None:
28 |     path = create_thumbnail("tests/files/cluster.png", max_size=250)
29 |     assert path.endswith("/cluster.png")
30 | 
31 |     im = Image.open(path)
32 |     assert im.size == (250, 162)
33 | 
34 | 
35 | def test_creates_thumbnail_of_pdf() -> None:
36 |     path = create_thumbnail("tests/files/snakes.pdf", max_size=350)
37 |     assert path.endswith("/snakes.pdf.png")
38 | 
39 |     im = Image.open(path)
40 |     assert im.size == (247, 350)
41 | 
42 | 
43 | def test_creates_thumbnail_if_no_quicklook_plugin_available(
44 |     tmpdir: pathlib.Path,
45 | ) -> None:
46 |     path = str(tmpdir / "sqlite.db")
47 | 
48 |     with open(path, "wb") as outfile:
49 |         outfile.write(b"SQLite format 3\x00")
50 | 
51 |     path = create_thumbnail(path)
52 | 
53 | 
54 | def test_gets_dimensions_of_an_image() -> None:
55 |     dimensions = get_dimensions("tests/files/cluster.png")
56 |     assert dimensions.width == 500
57 |     assert dimensions.height == 325
58 | 
59 | 
60 | def test_gets_dimensions_of_a_video() -> None:
61 |     thumbnail_path = create_thumbnail("tests/files/Newtons_cradle.gif")
62 | 
63 |     dimensions = get_dimensions(thumbnail_path)
64 |     assert dimensions.width == 400
65 |     assert dimensions.height == 300
66 | 


--------------------------------------------------------------------------------
/src/docstore/text_utils.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import os
 3 | import re
 4 | import sys
 5 | 
 6 | from unidecode import unidecode
 7 | 
 8 | 
 9 | def common_prefix(values: list[str]) -> str:
10 |     prefix = os.path.commonprefix(values).strip()
11 | 
12 |     prefix = prefix.strip("()").strip()
13 |     if prefix.lower().endswith("(part"):
14 |         prefix = prefix[: -len("(part")].strip()
15 | 
16 |     if prefix.lower().endswith("- part"):
17 |         prefix = prefix[: -len("- part")].strip()
18 | 
19 |     return prefix
20 | 
21 | 
22 | def slugify(u: str) -> str:
23 |     """
24 |     Convert Unicode string into blog slug.
25 | 
26 |     Based on http://www.leancrew.com/all-this/2014/10/asciifying/
27 | 
28 |     """
29 |     u = re.sub("[–—/:;,._]", "-", u)  # replace separating punctuation
30 |     a = unidecode(u).lower()  # best ASCII substitutions, lowercased
31 |     a = re.sub(r"[^a-z0-9 -]", "", a)  # delete any other characters
32 |     a = a.replace(" ", "-")  # spaces to hyphens
33 |     a = re.sub(r"-+", "-", a)  # condense repeated hyphens
34 |     return a
35 | 
36 | 
37 | def pretty_date(d: datetime.datetime, now: datetime.datetime) -> str:
38 |     delta = now - d
39 |     if delta.total_seconds() < 120:
40 |         return "just now"
41 |     elif delta.total_seconds() < 60 * 60:
42 |         return f"{int(delta.seconds / 60)} minutes ago"
43 |     elif d.date() == now.date():
44 |         return "earlier today"
45 |     elif d.date() == now.date() - datetime.timedelta(days=1):
46 |         return "yesterday"
47 |     else:
48 |         for days in range(2, 8):
49 |             if d.date() == now.date() - datetime.timedelta(days=days):
50 |                 return f"{days} days ago"
51 |         return d.strftime("%-d %b %Y")
52 | 
53 | 
54 | def hostname(url: str) -> str:
55 |     """
56 |     Returns a guess for the hostname of a URL to display in the <a> tag.
57 |     """
58 |     try:
59 |         return url.split("/")[2]
60 |     except IndexError:
61 |         print(f"Unable to detect hostname of URL: {url}", file=sys.stderr)
62 |         return url
63 | 


--------------------------------------------------------------------------------
/migrations/from_2-1-0_to_2-2-0.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | DB schema migration: v2.1.0 ~> v2.2.0
 4 | 
 5 | *   Record the tint color on Thumbnail instances.
 6 | 
 7 | """
 8 | 
 9 | import datetime
10 | import filecmp
11 | import json
12 | import os
13 | import shutil
14 | import sys
15 | 
16 | import tqdm
17 | 
18 | from docstore.git import current_commit
19 | from docstore.tint_colors import choose_tint_color
20 | 
21 | OLD_DB_SCHEMA = "v2.1.0"
22 | NEW_DB_SCHEMA = "v2.2.0"
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     try:
27 |         root = sys.argv[1]
28 |     except IndexError:
29 |         root = "."
30 | 
31 |     documents_path = os.path.join(root, "documents.json")
32 |     backup_path = os.path.join(root, f"documents.{OLD_DB_SCHEMA}.json.bak")
33 | 
34 |     documents = json.load(open(documents_path))
35 |     assert documents["docstore"]["db_schema"] == OLD_DB_SCHEMA
36 | 
37 |     # Backfill the thumbnail dimensions
38 |     for doc in tqdm.tqdm(documents["documents"]):
39 |         for f in doc["files"]:
40 |             tint_color = choose_tint_color(
41 |                 thumbnail_path=os.path.join(root, f["thumbnail"]["path"]),
42 |                 file_path=os.path.join(root, f["path"]),
43 |             )
44 | 
45 |             hex_tint_color = "#%02x%02x%02x" % tuple(
46 |                 int(component * 255) for component in tint_color
47 |             )
48 | 
49 |             f["thumbnail"]["tint_color"] = hex_tint_color
50 | 
51 |     new_output = {
52 |         "docstore": {
53 |             "db_schema": NEW_DB_SCHEMA,
54 |             "commit": current_commit(),
55 |             "last_modified": datetime.datetime.now().isoformat(),
56 |         },
57 |         "documents": documents["documents"],
58 |     }
59 | 
60 |     if os.path.exists(backup_path) and not filecmp.cmp(
61 |         backup_path, documents_path, shallow=False
62 |     ):
63 |         raise RuntimeError("Have you already started a migration of this version?")
64 | 
65 |     shutil.copyfile(documents_path, backup_path)
66 | 
67 |     # Write the new database
68 |     with open(documents_path, "w") as outfile:
69 |         outfile.write(json.dumps(new_output, indent=2, sort_keys=True))
70 | 


--------------------------------------------------------------------------------
/dev_requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file was autogenerated by uv via the following command:
 2 | #    uv pip compile dev_requirements.in --output-file dev_requirements.txt
 3 | -e file:.
 4 |     # via -r dev_requirements.in
 5 | attrs==23.2.0
 6 |     # via
 7 |     #   -r requirements.txt
 8 |     #   cattrs
 9 | beautifulsoup4==4.12.3
10 |     # via bs4
11 | blinker==1.8.2
12 |     # via
13 |     #   -r requirements.txt
14 |     #   flask
15 | bs4==0.0.2
16 |     # via -r dev_requirements.in
17 | cattrs==23.2.3
18 |     # via -r requirements.txt
19 | click==8.1.7
20 |     # via
21 |     #   -r requirements.txt
22 |     #   flask
23 | coverage[toml]==7.5.3
24 |     # via
25 |     #   -r dev_requirements.in
26 |     #   pytest-cov
27 | flask==3.0.3
28 |     # via -r requirements.txt
29 | hyperlink==21.0.0
30 |     # via -r requirements.txt
31 | idna==3.7
32 |     # via
33 |     #   -r requirements.txt
34 |     #   hyperlink
35 | iniconfig==2.0.0
36 |     # via pytest
37 | itsdangerous==2.2.0
38 |     # via
39 |     #   -r requirements.txt
40 |     #   flask
41 | jinja2==3.1.4
42 |     # via
43 |     #   -r requirements.txt
44 |     #   flask
45 | markupsafe==2.1.5
46 |     # via
47 |     #   -r requirements.txt
48 |     #   jinja2
49 |     #   werkzeug
50 | mypy==1.10.1
51 |     # via -r dev_requirements.in
52 | mypy-extensions==1.0.0
53 |     # via mypy
54 | packaging==24.1
55 |     # via pytest
56 | pillow==10.4.0
57 |     # via -r requirements.txt
58 | pluggy==1.5.0
59 |     # via pytest
60 | pytest==8.2.2
61 |     # via
62 |     #   -r dev_requirements.in
63 |     #   pytest-cov
64 | pytest-cov==5.0.0
65 |     # via -r dev_requirements.in
66 | rapidfuzz==3.9.4
67 |     # via -r requirements.txt
68 | ruff==0.5.1
69 |     # via -r dev_requirements.in
70 | smartypants==2.0.1
71 |     # via -r requirements.txt
72 | soupsieve==2.5
73 |     # via beautifulsoup4
74 | types-beautifulsoup4==4.12.0.20240511
75 |     # via -r dev_requirements.in
76 | types-html5lib==1.1.11.20240228
77 |     # via types-beautifulsoup4
78 | types-tqdm==4.66.0.20240417
79 |     # via -r dev_requirements.in
80 | typing-extensions==4.12.2
81 |     # via mypy
82 | unidecode==1.3.8
83 |     # via -r requirements.txt
84 | wcag-contrast-ratio==0.9
85 |     # via -r requirements.txt
86 | werkzeug==3.0.3
87 |     # via
88 |     #   -r requirements.txt
89 |     #   flask
90 | 


--------------------------------------------------------------------------------
/src/docstore/static/style.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |   background: url('/static/natural_paper.png');
  3 |   font: 14pt Helvetica;
  4 |   line-height: 1.45em;
  5 |   margin: 0;
  6 |   padding: 0;
  7 | }
  8 | 
  9 | aside {
 10 |   background: url('/static/specktre.png');
 11 |   background-size: auto 100%;
 12 |   padding: 6px 0px 2px 0px;
 13 |   border-bottom: 2px solid #aaa;
 14 |   font: 20pt "Nanum Brush Script";
 15 |   font-weight: bold;
 16 |   box-shadow: 0 3px 6px rgba(0, 0, 0, 0.3)
 17 | }
 18 | 
 19 | td {
 20 |   line-height: 1.45em;
 21 |   font-size: 10pt;
 22 | }
 23 | 
 24 | main, #aside_inner {
 25 |   max-width: 940px;
 26 |   margin-left:  auto;
 27 |   margin-right: auto;
 28 |   padding-left:  20px;
 29 |   padding-right: 20px;
 30 | }
 31 | 
 32 | .thumbnail {
 33 |   padding-right: 15px;
 34 | }
 35 | 
 36 | .thumbnail img {
 37 |   width: 100%;
 38 | }
 39 | 
 40 | h2 {
 41 |   margin-bottom: 0.5em;
 42 |   line-height: 1.25em;
 43 | }
 44 | 
 45 | table {
 46 |   width: 100%;
 47 | }
 48 | 
 49 | table hr {
 50 |   margin-left:  1em;
 51 |   margin-right: 1em;
 52 | }
 53 | 
 54 | hr {
 55 |   border: none;
 56 |   height: 1px;
 57 |   background: rgba(128, 128, 128, 0.5);
 58 | }
 59 | 
 60 | .thumbnail a:hover {
 61 |   background: #606060;
 62 | }
 63 | 
 64 | .thumbnail a img:hover {
 65 |   opacity: 0.8;
 66 | }
 67 | 
 68 | a {
 69 |   color: #606060;
 70 | }
 71 | 
 72 | a:hover {
 73 |   background: rgba(128, 128, 128, 0.3);
 74 | }
 75 | 
 76 | a.remove_tag {
 77 |   color: #d01c11;
 78 |   text-decoration: none;
 79 | }
 80 | 
 81 | a.remove_tag:hover {
 82 |   background: rgba(208, 28, 17, 0.3);
 83 | }
 84 | 
 85 | .meta_info {
 86 |   background: white;
 87 |   border: 2px solid #aaa;
 88 |   padding: 12px 15px;
 89 |   margin: 1em -14px;
 90 |   border-radius: 8px;
 91 |   font-size: 13pt;
 92 |   box-shadow: 0 3px 6px rgba(0, 0, 0, 0.2);
 93 | }
 94 | 
 95 | .meta_info ul {
 96 |   line-height: 1.4em;
 97 | }
 98 | 
 99 | #tag_cloud {
100 |   text-align: justify;
101 | }
102 | 
103 | #tag_cloud .tag {
104 |   display: inline-block;
105 | }
106 | 
107 | a.disabled {
108 |    pointer-events: none;
109 |    cursor: default;
110 |    text-decoration: none;
111 |    color: black;
112 | }
113 | 
114 | .doc_id {
115 |   font-weight: normal;
116 |   display: none;
117 |   color: #999;
118 | }
119 | 
120 | h2:hover .doc_id {
121 |   display: inline-block;
122 | }
123 | 
124 | .sortBy {
125 |   float: right;
126 |   margin-bottom: 0;
127 | }
128 | 


--------------------------------------------------------------------------------
/tests/test_merging.py:
--------------------------------------------------------------------------------
 1 | from docstore.merging import get_title_candidates, get_union_of_tags
 2 | from docstore.models import Document
 3 | 
 4 | 
 5 | class TestGetTitleCandidates:
 6 |     def test_single_document_is_title(self) -> None:
 7 |         doc = Document(title="Title 1")
 8 |         assert get_title_candidates([doc]) == ["Title 1"]
 9 | 
10 |     def test_multiples_document_are_title_and_common_prefix(self) -> None:
11 |         doc1 = Document(title="My document 1")
12 |         doc2 = Document(title="My document 2")
13 |         assert get_title_candidates([doc1, doc2]) == [
14 |             "My document",
15 |             "My document 1",
16 |             "My document 2",
17 |         ]
18 | 
19 |     def test_does_not_double_add_common_prefix(self) -> None:
20 |         doc1 = Document(title="My document")
21 |         doc2 = Document(title="My document 2")
22 |         assert get_title_candidates([doc1, doc2]) == ["My document", "My document 2"]
23 | 
24 |     def test_does_not_double_add_title(self) -> None:
25 |         doc1 = Document(title="My document")
26 |         doc2 = Document(title="My document")
27 |         assert get_title_candidates([doc1, doc2]) == ["My document"]
28 | 
29 |     def test_does_not_add_empty_prefix(self) -> None:
30 |         doc1 = Document(title="My document")
31 |         doc2 = Document(title="Another document")
32 |         assert get_title_candidates([doc1, doc2]) == ["My document", "Another document"]
33 | 
34 | 
35 | class TestGetUnionOfTags:
36 |     def create_document_with_tags(self, tags: list[str]) -> Document:
37 |         return Document(title="A test document", tags=tags)
38 | 
39 |     def test_tags_on_one_document_are_tags(self) -> None:
40 |         doc = self.create_document_with_tags(tags=["tag1", "tag2", "tag3"])
41 |         assert get_union_of_tags([doc]) == ["tag1", "tag2", "tag3"]
42 | 
43 |     def test_get_tags_on_multiple_documents_with_no_overlap(self) -> None:
44 |         doc1 = self.create_document_with_tags(tags=["tag1"])
45 |         doc2 = self.create_document_with_tags(tags=["tag2"])
46 |         doc3 = self.create_document_with_tags(tags=["tag3"])
47 |         assert get_union_of_tags([doc1, doc2, doc3]) == ["tag1", "tag2", "tag3"]
48 | 
49 |     def test_union_tags_deduplicates(self) -> None:
50 |         doc1 = self.create_document_with_tags(tags=["tag1", "tag2"])
51 |         doc2 = self.create_document_with_tags(tags=["tag3", "tag2"])
52 |         doc3 = self.create_document_with_tags(tags=["tag3"])
53 |         assert get_union_of_tags([doc1, doc2, doc3]) == ["tag1", "tag2", "tag3"]
54 | 


--------------------------------------------------------------------------------
/tests/test_text_utils.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import pytest
 4 | 
 5 | from docstore.text_utils import common_prefix, hostname, pretty_date, slugify
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     "values, expected_prefix",
10 |     [
11 |         (["My document"], "My document"),
12 |         (["My document", "A different document"], ""),
13 |         (["My document (1)", "My document (2)"], "My document"),
14 |         (["My document (part 1)", "My document (part 2)"], "My document"),
15 |         (["My document - part ", "My document - part 2"], "My document"),
16 |     ],
17 | )
18 | def test_common_prefix(values: list[str], expected_prefix: str) -> None:
19 |     assert common_prefix(values) == expected_prefix
20 | 
21 | 
22 | @pytest.mark.parametrize(
23 |     "u, expected_slug",
24 |     [
25 |         ("abc", "abc"),
26 |         ("a:b", "a-b"),
27 |         ("Çingleton", "cingleton"),
28 |         ("a b", "a-b"),
29 |         ("a_b", "a-b"),
30 |         ("a  b", "a-b"),
31 |     ],
32 | )
33 | def test_slugify(u: str, expected_slug: str) -> None:
34 |     assert slugify(u) == expected_slug
35 | 
36 | 
37 | @pytest.mark.parametrize(
38 |     "d, now, expected_str",
39 |     [
40 |         (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 1, 1, 1, 11), "just now"),
41 |         (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 1, 1, 2, 59), "just now"),
42 |         (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 1, 1, 3, 1), "2 minutes ago"),
43 |         (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 1, 3, 1, 1), "earlier today"),
44 |         (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 2, 1, 1, 1), "yesterday"),
45 |         (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 6, 1, 1, 1), "5 days ago"),
46 |         (datetime(2001, 1, 1, 1, 1, 1), datetime(2001, 1, 12, 1, 1, 1), "1 Jan 2001"),
47 |         (datetime(2001, 1, 1, 13, 0, 0), datetime(2001, 1, 3, 12, 0, 0), "2 days ago"),
48 |     ],
49 | )
50 | def test_pretty_date(d: datetime, now: datetime, expected_str: str) -> None:
51 |     assert pretty_date(d=d, now=now) == expected_str
52 | 
53 | 
54 | @pytest.mark.parametrize(
55 |     "url, expected_hostname",
56 |     [
57 |         ("https://example.org/path/to/file.pdf", "example.org"),
58 |         # This really appeared in the source_url of a docstore instance migrated
59 |         # from v1, and caused a 500 error in the app.  It's weird, but shouldn't cause
60 |         # the app to crash.
61 |         ("magic", "magic"),
62 |     ],
63 | )
64 | def test_hostname(url: str, expected_hostname: str) -> None:
65 |     assert hostname(url) == expected_hostname
66 | 


--------------------------------------------------------------------------------
/tests/test_file_normalisation.py:
--------------------------------------------------------------------------------
 1 | import concurrent.futures
 2 | import os
 3 | import pathlib
 4 | 
 5 | from docstore.file_normalisation import normalised_filename_copy
 6 | 
 7 | 
 8 | def test_copies_a_file(tmp_path: pathlib.Path) -> None:
 9 |     src = tmp_path / "src.txt"
10 |     dst = tmp_path / "dst.txt"
11 | 
12 |     src.write_text("Hello world")
13 |     assert not dst.exists()
14 | 
15 |     normalised_filename_copy(src=str(src), dst=str(dst))
16 | 
17 |     assert dst.read_text() == "Hello world"
18 | 
19 | 
20 | def test_creates_intermediate_directories(tmp_path: pathlib.Path) -> None:
21 |     src = tmp_path / "src.txt"
22 |     dst = tmp_path / "1" / "2" / "3" / "dst.txt"
23 | 
24 |     src.write_text("Hello world")
25 |     assert not dst.exists()
26 | 
27 |     normalised_filename_copy(src=str(src), dst=str(dst))
28 | 
29 |     assert dst.read_text() == "Hello world"
30 | 
31 | 
32 | def test_copies_multiple_files_to_the_same_dst(tmp_path: pathlib.Path) -> None:
33 |     src1 = tmp_path / "src1.txt"
34 |     src2 = tmp_path / "src2.txt"
35 |     src3 = tmp_path / "src3.txt"
36 | 
37 |     dst = tmp_path / "dst.txt"
38 | 
39 |     src1.write_text("Hello world")
40 |     src2.write_text("Bonjour le monde")
41 |     src3.write_text("Hallo Welt")
42 | 
43 |     normalised_filename_copy(src=str(src1), dst=str(dst))
44 |     normalised_filename_copy(src=str(src2), dst=str(dst))
45 |     normalised_filename_copy(src=str(src3), dst=str(dst))
46 | 
47 |     assert len([f for f in os.listdir(tmp_path) if "dst" in f]) == 3
48 | 
49 |     dst_contents = {
50 |         open(os.path.join(tmp_path, f)).read()
51 |         for f in os.listdir(tmp_path)
52 |         if "dst" in f
53 |     }
54 | 
55 |     assert dst_contents == {"Hello world", "Bonjour le monde", "Hallo Welt"}
56 | 
57 | 
58 | def test_copies_multiple_files_concurrently(tmp_path: pathlib.Path) -> None:
59 |     src1 = tmp_path / "src1.txt"
60 |     src2 = tmp_path / "src2.txt"
61 |     src3 = tmp_path / "src3.txt"
62 | 
63 |     dst = tmp_path / "dst.txt"
64 | 
65 |     src1.write_text("Hello world")
66 |     src2.write_text("Bonjour le monde")
67 |     src3.write_text("Hallo Welt")
68 | 
69 |     with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
70 |         futures = {
71 |             executor.submit(normalised_filename_copy, src=str(s), dst=str(dst))
72 |             for s in (src1, src2, src3)
73 |         }
74 |         concurrent.futures.wait(futures)
75 | 
76 |     assert len([f for f in os.listdir(tmp_path) if "dst" in f]) == 3
77 | 
78 |     dst_contents = {
79 |         open(os.path.join(tmp_path, f)).read()
80 |         for f in os.listdir(tmp_path)
81 |         if "dst" in f
82 |     }
83 | 
84 |     assert dst_contents == {"Hello world", "Bonjour le monde", "Hallo Welt"}
85 | 


--------------------------------------------------------------------------------
/src/docstore/templates/_meta_info.html:
--------------------------------------------------------------------------------
 1 | {% if documents|length <= page_end %}
 2 |   {% set next_url = "#" %}
 3 | {% else %}
 4 |   {% set next_url = query_string|set_page(page + 1) %}
 5 | {% endif %}
 6 | 
 7 | {% if page == 1 %}
 8 |   {% set prev_url = "#" %}
 9 | {% else %}
10 |   {% set prev_url = query_string|set_page(page - 1) %}
11 | {% endif %}
12 | 
13 | <div class="meta_info">
14 |   {% if documents|length == 0 %}
15 |     no documents found!
16 |   {% else %}
17 |     showing document{% if page_start != page_end %}s{% endif %} {{ page_start }}{% if page_start != page_end %}&ndash;{{ page_end }}{% endif %} of {{ documents|length }}.
18 | 
19 |     {% if (prev_url != "#") or (next_url != "#") %}
20 |       <a {% if prev_url == "#" %}class="disabled"{% endif %} href="{{ prev_url }}">« prev</a>
21 |       •
22 |       <a {% if next_url == "#" %}class="disabled"{% endif %} href="{{ next_url }}">next »</a>
23 |     {% endif %}
24 | 
25 |     <form id="sortBy_{{ placement }}" class="sortBy">
26 |       <label for="sortBy">sort by:</label>
27 |       <select name="sortBy">
28 |         {% for name, label in [
29 |           ("dateNewestFirst", "date (newest first)"),
30 |           ("dateOldestFirst", "date (oldest first)"),
31 |           ("titleAtoZ", "title (A to Z)"),
32 |           ("titleZtoA", "title (Z to A)"),
33 |           ("random", "random")
34 |         ] %}
35 |           <option name="{{ name }}" {% if sort_by == label %}selected{% endif %}>{{ label }}</option>
36 |         {% endfor %}
37 |       </select>
38 |     </form>
39 | 
40 |     <script>
41 |       var sortBy = document.getElementById("sortBy_{{ placement }}");
42 |       sortBy.addEventListener("input", function(event) {
43 |         var sortByValue = document.getElementById("sortBy_{{ placement }}").children[1].value;
44 |         if ("{{ request.url | url_without_sortby }}".indexOf("?") === -1) {
45 |           var newUrl = "{{ request.url | url_without_sortby | safe }}?sortBy=" + sortByValue;
46 |         } else {
47 |           var newUrl = "{{ request.url | url_without_sortby | safe }}&sortBy=" + sortByValue;
48 |         }
49 |         document.location.href = newUrl;
50 |       })
51 |     </script>
52 |   {% endif %}
53 | 
54 |   {% if (include_tags and tag_tally) or request_tags %}
55 |   <hr/>
56 |   {% endif %}
57 | 
58 |   {% if include_tags and tag_tally %}
59 |     {% if tag_tally|length > 15 %}
60 |       <details id="tagList"><summary>tag list</summary>
61 |         {% include "_tag_list.html" %}
62 |       </details>
63 | 
64 |       <details id="tagCloud"><summary>tag cloud</summary>
65 |     {% endif %}
66 | 
67 |     {% set tag_cloud = TagCloud(tag_tally) %}
68 | 
69 |     <div id="tag_cloud">
70 |       {% for t, count in tag_tally.items()|sort() %}
71 |         {% if t in request_tags %}
72 |         <span style="{{ tag_cloud.get_style(count) }}">{{ t }}</span>
73 |         {% else %}
74 |         <a class="tag" href="{{ query_string|add_tag(t) }}" style="{{ tag_cloud.get_style(count) }}">{{ t }}</a>
75 |         {% endif %}
76 |       {% endfor %}
77 |     </div>
78 | 
79 |     {% if tag_tally|length > 15 %}
80 |       </details>
81 |     {% endif %}
82 | 
83 |     {% if request_tags %}
84 |     <hr/>
85 |     {% endif %}
86 |   {% endif %}
87 | 
88 |   {% if request_tags %}
89 |     filtering to tag{% if request_tags|length > 1 %}s{% endif %}
90 |     {% for t in request_tags %}
91 |       <span class="tag"><strong>{{ t }}</strong> <a href="{{ query_string|remove_tag(t) }}" class="remove_tag">[x]</a></span>
92 |     {% endfor %}
93 |   {% endif %}
94 | </div>
95 | 


--------------------------------------------------------------------------------
/src/docstore/tint_colors.py:
--------------------------------------------------------------------------------
 1 | import colorsys
 2 | import subprocess
 3 | import typing
 4 | 
 5 | import wcag_contrast_ratio as contrast
 6 | 
 7 | 
 8 | Color: typing.TypeAlias = tuple[float, float, float]
 9 | 
10 | 
11 | def choose_tint_color_from_dominant_colors(
12 |     dominant_colors: list[Color], background_color: Color
13 | ) -> Color:
14 |     """
15 |     Given a set of dominant colors (say, from a k-means algorithm) and the
16 |     background against which they'll be displayed, choose a tint color.
17 | 
18 |     Both ``dominant_colors`` and ``background_color`` should be tuples in [0,1].
19 |     """
20 |     # The minimum contrast ratio for text and background to meet WCAG AA
21 |     # is 4.5:1, so discard any dominant colours with a lower contrast.
22 |     sufficient_contrast_colors: list[Color] = [
23 |         typing.cast(Color, tuple(col))
24 |         for col in dominant_colors
25 |         if contrast.rgb(col, background_color) >= 4.5
26 |     ]
27 | 
28 |     # If none of the dominant colours meet WCAG AA with the background,
29 |     # try again with black and white -- every colour in the RGB space
30 |     # has a contrast ratio of 4.5:1 with at least one of these, so we'll
31 |     # get a tint colour, even if it's not a good one.
32 |     #
33 |     # Note: you could modify the dominant colours until one of them
34 |     # has sufficient contrast, but that's omitted here because it adds
35 |     # a lot of complexity for a relatively unusual case.
36 |     if not sufficient_contrast_colors:
37 |         return choose_tint_color_from_dominant_colors(
38 |             dominant_colors=dominant_colors + [(0, 0, 0), (1, 1, 1)],
39 |             background_color=background_color,
40 |         )
41 | 
42 |     # Of the colors with sufficient contrast, pick the one with the
43 |     # highest saturation.  This is meant to optimise for colors that are
44 |     # more colourful/interesting than simple greys and browns.
45 |     hsv_candidates: dict[Color, Color] = {
46 |         rgb_col: colorsys.rgb_to_hsv(*rgb_col) for rgb_col in sufficient_contrast_colors
47 |     }
48 | 
49 |     return max(hsv_candidates, key=lambda rgb_col: hsv_candidates[rgb_col][2])
50 | 
51 | 
52 | def from_hex(hs: str | bytes) -> Color:
53 |     """
54 |     Returns an RGB tuple from a hex string, e.g. #ff0102 -> (255, 1, 2)
55 |     """
56 |     return int(hs[1:3], 16), int(hs[3:5], 16), int(hs[5:7], 16)
57 | 
58 | 
59 | def choose_tint_color_for_file(path: str) -> Color:
60 |     """
61 |     Returns the tint colour for a file.
62 |     """
63 |     background_color = (1, 1, 1)
64 | 
65 |     cmd = ["dominant_colours", "--no-palette", "--max-colours=12", path]
66 | 
67 |     dominant_colors = [
68 |         from_hex(line) for line in subprocess.check_output(cmd).splitlines()
69 |     ]
70 | 
71 |     colors = [(r / 255, g / 255, b / 255) for r, g, b in dominant_colors]
72 | 
73 |     return choose_tint_color_from_dominant_colors(
74 |         dominant_colors=colors, background_color=background_color
75 |     )
76 | 
77 | 
78 | def choose_tint_color(*, thumbnail_path: str, file_path: str) -> Color:
79 |     # In general, we use the thumbnail to choose the tint color.  The thumbnail
80 |     # is what the tint color will usually appear next to.  However, thumbnails
81 |     # for animated GIFs are MP4 videos rather than images, so we need to go to
82 |     # the original image to get the tint color.
83 |     if file_path.endswith((".jpg", ".jpeg", ".gif", ".png")):
84 |         return choose_tint_color_for_file(file_path)
85 |     else:
86 |         return choose_tint_color_for_file(thumbnail_path)
87 | 


--------------------------------------------------------------------------------
/docs/previewing-the-files.md:
--------------------------------------------------------------------------------
 1 | # Previewing the files
 2 | 
 3 | As part of the web app, I create thumbnail images for every file.
 4 | This makes it easier for me to find the file I'm looking for -- often I can spot it before I finish reading the text.
 5 | 
 6 | ![A web app showing a list of files, with a small thumbnail on the left-hand side next to each file.](thumbnails.png)
 7 | 
 8 | I've tried a couple of approaches for rendering file thumbnails.
 9 | You can see the current implementation in [thumbnails.py][thumbnails.py].
10 | 
11 | [thumbnails.py]: https://github.com/alexwlchan/docstore/blob/main/src/docstore/thumbnails.py
12 | 
13 | 
14 | 
15 | ## Using pdftocairo to preview PDFs
16 | 
17 | Initially I was using [pdftocairo][pdftocairo] to render thumbnails, with a command like:
18 | 
19 | ```
20 | pdftocairo document.pdf \
21 |   -jpeg \
22 |   -singlefile \
23 |   -scale-to-x 400
24 | ```
25 | 
26 | This creates a JPEG thumbnail of `document.pdf` that's 400 pixels wide.
27 | 
28 | But it occasionally had issues, especially with PDFs I'd downloaded from my bank or HR system – for example, if it was missing a font, it rendered a completely blank thumbnail.
29 | That's not useful!
30 | 
31 | [pdftocairo]: https://www.mankier.com/1/pdftocairo
32 | 
33 | 
34 | 
35 | ## Using Quick Look to preview arbitrary files
36 | 
37 | As I started to store more files in docstore besides scanned PDFs, I needed to write more thumbnailing rules.
38 | For a while I tried to maintain that code myself, but I've switched to using [Quick Look][ql].
39 | This is a file previewing tool built into macOS: when looking at a file in the Finder, you can press space to see a preview of the file:
40 | 
41 | ![A Finder window with a Quick Look preview for a Keynote file titled "Using Python to organise my physical paper"](quick_look.png)
42 | 
43 | Quick Look exposes a command-line interface for creating thumbnails:
44 | 
45 | ```
46 | qlmanage -t -s 400 document.pdf
47 | ```
48 | 
49 | This creates a PNG thumbnail of `document.pdf` that's 400 pixels wide.
50 | 
51 | By default Quick Look supports a wide variety of file types, and it's pluggable – developers can write [Quick Look generators][ql_generators] to create previews if they have a custom file format.
52 | This means I let macOS handle the thumbnailing and don't have to worry about it in docstore.
53 | 
54 | [ql]: https://en.wikipedia.org/wiki/Quick_Look
55 | [ql_generators]: https://developer.apple.com/design/human-interface-guidelines/macos/system-capabilities/quick-look/
56 | 
57 | 
58 | 
59 | ## Using FFmpeg to preview animated GIFs
60 | 
61 | I have a handful of animated GIFs in docstore, and I want to get animated previews.
62 | For this I've found the best approach is to use [FFmpeg](https://ffmpeg.org/) to create a small video file that autoplays:
63 | 
64 | ```
65 | ffmpeg -i animated.gif \
66 |   -movflags faststart \
67 |   -pix_fmt yuv420p \
68 |   -vf scale=400:400
69 |   out.mp4
70 | ```
71 | 
72 | This is the same approach Twitter use for displaying GIFs.
73 | 
74 | 
75 | 
76 | ## Extracting tint colours with *k*-means colouring and Pillow
77 | 
78 | In the web app, there are links to the right-hand side of each thumbnail: to filter to other files with the same tag, or to the URL where I downloaded each file.
79 | I want to use a colour from the thumbnail to tint these links, just because it looks pretty:
80 | 
81 | ![A series of four colourful book covers, with metadata links to their right in matching colours.](tint_colors.png)
82 | 
83 | 
84 | For this, I use a technique used [*k*-means colouring][kmeans].
85 | I've written a [separate blog post][blog] about exactly how this works.
86 | 
87 | [kmeans]: https://en.wikipedia.org/wiki/K-means_clustering
88 | [blog]: https://alexwlchan.net/2019/08/finding-tint-colours-with-k-means/
89 | 


--------------------------------------------------------------------------------
/src/docstore/models.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import json
  3 | import typing
  4 | import uuid
  5 | 
  6 | import attr
  7 | import cattr
  8 | 
  9 | from docstore.git import current_commit
 10 | 
 11 | 
 12 | DB_SCHEMA = "v2.2.0"
 13 | 
 14 | 
 15 | def _convert_to_datetime(d: datetime.datetime | str) -> datetime.datetime:
 16 |     if isinstance(d, datetime.datetime):
 17 |         return d
 18 |     else:
 19 |         return datetime.datetime.fromisoformat(d)
 20 | 
 21 | 
 22 | def _convert_to_thumbnail(t: typing.Any) -> "Thumbnail":
 23 |     if isinstance(t, Thumbnail):
 24 |         return t
 25 |     else:
 26 |         return Thumbnail(**t)
 27 | 
 28 | 
 29 | def _convert_to_dimensions(d: typing.Any) -> "Dimensions":
 30 |     if isinstance(d, Dimensions):
 31 |         return d
 32 |     else:
 33 |         return Dimensions(**d)
 34 | 
 35 | 
 36 | def _convert_to_file(f_list: list[typing.Any]) -> "list[File]":
 37 |     return [f if isinstance(f, File) else File(**f) for f in f_list]
 38 | 
 39 | 
 40 | @attr.s
 41 | class Dimensions:
 42 |     width: int = attr.ib()
 43 |     height: int = attr.ib()
 44 | 
 45 | 
 46 | @attr.s
 47 | class Thumbnail:
 48 |     path: str = attr.ib()
 49 |     dimensions: Dimensions = attr.ib(converter=_convert_to_dimensions)
 50 |     tint_color: str = attr.ib()
 51 | 
 52 | 
 53 | @attr.s
 54 | class File:
 55 |     filename: str = attr.ib(converter=str)
 56 |     path: str = attr.ib()
 57 |     size: int = attr.ib()
 58 |     checksum: str = attr.ib()
 59 |     thumbnail: Thumbnail = attr.ib(converter=_convert_to_thumbnail)
 60 |     source_url: str | None = attr.ib(default=None)
 61 |     date_saved: datetime.datetime = attr.ib(
 62 |         factory=datetime.datetime.now, converter=_convert_to_datetime
 63 |     )
 64 |     id: str = attr.ib(default=attr.Factory(lambda: str(uuid.uuid4())))
 65 | 
 66 | 
 67 | @attr.s
 68 | class Document:
 69 |     title: str = attr.ib()
 70 |     id: str = attr.ib(default=attr.Factory(lambda: str(uuid.uuid4())))
 71 |     date_saved: datetime.datetime = attr.ib(
 72 |         factory=datetime.datetime.now, converter=_convert_to_datetime
 73 |     )
 74 |     tags: list[str] = attr.ib(factory=list)
 75 |     files: list[File] = attr.ib(factory=list, converter=_convert_to_file)
 76 | 
 77 | 
 78 | class DocstoreEncoder(json.JSONEncoder):
 79 |     def default(self, obj: typing.Any) -> typing.Any:
 80 |         if isinstance(obj, datetime.datetime):
 81 |             return obj.isoformat()
 82 |         else:  # pragma: no cover
 83 |             return super().default(obj)
 84 | 
 85 | 
 86 | def to_json(documents: list[Document]) -> str:
 87 |     """
 88 |     Returns a JSON string containing all the documents.
 89 |     """
 90 |     if not isinstance(documents, list) or not all(
 91 |         isinstance(d, Document) for d in documents
 92 |     ):
 93 |         raise TypeError("Expected type List[Document]!")
 94 | 
 95 |     # Use the same order that's used to serve the documents; Python's sort()
 96 |     # function goes faster if the documents are already in the right order.
 97 |     documents = sorted(documents, key=lambda d: d.date_saved, reverse=True)
 98 | 
 99 |     return json.dumps(
100 |         {
101 |             "docstore": {
102 |                 "db_schema": DB_SCHEMA,
103 |                 "commit": current_commit(),
104 |                 "last_modified": datetime.datetime.now().isoformat(),
105 |             },
106 |             "documents": cattr.unstructure(documents),
107 |         },
108 |         indent=2,
109 |         sort_keys=True,
110 |         cls=DocstoreEncoder,
111 |     )
112 | 
113 | 
114 | def from_json(json_string: str) -> list[Document]:
115 |     """
116 |     Parses a JSON string containing all the documents.
117 |     """
118 |     parsed_structure = json.loads(json_string)
119 |     assert parsed_structure["docstore"]["db_schema"] == DB_SCHEMA
120 |     return cattr.structure(parsed_structure["documents"], list[Document])
121 | 


--------------------------------------------------------------------------------
/src/docstore/thumbnails.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import subprocess
  4 | import sys
  5 | import tempfile
  6 | 
  7 | from PIL import Image, UnidentifiedImageError
  8 | 
  9 | from docstore.models import Dimensions
 10 | 
 11 | 
 12 | def _is_animated_gif(path: str) -> bool:
 13 |     """
 14 |     Returns True if the file at ``path`` is an animated GIF.
 15 |     """
 16 |     try:
 17 |         im = Image.open(path)
 18 |     except UnidentifiedImageError:
 19 |         # Not an image
 20 |         return False
 21 |     else:
 22 |         return getattr(im, "is_animated", False)
 23 | 
 24 | 
 25 | def _create_gif_thumbnail_from_ffmpeg(*, path: str, max_size: int, out_dir: str) -> str:
 26 |     im = Image.open(path)
 27 | 
 28 |     if im.width > im.height and im.width >= max_size:
 29 |         width, height = (max_size, int(im.height * max_size / im.width))
 30 |     else:
 31 |         width, height = (int(im.width * max_size / im.height), max_size)
 32 | 
 33 |     # The yuv420p encoder requires even values
 34 |     width, height = (int(width / 2) * 2, int(height / 2) * 2)
 35 | 
 36 |     out_path = os.path.join(out_dir, os.path.basename(path) + ".mp4")
 37 | 
 38 |     subprocess.check_call(
 39 |         [
 40 |             "ffmpeg",
 41 |             "-i",
 42 |             path,
 43 |             "-movflags",
 44 |             "faststart",
 45 |             "-pix_fmt",
 46 |             "yuv420p",
 47 |             "-vf",
 48 |             f"scale={width}:{height}",
 49 |             out_path,
 50 |         ],
 51 |         stdout=subprocess.DEVNULL,
 52 |     )
 53 | 
 54 |     return out_path
 55 | 
 56 | 
 57 | def _create_thumbnail_from_quick_look(*, path: str, max_size: int, out_dir: str) -> str:
 58 |     try:
 59 |         subprocess.check_call(
 60 |             ["qlmanage", "-t", path, "-s", f"{max_size}x{max_size}", "-o", out_dir],
 61 |             stdout=subprocess.DEVNULL,
 62 |             stderr=subprocess.DEVNULL,
 63 |             timeout=5,
 64 |         )
 65 |     except subprocess.TimeoutExpired:
 66 |         # It's possible for somethign to go wrong with the Quick Look
 67 |         # process where it just hangs and doesn't create a thumbnail.
 68 |         # If so, just continue without creating the thumbnail.
 69 |         pass
 70 | 
 71 |     try:
 72 |         result = os.path.join(out_dir, os.listdir(out_dir)[0])
 73 |     except IndexError:
 74 |         print(f"Quick Look could not create a thumbnail for {path}", file=sys.stderr)
 75 |         result = os.path.join(out_dir, "generic_document.png")
 76 |         shutil.copyfile(
 77 |             src=os.path.join(
 78 |                 os.path.dirname(os.path.abspath(__file__)),
 79 |                 "static/generic_document.png",
 80 |             ),
 81 |             dst=result,
 82 |         )
 83 | 
 84 |     if result.endswith(".png.png"):
 85 |         os.rename(result, result.replace(".png.png", ".png"))
 86 |         result = result.replace(".png.png", ".png")
 87 | 
 88 |     return result
 89 | 
 90 | 
 91 | def create_thumbnail(path: str, *, max_size: int = 400) -> str:
 92 |     """
 93 |     Creates a thumbnail of the file at ``path``.
 94 | 
 95 |     Returns the path to the new file.
 96 |     """
 97 |     if _is_animated_gif(path):
 98 |         return _create_gif_thumbnail_from_ffmpeg(
 99 |             path=path, max_size=max_size, out_dir=tempfile.mkdtemp()
100 |         )
101 |     else:
102 |         return _create_thumbnail_from_quick_look(
103 |             path=path, max_size=max_size, out_dir=tempfile.mkdtemp()
104 |         )
105 | 
106 | 
107 | def get_dimensions(path: str) -> Dimensions:
108 |     """
109 |     Returns the (width, height) of a given path.
110 |     """
111 |     if path.endswith(".png"):  # image thumbnail
112 |         im = Image.open(path)
113 |         return Dimensions(width=im.width, height=im.height)
114 | 
115 |     elif path.endswith(".mp4"):  # video thumbnail
116 |         # See https://stackoverflow.com/a/29585066/1558022
117 |         output = subprocess.check_output(
118 |             [
119 |                 "ffprobe",
120 |                 "-v",
121 |                 "error",
122 |                 "-show_entries",
123 |                 "stream=width,height",
124 |                 "-of",
125 |                 "csv=p=0:s=x",
126 |                 os.path.abspath(path),
127 |             ]
128 |         )
129 |         width, height = output.strip().split(b"x")
130 |         return Dimensions(width=int(width), height=int(height))
131 | 
132 |     else:  # pragma: no cover
133 |         raise ValueError(f"Unrecognised thumbnail type: {path}")
134 | 


--------------------------------------------------------------------------------
/docs/storing-the-metadata.md:
--------------------------------------------------------------------------------
  1 | # Storing the metadata
  2 | 
  3 | I store a certain amount of metadata alongside each file, including:
  4 | 
  5 | *   The original filename
  6 | *   When I saved it
  7 | *   A one-line human-readable description
  8 | *   What tags I'm using
  9 | 
 10 | This document explains how I model the metadata and how I serialise it to disk.
 11 | 
 12 | 
 13 | 
 14 | ## Modelling the metadata with attrs
 15 | 
 16 | I use the [attrs library][attrs] to define my metadata models.
 17 | The library provides a couple of decorators that let you define data classes without writing all the usual boilerplate.
 18 | 
 19 | For example:
 20 | 
 21 | ```pycon
 22 | >>> import attr
 23 | 
 24 | >>> @attr.s
 25 | ... class Document:
 26 | ...     path = attr.ib()
 27 | ...     tags = attr.ib()
 28 | ```
 29 | 
 30 | This defines a class called `Document` with two attributes `path` and `tags`.
 31 | It gives me a constructor, and makes both those attributes available for reading/writing:
 32 | 
 33 | ```pycon
 34 | >>> doc = Document(
 35 | ...     path="scanned_doc.pdf",
 36 | ...     tags=["home", "bills", "acme energy"]
 37 | ... )
 38 | 
 39 | >>> doc.path
 40 | "scanned_doc.pdf"
 41 | 
 42 | >>> doc.tags
 43 | ["home", "bills", "acme energy"]
 44 | 
 45 | >>> doc.tags.append("utilities:electricity")
 46 | >>> doc.tags
 47 | ["home", "bills", "acme energy", "utilities:electricity"]
 48 | ```
 49 | 
 50 | The attrs library defines commonly used methods on the class, saving you from writing that boilerplate yourself.
 51 | For example, it includes a nice repr() of objects:
 52 | 
 53 | ```pycon
 54 | >>> repr(doc)
 55 | Document(path="scanned_doc.pdf", tags=["home", "bills", "acme energy", "utilities:electricity"])
 56 | ```
 57 | 
 58 | That repr() can be eval()'d to get back the same value, and attrs provides methods for equality and hashing (not shown):
 59 | 
 60 | ```pycon
 61 | >>> eval(repr(doc)) == doc
 62 | True
 63 | 
 64 | >>> doc == Document(path="cat.jpg", tags=["pets"])
 65 | False
 66 | ```
 67 | 
 68 | If this looks similar to the [dataclasses module][dataclasses] in the Python standard library, it's because attrs was a direct inspiration for dataclasses.
 69 | I was using attrs before dataclasses existed and I've never been persuaded to switch.
 70 | 
 71 | Using attrs allows me to write short, compact models for my metadata.
 72 | The entire model definition is less than 40 lines: [see models.py](https://github.com/alexwlchan/docstore/blob/a4b7972d147b538bbf48792566d55eeaea24e32a/src/docstore/models.py#L40-L71) for my model implemtnation.
 73 | 
 74 | [attrs]: https://www.attrs.org/en/stable/
 75 | [dataclasses]: https://docs.python.org/3/library/dataclasses.html
 76 | 
 77 | 
 78 | 
 79 | ## Using JSON as a database
 80 | 
 81 | You can serialise an attrs model to a Python dict:
 82 | 
 83 | ```pycon
 84 | >>> attr.asdict(doc)
 85 | {"path": "scanned_doc.pdf", "tags": ["home", "bills", "acme energy", "utilities:electricity"]}
 86 | ```
 87 | 
 88 | This looks pretty close to JSON, and I save all the metadata into a standalone JSON file that lives in the top-level directory of a docstore instance.
 89 | 
 90 | There are several reasons I like JSON for storing my docstore metadata:
 91 | 
 92 | -   It maps very closely to data structures in Python.
 93 |     I don't have to deal with any complex serialisation code.
 94 | 
 95 | -   JSON is a simple format with parsing libraries in lots of languages.
 96 |     Even if I lost all the code for docstore, I could still use the metadata.
 97 | 
 98 | -   JSON is plain text, so it's easy to edit.
 99 |     If I want to edit some metadata, I can open the metadata file in any text editor and make changes.
100 |     This means I didn't have to put any editing-related code in docstore itself.
101 | 
102 | I only have a few thousand files, so the performance impact of reading/writing all the JSON every time is minimal.
103 | You shouldn't use JSON for large data sets, but for small data sets it's absolutely fine.
104 | 
105 | 
106 | 
107 | ## Serialising attrs models to JSON and back
108 | 
109 | To save attrs models as JSON, or to read JSON as attrs models, I use the [cattrs library][cattrs].
110 | This provides a pair of functions to go in both directions:
111 | 
112 | ```pycon
113 | >>> cattr.unstructure(doc)
114 | {"path": "scanned_doc.pdf", "tags": ["home", "bills", "acme energy"]}
115 | 
116 | >>> cattr.structure(
117 | ...     {"path": "cat.jpg", "tags": ["pets"]},
118 | ...     Document)
119 | Document(path="cat.jpg", tags=["pets"])
120 | ```
121 | 
122 | It has all the logic for doing validation, handling errors, and converting everything to the right type – so I don't have to write any custom serialisation code in docstore.
123 | 
124 | [cattrs]: https://cattrs.readthedocs.io/en/latest/
125 | 


--------------------------------------------------------------------------------
/.github/install-github-bin:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Install one of my Rust binaries from GitHub.
  3 | #
  4 | # This was originally written to pull just my Rust binaries, but it
  5 | # turns out it can work reasonably well for other projects.  This allows
  6 | # me to bypass Homebrew, which is generally nicer.
  7 | #
  8 | # For my CLI tools written in Rust, I use a GitHub Action [1]
  9 | # to compile binaries and add them to a release on GitHub.
 10 | #
 11 | # These are the "canonical" versions, rather than recompiling on
 12 | # each machine and having slightly different versions because I
 13 | # was using different checkouts of the code.
 14 | #
 15 | # This script gets the latest version of a binary from GitHub and
 16 | # adds it to my $PATH.
 17 | #
 18 | # This script isn't meant to be run directly (hence the underscore) --
 19 | # I have other scripts that invoke it (e.g. install-vfd), so I can find
 20 | # those scripts with autocomplete in my shell, even on a new machine.
 21 | #
 22 | # TODO: I publish macOS/Windows/Linux binaries, but this script only
 23 | # fetches the macOS binary.  In theory, it could detect which OS it's
 24 | # running on and pick the right binary.
 25 | #
 26 | # [1]: https://github.com/taiki-e/upload-rust-binary-action
 27 | 
 28 | set -o errexit
 29 | set -o nounset
 30 | 
 31 | if (( "$#" != 1 ))
 32 | then
 33 |   echo "Usage: _install-rust-bin <REPO_NAME>"
 34 | fi
 35 | 
 36 | REPO_NAME="$1"
 37 | 
 38 | RELEASES_API_URL="https://api.github.com/repos/$REPO_NAME/releases/latest"
 39 | 
 40 | # Fetch the asset URL using the GitHub Releases API [2].
 41 | #
 42 | # For the purposes of this script, these are the interesting bits of the
 43 | # API response that we want to pay attention to:
 44 | #
 45 | #     {
 46 | #       "assets": [
 47 | #         {
 48 | #           "name": "vfd-x86_64-apple-darwin.tar.gz",
 49 | #           "url": "https://api.github.com/repos/alexwlchan/books.alexwlchan.net/releases/assets/64229966",
 50 | #           ...
 51 | #         },
 52 | #         {
 53 | #           "name": "vfd-x86_64-pc-windows-msvc.zip",
 54 | #           "url": "https://api.github.com/repos/alexwlchan/books.alexwlchan.net/releases/assets/64229889",
 55 | #           ...
 56 | #         },
 57 | #         {
 58 | #           "name": "vfd-x86_64-unknown-linux-gnu.tar.gz",
 59 | #           "url": "https://api.github.com/repos/alexwlchan/books.alexwlchan.net/releases/assets/64229611",
 60 | #           ...
 61 | #         }
 62 | #       ],
 63 | #     }
 64 | #
 65 | # [2]: https://docs.github.com/en/rest/releases/releases#get-the-latest-release
 66 | #
 67 | # Note: this will filter out ARM binaries because I'm not running on
 68 | # Apple Silicon yet; this will need updating eventually.
 69 | #
 70 | ASSET_URL=$(curl --silent "$RELEASES_API_URL" \
 71 |   | jq -r '.assets | .[] | select(.name | contains("darwin")) | select(.name | contains("arm") | not) | select(.name | contains("aarch64") | not) | .url' \
 72 |   | grep -v arm64
 73 | )
 74 | 
 75 | if [[ "$ASSET_URL" == "" ]]
 76 | then
 77 |   echo "No macOS download available for the latest version! Is it still building?" >&2
 78 |   exit 1
 79 | fi
 80 | 
 81 | # Download and unpack the asset using the GitHub Release Assets API [3].
 82 | #
 83 | # We supply the headers required by the GitHub API, and the `--location`
 84 | # flag caused curl to follow redirects.
 85 | #
 86 | # Note: this assumes the binary is packaged as a tar.gz.  The Windows
 87 | # binaries are zipped instead of tar.gz-ed, so if you want to support
 88 | # Windows, inspect the "content_type" field in the Releases API response.
 89 | #
 90 | # [3]: https://docs.github.com/en/rest/releases/assets#get-a-release-asset
 91 | 
 92 | cd $(mktemp -d)
 93 | 
 94 | curl \
 95 |   --header "Accept: application/octet-stream" \
 96 |   --location \
 97 |   --silent \
 98 |   "$ASSET_URL" > "asset.tar.gz"
 99 | 
100 | # Identify the name of the binary, which may be different from the repo name.
101 | #
102 | # We list all the files in the asset package, which should contain a single
103 | # file, and assume that's the name of the binary.
104 | ASSET_FILES=$(tar --list --file "asset.tar.gz")
105 | 
106 | if [[ "$REPO_NAME" == "BurntSushi/ripgrep" ]]
107 | then
108 |   BINARY_PATH=$(echo "$ASSET_FILES" | grep "/rg$")
109 |   BINARY_NAME="rg"
110 | else
111 |   if (( $(echo "$ASSET_FILES" | wc -l) != 1 ))
112 |   then
113 |     echo "Release asset doesn't contain exactly 1 file; not sure what to do:" >&2
114 |     echo "$ASSET_FILES" >&2
115 |     exit 1
116 |   fi
117 | 
118 |   BINARY_PATH="$(echo "$ASSET_FILES" | head -n 1)"
119 |   BINARY_NAME="$BINARY_PATH"
120 | fi
121 | 
122 | # Now actually extract the binary, make it executable, and add it to the PATH.
123 | tar --extract --gunzip --file "asset.tar.gz"
124 | 
125 | chmod +x "$BINARY_PATH"
126 | sudo mv "$BINARY_PATH" /usr/local/bin
127 | 
128 | which "$BINARY_NAME"
129 | "$BINARY_NAME" --version
130 | 


--------------------------------------------------------------------------------
/src/docstore/tag_list.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Given a series of tags, arrange them into a hierarchy.  For example:
  3 |     - seasons
  4 |         - autumn
  5 |         - summer
  6 |     - trees
  7 |         - ash
  8 |         - oak
  9 |         - yew
 10 |             - ancient
 11 | This is based on
 12 | https://github.com/dreamwidth/dw-free/blob/6ec1e146d3c464e506a77913f0abf0d51a944f95/styles/core2.s2#L4126
 13 | """
 14 | 
 15 | import typing
 16 | 
 17 | 
 18 | class HtmlLiteral(typing.TypedDict):
 19 |     type: typing.Literal["html_literal"]
 20 |     value: str
 21 | 
 22 | 
 23 | class TagLink(typing.TypedDict):
 24 |     type: typing.Literal["tag_link"]
 25 |     name: str
 26 |     count: int
 27 |     display_name: str
 28 | 
 29 | 
 30 | class TagText(typing.TypedDict):
 31 |     type: typing.Literal["tag_text"]
 32 |     display_name: str
 33 | 
 34 | 
 35 | def render_tag_list(tag_tally: dict[str, int]) -> list[HtmlLiteral | TagLink | TagText]:
 36 |     if not tag_tally:
 37 |         return []
 38 | 
 39 |     prev_tags: list[str] = []
 40 |     tag_list_pos = 0
 41 |     tier_elements: list[HtmlLiteral | TagLink | TagText] = []
 42 |     levels_to_close = 0
 43 | 
 44 |     result: list[HtmlLiteral | TagLink | TagText] = []
 45 | 
 46 |     for name, count in sorted(tag_tally.items()):
 47 |         tags = name.split(":")
 48 | 
 49 |         pos = 0
 50 |         show_lower_tiers = False
 51 | 
 52 |         for tier in tags:
 53 |             # If we're on a tag's last tier and this tag isn't already selected,
 54 |             # we need to return a link to the tag, otherwise plain text is returned.
 55 |             if len(tags) == pos + 1:
 56 |                 tier_elements = [
 57 |                     {
 58 |                         "type": "tag_link",
 59 |                         "name": name,
 60 |                         "count": count,
 61 |                         "display_name": tier.lstrip("_"),
 62 |                     }
 63 |                 ]
 64 |             else:
 65 |                 tier_elements = [{"type": "tag_text", "display_name": tier}]
 66 | 
 67 |             # Prev tag has fewer tiers than than current tag.
 68 |             if len(prev_tags) < pos + 1:
 69 |                 result.append({"type": "html_literal", "value": "<ul><li>"})
 70 |                 result.extend(tier_elements)
 71 |                 levels_to_close += 1
 72 | 
 73 |             elif tags[pos] != prev_tags[pos] or show_lower_tiers:
 74 |                 if tags[pos] != prev_tags[pos]:
 75 |                     # The current tag's tier is not the same as the previous
 76 |                     # tag's tier of the same level.  This means we may need
 77 |                     # to close some lists.
 78 |                     i = levels_to_close
 79 |                     for html in prev_tags:
 80 |                         if i > pos + 1:
 81 |                             result.append(
 82 |                                 {
 83 |                                     "type": "html_literal",
 84 |                                     "value": "</li></ul>",
 85 |                                 }
 86 |                             )
 87 |                             levels_to_close -= 1
 88 |                         i -= 1
 89 | 
 90 |                     # If we just closed some lists, that means that any lower
 91 |                     # tiers in this tag need to be explicitly displayed, even
 92 |                     # if they match the same-level tier of the previous tag
 93 |                     show_lower_tiers = True
 94 | 
 95 |                 if levels_to_close <= pos:
 96 |                     # This is the first tier at this level, so open list
 97 |                     result.append({"type": "html_literal", "value": "<ul><li>"})
 98 |                     result.extend(tier_elements)
 99 |                     levels_to_close += 1
100 | 
101 |                 else:
102 |                     result.append({"type": "html_literal", "value": "</li><li>"})
103 |                     result.extend(tier_elements)
104 | 
105 |             else:
106 |                 # The current tag's tier is exactly the same as the previous
107 |                 # tag's tier at this same level.  It has already been included
108 |                 # in the list, so do nothing.
109 |                 pass
110 | 
111 |             # Moving on to next tier in this tag!
112 |             pos += 1
113 | 
114 |         prev_tags = tags
115 |         show_lower_tiers = False
116 | 
117 |     # Next tag in the list!
118 |     tag_list_pos += 1
119 | 
120 |     # All the tags have been added so close all outstanding lists.
121 |     for html in prev_tags:
122 |         if levels_to_close > 0:
123 |             result.append({"type": "html_literal", "value": "</li></ul>"})
124 |             levels_to_close -= 1
125 |         else:  # pragma: no cover
126 |             # I haven't been able to find a test case that triggers this
127 |             # particular branch, so I'm excluding it from coverage.
128 |             # If it does come up, come back and add a test for this line!
129 |             assert 0
130 | 
131 |     return result
132 | 


--------------------------------------------------------------------------------
/src/docstore/server.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import datetime
  3 | import functools
  4 | import hashlib
  5 | import os
  6 | import pathlib
  7 | import secrets
  8 | import typing
  9 | import urllib.parse
 10 | from urllib.parse import parse_qsl, urlparse, urlencode
 11 | 
 12 | from flask import (
 13 |     Flask,
 14 |     Response as FlaskResponse,
 15 |     make_response,
 16 |     render_template,
 17 |     request,
 18 |     send_file,
 19 |     send_from_directory,
 20 | )
 21 | import hyperlink
 22 | import smartypants
 23 | from werkzeug.middleware.profiler import ProfilerMiddleware
 24 | 
 25 | from .documents import find_original_filename, read_documents
 26 | from .models import Document
 27 | from .tag_cloud import TagCloud
 28 | from .tag_list import render_tag_list
 29 | from .text_utils import hostname, pretty_date
 30 | 
 31 | 
 32 | def tags_with_prefix(document: Document, prefix: str) -> list[str]:
 33 |     return [t for t in document.tags if t.startswith(prefix)]
 34 | 
 35 | 
 36 | def tags_without_prefix(document: Document, prefix: str) -> list[str]:
 37 |     return [t for t in document.tags if not t.startswith(prefix)]
 38 | 
 39 | 
 40 | def url_without_sortby(u: str) -> str:
 41 |     url = hyperlink.URL.from_text(u)
 42 |     return str(url.remove("sortBy"))
 43 | 
 44 | 
 45 | def serve_file(*, root: pathlib.Path, shard: str, filename: str) -> FlaskResponse:
 46 |     """
 47 |     Serves a file which has been saved in docstore.
 48 | 
 49 |     This adds the Content-Disposition header to the response, so files
 50 |     are downloaded with the original filename they were uploaded as,
 51 |     rather than the normalised filename.
 52 | 
 53 |     """
 54 |     path = os.path.abspath(os.path.join(root, "files", shard, filename))
 55 |     response = make_response(send_file(path))
 56 | 
 57 |     original_filename = find_original_filename(root, path=path)
 58 | 
 59 |     # See https://stackoverflow.com/a/49481671/1558022 for UTF-8 encoding
 60 |     encoded_filename = urllib.parse.quote(original_filename, encoding="utf-8")
 61 |     response.headers["Content-Disposition"] = f"filename*=utf-8''{encoded_filename}"
 62 | 
 63 |     return response
 64 | 
 65 | 
 66 | def create_app(title: str, root: pathlib.Path, thumbnail_width: int) -> Flask:
 67 |     app = Flask(__name__)
 68 | 
 69 |     app.config["THUMBNAIL_WIDTH"] = thumbnail_width
 70 | 
 71 |     app.jinja_env.trim_blocks = True
 72 |     app.jinja_env.lstrip_blocks = True
 73 | 
 74 |     app.jinja_env.filters["hostname"] = hostname
 75 |     app.jinja_env.filters["pretty_date"] = lambda d: pretty_date(
 76 |         d, now=datetime.datetime.now()
 77 |     )
 78 |     app.jinja_env.filters["render_tag_list"] = render_tag_list
 79 |     app.jinja_env.filters["smartypants"] = smartypants.smartypants
 80 |     app.jinja_env.filters["url_without_sortby"] = url_without_sortby
 81 | 
 82 |     app.jinja_env.filters["tags_with_prefix"] = tags_with_prefix
 83 |     app.jinja_env.filters["tags_without_prefix"] = tags_without_prefix
 84 | 
 85 |     @app.route("/")
 86 |     def list_documents() -> str:
 87 |         request_tags = set(request.args.getlist("tag"))
 88 |         documents = [
 89 |             doc for doc in read_documents(root) if request_tags.issubset(set(doc.tags))
 90 |         ]
 91 | 
 92 |         tag_tally: dict[str, int] = collections.Counter()
 93 |         for doc in documents:
 94 |             for t in doc.tags:
 95 |                 tag_tally[t] += 1
 96 | 
 97 |         try:
 98 |             page = int(request.args["page"])
 99 |         except KeyError:
100 |             page = 1
101 | 
102 |         sort_by = request.args.get("sortBy", "date (newest first)")
103 | 
104 |         if sort_by.startswith("date"):
105 |             sort_key = lambda d: d.date_saved  # noqa
106 |         elif sort_by.startswith("title"):
107 |             sort_key = lambda d: d.title.lower()  # noqa
108 |         elif sort_by == "random":
109 |             if page == 1:
110 |                 app.config["_RANDOM_SEED"] = secrets.token_bytes()
111 |             seed = app.config["_RANDOM_SEED"]
112 | 
113 |             def sort_key(d: Document) -> str:
114 |                 h = hashlib.md5()
115 |                 h.update(d.id.encode("utf8"))
116 |                 h.update(seed)
117 |                 return h.hexdigest()
118 |         else:
119 |             raise ValueError(f"Unrecognised sortBy query parameter: {sort_by}")
120 | 
121 |         if sort_by in {"date (newest first)", "title (Z to A)"}:
122 |             sort_reverse = True
123 |         else:
124 |             sort_reverse = False
125 | 
126 |         html = render_template(
127 |             "index.html",
128 |             documents=sorted(documents, key=sort_key, reverse=sort_reverse),
129 |             request_tags=request_tags,
130 |             query_string=tuple(parse_qsl(urlparse(request.url).query)),
131 |             tag_tally=tag_tally,
132 |             title=title,
133 |             page=page,
134 |             sort_by=sort_by,
135 |             TagCloud=TagCloud,
136 |         )
137 | 
138 |         return html
139 | 
140 |     @app.route("/thumbnails/<shard>/<filename>")
141 |     def thumbnails(shard: str, filename: str) -> FlaskResponse:
142 |         return send_from_directory(
143 |             os.path.abspath(os.path.join(root, "thumbnails", shard)), filename
144 |         )
145 | 
146 |     app.add_url_rule(
147 |         rule="/files/<shard>/<filename>",
148 |         view_func=lambda shard, filename: serve_file(
149 |             root=root, shard=shard, filename=filename
150 |         ),
151 |     )
152 | 
153 |     QueryString: typing.TypeAlias = list[tuple[str, str]]
154 | 
155 |     @app.template_filter("add_tag")
156 |     @functools.lru_cache()
157 |     def add_tag(query_string: QueryString, tag: str) -> str:
158 |         return "?" + urlencode(
159 |             [(k, v) for k, v in query_string if k != "page"] + [("tag", tag)]
160 |         )
161 | 
162 |     @app.template_filter("remove_tag")
163 |     def remove_tag(query_string: QueryString, tag: str) -> str:
164 |         return "?" + urlencode(
165 |             [(k, v) for k, v in query_string if (k, v) != ("tag", tag)]
166 |         )
167 | 
168 |     @app.template_filter("set_page")
169 |     @functools.lru_cache()
170 |     def set_page(query_string: QueryString, page: int) -> str:
171 |         pageless_qs = [(k, v) for k, v in query_string if k != "page"]
172 |         if page == 1:
173 |             return "?" + urlencode(pageless_qs)
174 |         else:
175 |             return "?" + urlencode(pageless_qs + [("page", page)])
176 | 
177 |     return app
178 | 
179 | 
180 | def run_profiler(app: Flask, *, host: str, port: int) -> None:  # pragma: no cover
181 |     app.config["PROFILE"] = True
182 |     app.wsgi_app = ProfilerMiddleware(app.wsgi_app, restrictions=[30])  # type: ignore
183 |     app.run(host=host, port=port, debug=True)
184 | 
185 | 
186 | def run_server(
187 |     app: Flask, *, host: str, port: int, debug: bool
188 | ) -> None:  # pragma: no cover
189 |     app.run(host=host, port=port, debug=debug)
190 | 


--------------------------------------------------------------------------------
/docs/storing-the-files.md:
--------------------------------------------------------------------------------
  1 | # Storing the files
  2 | 
  3 | Part of the point of docstore is to abstract away the management of individual files.
  4 | I don't want to worry about managing individual files and folders – I want the tool to do that for me.
  5 | 
  6 | This document explains a bit about how docstore manages my files.
  7 | 
  8 | 
  9 | 
 10 | ## Where the files are stored
 11 | 
 12 | I run docstore on my home computer, which shouldn't accessible from the Internet.
 13 | **The files are stored on the local disk, not in cloud storage.**
 14 | 
 15 | I use docstore to store files with private information: bank statements, medical letters, rental contracts, and more.
 16 | If I uploaded them to a cloud storage service like S3, there's a risk I'd misconfigure the permissions and inadvertently make the files public.
 17 | For me, the security of knowing they're not in the cloud outweighs the potential convenience of having remote access.
 18 | 
 19 | 
 20 | 
 21 | ## How the files are named / filename normalisation
 22 | 
 23 | Although my scanned documents have autogenerated filenames, sometimes I download documents that I want to save (e.g. electronic banking statements), which have … interesting filename choices.
 24 | 
 25 | These are real filenames I've received:
 26 | 
 27 | <table>
 28 |     <tr>
 29 |         <th>Filename</th>
 30 |         <th>Comments</th>
 31 |     </tr>
 32 |     <tr>
 33 |         <td><code>VolcanoPattern.pdf</code></td>
 34 |         <td>10/10 great name.</td>
 35 |     </tr>
 36 |     <tr>
 37 |         <td><code>Alex Chan_5312.pdf</td>
 38 |         <td>Spaces in filenames cause nothing but trouble.</td>
 39 |     </tr>
 40 |     <tr>
 41 |         <td><code>Statement.pdf</td>
 42 |         <td>This is a bank statement with no context. I have dozens of files with identical names, covering different accounts and date ranges.</td>
 43 |     </tr>
 44 |     <tr>
 45 |         <td><code>Alexander Chan›Payslip November 2014-2015.PDF</code></td>
 46 |         <td>Special characters are annoying.</td>
 47 |     </tr>
 48 |     <tr>
 49 |         <td><code>V5C:3 scrappage note.pdf</code></td>
 50 |         <td>I have no idea how I created this file. This is the <a href="https://insidedvla.blog.gov.uk/2021/05/13/do-you-know-how-to-tell-dvla-online-that-youve-sold-or-transferred-your-vehicle/">V5C/3 form</a>, so at some point the slash has been converted to a colon – but both the colon and slash are used as path separators on macOS, and are best avoided.</td>
 51 |     </tr>
 52 | </table>
 53 | 
 54 | So I can't rely on the original filename: maybe it contains special characters, or I have different files with the same filename.
 55 | The original filename is a useful piece of metadata that I want to keep, but I can't use it for saving files.
 56 | 
 57 | **I save files under a normalised version of their original filename.**
 58 | I want to keep as close to the original filename as possible -- so no UUIDs.
 59 | Then I save the original filename as a bit of metadata in the database.
 60 | 
 61 | The normalisation process has two steps:
 62 | 
 63 | -   Creating an ASCII-safe filename using [Dr Drang's slugify() function](http://www.leancrew.com/all-this/2014/10/asciifying/).
 64 |     This uses the [Unidecode](https://pypi.org/project/Unidecode/) and [re libraries](https://docs.python.org/3/library/re.html) to remove any non-ASCII characters and spaces.
 65 | 
 66 | -   Appending a random hex value before the filename extension if there are multiple files with the same name.
 67 |     This avoids saving two files with the same name.
 68 |     e.g. `Statement.pdf`, `Statement_1c5e.pdf`, `Statement_3fc9.pdf`, …
 69 | 
 70 | For the exact implementation, see [file_normalisation.py](https://github.com/alexwlchan/docstore/blob/main/src/docstore/file_normalisation.py).
 71 | 
 72 | 
 73 | 
 74 | ## Ensuring I don't save two files with the same name / exclusive-open mode in Python
 75 | 
 76 | What if two processes try to save a file with the same name simultaneously?
 77 | How do I ensure the normalisation kicks in and adds the random hex value to keep the files apart?
 78 | 
 79 | This is probably overkill: I'm the only person saving documents, and I can't do multiple things at once.
 80 | But it was pretty easy to add, and it's a useful example of a less well-known feature in Python.
 81 | 
 82 | If you've used Python, you probably know how to [read and write files][python_rw]:
 83 | 
 84 | ```pycon
 85 | >>> with open("greeting.txt", mode="w") as outfile:
 86 | ...     outfile.write("Hello world!")
 87 | 12
 88 | 
 89 | >>> with open("greeting.txt", mode="r") as infile:
 90 | ...     print(infile.read())
 91 | "Hello world!"
 92 | ```
 93 | 
 94 | The `mode` argument tells Python whether you're writing (`w`) or reading (`r`).
 95 | These are by far the most commonly used values.
 96 | 
 97 | What if you want to write to a file, but only if it doesn't exist yet?
 98 | You could check if it exists first:
 99 | 
100 | ```pycon
101 | >>> if not os.path.exists("important.txt"):
102 | ...     with open("important.txt", mode="w"):
103 | ...
104 | ```
105 | 
106 | but this is risky – what if the file is created between the existence check and when you open it?
107 | 
108 | Better is to use mode `x` which means **exclusive open**.
109 | You write as normal, but if the file already exists, the `open()` throws a FileExistsError:
110 | 
111 | ```pycon
112 | >>> with open("greeting.txt", mode="x") as outfile:
113 | ...     outfile.write("Hello world!")
114 | 12
115 | 
116 | >>> with open("greeting.txt", mode="x") as outfile:
117 | ...     outfile.write("Bonjour le monde!")
118 | Traceback (most recent call last):
119 |   File "<stdin>", line 1, in <module>
120 | FileExistsError: [Errno 17] File exists: 'greeting.txt'
121 | ```
122 | 
123 | This is enforced at the OS-level so it's a bit more robust.
124 | I use this to ensure I don't save two files with the same name – one will succeed, the other will throw a FileExistsError and get a random hex value inserted to distinguish it.
125 | 
126 | For the exact implementation, see [file_normalisation.py](https://github.com/alexwlchan/docstore/blob/main/src/docstore/file_normalisation.py).
127 | 
128 | [python_rw]: https://docs.python.org/3/tutorial/inputoutput.html#tut-files
129 | 
130 | 
131 | 
132 | ## Downloading files with their original filename / the Content-Disposition header
133 | 
134 | When I download a file from the web app, I want to download it with the original filename -- not the normalised version.
135 | 
136 | For example, if I have an HTML link:
137 | 
138 | ```html
139 | <a href="/files/beijing.pdf">
140 | ```
141 | 
142 | then if I downloaded this link, my web browser would download a file named `beijing.pdf`.
143 | 
144 | But you can use the [Content-Disposition header][cd_header] to suggest to a browser that it should download a file with a different name.
145 | In particular, if the server returns the header:
146 | 
147 | ```
148 | Content-Disposition: attachment; filename="北京.pdf"
149 | ```
150 | 
151 | then the browser will download the file as `北京.pdf`.
152 | 
153 | For the exact implementation, see [`serve_file()` in `server.py`](https://github.com/alexwlchan/docstore/blob/7cb1cfd708c212af4dc0673dc8da372f7b8c79a4/src/docstore/server.py#L39-L57).
154 | 
155 | [cd_header]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition
156 | 


--------------------------------------------------------------------------------
/src/docstore/documents.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import hashlib
  3 | import json
  4 | import os
  5 | import pathlib
  6 | import shutil
  7 | import typing
  8 | 
  9 | import cattr
 10 | 
 11 | from docstore.file_normalisation import normalised_filename_copy
 12 | from docstore.models import (
 13 |     DocstoreEncoder,
 14 |     Document,
 15 |     File,
 16 |     Thumbnail,
 17 |     from_json,
 18 |     to_json,
 19 | )
 20 | from docstore.text_utils import slugify
 21 | from docstore.thumbnails import create_thumbnail, get_dimensions
 22 | from docstore.tint_colors import choose_tint_color
 23 | 
 24 | 
 25 | def db_path(root: pathlib.Path) -> pathlib.Path:
 26 |     """
 27 |     Returns the path to the database.
 28 |     """
 29 |     return root / "documents.json"
 30 | 
 31 | 
 32 | class CachedDocuments(typing.TypedDict):
 33 |     last_modified: float
 34 |     contents: list[Document]
 35 | 
 36 | 
 37 | _cached_documents: CachedDocuments = {
 38 |     "last_modified": -1,
 39 |     "contents": [],
 40 | }
 41 | 
 42 | 
 43 | def read_documents(root: pathlib.Path) -> list[Document]:
 44 |     """
 45 |     Get a list of all the documents.
 46 |     """
 47 |     # JSON parsing is somewhat expensive.  By caching the result rather than
 48 |     # going to disk each time, we see a ~10x speedup in returning responses
 49 |     # from the server.
 50 |     try:
 51 |         if (
 52 |             _cached_documents["last_modified"] is not None
 53 |             and os.stat(db_path(root)).st_mtime <= _cached_documents["last_modified"]
 54 |         ):
 55 |             return _cached_documents["contents"]
 56 |     except FileNotFoundError:
 57 |         pass
 58 | 
 59 |     try:
 60 |         with open(db_path(root)) as infile:
 61 |             result = from_json(infile.read())
 62 |     except FileNotFoundError:
 63 |         return []
 64 | 
 65 |     _cached_documents["last_modified"] = os.stat(db_path(root)).st_mtime
 66 |     _cached_documents["contents"] = result
 67 | 
 68 |     return result
 69 | 
 70 | 
 71 | def write_documents(*, root: pathlib.Path, documents: list[Document]) -> None:
 72 |     json_string = to_json(documents)
 73 | 
 74 |     os.makedirs(root, exist_ok=True)
 75 | 
 76 |     with open(db_path(root), "w") as out_file:
 77 |         out_file.write(json_string)
 78 | 
 79 | 
 80 | def sha256(path: pathlib.Path) -> str:
 81 |     h = hashlib.sha256()
 82 |     with open(path, "rb") as infile:
 83 |         for byte_block in iter(lambda: infile.read(4096), b""):
 84 |             h.update(byte_block)
 85 | 
 86 |     return "sha256:%s" % h.hexdigest()
 87 | 
 88 | 
 89 | def store_new_document(
 90 |     *,
 91 |     root: pathlib.Path,
 92 |     path: pathlib.Path,
 93 |     title: str,
 94 |     tags: list[str],
 95 |     source_url: str | None,
 96 |     date_saved: datetime.datetime,
 97 | ) -> Document:
 98 |     filename = os.path.basename(path)
 99 | 
100 |     # Files are sharded by the first letter of their filename,
101 |     # e.g. "aardvark.png" is saved in "a/aardvark.png"
102 |     shard = slugify(filename)[0].lower()
103 | 
104 |     dst = os.path.join(root, "files", shard, filename)
105 | 
106 |     out_path = normalised_filename_copy(src=str(path), dst=dst)
107 | 
108 |     thumbnail_path = create_thumbnail(out_path)
109 |     thumbnail_name = os.path.basename(thumbnail_path)
110 |     thumb_out_path = os.path.join(root, "thumbnails", thumbnail_name[0], thumbnail_name)
111 |     os.makedirs(os.path.dirname(thumb_out_path), exist_ok=True)
112 |     shutil.move(thumbnail_path, thumb_out_path)
113 | 
114 |     tint_color = choose_tint_color(thumbnail_path=thumb_out_path, file_path=out_path)
115 | 
116 |     hex_tint_color = "#%02x%02x%02x" % tuple(
117 |         int(component * 255) for component in tint_color
118 |     )
119 | 
120 |     new_document = Document(
121 |         title=title,
122 |         date_saved=date_saved,
123 |         tags=tags,
124 |         files=[
125 |             File(
126 |                 filename=filename,
127 |                 path=os.path.relpath(out_path, root),
128 |                 size=os.stat(out_path).st_size,
129 |                 checksum=sha256(pathlib.Path(out_path)),
130 |                 source_url=source_url,
131 |                 thumbnail=Thumbnail(
132 |                     path=os.path.relpath(thumb_out_path, root),
133 |                     dimensions=get_dimensions(thumb_out_path),
134 |                     tint_color=hex_tint_color,
135 |                 ),
136 |                 date_saved=date_saved,
137 |             )
138 |         ],
139 |     )
140 | 
141 |     documents = read_documents(root)
142 |     documents.append(new_document)
143 | 
144 |     write_documents(root=root, documents=documents)
145 | 
146 |     # Don't delete the original file until it's been successfully recorded
147 |     # and a thumbnail created.
148 |     os.unlink(path)
149 | 
150 |     return new_document
151 | 
152 | 
153 | def pairwise_merge_documents(
154 |     root: pathlib.Path,
155 |     *,
156 |     doc1: Document,
157 |     doc2: Document,
158 |     new_title: str,
159 |     new_tags: list[str],
160 | ) -> Document:
161 |     """
162 |     Merge the files on two documents together.
163 | 
164 |     Before: 2 documents with 1 file each
165 |     After:  1 document with 2 files
166 |     """
167 |     documents = read_documents(root)
168 |     assert doc2 in documents
169 |     documents.remove(doc2)
170 | 
171 |     # Modify the copy of the document that's about to be written; this will
172 |     # throw an error if the document has changed between starting and finishing
173 |     # the merge.
174 |     stored_doc1 = documents[documents.index(doc1)]
175 | 
176 |     stored_doc1.date_saved = min([stored_doc1.date_saved, doc2.date_saved])
177 |     stored_doc1.tags = new_tags
178 |     stored_doc1.title = new_title
179 |     stored_doc1.files.extend(doc2.files)
180 |     write_documents(root=root, documents=documents)
181 | 
182 |     return stored_doc1
183 | 
184 | 
185 | def delete_document(root: pathlib.Path, *, doc_id: str) -> None:
186 |     documents = read_documents(root)
187 |     doc = [d for d in documents if d.id == doc_id][0]
188 | 
189 |     delete_dir = os.path.join(root, "deleted", doc.id)
190 |     os.makedirs(delete_dir, exist_ok=True)
191 | 
192 |     for f in doc.files:
193 |         os.rename(
194 |             os.path.join(root, f.path),
195 |             os.path.join(delete_dir, os.path.basename(f.path)),
196 |         )
197 |         os.unlink(os.path.join(root, f.thumbnail.path))
198 | 
199 |     with open(os.path.join(delete_dir, "document.json"), "w") as outfile:
200 |         outfile.write(
201 |             json.dumps(
202 |                 cattr.unstructure(doc), indent=2, sort_keys=True, cls=DocstoreEncoder
203 |             )
204 |         )
205 | 
206 |     documents = [d for d in documents if d.id != doc_id]
207 |     write_documents(root=root, documents=documents)
208 | 
209 | 
210 | def find_original_filename(root: pathlib.Path, *, path: str) -> str:
211 |     """
212 |     Returns the name of the original file stored in this path.
213 |     """
214 |     documents = read_documents(root)
215 |     for d in documents:
216 |         for f in d.files:
217 |             if f.path == os.path.relpath(path, root):
218 |                 return f.filename
219 | 
220 |     raise ValueError(f"Couldn't find file stored with path {path}")
221 | 


--------------------------------------------------------------------------------
/tests/test_documents.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import json
  3 | import os
  4 | import pathlib
  5 | import shutil
  6 | 
  7 | from docstore.documents import (
  8 |     delete_document,
  9 |     pairwise_merge_documents,
 10 |     read_documents,
 11 |     sha256,
 12 |     store_new_document,
 13 |     write_documents,
 14 | )
 15 | from docstore.models import Dimensions, Document, File, Thumbnail
 16 | 
 17 | 
 18 | def test_sha256() -> None:
 19 |     p = pathlib.Path("tests/files/cluster.png")
 20 | 
 21 |     assert (
 22 |         sha256(p)
 23 |         == "sha256:683cbee0c2dda22b42fd92bda0f31e4b6b49cd8650a7924d72a14a30f11bfbe5"
 24 |     )
 25 | 
 26 | 
 27 | def test_read_blank_documents_is_empty(tmpdir: pathlib.Path) -> None:
 28 |     assert read_documents(tmpdir) == []
 29 | 
 30 | 
 31 | def test_can_write_and_read_documents(tmpdir: pathlib.Path) -> None:
 32 |     documents = [Document(title="My first document")]
 33 | 
 34 |     write_documents(root=tmpdir, documents=documents)
 35 | 
 36 |     # Repeat a couple of times so we hit the caching paths.
 37 |     for _ in range(3):
 38 |         assert read_documents(tmpdir) == documents
 39 | 
 40 | 
 41 | def test_can_merge_documents(tmpdir: pathlib.Path, root: pathlib.Path) -> None:
 42 |     shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "cluster1.png")
 43 |     shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "cluster2.png")
 44 | 
 45 |     doc1 = store_new_document(
 46 |         root=root,
 47 |         path=tmpdir / "cluster1.png",
 48 |         title="My first document",
 49 |         tags=["tag1"],
 50 |         source_url="htttps://example.org/cluster1.png",
 51 |         date_saved=datetime.datetime.now(),
 52 |     )
 53 |     doc2 = store_new_document(
 54 |         root=root,
 55 |         path=tmpdir / "cluster2.png",
 56 |         title="My second document",
 57 |         tags=["tag2"],
 58 |         source_url="htttps://example.org/cluster2.png",
 59 |         date_saved=datetime.datetime.now(),
 60 |     )
 61 | 
 62 |     pairwise_merge_documents(
 63 |         root=root,
 64 |         doc1=doc1,
 65 |         doc2=doc2,
 66 |         new_title="My merged document",
 67 |         new_tags=["tag1", "tag2", "new_merged_tag"],
 68 |     )
 69 | 
 70 |     stored_documents = read_documents(root)
 71 | 
 72 |     assert stored_documents == [
 73 |         Document(
 74 |             id=doc1.id,
 75 |             date_saved=doc1.date_saved,
 76 |             files=doc1.files + doc2.files,
 77 |             title="My merged document",
 78 |             tags=["tag1", "tag2", "new_merged_tag"],
 79 |         )
 80 |     ]
 81 | 
 82 | 
 83 | def test_merging_uses_earliest_date(tmpdir: pathlib.Path) -> None:
 84 |     doc1 = Document(title="Doc1", date_saved=datetime.datetime(2010, 1, 1))
 85 |     doc2 = Document(title="Doc2", date_saved=datetime.datetime(2002, 2, 2))
 86 | 
 87 |     write_documents(root=tmpdir, documents=[doc1, doc2])
 88 | 
 89 |     pairwise_merge_documents(
 90 |         root=tmpdir,
 91 |         doc1=doc1,
 92 |         doc2=doc2,
 93 |         new_title="DocMerged",
 94 |         new_tags=[],
 95 |     )
 96 | 
 97 |     stored_documents = read_documents(tmpdir)
 98 | 
 99 |     assert doc2.date_saved < doc1.date_saved
100 |     assert len(stored_documents) == 1
101 |     assert stored_documents[0].date_saved == doc2.date_saved
102 | 
103 | 
104 | def test_store_new_document(tmpdir: pathlib.Path) -> None:
105 |     root = tmpdir / "root"
106 |     shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "My Cluster.png")
107 | 
108 |     documents = read_documents(root)
109 |     assert len(documents) == 0
110 | 
111 |     now = datetime.datetime(2020, 2, 20)
112 | 
113 |     new_document = store_new_document(
114 |         root=root,
115 |         path=tmpdir / "My Cluster.png",
116 |         title="My cluster title",
117 |         tags=["tag1", "tag2", "tag3"],
118 |         source_url="https://example.org/cluster.png",
119 |         date_saved=now,
120 |     )
121 | 
122 |     assert not os.path.exists(tmpdir / "My Cluster.png")
123 | 
124 |     assert isinstance(new_document, Document)
125 |     assert new_document.title == "My cluster title"
126 |     assert new_document.date_saved == now
127 |     assert new_document.tags == ["tag1", "tag2", "tag3"]
128 | 
129 |     assert len(new_document.files) == 1
130 |     new_file = new_document.files[0]
131 |     assert isinstance(new_file, File)
132 |     assert new_file.filename == "My Cluster.png"
133 |     assert new_file.path == "files/m/my-cluster.png"
134 |     assert new_file.size == 41151
135 |     assert (
136 |         new_file.checksum
137 |         == "sha256:683cbee0c2dda22b42fd92bda0f31e4b6b49cd8650a7924d72a14a30f11bfbe5"
138 |     )
139 |     assert new_file.source_url == "https://example.org/cluster.png"
140 |     assert new_file.date_saved == now
141 | 
142 |     assert new_file.thumbnail == Thumbnail(
143 |         path="thumbnails/m/my-cluster.png",
144 |         dimensions=Dimensions(400, 260),
145 |         tint_color="#007f7f",
146 |     )
147 |     assert os.path.exists(root / new_file.thumbnail.path)
148 | 
149 |     assert read_documents(root) == [new_document]
150 | 
151 |     # Storing a second document gets us both documents, but with different names
152 |     shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "My Cluster.png")
153 |     new_document2 = store_new_document(
154 |         root=root,
155 |         path=tmpdir / "My Cluster.png",
156 |         title="My second cluster title",
157 |         tags=["tag1", "tag2", "tag3", "tag4"],
158 |         source_url="https://example.org/cluster2.png",
159 |         date_saved=now,
160 |     )
161 | 
162 |     assert isinstance(new_document2, Document)
163 |     new_file2 = new_document2.files[0]
164 |     assert new_file2.filename == "My Cluster.png"
165 |     assert new_file2.path != "files/m/my-cluster.png"
166 |     assert new_file2.path.startswith("files/m/my-cluster_")
167 |     assert new_file2.path.endswith(".png")
168 | 
169 |     assert read_documents(root) == [new_document, new_document2]
170 | 
171 |     assert len(os.listdir(root / "files" / "m")) == 2
172 |     assert len(os.listdir(root / "thumbnails" / "m")) == 2
173 | 
174 | 
175 | def test_deleting_document(tmpdir: pathlib.Path, root: pathlib.Path) -> None:
176 |     root = tmpdir / "root"
177 |     shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "cluster.png")
178 | 
179 |     doc1 = store_new_document(
180 |         root=root,
181 |         path=tmpdir / "cluster.png",
182 |         title="A document about to be deleted",
183 |         tags=[],
184 |         source_url="htttps://example.org/cluster.png",
185 |         date_saved=datetime.datetime.now(),
186 |     )
187 |     doc2 = Document(title="Doc1", date_saved=datetime.datetime(2010, 1, 1))
188 |     doc3 = Document(title="Doc2", date_saved=datetime.datetime(2002, 2, 2))
189 | 
190 |     write_documents(root=root, documents=[doc1, doc2, doc3])
191 | 
192 |     assert read_documents(root) == [doc1, doc2, doc3]
193 | 
194 |     delete_document(root, doc_id=doc1.id)
195 | 
196 |     assert read_documents(root) == [doc2, doc3]
197 | 
198 |     deleted_json_path = root / "deleted" / doc1.id / "document.json"
199 |     assert os.path.exists(deleted_json_path)
200 |     assert json.load(open(deleted_json_path))["id"] == doc1.id
201 |     assert not os.path.exists(root / "files" / "c" / "cluster.png")
202 |     assert os.path.exists(root / "deleted" / doc1.id / "cluster.png")
203 | 


--------------------------------------------------------------------------------
/src/docstore/templates/index.html:
--------------------------------------------------------------------------------
  1 | <head>
  2 |   {% include "_head.html" %}
  3 | </head>
  4 | 
  5 | <aside>
  6 |   <div id="aside_inner">
  7 |     docstore{% if title %}/{{ title | smartypants | safe }}{% endif %}
  8 |   </div>
  9 | </aside>
 10 | 
 11 | <main>
 12 |   {% set page_size = 100 %}
 13 | 
 14 |   {% set page_start = (page - 1) * page_size + 1 %}
 15 |   {% set page_end = page_start + page_size - 1 %}
 16 | 
 17 |   {% if documents|length < page_end %}
 18 |     {% set page_end = documents|length %}
 19 |   {% endif %}
 20 | 
 21 |   {% set include_tags = True %}
 22 |   {% with placement="top" %}
 23 |     {% include "_meta_info.html" %}
 24 |   {% endwith %}
 25 | 
 26 |   <script>
 27 |     function rememberDetailsExpansionState(id) {
 28 |       var details = document.getElementById(id);
 29 | 
 30 |       details.addEventListener("toggle", event => {
 31 |         if (details.open) {
 32 |           localStorage.setItem("expandDetails_" + id, true);
 33 |         } else {
 34 |           localStorage.removeItem("expandDetails_" + id);
 35 |         }
 36 |       });
 37 | 
 38 |       if (localStorage.getItem("expandDetails_" + id)) {
 39 |         details.open = true;
 40 |       }
 41 |     }
 42 | 
 43 |     rememberDetailsExpansionState("tagList");
 44 |     rememberDetailsExpansionState("tagCloud");
 45 |   </script>
 46 | 
 47 |   <style>
 48 |     .doc_preview {
 49 |       border: 2.5px solid;
 50 |       background: rgba(255, 255, 255, 0.8);
 51 |       border-radius: 8px;
 52 |       margin-bottom: 1em;
 53 | 
 54 |       grid-template-columns: 200px auto;
 55 |       display: grid;
 56 |       grid-gap: 20px;
 57 |       width: 100%;
 58 |       color: black;
 59 |       text-decoration: none;
 60 |       min-height: 172px;
 61 |       margin-left:  -12px;
 62 |       margin-right: -12px;
 63 |       padding: 11px;
 64 |     }
 65 | 
 66 |     .doc_preview img {
 67 |       max-width: 200px;
 68 |       max-height: 200px;
 69 |       vertical-align: middle;
 70 |     }
 71 | 
 72 |     .doc_thumbnail {
 73 |       grid-row: 1/1;
 74 |       grid-column: 1/2;
 75 |     }
 76 | 
 77 |     .doc_thumbnail a:not(:last-child) {
 78 |       margin-bottom: 11px;
 79 |     }
 80 | 
 81 |     .doc_metadata {
 82 |       grid-row: 1/1;
 83 |       grid-column: 2/2;
 84 |       display: inline-block;
 85 |       margin-top:    auto;
 86 |       margin-bottom: auto;
 87 |       border-collapse: collapse;
 88 |       line-height: 1.25em;
 89 |       padding-right: 10px;
 90 |       padding-top:    10px;
 91 |       padding-bottom: 10px;
 92 | 
 93 |       line-height: 1.45em;
 94 |       font-size: 10pt;
 95 |     }
 96 |   </style>
 97 | 
 98 |   {% for doc in documents[page_start - 1:page_end] %}
 99 |     <div class="doc_preview" id="doc_{{ doc.id }}">
100 |       <style>
101 |       {% for f in doc.files %}
102 |         {% set red   = f.thumbnail.tint_color[1:3] | int(base=16) %}
103 |         {% set green = f.thumbnail.tint_color[3:5] | int(base=16) %}
104 |         {% set blue  = f.thumbnail.tint_color[5:7] | int(base=16) %}
105 | 
106 |         {% if loop.first %}
107 |           #doc_{{ doc.id }} a {
108 |             color: rgb({{ red }}, {{ green }}, {{ blue }});
109 |           }
110 | 
111 |           #doc_{{ doc.id }} {
112 |             border-color: rgb({{ red }}, {{ green }}, {{ blue }});
113 |           }
114 | 
115 |           #doc_{{ doc.id }} a:hover {
116 |             background: rgba({{ red }}, {{ green }}, {{ blue }}, 0.3);
117 |           }
118 | 
119 |           #doc_{{ doc.id }} .doc_id {
120 |             color: rgba({{ red }}, {{ green }}, {{ blue }}, 0.4);
121 |           }
122 |         {% endif %}
123 | 
124 |       /* todo: constrain portrait here; rows all have same height local*/
125 | 
126 |         {% if doc.files|length > 5 %}
127 |           {% set max_size = 100 %}
128 |         {% else %}
129 |           {% set max_size = 200 %}
130 |         {% endif %}
131 | 
132 |         {% if f.thumbnail.dimensions.width > f.thumbnail.dimensions.height %}
133 |           {% set width = max_size %}
134 |           {% set height = (max_size / f.thumbnail.dimensions.width * f.thumbnail.dimensions.height) %}
135 |         {% else %}
136 |           {% set height = max_size %}
137 |           {% set width = (max_size / f.thumbnail.dimensions.height * f.thumbnail.dimensions.width) %}
138 |         {% endif %}
139 | 
140 |           #file_{{ f.id }} div.thumbnail_image {
141 |             background: rgba({{ red }}, {{ green }}, {{ blue }}, 0.5);
142 |             border: 1px solid rgba({{ red }}, {{ green }}, {{ blue }}, 0.1);
143 |             width: {{ width }}px;
144 |             height: {{ height }}px;
145 |           }
146 | 
147 |           #file_{{ f.id }} img {
148 |             width: {{ width }}px;
149 |             height: {{ height }}px;
150 |           }
151 | 
152 |           #file_{{ f.id }} div.thumbnail_image:hover {
153 |             background: rgba({{ red }}, {{ green }}, {{ blue }}, 0.75);
154 |             border: 1px solid rgba({{ red }}, {{ green }}, {{ blue }}, 0.5);
155 |           }
156 | 
157 |           #file_{{ f.id }} img:hover {
158 |             opacity: 0.7;
159 |           }
160 |       {% endfor %}
161 |       </style>
162 | 
163 |       <div class="doc_thumbnail" style="display: flex;
164 |   justify-content: center;
165 |   align-items: center; flex-direction: column; flex: 1;">
166 |         {%- for f in doc.files %}
167 |           <a href="/{{ f.path }}" id="file_{{ f.id }}" style="display: block;">
168 |             <div class="thumbnail_image">
169 |               <img src="/{{ f.thumbnail.path }}">
170 |             </div>
171 |           </a>
172 |         {%- endfor %}
173 |       </div>
174 | 
175 |       <div class="doc_metadata">
176 |         {% if doc.title or doc|tags_with_prefix(("by:", "from:")) %}
177 |           <h2 class="title">
178 |             {{ doc.title | smartypants | safe }}
179 | 
180 |             {%- set prefix = "by" -%}
181 |             {% include "_title_attribution.html" %}
182 | 
183 |             {%- set prefix = "from" -%}
184 |             {% include "_title_attribution.html" %}
185 | 
186 |             <span class="doc_id">({{ doc.id }})</span>
187 |           </h2>
188 |         {% endif %}
189 | 
190 |         <span title="{{ doc.date_saved.strftime('%d %B %Y') }}">date saved: {{ doc.date_saved | pretty_date }}</span>
191 |         {% if (doc.files and doc.files[0].source_url) or doc|tags_without_prefix(("by:", "from:")) %}<br/>{% endif %}
192 | 
193 |         {% if doc.files and doc.files[0].source_url %}
194 |           source: <a href="{{ doc.files[0].source_url }}">{{ doc.files[0].source_url|hostname }}</a></br>
195 |         {% endif %}
196 | 
197 |         {% if doc|tags_without_prefix(("by:", "from:")) %}
198 |           <div class="tags">
199 |           tagged with: {% for t in doc|tags_without_prefix(("by:", "from:"))|sort %}
200 |             <span class="tag">
201 |               {% if t in request_tags %}
202 |               {{ t }}
203 |               {% else %}
204 |               <a href="{{ query_string|add_tag(t) }}">{{ t }}</a>
205 |               {% endif %}
206 |             </span>
207 |           {% endfor %}
208 |           </div>
209 |         {% endif %}
210 |       </div>
211 |     </div>
212 |   {% endfor %}
213 | 
214 |   {% if page_end - page_start > 10 %}
215 |     {% set include_tags = False %}
216 |     {% with placement="bottom" %}
217 |       {% include "_meta_info.html" %}
218 |     {% endwith %}
219 |   {% endif %}
220 | </main>
221 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | import pathlib
  4 | import shutil
  5 | import uuid
  6 | 
  7 | from click.testing import CliRunner
  8 | import pytest
  9 | 
 10 | from docstore.cli import main
 11 | from docstore.documents import read_documents, store_new_document, write_documents
 12 | from docstore.models import Dimensions, Document, File, Thumbnail
 13 | from test_models import is_recent
 14 | 
 15 | 
 16 | class TestAdd:
 17 |     def test_stores_new_document(
 18 |         self, tmpdir: pathlib.Path, root: pathlib.Path
 19 |     ) -> None:
 20 |         shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "My Cluster.png")
 21 | 
 22 |         runner = CliRunner()
 23 |         result = runner.invoke(
 24 |             main,
 25 |             [
 26 |                 f"--root={root}",
 27 |                 "add",
 28 |                 str(tmpdir / "My Cluster.png"),
 29 |                 "--title",
 30 |                 "My first document",
 31 |                 "--tags",
 32 |                 "tag1, tag2, tag3",
 33 |             ],
 34 |         )
 35 | 
 36 |         assert result.exit_code == 0, result.output
 37 | 
 38 |         doc_id = result.output.strip()
 39 | 
 40 |         documents = read_documents(root)
 41 | 
 42 |         assert len(documents) == 1
 43 |         assert documents[0].id == doc_id
 44 |         assert documents[0].title == "My first document"
 45 |         assert documents[0].tags == ["tag1", "tag2", "tag3"]
 46 |         assert is_recent(documents[0].date_saved)
 47 | 
 48 |         assert len(documents[0].files) == 1
 49 |         f = documents[0].files[0]
 50 |         assert f.filename == "My Cluster.png"
 51 |         assert f.path == "files/m/my-cluster.png"
 52 |         assert f.source_url is None
 53 |         assert f.date_saved == documents[0].date_saved
 54 | 
 55 |     @pytest.mark.parametrize(
 56 |         "tag_arg, expected_tags",
 57 |         [
 58 |             ("", []),
 59 |             ("tag with trailing whitespace ", ["tag with trailing whitespace"]),
 60 |             (
 61 |                 "multiple,comma,separated,tags",
 62 |                 ["multiple", "comma", "separated", "tags"],
 63 |             ),
 64 |         ],
 65 |     )
 66 |     def test_adds_tags_to_document(
 67 |         self,
 68 |         tmpdir: pathlib.Path,
 69 |         root: pathlib.Path,
 70 |         tag_arg: str,
 71 |         expected_tags: list[str],
 72 |     ) -> None:
 73 |         shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "My Cluster.png")
 74 | 
 75 |         runner = CliRunner()
 76 |         result = runner.invoke(
 77 |             main,
 78 |             [
 79 |                 f"--root={root}",
 80 |                 "add",
 81 |                 str(tmpdir / "My Cluster.png"),
 82 |                 "--title",
 83 |                 "My second document",
 84 |                 "--tags",
 85 |                 tag_arg,
 86 |             ],
 87 |         )
 88 | 
 89 |         assert result.exit_code == 0, result.output
 90 | 
 91 |         documents = read_documents(root)
 92 |         assert documents[0].tags == expected_tags
 93 | 
 94 |     @pytest.mark.parametrize(
 95 |         "source_url_arg, expected_source_url",
 96 |         [
 97 |             ("", ""),
 98 |             ("https://example.org/cluster.png", "https://example.org/cluster.png"),
 99 |         ],
100 |     )
101 |     def test_adds_source_url_to_file(
102 |         self,
103 |         tmpdir: pathlib.Path,
104 |         root: pathlib.Path,
105 |         source_url_arg: str,
106 |         expected_source_url: str,
107 |     ) -> None:
108 |         shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "My Cluster.png")
109 | 
110 |         runner = CliRunner()
111 |         result = runner.invoke(
112 |             main,
113 |             [
114 |                 f"--root={root}",
115 |                 "add",
116 |                 str(tmpdir / "My Cluster.png"),
117 |                 "--title",
118 |                 "My stored document",
119 |                 "--tags",
120 |                 "tag1, tag2, tag3",
121 |                 "--source_url",
122 |                 source_url_arg,
123 |             ],
124 |         )
125 | 
126 |         assert result.exit_code == 0, result.output
127 | 
128 |         documents = read_documents(root)
129 |         assert documents[0].files[0].source_url == expected_source_url
130 | 
131 | 
132 | class TestMerge:
133 |     @pytest.mark.parametrize("doc_count", [1, 2, 3, 4])
134 |     def test_merges_documents_with_identical_metadata(
135 |         self, root: pathlib.Path, doc_count: int
136 |     ) -> None:
137 |         documents = [
138 |             Document(title="My Document", tags=["tag1", "tag2", "tag3"])
139 |             for _ in range(doc_count)
140 |         ]
141 | 
142 |         write_documents(root=root, documents=documents)
143 | 
144 |         runner = CliRunner()
145 |         result = runner.invoke(
146 |             main, [f"--root={root}", "merge", "--yes"] + [doc.id for doc in documents]
147 |         )
148 |         assert result.exit_code == 0, result.output
149 | 
150 |         if doc_count > 1:
151 |             assert "Using common title: My Document\n" in result.output
152 |             assert "Using common tags: tag1, tag2, tag3\n" in result.output
153 | 
154 |         stored_documents = read_documents(root)
155 | 
156 |         assert len(stored_documents) == 1
157 |         assert stored_documents[0].id == documents[0].id
158 |         assert stored_documents[0].title == "My Document"
159 |         assert stored_documents[0].tags == ["tag1", "tag2", "tag3"]
160 | 
161 |     def test_merges_documents_with_inferred_metadata(self, root: pathlib.Path) -> None:
162 |         documents = [
163 |             Document(title=f"My Document {i}", tags=[f"tag{i}"]) for i in range(3)
164 |         ]
165 | 
166 |         write_documents(root=root, documents=documents)
167 | 
168 |         runner = CliRunner()
169 |         result = runner.invoke(
170 |             main, [f"--root={root}", "merge", "--yes"] + [doc.id for doc in documents]
171 |         )
172 |         assert result.exit_code == 0, result.output
173 | 
174 |         assert "Guessed title: My Document\n" in result.output
175 |         assert "Guessed tags: tag0, tag1, tag2\n" in result.output
176 | 
177 |         stored_documents = read_documents(root)
178 | 
179 |         assert len(stored_documents) == 1
180 |         assert stored_documents[0].id == documents[0].id
181 |         assert stored_documents[0].title == "My Document"
182 |         assert stored_documents[0].tags == ["tag0", "tag1", "tag2"]
183 | 
184 |     @pytest.mark.parametrize("doc_count", [1, 2, 3, 4])
185 |     def test_merging_combines_files(self, root: pathlib.Path, doc_count: int) -> None:
186 |         shutil.copyfile(src="tests/files/cluster.png", dst=root / "cluster.png")
187 |         documents = [
188 |             Document(
189 |                 title="My Document",
190 |                 tags=["tag"],
191 |                 files=[
192 |                     File(
193 |                         filename=f"cluster{i}.png",
194 |                         path="cluster.png",
195 |                         size=100,
196 |                         checksum="sha256:123",
197 |                         thumbnail=Thumbnail(
198 |                             path="cluster.png",
199 |                             dimensions=Dimensions(400, 300),
200 |                             tint_color="#000000",
201 |                         ),
202 |                     )
203 |                 ],
204 |             )
205 |             for i in range(doc_count)
206 |         ]
207 | 
208 |         write_documents(root=root, documents=documents)
209 | 
210 |         runner = CliRunner()
211 |         result = runner.invoke(
212 |             main, [f"--root={root}", "merge", "--yes"] + [doc.id for doc in documents]
213 |         )
214 |         assert result.exit_code == 0, result.output
215 | 
216 |         stored_documents = read_documents(root)
217 | 
218 |         assert len(stored_documents) == 1
219 |         assert len(stored_documents[0].files) == doc_count
220 | 
221 | 
222 | def test_deleting_document_through_cli(
223 |     tmpdir: pathlib.Path, root: pathlib.Path
224 | ) -> None:
225 |     shutil.copyfile(src="tests/files/cluster.png", dst=tmpdir / "cluster.png")
226 | 
227 |     doc1 = store_new_document(
228 |         root=root,
229 |         path=tmpdir / "cluster.png",
230 |         title="A document about to be deleted",
231 |         tags=[],
232 |         source_url="https://example.org/cluster.png",
233 |         date_saved=datetime.datetime.now(),
234 |     )
235 |     doc2 = Document(title="Doc1", date_saved=datetime.datetime(2010, 1, 1))
236 |     doc3 = Document(title="Doc2", date_saved=datetime.datetime(2002, 2, 2))
237 | 
238 |     write_documents(root=root, documents=[doc1, doc2, doc3])
239 | 
240 |     assert read_documents(root) == [doc1, doc2, doc3]
241 | 
242 |     runner = CliRunner()
243 |     result = runner.invoke(main, [f"--root={root}", "delete", doc1.id, doc2.id])
244 |     assert result.exit_code == 0, result.output
245 | 
246 |     assert read_documents(root) == [doc3]
247 | 
248 |     for deleted_doc in [doc1, doc2]:
249 |         deleted_json_path = root / "deleted" / deleted_doc.id / "document.json"
250 |         assert os.path.exists(deleted_json_path)
251 | 
252 | 
253 | def test_deleting_in_empty_instance_is_error(root: pathlib.Path) -> None:
254 |     runner = CliRunner()
255 |     result = runner.invoke(main, [f"--root={root}", "delete", str(uuid.uuid4())])
256 | 
257 |     assert result.exit_code == 1, result.output
258 |     assert result.output.strip() == f"There is no docstore instance at {root}!"
259 | 


--------------------------------------------------------------------------------
/tests/test_server.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterator
  2 | import datetime
  3 | import re
  4 | import pathlib
  5 | import shutil
  6 | import typing
  7 | 
  8 | import bs4
  9 | from flask.testing import FlaskClient
 10 | import pytest
 11 | 
 12 | from docstore.documents import store_new_document, write_documents
 13 | from docstore.models import Document
 14 | from docstore.server import create_app
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def client(root: pathlib.Path) -> Iterator[FlaskClient]:
 19 |     app = create_app(root=root, title="My test instance", thumbnail_width=200)
 20 |     app.config["TESTING"] = True
 21 | 
 22 |     with app.test_client() as client:
 23 |         yield client
 24 | 
 25 | 
 26 | def test_empty_response(client: FlaskClient) -> None:
 27 |     resp = client.get("/")
 28 |     assert resp.status_code == 200
 29 |     assert b"no documents found!" in resp.data
 30 | 
 31 | 
 32 | def test_shows_documents(
 33 |     tmpdir: pathlib.Path, root: pathlib.Path, client: FlaskClient
 34 | ) -> None:
 35 |     for _ in range(3):
 36 |         shutil.copyfile("tests/files/cluster.png", str(tmpdir / "cluster.png"))
 37 |         store_new_document(
 38 |             root=root,
 39 |             path=tmpdir / "cluster.png",
 40 |             title="My test document",
 41 |             tags=["tag1", "tag2", "tag3"],
 42 |             source_url="https://example.org/cluster",
 43 |             date_saved=datetime.datetime.now(),
 44 |         )
 45 | 
 46 |     resp = client.get("/")
 47 |     assert resp.status_code == 200
 48 |     assert resp.data.count(b"My test document") == 3
 49 |     assert b"date saved: just now" in resp.data
 50 | 
 51 |     # TODO: Detect this thumbnail URL from the page HTML
 52 |     resp = client.get("/thumbnails/c/cluster.png")
 53 |     assert resp.data[:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a"  # PNG magic number
 54 | 
 55 |     resp = client.get("/files/c/cluster.png")
 56 |     assert resp.data == open("tests/files/cluster.png", "rb").read()
 57 | 
 58 | 
 59 | def test_filters_documents_by_tag(root: pathlib.Path, client: FlaskClient) -> None:
 60 |     documents = [Document(title=f"Document {i}", tags=[f"tag{i}"]) for i in range(3)]
 61 |     write_documents(root=root, documents=documents)
 62 | 
 63 |     resp = client.get("/?tag=tag0")
 64 |     assert resp.status_code == 200
 65 |     assert b"Document 0" in resp.data
 66 |     assert b"Document 1" not in resp.data
 67 |     assert b"Document 2" not in resp.data
 68 | 
 69 | 
 70 | def test_paginates_document(root: pathlib.Path, client: FlaskClient) -> None:
 71 |     documents = [Document(title=f"Document {i}") for i in range(200)]
 72 |     write_documents(root=root, documents=documents)
 73 | 
 74 |     resp = client.get("/")
 75 |     assert resp.status_code == 200
 76 | 
 77 |     # More recent documents appear first
 78 |     assert b"Document 199" in resp.data
 79 |     assert b"Document 100" in resp.data
 80 |     assert b"Document 99" not in resp.data
 81 | 
 82 |     assert "« prev" in resp.data.decode("utf8")
 83 |     assert "next »" in resp.data.decode("utf8")
 84 | 
 85 |     resp_page_2 = client.get("/?page=2")
 86 |     assert resp_page_2.status_code == 200
 87 |     assert b"Document 100" not in resp_page_2.data
 88 |     assert b"Document 99" in resp_page_2.data
 89 |     assert b"Document 0" in resp_page_2.data
 90 | 
 91 | 
 92 | def test_documents_with_lots_of_tags(root: pathlib.Path, client: FlaskClient) -> None:
 93 |     documents = [Document(title=f"Document {i}", tags=[f"tag{i}"]) for i in range(200)]
 94 | 
 95 |     documents.extend(
 96 |         [
 97 |             Document(title="Another document", tags=["nest0:tag1"]),
 98 |             Document(title="Another document", tags=["nest0:tag1:tagA"]),
 99 |             Document(title="Another document", tags=["nest0:tag1:tagB"]),
100 |             Document(title="Another document", tags=["nest1:tag1"]),
101 |         ]
102 |     )
103 | 
104 |     write_documents(root=root, documents=documents)
105 | 
106 |     resp = client.get("/")
107 |     assert resp.status_code == 200
108 | 
109 |     assert b'<details id="tagList">' in resp.data
110 | 
111 | 
112 | def tidy(html_str: typing.Any) -> str:
113 |     assert isinstance(html_str, str)
114 |     return re.sub(r"\s+", " ", html_str.strip())
115 | 
116 | 
117 | class TestCase(typing.TypedDict):
118 |     tags: list[str]
119 |     expected_title: str
120 |     urls: list[str]
121 | 
122 | 
123 | @pytest.mark.parametrize(
124 |     "test_case",
125 |     [
126 |         {
127 |             "tags": ["by:John Smith"],
128 |             "expected_title": "{title}, by John Smith ({doc_id})",
129 |             "urls": ["/", "/?tag=by%3AJohn+Smith"],
130 |         },
131 |         {
132 |             "tags": ["by:John Smith", "by:Jane Doe"],
133 |             "expected_title": "{title}, by John Smith, Jane Doe ({doc_id})",
134 |             "urls": [
135 |                 "/",
136 |                 "/?tag=by%3AJohn+Smith",
137 |                 "/?tag=by%3AJane+Doe",
138 |                 "/?tag=by%3AJane+Doe&tag=by%3AJohn+Smith",
139 |             ],
140 |         },
141 |         {
142 |             "tags": ["from:ACME Corp"],
143 |             "expected_title": "{title}, from ACME Corp ({doc_id})",
144 |             "urls": ["/", "/?tag=from%3AACME+Corp"],
145 |         },
146 |         {
147 |             "tags": ["from:ACME Corp", "from:Widget Inc"],
148 |             "expected_title": "{title}, from ACME Corp, Widget Inc ({doc_id})",
149 |             "urls": [
150 |                 "/",
151 |                 "/?tag=from%3AACME+Corp",
152 |                 "/?tag=from%3AWidget+Inc",
153 |                 "/?tag=from%3AACME+Corp&tag=from%3AWidget+Inc",
154 |             ],
155 |         },
156 |         {
157 |             "tags": ["by:John Smith", "from:ACME Corp"],
158 |             "expected_title": "{title}, by John Smith, from ACME Corp ({doc_id})",
159 |             "urls": [
160 |                 "/",
161 |                 "/?tag=by%3AJohn+Smith",
162 |                 "/?tag=from%3AACME+Corp",
163 |                 "/?tag=by%3AJohn+Smith&tag=from%3AACME+Corp",
164 |             ],
165 |         },
166 |     ],
167 | )
168 | def test_shows_attribution_tags(
169 |     root: pathlib.Path, client: FlaskClient, test_case: TestCase
170 | ) -> None:
171 |     doc_tags = test_case["tags"] + ["tag1", "tag2"]
172 | 
173 |     doc = Document(title="My document", tags=doc_tags)
174 |     write_documents(root=root, documents=[doc])
175 | 
176 |     for url in test_case["urls"]:
177 |         print(url)
178 |         resp = client.get(url)
179 |         assert resp.status_code == 200
180 | 
181 |         soup = bs4.BeautifulSoup(resp.data, "html.parser")
182 | 
183 |         h2_title = soup.find("h2", attrs={"class": "title"})
184 |         assert h2_title is not None
185 |         assert tidy(h2_title.text) == test_case["expected_title"].format(
186 |             title=doc.title, doc_id=doc.id
187 |         )
188 | 
189 |         tags_list = soup.find("div", attrs={"class": "tags"})
190 |         assert tags_list is not None
191 |         assert tidy(tags_list.text) == "tagged with: tag1 tag2"
192 | 
193 | 
194 | def test_links_attribution_tags(root: pathlib.Path, client: FlaskClient) -> None:
195 |     doc = Document(title="My document", tags=["by:John Smith"])
196 |     write_documents(root=root, documents=[doc])
197 | 
198 |     # If the tag is not selected, the attribution tag in the title is a link
199 |     # that filters to the selected tag.
200 |     resp = client.get("/")
201 |     assert resp.status_code == 200
202 | 
203 |     soup = bs4.BeautifulSoup(resp.data, "html.parser")
204 | 
205 |     h2_title = soup.find("h2", attrs={"class": "title"})
206 |     assert isinstance(h2_title, bs4.Tag)
207 |     assert h2_title.find("a", attrs={"href": "?tag=by%3AJohn+Smith"}) is not None
208 | 
209 |     # If the tag is selected, the attribution tag in the title is regular text,
210 |     # not a link.
211 |     resp = client.get("/?tag=by%3aJohn+Smith")
212 |     assert resp.status_code == 200
213 | 
214 |     soup = bs4.BeautifulSoup(resp.data, "html.parser")
215 | 
216 |     h2_title = soup.find("h2", attrs={"class": "title"})
217 |     assert h2_title is not None
218 |     assert h2_title.find("a") is None
219 | 
220 | 
221 | def test_sets_thumbnail_width(client: FlaskClient) -> None:
222 |     """
223 |     If the user sets a custom thumbnail width, the appropriate CSS style is
224 |     added to the rendered page.
225 |     """
226 |     client.application.config["THUMBNAIL_WIDTH"] = 100
227 | 
228 |     resp = client.get("/")
229 | 
230 |     soup = bs4.BeautifulSoup(resp.data, "html.parser")
231 | 
232 |     style_tag = soup.find("style")
233 |     assert isinstance(style_tag, bs4.Tag)
234 |     assert tidy(style_tag.string) == ".thumbnail { width: 100px; }"
235 | 
236 | 
237 | def test_tags_are_sorted_alphabetically(
238 |     root: pathlib.Path, client: FlaskClient
239 | ) -> None:
240 |     doc = Document(title="My document", tags=["bulgaria", "austria", "croatia"])
241 |     write_documents(root=root, documents=[doc])
242 | 
243 |     resp = client.get("/")
244 | 
245 |     soup = bs4.BeautifulSoup(resp.data, "html.parser")
246 | 
247 |     tags_div = soup.find("div", attrs={"class": "tags"})
248 |     assert tags_div is not None
249 |     assert tidy(tags_div.text) == "tagged with: austria bulgaria croatia"
250 | 
251 | 
252 | def test_gets_curly_quotes(root: pathlib.Path, client: FlaskClient) -> None:
253 |     app = create_app(root=root, title="Isn't this a good title?", thumbnail_width=200)
254 |     app.config["TESTING"] = True
255 | 
256 |     with app.test_client() as client:
257 |         resp = client.get("/")
258 | 
259 |     soup = bs4.BeautifulSoup(resp.data, "html.parser")
260 | 
261 |     title = soup.find("title")
262 |     assert title is not None
263 |     assert title.text.strip() == "docstore/Isn’t this a good title?"
264 | 
265 |     aside_inner = soup.find("div", attrs={"id": "aside_inner"})
266 |     assert aside_inner is not None
267 |     assert aside_inner.text.strip() == "docstore/Isn’t this a good title?"
268 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # docstore
  2 | 
  3 | docstore is a tool I wrote to help me manage my scanned documents and reference files.
  4 | It uses [keyword tagging](https://en.wikipedia.org/wiki/Tag_(metadata)) to categorise files, and creates thumbnails to help identify files.
  5 | 
  6 | It has two parts:
  7 | 
  8 | *   A CLI tool that lets me store new documents
  9 | *   A web app that lets me browse the documents I've already stored
 10 | 
 11 | Here's an example of how I'd use the CLI tool to save a file:
 12 | 
 13 | ```
 14 | docstore add '~/Desktop/Contract of Employment.pdf' \
 15 |   --source_url='https://email.example.com/message/1234' \
 16 |   --title='2020-10: Contract of employment for ACME' \
 17 |   --tags='employer:acme-corp, contract:employment'
 18 | ```
 19 | 
 20 | Here's a screenshot of the web app:
 21 | 
 22 | ![A screenshot of docstore](docstore.png)
 23 | 
 24 | The web app allows me to filter by one or more tags, or to sort by title/date, to help me find the document I'm looking for.
 25 | 
 26 | 
 27 | 
 28 | ## Usage
 29 | 
 30 | Clone this repo and install the package locally:
 31 | 
 32 | ```console
 33 | $ git clone https://github.com/alexwlchan/docstore.git
 34 | $ cd docstore
 35 | $ pip3 install -e .
 36 | ```
 37 | 
 38 | You can add files using `docstore add` and run the web app with `docstore serve`.
 39 | 
 40 | Note that docstore is only intended for me to use -- it solves a specific problem that I have, and is designed to solve my exact needs.
 41 | 
 42 | You're welcome to use it, but I'm unlikely to provide support or add features for other people.
 43 | 
 44 | 
 45 | 
 46 | ## How it works: design and implementation notes
 47 | 
 48 | I learnt a lot of stuff writing docstore, and the source code is public so other people can read it and see how it works.
 49 | 
 50 | Everything is written in Python, with [Click][click] and [Flask][flask] being the core of the CLI and and the web app.
 51 | 
 52 | Because reading source code is a pretty inefficient way to learn, I have some documents that explain the key ideas:
 53 | 
 54 | -   [Storing the files](docs/storing-the-files.md) – where files are stored, what name they're stored under, ensuring I don't save two files with the same name
 55 | -   [Storing the metadata](docs/storing-the-metadata.md) – what metadata I store, how I model it, why I save it as JSON, how I serialise Python models to JSON and back
 56 | -   [Previewing the files](docs/previewing-the-files.md) – how I create file previews with Quick Look and FFmpeg, how I extract a tint colour from thumbnails from the web app
 57 | 
 58 | [click]: https://palletsprojects.com/p/click/
 59 | [flask]: https://palletsprojects.com/p/flask/
 60 | 
 61 | 
 62 | 
 63 | ## Why I wrote it
 64 | 
 65 | *   **I prefer keyword tagging to files-and-folders as a way to organise files.**
 66 |     I'm a particular fan of how [Pinboard](https://pinboard.in/) does tagging, but I haven't found an app that stores files with Pinboard-like.
 67 | 
 68 | *   **I want my documents stored locally.**
 69 |     My scanned paperwork in particular contains a lot of private information -- bank statements, medical letters, rental contracts, and more.
 70 |     I don't want to upload them to a cloud service and risk them being leaked.
 71 | 
 72 | *   **I'm very picky about how this sort of thing.**
 73 |     I've tried a bunch of other apps and services for doing this sort of thing, but none of them were quite right.
 74 |     I found it easier to write my own tool than try to use something written by somebody else.
 75 | 
 76 |     It helps that my needs are quite simple -- the whole app is about a thousand lines of code, which is pretty manageable.
 77 | 
 78 | 
 79 | 
 80 | ## Design principles
 81 | 
 82 | *   **My files and metadata should be portable.**
 83 |     All the data for a collection of files stored with docstore is kept in a single directory.
 84 |     That directory can be copied or synced to another machine, and I can start working with them immediately -- no config or setup required.
 85 | 
 86 |     This is important for day-to-day utility, and for disaster recovery.
 87 |     If something happens to my main computer, I want to be able to get to my documents again (including the keyword tags for organisation) as quickly as possible.
 88 | 
 89 | *   **Use JSON as a database.**
 90 |     All the metadata about my documents is kept in a single JSON file.
 91 |     JSON is a simple, popular format with several advantages for me:
 92 | 
 93 |     -   Lots of tools can read it.
 94 |         Pretty much every programming language has a JSON parser, so I'm guaranteed I'll be able to parse the metadata file for years to come.
 95 |     -   I can edit JSON in a text editor.
 96 |         This saves me building editing features into docstore -- if I've made a typo or want to change something, I can edit the metadata JSON directly.
 97 |     -   It maps directly to Python data structures (Python is what I use to write docstore).
 98 |         The serialisation and deserialisation isn't very complicated.
 99 | 
100 |     If you were building an app that had to store a lot of documents or support multiple users, JSON would be a poor choice -- you'd want to use a proper database instead.
101 |     My biggest docstore instance only has a few thousand files, and the cost of JSON parsing is negligible.
102 | 
103 | *   **A document can have multiple files.**
104 | 
105 |     This wasn't part of my original design, but I added it when I rewrote docstore in autumn 2020.
106 |     This means that I can group files so they show up together.
107 |     Examples of when I use this:
108 | 
109 |     -   I have two scans of the same piece of paper
110 |     -   I have a scanned copy of a letter, and an electronic copy I was sent separately
111 |     -   I have multiple versions of a contract at different stages of signing
112 | 
113 |     Here's how a document is described in the JSON:
114 | 
115 |     ```json
116 |     {
117 |       "date_saved": "2020-10-03T16:30:08.471833",
118 |       "files": [
119 |         {
120 |           "checksum": "sha256:fe79444e61b9c009a22497a9878020da98f557476b7f993432bc94fa700e888a",
121 |           "date_saved": "2020-10-03T16:30:08.471833",
122 |           "filename": "Eldritchbot.pdf",
123 |           "id": "331e2b59-fe82-48a4-8d59-f71b0f2ad7b3",
124 |           "path": "files/e/eldritchbot.pdf",
125 |           "size": 2215466,
126 |           "source_url": "https://www.patreon.com/posts/visit-from-40137342",
127 |           "thumbnail": {
128 |             "path": "thumbnails/E/Eldritchbot.pdf.png"
129 |           }
130 |         },
131 |         {
132 |           "checksum": "sha256:ebee96fbb3725e3c708388e6b3f446b933967849980aabb61c51a146942dc7f4",
133 |           "date_saved": "2020-10-03T16:32:08.471833",
134 |           "filename": "Eldritchbot.epub",
135 |           "id": "00faef01-d3b4-4ff3-a226-770f652849e6",
136 |           "path": "files/e/eldritchbot.epub",
137 |           "size": 2215466,
138 |           "source_url": "https://www.patreon.com/posts/visit-from-40137342",
139 |           "thumbnail": {
140 |             "path": "thumbnails/E/Eldritchbot.epub.png"
141 |           }
142 |         }
143 |       ],
144 |       "id": "9dd532c7-edf9-428a-9637-df9bb6030378",
145 |       "tags": [
146 |         "smolrobots",
147 |         "sci-fi",
148 |         "by:Thomas Heasman-Hunt"
149 |       ],
150 |       "title": "A Visit from Eldritchbot"
151 |     }
152 |     ```
153 | 
154 | *   **Stay close to the original filename.**
155 | 
156 |     As much as possible, I want docstore to use the original filename.
157 |     This makes the underlying storage human-readable, and it means that if I lost the metadata, the files would still be somewhat useful.
158 | 
159 |     Here's what the underlying storage looks like:
160 | 
161 |     ```
162 |     docstore/
163 |     └── files/
164 |         ├── a/
165 |         │   ├── admin-renewal-cover-letter.html
166 |         │   ├── advice-for-patients-and-visitors.pdf
167 |         │   └── application-paperwork.pdf
168 |         ├── b/
169 |         ├── c/
170 |         └── ...
171 |     ```
172 | 
173 |     docstore records the original filename in the metadata, and then does some normalisation before copying a file to its storage.
174 |     The normalisation does a couple of things:
175 | 
176 |     *   Remove any special characters and spaces.
177 |         e.g. `alex.chan › payslip › january 2015–2016.pdf` becomes `alex-chan-payslip-january-2015-2016.pdf`
178 | 
179 |     *   Lowercase the filename.
180 |         e.g. `P60Certificate.pdf` becomes `p60certificate.pdf`
181 | 
182 |     *   De-duplicate documents with the same name by adding some random hex to the end of the name.
183 |         e.g. if I store two documents called `statement.pdf`, one will be stored as `statement.pdf` and the other as `statement_f97b.pdf`.
184 | 
185 |     This normalisation means I don't have to worry about whether my filesystem can cope with weird characters, or if I'm storing two different files with the same name.
186 | 
187 |     The thumbnails for each file use a similar filename, so it's easy to find the thumbnail that corresponds to a file (and vice versa).
188 |     For example, if a document is stored as `p60-certificate.pdf`, the thumbnail is stored as `p60-certificate.pdf.png`.
189 | 
190 |     These normalised filenames aren't exposed through the web app – if I'm downloading a file, docstore sets a [`Content-Disposition` header](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition) that tells my browser to download it with the original filename.
191 | 
192 | 
193 | ## Technology
194 | 
195 | *   docstore is written in **Python**.
196 |     The web app uses [**Flask**](https://pypi.org/project/Flask/), and the CLI uses [**Click**](https://pypi.org/project/click/).
197 | *   I use [**attrs**](https://pypi.org/project/attrs/) for the internal models, and [**cattrs**](https://pypi.org/project/cattrs/) to serialise my internal models to JSON.
198 | *   I use [macOS **Quick Look**](https://en.wikipedia.org/wiki/Quick_Look) and [**ffmpeg**](https://ffmpeg.org) to create thumbnails, and a [*k*-means clustering algorithm](https://alexwlchan.net/2019/08/finding-tint-colours-with-k-means/) to get the tint colour to go with the thumbnails.
199 | *   The filename normalisation is based on the blog post ["ASCIIfying" by Dr. Drang](http://www.leancrew.com/all-this/2014/10/asciifying/)
200 | *   The code for displaying tags in a list is based on [templates from Dreamwidth](https://github.com/dreamwidth/dw-free/blob/6ec1e146d3c464e506a77913f0abf0d51a944f95/styles/core2.s2#L4126-L4220)
201 | *   The code for displaying a tag cloud is based on [jquery.tagcloud.js by addywaddy](https://github.com/addywaddy/jquery.tagcloud.js/)
202 | 
203 | 
204 | ## License
205 | 
206 | MIT.
207 | 


--------------------------------------------------------------------------------
/src/docstore/cli.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterable
  2 | import datetime
  3 | import functools
  4 | import json
  5 | import os
  6 | import pathlib
  7 | import sys
  8 | import typing
  9 | 
 10 | import click
 11 | 
 12 | 
 13 | @click.group()
 14 | @click.option(
 15 |     "--root",
 16 |     default=".",
 17 |     help="The root of the docstore database.",
 18 |     type=click.Path(),
 19 |     show_default=True,
 20 | )
 21 | @click.pass_context
 22 | def main(ctx, root):  # type: ignore
 23 |     ctx.obj = pathlib.Path(root)
 24 | 
 25 | 
 26 | def _require_existing_instance(inner):  # type: ignore
 27 |     """
 28 |     When you call ``docstore add``, most of the time you want to be adding
 29 |     documents to an existing instance, not creating a new instance.
 30 | 
 31 |     It's easy to get the directory wrong, so this decorator will check you
 32 |     really wanted to create a new instance vs. adding to an old one.
 33 |     """
 34 | 
 35 |     @functools.wraps(inner)
 36 |     def wrapper(*args, **kwargs):  # type: ignore
 37 |         from docstore.documents import db_path
 38 | 
 39 |         root = click.get_current_context().obj
 40 | 
 41 |         if (
 42 |             root == "."
 43 |             and not os.path.exists(db_path(pathlib.Path(".")))
 44 |             and not any(ag == "--root" or ag.startswith("--root=") for ag in sys.argv)
 45 |         ):  # pragma: no cover
 46 |             click.echo(
 47 |                 f"There is no existing docstore instance at {os.path.abspath('.')}",
 48 |                 err=True,
 49 |             )
 50 |             click.confirm("Do you want to create a new instance?", abort=True, err=True)
 51 | 
 52 |         return inner(*args, **kwargs)
 53 | 
 54 |     return wrapper
 55 | 
 56 | 
 57 | @main.command(help="Run a docstore API server")
 58 | @click.option(
 59 |     "--host", default="127.0.0.1", help="The interface to bind to.", show_default=True
 60 | )
 61 | @click.option("--port", default=3391, help="The port to bind to.", show_default=True)
 62 | @click.option("--title", default="", help="The title of the app.")
 63 | @click.option(
 64 |     "--thumbnail_width", default=200, help="Thumbnail width (px).", show_default=True
 65 | )
 66 | @click.option("--debug", default=False, is_flag=True, help="Run in debug mode.")
 67 | @click.option("--profile", default=False, is_flag=True, help="Run a profiler.")
 68 | @click.pass_obj
 69 | def serve(
 70 |     root: pathlib.Path,
 71 |     host: str,
 72 |     port: int,
 73 |     debug: bool,
 74 |     profile: bool,
 75 |     title: str,
 76 |     thumbnail_width: int,
 77 | ) -> None:  # pragma: no cover
 78 |     from docstore.server import create_app, run_profiler, run_server
 79 | 
 80 |     app = create_app(root=root, title=title, thumbnail_width=thumbnail_width)
 81 | 
 82 |     if profile:
 83 |         run_profiler(app, host=host, port=port)
 84 |     else:
 85 |         run_server(app, host=host, port=port, debug=debug)
 86 | 
 87 | 
 88 | def _add_document(
 89 |     root: pathlib.Path,
 90 |     path: pathlib.Path,
 91 |     title: str | None,
 92 |     tags: str | None,
 93 |     source_url: str | None,
 94 | ) -> None:
 95 |     from docstore.documents import store_new_document
 96 | 
 97 |     document = store_new_document(
 98 |         root=root,
 99 |         path=path,
100 |         title=title or "",
101 |         tags=[t.strip() for t in (tags or "").split(",") if t.strip()],
102 |         source_url=source_url,
103 |         date_saved=datetime.datetime.now(),
104 |     )
105 | 
106 |     print(document.id)
107 | 
108 | 
109 | @main.command(help="Store a file in docstore")
110 | @click.argument("path", nargs=1, type=click.Path(), required=True)
111 | @click.option(
112 |     "--title",
113 |     help="The title of the file.",
114 |     required=True,
115 |     prompt="What is the title of the file?",
116 | )
117 | @click.option(
118 |     "--tags",
119 |     help="The tags to apply to the file.",
120 |     required=True,
121 |     prompt="How should the file be tagged?",
122 | )
123 | @click.option("--source_url", help="Where was this file downloaded from?.")
124 | @click.pass_obj
125 | @_require_existing_instance  # type: ignore
126 | def add(root, path, title, tags, source_url):
127 |     return _add_document(
128 |         root=root, path=path, title=title, tags=tags, source_url=source_url
129 |     )
130 | 
131 | 
132 | @main.command(help="Store a file on the web in docstore")
133 | @click.option(
134 |     "--url", help="URL of the file to store.", type=click.Path(), required=True
135 | )
136 | @click.option("--title", help="The title of the file.")
137 | @click.option("--tags", help="The tags to apply to the file.")
138 | @click.option("--source_url", help="Where was this file downloaded from?.")
139 | @click.pass_obj
140 | @_require_existing_instance  # type: ignore
141 | def add_from_url(
142 |     root: pathlib.Path,
143 |     url: str,
144 |     title: str | None,
145 |     tags: str | None,
146 |     source_url: str | None,
147 | ) -> None:  # pragma: no cover
148 |     from docstore.downloads import download_file
149 | 
150 |     path = download_file(url)
151 | 
152 |     return _add_document(
153 |         root=root, path=path, title=title, tags=tags, source_url=source_url
154 |     )
155 | 
156 | 
157 | @main.command(help="Migrate a V1 docstore")
158 | @click.option(
159 |     "--v1_path",
160 |     help="Path to the root of the V1 instance.",
161 |     type=click.Path(),
162 |     required=True,
163 | )
164 | @click.pass_obj
165 | def migrate(root: pathlib.Path, v1_path: pathlib.Path) -> None:  # pragma: no cover
166 |     documents = json.load(open(os.path.join(v1_path, "documents.json")))
167 | 
168 |     for _, doc in documents.items():
169 |         stored_file_path = v1_path / "files" / doc["file_identifier"]
170 | 
171 |         try:
172 |             filename_path = v1_path / "files" / doc["filename"]
173 |         except KeyError:
174 |             filename_path = stored_file_path
175 | 
176 |         if stored_file_path.exists():
177 |             os.rename(stored_file_path, filename_path)
178 | 
179 |             from docstore.documents import store_new_document
180 | 
181 |             store_new_document(
182 |                 root=root,
183 |                 path=filename_path,
184 |                 title=doc.get("title", ""),
185 |                 tags=doc.get("tags", []),
186 |                 source_url=doc.get("user_data", {}).get("source_url", ""),
187 |                 date_saved=datetime.datetime.fromisoformat(doc["date_created"]),
188 |             )
189 |             print(doc.get("filename", os.path.basename(doc["file_identifier"])))
190 | 
191 | 
192 | @main.command(help="Delete one or more documents")
193 | @click.argument("doc_ids", nargs=-1)
194 | @click.pass_obj
195 | def delete(root: pathlib.Path, doc_ids: list[str]) -> None:
196 |     from docstore.documents import db_path, delete_document
197 | 
198 |     if not os.path.exists(db_path(root)):
199 |         sys.exit(f"There is no docstore instance at {root}!")
200 | 
201 |     for d_id in doc_ids:
202 |         delete_document(root=root, doc_id=d_id)
203 |         print(d_id)
204 | 
205 | 
206 | @main.command(help="Verify your stored files")
207 | @click.pass_obj
208 | def verify(root: pathlib.Path) -> None:
209 |     import collections
210 |     from docstore.documents import read_documents, sha256
211 |     import tqdm
212 | 
213 |     errors = collections.defaultdict(list)
214 | 
215 |     for doc in tqdm.tqdm(list(read_documents(root))):
216 |         for f in doc.files:
217 |             f_path = root / f.path
218 |             if f.size != os.stat(f_path).st_size:
219 |                 errors[f.id].append(
220 |                     f"Size mismatch\n  actual   = {os.stat(f_path).st_size}\n  expected = {f.size}"
221 |                 )
222 | 
223 |             if f.checksum != sha256(f_path):
224 |                 errors[f.id].append(
225 |                     f"Checksum mismatch\n  actual   = {sha256(f_path)}\n  expected = {f.checksum}"
226 |                 )
227 | 
228 |     from pprint import pprint
229 | 
230 |     pprint(errors)
231 | 
232 | 
233 | @main.command(help="Merge the files on two documents")
234 | @click.argument("doc_ids", nargs=-1)
235 | @click.option("--yes", is_flag=True, help="Skip confirmation prompts.")
236 | @click.pass_obj
237 | def merge(root: pathlib.Path, doc_ids: list[str], yes: bool) -> None:
238 |     if len(doc_ids) == 1:
239 |         return
240 | 
241 |     from docstore.documents import read_documents
242 | 
243 |     documents = {d.id: d for d in read_documents(root)}
244 | 
245 |     documents_to_merge = [documents[d_id] for d_id in doc_ids]
246 | 
247 |     for doc in documents_to_merge:
248 |         click.echo(
249 |             f'{doc.id.split("-")[0]} {click.style(doc.title, fg="yellow") or "<untitled>"}'
250 |         )
251 | 
252 |     if not yes:  # pragma: no cover
253 |         click.confirm(f"Merge these {len(doc_ids)} documents?", abort=True)
254 | 
255 |     # What should the title of the merged document be?
256 |     from docstore.merging import get_title_candidates
257 | 
258 |     title_candidates = get_title_candidates(documents_to_merge)
259 | 
260 |     if len(title_candidates) == 1:
261 |         click.echo(f"Using common title: {click.style(title_candidates[0], fg='blue')}")
262 |         new_title = title_candidates[0]
263 |     else:
264 |         print("")
265 |         click.echo(f'Guessed title: {click.style(title_candidates[0], fg="blue")}')
266 |         if yes or click.confirm("Use title?"):
267 |             new_title = title_candidates[0]
268 |         else:  # pragma: no cover
269 |             new_title = typing.cast(
270 |                 str, click.edit("\n".join(title_candidates))
271 |             ).strip()
272 | 
273 |     # What should the tags on the merged document be?
274 |     from docstore.merging import get_union_of_tags
275 | 
276 |     all_tags = get_union_of_tags(documents_to_merge)
277 | 
278 |     print("")
279 | 
280 |     if all(doc.tags == all_tags for doc in documents_to_merge):
281 |         click.echo(f"Using common tags: {click.style(', '.join(all_tags), fg='blue')}")
282 |         new_tags = all_tags
283 |     else:
284 |         click.echo(f"Guessed tags: {click.style(', '.join(all_tags), fg='blue')}")
285 |         if yes or click.confirm("Use tags?"):
286 |             new_tags = all_tags
287 |         else:  # pragma: no cover
288 |             new_tags = (
289 |                 typing.cast(str, click.edit("\n".join(all_tags))).strip().splitlines()
290 |             )
291 | 
292 |     from docstore.documents import pairwise_merge_documents
293 | 
294 |     doc1 = documents[doc_ids[0]]
295 |     for doc2_id in doc_ids[1:]:
296 |         doc2 = documents[doc2_id]
297 |         doc1 = pairwise_merge_documents(
298 |             root=root, doc1=doc1, doc2=doc2, new_title=new_title, new_tags=new_tags
299 |         )
300 | 
301 | 
302 | def find_similar_pairs(
303 |     tags: Iterable[str], *, required_similarity: int = 80
304 | ) -> Iterable[tuple[str, str]]:
305 |     """
306 |     Find pairs of similar-looking tags in the collection ``tags``.
307 | 
308 |     Increase ``required_similarity`` for stricter matching (=> less results).
309 |     """
310 |     import itertools
311 | 
312 |     from rapidfuzz import fuzz
313 | 
314 |     for t1, t2 in itertools.combinations(sorted(tags), 2):
315 |         # utilities:gas, utilities:electricity
316 |         if os.path.commonprefix([t1, t2]).endswith(":"):
317 |             continue
318 | 
319 |         # utilities, utilities:gas
320 |         if t1.startswith(f"{t2}:") or t2.startswith(f"{t1}:"):
321 |             continue
322 | 
323 |         if fuzz.ratio(t1, t2) > required_similarity:
324 |             yield (t1, t2)
325 | 
326 | 
327 | @main.command(help="Show tags that might be similar")
328 | @click.pass_obj
329 | def show_similar_tags(root: pathlib.Path) -> None:
330 |     import collections
331 |     from docstore.documents import read_documents
332 | 
333 |     documents = read_documents(root)
334 |     tags: dict[str, int] = collections.Counter()
335 | 
336 |     for doc in documents:
337 |         for t in doc.tags:
338 |             tags[t] += 1
339 | 
340 |     for t1, t2 in find_similar_pairs(set(tags)):
341 |         print("%3d %s" % (tags[t1], t1))
342 |         print("%3d %s" % (tags[t2], t2))
343 |         print("")
344 | 


--------------------------------------------------------------------------------