├── docs
    ├── references.bib
    ├── _templates
    │   ├── autoapi
    │   │   ├── python
    │   │   │   ├── attribute.rst
    │   │   │   ├── exception.rst
    │   │   │   ├── package.rst
    │   │   │   ├── property.rst
    │   │   │   ├── method.rst
    │   │   │   ├── function.rst
    │   │   │   ├── data.rst
    │   │   │   ├── class.rst
    │   │   │   └── module.rst
    │   │   └── index.rst
    │   └── copyright.html
    ├── requirements.txt
    ├── Makefile
    ├── make.bat
    ├── _static
    │   └── styles
    │   │   └── my_theme.css
    ├── index.rst
    ├── conf.py
    └── abbrev_long.bib
├── src
    └── barecat
    │   ├── core
    │       ├── __init__.py
    │       └── sharder.py
    │   ├── threadsafe.py
    │   ├── progbar.py
    │   ├── codecs.py
    │   ├── to_tar_stream.py
    │   ├── from_tar_stream.py
    │   ├── exceptions.py
    │   ├── __init__.py
    │   ├── upgrade_database2.py
    │   ├── upgrade_database.py
    │   ├── consumed_threadpool.py
    │   ├── glob_to_regex.py
    │   ├── archive_formats.py
    │   ├── defrag.py
    │   ├── sql
    │       └── schema.sql
    │   ├── util.py
    │   ├── cli.py
    │   ├── cli_impl.py
    │   ├── common.py
    │   └── viewerqt6.py
├── MANIFEST.in
├── figure.png
├── .readthedocs.yaml
├── .github
    └── workflows
    │   └── python-publish.yml
├── LICENSE
├── pyproject.toml
├── tests
    ├── test_barecat.py
    └── test_cli.py
├── .gitignore
└── README.md


/docs/references.bib:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/barecat/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/barecat/sql/*.sql
2 | 


--------------------------------------------------------------------------------
/figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isarandi/barecat/HEAD/figure.png


--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/attribute.rst:
--------------------------------------------------------------------------------
1 | {% extends "python/data.rst" %}
2 | 


--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/exception.rst:
--------------------------------------------------------------------------------
1 | {% extends "python/class.rst" %}
2 | 


--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/package.rst:
--------------------------------------------------------------------------------
1 | {% extends "python/module.rst" %}
2 | 


--------------------------------------------------------------------------------
/docs/_templates/autoapi/index.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | =============
 3 | 
 4 | Start at :class:`barecat.Barecat` to explore the API.
 5 | 
 6 | .. toctree::
 7 |    :titlesonly:
 8 | 
 9 |    {% for page in pages|selectattr("is_top_level_object") %}
10 |    {{ page.include_path }}
11 |    {% endfor %}
12 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-24.04
 5 |   tools:
 6 |     python: "3.10"
 7 |   commands:
 8 |     - python -m pip install .
 9 |     - python -m pip install --no-cache-dir -r docs/requirements.txt
10 |     - python -m sphinx -E -b html docs $READTHEDOCS_OUTPUT/html
11 | 
12 | sphinx:
13 |    configuration: docs/conf.py


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | sphinx
 2 | sphinxcontrib-bibtex
 3 | sphinx-autoapi
 4 | sphinx-autobuild
 5 | sphinx-autodoc-typehints
 6 | sphinxcontrib-prettyspecialmethods
 7 | sphinx-autodoc-napoleon-typehints
 8 | sphinx-codeautolink
 9 | sphinx-rtd-theme
10 | pydata-sphinx-theme
11 | sphinxcontrib-napoleon
12 | Cython
13 | numpy
14 | setuptools-scm
15 | toml


--------------------------------------------------------------------------------
/docs/_templates/copyright.html:
--------------------------------------------------------------------------------
 1 | {# Displays the copyright information (which is defined in conf.py). #}
 2 | {% if show_copyright and copyright %}
 3 |   <p class="copyright">
 4 |     {% if hasdoc('copyright') %}
 5 |       © <a href="{{ pathto('copyright') }}">{% trans copyright=copyright|e %}Copyright {{ copyright }} {% endtrans %}</a>.
 6 |       <br/>
 7 |     {% else %}
 8 |       {% trans copyright=copyright|e %}© Copyright {{ copyright }}, <a href="{{ author_url }}">{{ author }}</a>.{% endtrans %}
 9 |       <br/>
10 |     {% endif %}
11 |   </p>
12 | {% endif %}


--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/property.rst:
--------------------------------------------------------------------------------
 1 | :html_theme.sidebar_secondary.remove: true
 2 | 
 3 | {% if obj.display %}
 4 |    {% if is_own_page %}
 5 | {{ obj.name }}
 6 | {{ "=" * obj.name | length }}
 7 | 
 8 |    {% endif %}
 9 | .. py:property:: {% if is_own_page %}{{ obj.id}}{% else %}{{ obj.short_name }}{% endif %}
10 |    {% if obj.annotation %}
11 | 
12 |    :type: {{ obj.annotation }}
13 |    {% endif %}
14 |    {% for property in obj.properties %}
15 | 
16 |    :{{ property }}:
17 |    {% endfor %}
18 | 
19 |    {% if obj.docstring %}
20 | 
21 |    {{ obj.docstring|indent(3) }}
22 |    {% endif %}
23 | {% endif %}
24 | 
25 | .. footbibliography::


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/src/barecat/threadsafe.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | import multiprocessing_utils
 4 | 
 5 | from barecat.core import barecat as barecat
 6 | 
 7 | 
 8 | def threadlocal_decorate(decorator):
 9 |     def my_decorator(fun):
10 |         local = multiprocessing_utils.local()
11 | 
12 |         @functools.wraps(fun)
13 |         def wrapper(*args, **kwargs):
14 |             if not hasattr(local, 'fn'):
15 |                 local.fn = decorator(fun)
16 |             return local.fn(*args, **kwargs)
17 | 
18 |         return wrapper
19 | 
20 |     return my_decorator
21 | 
22 | 
23 | @threadlocal_decorate(functools.lru_cache())
24 | def get_cached_reader(path, auto_codec=True):
25 |     return barecat.Barecat(path, readonly=True, auto_codec=auto_codec)
26 | 


--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/method.rst:
--------------------------------------------------------------------------------
 1 | :html_theme.sidebar_secondary.remove: true
 2 | 
 3 | {% if obj.display %}
 4 |    {% if is_own_page %}
 5 | {{ obj.name }}
 6 | {{ "=" * obj.name | length }}
 7 | 
 8 |    {% endif %}
 9 | .. py:method:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ obj.args }}){% if obj.return_annotation is not none %} -> {{ obj.return_annotation }}{% endif %}
10 |    {% for (args, return_annotation) in obj.overloads %}
11 | 
12 |                {%+ if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ args }}){% if return_annotation is not none %} -> {{ return_annotation }}{% endif %}
13 |    {% endfor %}
14 |    {% for property in obj.properties %}
15 | 
16 |    :{{ property }}:
17 |    {% endfor %}
18 | 
19 |    {% if obj.docstring %}
20 | 
21 |    {{ obj.docstring|indent(3) }}
22 |    {% endif %}
23 | {% endif %}
24 | 
25 | .. footbibliography::


--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/function.rst:
--------------------------------------------------------------------------------
 1 | :html_theme.sidebar_secondary.remove: true
 2 | 
 3 | {% if obj.display %}
 4 |    {% if is_own_page %}
 5 | {{ obj.name }}
 6 | {{ "=" * obj.name | length }}
 7 | 
 8 |    {% endif %}
 9 | .. py:function:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ obj.args }}){% if obj.return_annotation is not none %} -> {{ obj.return_annotation }}{% endif %}
10 |    {% for (args, return_annotation) in obj.overloads %}
11 | 
12 |                  {%+ if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ args }}){% if return_annotation is not none %} -> {{ return_annotation }}{% endif %}
13 |    {% endfor %}
14 |    {% for property in obj.properties %}
15 | 
16 |    :{{ property }}:
17 |    {% endfor %}
18 | 
19 |    {% if obj.docstring %}
20 | 
21 |    {{ obj.docstring|indent(3) }}
22 |    {% endif %}
23 | {% endif %}
24 | 
25 | .. footbibliography::


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   pypi-publish:
12 |     name: Upload release to PyPI
13 |     runs-on: ubuntu-latest
14 |     environment: pypi
15 |     permissions:
16 |       id-token: write
17 |     steps:
18 |       - name: Check out repository
19 |         uses: actions/checkout@v4
20 |         with:
21 |           fetch-depth: 0
22 | 
23 |       - name: Set up Python
24 |         uses: actions/setup-python@v5
25 |         with:
26 |           python-version: "3.x"
27 | 
28 |       - name: Install build dependencies
29 |         run: python -m pip install --upgrade build
30 | 
31 |       - name: Build package distribution
32 |         run: python -m build --sdist
33 | 
34 |       - name: Publish package distributions to PyPI
35 |         uses: pypa/gh-action-pypi-publish@release/v1
36 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/data.rst:
--------------------------------------------------------------------------------
 1 | :html_theme.sidebar_secondary.remove: true
 2 | 
 3 | {% if obj.display %}
 4 |    {% if is_own_page %}
 5 | {{ obj.name }}
 6 | {{ "=" * obj.name | length }}
 7 | 
 8 |    {% endif %}
 9 | .. py:{{ obj.type }}:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.name }}{% endif %}
10 |    {% if obj.annotation is not none %}
11 | 
12 |    :type: {% if obj.annotation %} {{ obj.annotation }}{% endif %}
13 |    {% endif %}
14 |    {% if obj.value is not none %}
15 | 
16 |       {% if obj.value.splitlines()|count > 1 %}
17 |    :value: Multiline-String
18 | 
19 |    .. raw:: html
20 | 
21 |       <details><summary>Show Value</summary>
22 | 
23 |    .. code-block:: python
24 | 
25 |       {{ obj.value|indent(width=6,blank=true) }}
26 | 
27 |    .. raw:: html
28 | 
29 |       </details>
30 | 
31 |       {% else %}
32 |    :value: {{ obj.value|truncate(100) }}
33 |       {% endif %}
34 |    {% endif %}
35 | 
36 |    {% if obj.docstring %}
37 | 
38 |    {{ obj.docstring|indent(3) }}
39 |    {% endif %}
40 | {% endif %}
41 | 
42 | .. footbibliography::


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 István Sárándi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/barecat/progbar.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | def is_running_in_jupyter_notebook():
 5 |     try:
 6 |         # noinspection PyUnresolvedReferences
 7 |         shell = get_ipython().__class__.__name__
 8 |         if shell == 'ZMQInteractiveShell':
 9 |             return True  # Jupyter notebook or qtconsole
10 |         elif shell == 'TerminalInteractiveShell':
11 |             return False  # Terminal running IPython
12 |         else:
13 |             return False  # Other type (?)
14 |     except NameError:
15 |         return False  # Probably standard Python interpreter
16 | 
17 | 
18 | def progressbar(iterable=None, *args, **kwargs):
19 |     import tqdm
20 | 
21 |     if is_running_in_jupyter_notebook():
22 |         return tqdm.notebook.tqdm(iterable, *args, **kwargs)
23 |     elif sys.stdout.isatty():
24 |         return tqdm.tqdm(iterable, *args, dynamic_ncols=True, **kwargs)
25 |     elif iterable is None:
26 | 
27 |         class X:
28 |             def update(self, *a, **kw):
29 |                 pass
30 | 
31 |         return X()
32 |     else:
33 |         return iterable
34 | 
35 | 
36 | def progressbar_items(dictionary, *args, **kwargs):
37 |     return progressbar(dictionary.items(), total=len(dictionary), *args, **kwargs)
38 | 


--------------------------------------------------------------------------------
/src/barecat/codecs.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | 
 4 | def encode_jpeg(data):
 5 |     import imageio.v2 as imageio
 6 | 
 7 |     with io.BytesIO() as f:
 8 |         imageio.imwrite(f, data, format='jpeg', quality=95)
 9 |         return f.getvalue()
10 | 
11 | 
12 | def decode_jpeg(data):
13 |     import jpeg4py
14 |     import numpy as np
15 | 
16 |     return jpeg4py.JPEG(np.frombuffer(data, np.uint8)).decode()
17 | 
18 | 
19 | def encode_msgpack_np(data):
20 |     import msgpack_numpy
21 | 
22 |     return msgpack_numpy.packb(data)
23 | 
24 | 
25 | def decode_msgpack_np(data):
26 |     import msgpack_numpy
27 | 
28 |     return msgpack_numpy.unpackb(data)
29 | 
30 | 
31 | def encode_npy(data):
32 |     import numpy as np
33 | 
34 |     with io.BytesIO() as f:
35 |         np.save(f, data)
36 |         return f.getvalue()
37 | 
38 | 
39 | def decode_npy(data):
40 |     import numpy as np
41 | 
42 |     with io.BytesIO(data) as f:
43 |         return np.load(f)
44 | 
45 | 
46 | def encode_npz(data):
47 |     import numpy as np
48 | 
49 |     with io.BytesIO() as f:
50 |         np.savez(f, **data)
51 |         return f.getvalue()
52 | 
53 | 
54 | def decode_npz(data):
55 |     import numpy as np
56 | 
57 |     with io.BytesIO(data) as f:
58 |         return dict(np.load(f))
59 | 


--------------------------------------------------------------------------------
/src/barecat/to_tar_stream.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from barecat.archive_formats import TarWriter
 3 | import barecat.core.barecat as barecat_
 4 | import argparse
 5 | import sys
 6 | 
 7 | 
 8 | def main():
 9 |     parser = argparse.ArgumentParser(description='Convert a tar stream to a barecat file')
10 |     parser.add_argument('barecat_file', type=str, help='path to the target barecat file')
11 |     parser.add_argument("args", nargs=argparse.REMAINDER, help="Ordered --in and --ex arguments")
12 | 
13 |     args = parser.parse_args()
14 |     patterns = parse_in_ex_patterns(args)
15 | 
16 |     with (
17 |         barecat_.Barecat(args.barecat_file, readonly=True) as bc_reader,
18 |         TarWriter(fileobj=sys.stdout.buffer, mode='w|') as tar_writer,
19 |     ):
20 |         for finfo in bc_reader.index.raw_iterglob_infos_incl_excl(
21 |             patterns=patterns, only_files=True
22 |         ):
23 |             with bc_reader.open(finfo.path) as fileobj:
24 |                 tar_writer.add(finfo, fileobj)
25 | 
26 | 
27 | def parse_in_ex_patterns(args):
28 |     patterns = []
29 |     i = 0
30 |     while i < len(args.args):
31 |         arg = args.args[i]
32 | 
33 |         if arg.startswith("--in="):
34 |             patterns.append((True, arg.split("=", 1)[1]))
35 | 
36 |         elif arg.startswith("--ex="):
37 |             patterns.append((False, arg.split("=", 1)[1]))
38 | 
39 |         elif arg == "--in":
40 |             if i + 1 < len(args.args):
41 |                 patterns.append((True, args.args[i + 1]))
42 |                 i += 1
43 | 
44 |         elif arg == "--ex":
45 |             if i + 1 < len(args.args):
46 |                 patterns.append((False, args.args[i + 1]))
47 |                 i += 1
48 | 
49 |         i += 1
50 | 
51 |     return patterns
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------
/docs/_static/styles/my_theme.css:
--------------------------------------------------------------------------------
 1 | @import url("theme.css");
 2 | @import url("https://fonts.googleapis.com/css2?family=Mona+Sans:ital,wght@0,200..900;1,200..900&family=Geist:wght@100..900&&family=JetBrains+Mono:ital,wght@0,100..800;1,100..800&family=Outfit:wght@100..900&display=swap");
 3 | 
 4 | /*@media (min-width: 960px) {
 5 |     .bd-page-width {
 6 |         max-width: 120rem;
 7 |     }
 8 | }*/
 9 | 
10 | #rtd-footer-container {
11 |     margin-top: 0 !important;
12 | }
13 | 
14 | html[data-theme="light"] {
15 |     --pst-color-table-row-hover-bg: #dfc6ff;
16 |     --pst-color-link-hover: #845818;
17 | }
18 | 
19 | html[data-theme="dark"] {
20 |     --pst-color-table-row-hover-bg: #41296c;
21 |     --pst-color-inline-code: #dd8cd4;
22 | }
23 | 
24 | 
25 | html[data-theme="dark"] dt:target {
26 |     background-color: #4f4500;
27 | }
28 | 
29 | html[data-theme="dark"] .linkcode-link {
30 |     color: #9090ff;
31 | }
32 | 
33 | html[data-theme="dark"] table.indextable tr.cap {
34 |     background-color: #464646;
35 | }
36 | 
37 | html[data-theme="dark"] a:visited {
38 |     color: #9E67D0;
39 | }
40 | 
41 | .navbar-brand .logo__title {
42 |     font-family: "Mona Sans", sans-serif;
43 |     font-size: 2.5rem;
44 |     font-weight: 400;
45 |     font-style: normal;
46 | }
47 | 
48 | :root {
49 |     --pst-font-family-monospace: "JetBrains Mono", monospace;
50 |     --pst-font-family-heading: "Mona Sans", sans-serif;
51 |     --pst-font-family-base: "Mona Sans", sans-serif;
52 | }
53 | 
54 | body {
55 |     font-weight: 450;
56 | }
57 | 
58 | .bd-main .bd-content .bd-article-container {
59 |   max-width: 100%;  /* default is 60em */
60 | }
61 | 
62 | /*.bd-sidebar-primary {
63 |     max-width: 20%;
64 | }*/
65 | 
66 | /* Ensure links in code blocks are underlined */
67 | .highlight a {
68 |     text-decoration: underline;
69 |     color: #394198; /* Adjust color as needed */
70 | }
71 | 
72 | /* For additional emphasis, change hover effect */
73 | .highlight a:hover {
74 |     text-decoration: underline;
75 |     color: #9090ff;
76 | }
77 | 


--------------------------------------------------------------------------------
/src/barecat/from_tar_stream.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | import tarfile
 4 | 
 5 | import barecat.core.barecat as barecat_
 6 | from barecat.common import BarecatDirInfo, BarecatFileInfo
 7 | 
 8 | 
 9 | def main():
10 |     parser = argparse.ArgumentParser(description='Convert a tar stream to a barecat file')
11 |     parser.add_argument('barecat_file', type=str, help='path to the target barecat file')
12 |     parser.add_argument(
13 |         '--shard-size-limit',
14 |         type=str,
15 |         default=None,
16 |         help='maximum size of a shard in bytes (if not specified, '
17 |         'all files will be concatenated into a single shard)',
18 |     )
19 |     parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
20 |     args = parser.parse_args()
21 | 
22 |     with barecat_.Barecat(
23 |         args.barecat_file,
24 |         shard_size_limit=args.shard_size_limit,
25 |         readonly=False,
26 |         overwrite=args.overwrite,
27 |     ) as writer:
28 |         with tarfile.open(fileobj=sys.stdin.buffer, mode='r|') as tar:
29 |             for member in tar:
30 |                 if member.isdir():
31 |                     dinfo = BarecatDirInfo(
32 |                         path=member.name,
33 |                         mode=member.mode,
34 |                         uid=member.uid,
35 |                         gid=member.gid,
36 |                         mtime_ns=member.mtime * 1_000_000_000,
37 |                     )
38 |                     writer.add(dinfo, dir_exist_ok=True)
39 |                 if member.isfile():
40 |                     finfo = BarecatFileInfo(
41 |                         path=member.name,
42 |                         size=member.size,
43 |                         mode=member.mode,
44 |                         uid=member.uid,
45 |                         gid=member.gid,
46 |                         mtime_ns=member.mtime * 1_000_000_000,
47 |                     )
48 |                     with tar.extractfile(member) as file_in_tar:
49 |                         writer.add(finfo, fileobj=file_in_tar)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 


--------------------------------------------------------------------------------
/src/barecat/exceptions.py:
--------------------------------------------------------------------------------
 1 | """Exceptions indicating various errors related to the use of Barecat archives"""
 2 | 
 3 | 
 4 | class BarecatError(Exception):
 5 |     """Base class for all exceptions in Barecat"""
 6 | 
 7 |     def __init__(self, message: str):
 8 |         super().__init__(message)
 9 | 
10 | 
11 | class FileExistsBarecatError(BarecatError):
12 |     """Exception raised when trying to create a file that already exists
13 | 
14 |     Analogous to FileExistsError
15 | 
16 |     Args:
17 |         path: path to the file that already exists
18 |     """
19 | 
20 |     def __init__(self, path: str):
21 |         super().__init__(f'File already exists: {path}')
22 | 
23 | 
24 | class FileNotFoundBarecatError(BarecatError):
25 |     """Exception raised when trying to access a file that does not exist
26 | 
27 |     Analogous to FileNotFoundError
28 | 
29 |     Args:
30 |         path: path to the file that does not exist
31 | 
32 |     """
33 | 
34 |     def __init__(self, path: str):
35 |         super().__init__(f'File not found: {path}')
36 | 
37 | 
38 | class DirectoryNotEmptyBarecatError(BarecatError):
39 |     """Exception raised when trying to delete a non-empty directory
40 | 
41 |     Args:
42 |         path: path to the non-empty directory
43 |     """
44 | 
45 |     def __init__(self, path: str):
46 |         super().__init__(f'Directory not empty: {path}')
47 | 
48 | 
49 | class IsADirectoryBarecatError(BarecatError):
50 |     """Exception raised when trying to access a directory as a file.
51 | 
52 |     Args:
53 |         path: path to the directory
54 | 
55 |     """
56 | 
57 |     def __init__(self, path: str):
58 |         super().__init__(f'Is a directory: {path}')
59 | 
60 | 
61 | class NotADirectoryBarecatError(BarecatError):
62 |     """Exception raised when trying to access a file as a directory."""
63 | 
64 |     def __init__(self, message: str):
65 |         super().__init__(message)
66 | 
67 | 
68 | class BarecatIntegrityError(BarecatError):
69 |     """Exception raised when the CRC32C checksum of a file does not match the expected checksum"""
70 | 
71 |     def __init__(self, message: str):
72 |         super().__init__(message)
73 | 
74 | 
75 | class NotEnoughSpaceBarecatError(BarecatError):
76 |     """Exception raised when there is not enough space to write a file to the archive"""
77 | 
78 |     def __init__(self, message: str):
79 |         super().__init__(message)
80 | 


--------------------------------------------------------------------------------
/src/barecat/__init__.py:
--------------------------------------------------------------------------------
 1 | """Barecat is a fast random-access, mountable archive format for storing and accessing many small
 2 |  files."""
 3 | 
 4 | from .core.barecat import Barecat
 5 | from .core.index import Index
 6 | 
 7 | from .cli_impl import (
 8 |     archive2barecat,
 9 |     barecat2archive,
10 |     extract,
11 |     merge,
12 |     merge_symlink,
13 |     read_index,
14 |     write_index,
15 | )
16 | from .common import (
17 |     BarecatFileInfo,
18 |     BarecatDirInfo,
19 |     BarecatEntryInfo,
20 |     FileSection,
21 |     Order,
22 |     SHARD_SIZE_UNLIMITED,
23 | )
24 | 
25 | from .exceptions import (
26 |     BarecatError,
27 |     BarecatIntegrityError,
28 |     FileExistsBarecatError,
29 |     FileNotFoundBarecatError,
30 |     IsADirectoryBarecatError,
31 |     NotEnoughSpaceBarecatError,
32 |     DirectoryNotEmptyBarecatError,
33 | )
34 | 
35 | from .threadsafe import get_cached_reader
36 | 
37 | 
38 | def open(path, mode='r', auto_codec=False, threadsafe_reader=True):
39 |     if mode == 'r':
40 |         return Barecat(path, readonly=True, threadsafe=threadsafe_reader, auto_codec=auto_codec)
41 |     elif mode == 'w+':
42 |         return Barecat(
43 |             path,
44 |             readonly=False,
45 |             overwrite=True,
46 |             exist_ok=True,
47 |             append_only=False,
48 |             auto_codec=auto_codec,
49 |         )
50 |     elif mode == 'r+':
51 |         return Barecat(
52 |             path,
53 |             readonly=False,
54 |             overwrite=False,
55 |             exist_ok=True,
56 |             append_only=False,
57 |             auto_codec=auto_codec,
58 |         )
59 |     elif mode == 'a+':
60 |         return Barecat(
61 |             path,
62 |             readonly=False,
63 |             overwrite=False,
64 |             exist_ok=True,
65 |             append_only=True,
66 |             auto_codec=auto_codec,
67 |         )
68 |     elif mode == 'ax+':
69 |         return Barecat(
70 |             path,
71 |             readonly=False,
72 |             overwrite=False,
73 |             exist_ok=False,
74 |             append_only=True,
75 |             auto_codec=auto_codec,
76 |         )
77 |     elif mode == 'x+':
78 |         return Barecat(
79 |             path,
80 |             readonly=False,
81 |             overwrite=False,
82 |             exist_ok=False,
83 |             append_only=False,
84 |             auto_codec=auto_codec,
85 |         )
86 |     else:
87 |         raise ValueError(f"Invalid mode: {mode}")
88 | 


--------------------------------------------------------------------------------
/src/barecat/upgrade_database2.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os.path
 3 | 
 4 | import barecat
 5 | 
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser(description='Migrate index database to new version')
 9 |     parser.add_argument('path_in', type=str, help='Path to the old barecat')
10 |     parser.add_argument('path_out', type=str, help='Path to the new barecat')
11 | 
12 |     args = parser.parse_args()
13 |     upgrade_schema(args.path_in, args.path_out)
14 | 
15 | 
16 | def upgrade_schema(path_in: str, path_out: str):
17 |     if os.path.exists(path_out + '-sqlite-index'):
18 |         raise FileExistsError(f'Output path {path_out}-sqlite-index already exists')
19 |     with barecat.Index(path_out + '-sqlite-index', readonly=False) as index_out:
20 |         c = index_out.cursor
21 |         c.execute('COMMIT')
22 |         c.execute('PRAGMA foreign_keys=OFF')
23 |         c.execute('PRAGMA synchronous=OFF')
24 |         c.execute('PRAGMA journal_mode=OFF')
25 |         c.execute(f'ATTACH DATABASE "file:{path_in}-sqlite-index?mode=ro" AS source')
26 | 
27 |         with index_out.no_triggers(), index_out.no_foreign_keys():
28 |             print('Migrating dir metadata...')
29 |             c.execute(
30 |                 """
31 |                 INSERT INTO dirs (
32 |                     path, num_subdirs, num_files, num_files_tree, size_tree, mode, uid, gid,
33 |                     mtime_ns)
34 |                 SELECT path, num_subdirs, num_files, num_files_tree, size_tree, mode, uid,
35 |                     gid, mtime_ns
36 |                 FROM source.dirs
37 |                 WHERE path != ''
38 |                 """
39 |             )
40 |             c.execute("""
41 |                 UPDATE dirs
42 |                 SET (num_subdirs, num_files, num_files_tree, size_tree, mode, uid, gid, mtime_ns) =
43 |                     (SELECT num_subdirs, num_files, num_files_tree, size_tree, mode, uid, gid, mtime_ns 
44 |                      FROM source.dirs WHERE path = '')
45 |                 WHERE path = ''
46 |             """)
47 | 
48 | 
49 |             print('Migrating file metadata...')
50 |             c.execute(
51 |                 f"""
52 |                 INSERT INTO files (
53 |                     path, shard, offset, size, crc32c, mode, uid, gid, mtime_ns) 
54 |                 SELECT path, shard, offset, size, crc32c, mode, uid, gid, mtime_ns
55 |                 FROM source.files
56 |                 """
57 |             )
58 | 
59 |             c.execute(
60 |                 f"""
61 |                 INSERT OR REPLACE INTO config (key, value_text, value_int)
62 |                 SELECT key, value_text, value_int
63 |                 FROM source.config
64 |                 """
65 |             )
66 | 
67 |             index_out.conn.commit()
68 |             c.execute("DETACH DATABASE source")
69 |             index_out.optimize()
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "setuptools>=64",
 4 |     "wheel",
 5 |     "setuptools_scm[toml]>=8"
 6 | ]
 7 | build-backend = "setuptools.build_meta"
 8 | 
 9 | [project]
10 | name = "barecat"
11 | dynamic = ["version"]
12 | description = "Scalable archive format for storing millions of small files with random access and SQLite indexing."
13 | readme = "README.md"
14 | requires-python = ">=3.9"
15 | license = { file = "LICENSE" }
16 | 
17 | authors = [
18 |     { name = "István Sárándi", email = "istvan.sarandi@gmail.com" }
19 | ]
20 | 
21 | dependencies = [
22 |     "multiprocessing-utils",
23 |     "tqdm",
24 |     "crc32c"
25 | ]
26 | 
27 | keywords = [
28 |     "sqlite",
29 |     "dataset",
30 |     "storage",
31 |     "archive",
32 |     "random-access",
33 |     "image-dataset",
34 |     "filesystem",
35 |     "key-value-store",
36 |     "deep-learning",
37 |     "data-loader",
38 |     "file-indexing"
39 | ]
40 | 
41 | classifiers = [
42 |     "Development Status :: 4 - Beta",
43 |     "Intended Audience :: Developers",
44 |     "Intended Audience :: Science/Research",
45 |     "Topic :: Scientific/Engineering :: Information Analysis",
46 |     "Topic :: Software Development :: Libraries",
47 |     "Topic :: System :: Archiving",
48 |     "Topic :: System :: Filesystems",
49 |     "License :: OSI Approved :: MIT License",
50 |     "Programming Language :: Python",
51 |     "Programming Language :: Python :: 3",
52 |     "Programming Language :: Cython",
53 |     "Operating System :: POSIX :: Linux"
54 | ]
55 | 
56 | [project.scripts]
57 | barecat-create = "barecat.cli:create"
58 | barecat-extract = "barecat.cli:extract"
59 | barecat-merge = "barecat.cli:merge"
60 | barecat-merge-symlink = "barecat.cli:merge_symlink"
61 | barecat-extract-single = "barecat.cli:extract_single"
62 | barecat-index-to-csv = "barecat.cli:index_to_csv"
63 | barecat-verify = "barecat.cli:verify_integrity"
64 | barecat-to-ncdu-json = "barecat.cli:print_ncdu_json"
65 | archive2barecat = "barecat.cli:archive2barecat"
66 | barecat2archive = "barecat.cli:barecat2archive"
67 | barecat-defrag = "barecat.cli:defrag"
68 | barecat-create-recursive = "barecat.cli:create_recursive"
69 | barecat-viewer = "barecat.viewerqt6:main"
70 | barecat-upgrade-database = "barecat.upgrade_database:main"
71 | 
72 | [project.urls]
73 | Homepage = "https://github.com/isarandi/barecat"
74 | Documentation = "https://istvansarandi.com/docs/barecat/api/barecat/Barecat.html"
75 | Repository = "https://github.com/isarandi/barecat"
76 | Issues = "https://github.com/isarandi/barecat/issues"
77 | Author = "https://istvansarandi.com"
78 | 
79 | [tool.setuptools_scm]
80 | version_scheme = "guess-next-dev"
81 | local_scheme = "no-local-version"
82 | write_to = "src/barecat/_version.py"
83 | 
84 | [tool.setuptools]
85 | package-dir = { "" = "src" }
86 | 
87 | [tool.setuptools.packages.find]
88 | where = ["src"]
89 | 
90 | [tool.black]
91 | line-length = 99
92 | skip-string-normalization = true
93 | 


--------------------------------------------------------------------------------
/tests/test_barecat.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import barecat
 4 | from barecat import Barecat, BarecatFileInfo, BarecatDirInfo
 5 | import pytest
 6 | import tempfile
 7 | import os.path as osp
 8 | 
 9 | 
10 | def test_barecat():
11 |     tempdir = tempfile.mkdtemp()
12 |     filepath = osp.join(tempdir, 'test.barecat')
13 |     with barecat.Barecat(filepath, readonly=False) as bc:
14 |         bc['some/path.txt'] = b'hello'
15 | 
16 |     with barecat.Barecat(filepath, readonly=True) as bc:
17 |         assert bc['some/path.txt'] == b'hello'
18 | 
19 |     with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc:
20 |         bc.add(BarecatFileInfo(path='some/path.txt', mode=0o666), data=b'hello world')
21 |         bc.add(BarecatDirInfo(path='some/dir', mode=0o777))
22 | 
23 |     with barecat.Barecat(filepath, readonly=True) as bc:
24 |         assert bc['some/path.txt'] == b'hello world'
25 |         assert bc.listdir('some/dir') == []
26 | 
27 |     with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc:
28 |         bc['some/path.txt'] = b'hello world'
29 |         assert bc['some/path.txt'] == b'hello world'
30 |         del bc['some/path.txt']
31 |         with pytest.raises(KeyError):
32 |             a = bc['some/path.txt']
33 | 
34 |     with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc:
35 |         bc['some/path.txt'] = b'hello world'
36 | 
37 |     with barecat.Barecat(filepath, readonly=True) as bc:
38 |         with bc.open('some/path.txt') as f:
39 |             f.seek(6)
40 |             assert f.read() == b'world'
41 | 
42 |     with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc:
43 |         bc['dir/file.txt'] = b'Hello, world!'
44 |         bc['dir/subdir/file2.txt'] = b'Hello, world2!'
45 | 
46 |     with barecat.Barecat(filepath, readonly=True) as bc:
47 |         assert bc.listdir('dir/subdir') == ['file2.txt']
48 | 
49 |         assert list(bc.walk('dir')) == [
50 |             ('dir', ['subdir'], ['file.txt']),
51 |             ('dir/subdir', [], ['file2.txt']),
52 |         ]
53 | 
54 |     with open(osp.join(tempdir, 'file.txt'), 'wb') as f:
55 |         f.write(b'Hello, world!')
56 |     os.mkdir(osp.join(tempdir, 'dir2'))
57 | 
58 |     with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc:
59 |         bc.add_by_path(osp.join(tempdir, 'file.txt'))
60 |         bc.add_by_path(osp.join(tempdir, 'dir2'), store_path='dir')
61 | 
62 |     with barecat.Barecat(filepath, readonly=True) as bc:
63 |         assert bc[osp.join(tempdir, 'file.txt')] == b'Hello, world!'
64 |         assert bc.listdir('dir') == []
65 | 
66 |     with Barecat(filepath, readonly=False, overwrite=True) as bc:
67 |         bc.add(BarecatFileInfo(path='file.txt', mode=0o666), data=b'Hello, world!')
68 |         bc.add(BarecatDirInfo(path='dir', mode=0o777))
69 | 
70 |     with Barecat(filepath, readonly=True) as bc:
71 |         assert bc['file.txt'] == b'Hello, world!'
72 |         assert bc.listdir('dir') == []
73 | 
74 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | *_cython.c
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/src/barecat/upgrade_database.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os.path
  3 | import sqlite3
  4 | 
  5 | import barecat
  6 | import barecat_cython
  7 | from barecat.consumed_threadpool import ConsumedThreadPool
  8 | from barecat.progbar import progressbar
  9 | 
 10 | 
 11 | def main():
 12 |     parser = argparse.ArgumentParser(description='Migrate index database to new version')
 13 |     parser.add_argument('path', type=str, help='Path to the old barecat')
 14 |     parser.add_argument(
 15 |         '--workers', type=int, default=8, help='Number of workers for calculating crc32c'
 16 |     )
 17 | 
 18 |     args = parser.parse_args()
 19 |     dbase_path = args.path + '-sqlite-index'
 20 |     if not os.path.exists(dbase_path):
 21 |         raise FileNotFoundError(f'{dbase_path} does not exist!')
 22 | 
 23 |     os.rename(args.path + '-sqlite-index', args.path + '-sqlite-index.old')
 24 |     upgrade_schema(args.path)
 25 |     update_crc32c(args.path, workers=args.workers)
 26 | 
 27 | 
 28 | def upgrade_schema(path: str):
 29 |     with barecat.Index(path + '-sqlite-index', readonly=False) as index_out:
 30 |         c = index_out.cursor
 31 |         c.execute('COMMIT')
 32 |         c.execute('PRAGMA foreign_keys=OFF')
 33 |         c.execute('PRAGMA synchronous=OFF')
 34 |         c.execute('PRAGMA journal_mode=OFF')
 35 |         c.execute('PRAGMA recursive_triggers=ON')
 36 |         c.execute(f'ATTACH DATABASE "file:{path}-sqlite-index.old?mode=ro" AS source')
 37 |         print('Migrating dir metadata...')
 38 |         c.execute(
 39 |             """
 40 |             INSERT INTO dirs (path)
 41 |             SELECT path FROM source.directories
 42 |             WHERE path != ''
 43 |             """
 44 |         )
 45 |         print('Migrating file metadata...')
 46 |         c.execute(
 47 |             f"""
 48 |             INSERT INTO files (path, shard, offset, size)
 49 |             SELECT path, shard, offset, size
 50 |             FROM source.files
 51 |             """
 52 |         )
 53 | 
 54 |         c.execute('COMMIT')
 55 |         c.execute("DETACH DATABASE source")
 56 | 
 57 | 
 58 | def update_crc32c(path_out: str, workers=8):
 59 |     with (
 60 |         barecat_cython.BarecatMmapCython(path_out) as sh,
 61 |         barecat.Index(path_out + '-sqlite-index', readonly=False) as index,
 62 |     ):
 63 |         c = index.cursor
 64 |         c.execute('COMMIT')
 65 |         c.execute('PRAGMA synchronous=OFF')
 66 |         c.execute('PRAGMA journal_mode=OFF')
 67 |         index._triggers_enabled = False
 68 | 
 69 |         print('Calculating crc32c for all files to separate database...')
 70 |         path_newcrc_temp = f'{path_out}-sqlite-index-newcrc-temp'
 71 |         with ConsumedThreadPool(
 72 |             temp_crc_writer_main,
 73 |             main_args=(path_newcrc_temp,),
 74 |             max_workers=workers,
 75 |             queue_size=1024,
 76 |         ) as ctp:
 77 |             for fi in progressbar(
 78 |                 index.iter_all_fileinfos(order=barecat.Order.ADDRESS), total=index.num_files
 79 |             ):
 80 |                 ctp.submit(
 81 |                     sh.crc32c_from_address, userdata=fi.path, args=(fi.shard, fi.offset, fi.size)
 82 |                 )
 83 | 
 84 |         print('Updating crc32c in the barecat index...')
 85 |         c.execute(f'ATTACH DATABASE "file:{path_newcrc_temp}?mode=ro" AS newdb')
 86 |         c.execute(
 87 |             """
 88 |             UPDATE files 
 89 |             SET crc32c=newdb.crc32c.crc32c
 90 |             FROM newdb.crc32c
 91 |             WHERE files.path=newdb.crc32c.path
 92 |             """
 93 |         )
 94 |         c.execute('COMMIT')
 95 |         c.execute('DETACH DATABASE newdb')
 96 | 
 97 |     os.remove(path_newcrc_temp)
 98 | 
 99 | 
100 | def temp_crc_writer_main(dbpath, future_iter):
101 |     with sqlite3.connect(dbpath) as conn:
102 |         c = conn.cursor()
103 |         c.execute('PRAGMA synchronous=OFF')
104 |         c.execute('PRAGMA journal_mode=OFF')
105 |         c.execute("CREATE TABLE IF NOT EXISTS crc32c (path TEXT PRIMARY KEY, crc32c INTEGER)")
106 |         for future in future_iter:
107 |             path = future.userdata
108 |             crc32c = future.result()
109 |             c.execute("INSERT INTO crc32c (path, crc32c) VALUES (?, ?)", (path, crc32c))
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     main()
114 | 


--------------------------------------------------------------------------------
/src/barecat/consumed_threadpool.py:
--------------------------------------------------------------------------------
  1 | import concurrent.futures
  2 | import os
  3 | import queue
  4 | import threading
  5 | 
  6 | 
  7 | class ConsumedThreadPool:
  8 |     """This class solves a form of the producer-consumer problem.
  9 |     There is one main producer, whose items need to be processed in parallel by one of several
 10 |     workers, and finally the processed items are consumed by a single consumer thread.
 11 | 
 12 |     So the three steps are:
 13 | 
 14 |     1. The main thread constructs this object, then iterates and calls submit() for each item,
 15 |      passing the appropriate processing function and arguments to submit().
 16 |     2. The workers process the items in parallel threads, these are the threads created by a
 17 |      ThreadPoolExecutor.
 18 |     3. The consumer thread consumes the items, in the form of futures, running the consumer_main
 19 |      function originally passed to the constructor.
 20 | 
 21 |     The main producer's loop is meant to be computationally inexpensive, something that generates
 22 |     "tasks". The worker threads do the heavy lifting.
 23 |     The consumer does something that must happen in a serial manner or otherwise must happen in the
 24 |     same, single thread.
 25 | 
 26 |     Example:
 27 | 
 28 |         def producer_main():
 29 |             with ConsumedThreadPool(consumer_main, main_args=('hello',), max_workers=8) as pool:
 30 |                 for i in range(100):
 31 |                     pool.submit(process_fn, userdata='anything', args=(i,))
 32 | 
 33 |         def process_fn(i):
 34 |             return i * 2
 35 | 
 36 |         def consumer_main(greeting, future_iter):
 37 |             print(greeting)
 38 |             for future in future_iter:
 39 |                 print(future.userdata)
 40 |                 print(future.result())
 41 |     """
 42 | 
 43 |     def __init__(
 44 |         self, consumer_main, main_args=None, main_kwargs=None, max_workers=None, queue_size=None
 45 |     ):
 46 |         if max_workers is None:
 47 |             max_workers = len(os.sched_getaffinity(0))
 48 |         if queue_size is None:
 49 |             queue_size = max_workers * 2
 50 |         self.q = queue.Queue(queue_size)
 51 |         self.semaphore = threading.Semaphore(queue_size)
 52 |         self.executor = concurrent.futures.ThreadPoolExecutor(max_workers)
 53 | 
 54 |         self.consumer_error_queue = queue.Queue()
 55 |         self.consumer_main = consumer_main
 56 | 
 57 |         if main_kwargs is None:
 58 |             main_kwargs = {}
 59 |         self.consumer_thread = threading.Thread(
 60 |             target=self._safe_consumer_main, args=(main_args, main_kwargs)
 61 |         )
 62 |         self.consumer_thread.start()
 63 | 
 64 |     def _safe_consumer_main(self, main_args, main_kwargs):
 65 |         try:
 66 |             main_kwargs = {**main_kwargs, 'future_iter': IterableQueue(self.q)}
 67 |             self.consumer_main(*main_args, **main_kwargs)
 68 |         except Exception as e:
 69 |             self.consumer_error_queue.put(e)
 70 | 
 71 |     def submit(self, fn=None, userdata=None, args=None, kwargs=None):
 72 |         if not self.consumer_error_queue.empty():
 73 |             consumer_exception = self.consumer_error_queue.get()
 74 |             raise RuntimeError('Consumer thread raised an exception') from consumer_exception
 75 | 
 76 |         self.semaphore.acquire()
 77 |         if args is None:
 78 |             args = ()
 79 |         if kwargs is None:
 80 |             kwargs = {}
 81 |         if fn is None:
 82 |             fn = noop
 83 |         future = self.executor.submit(fn, *args, **kwargs)
 84 |         future.userdata = userdata
 85 |         future.add_done_callback(lambda f: self.semaphore.release())
 86 |         future.add_done_callback(self.q.put)
 87 | 
 88 |     def close(self):
 89 |         self.executor.shutdown(wait=True)
 90 |         self.q.put(None)
 91 |         self.q.join()
 92 |         self.consumer_thread.join()
 93 | 
 94 |         if not self.consumer_error_queue.empty():
 95 |             consumer_exception = self.consumer_error_queue.get()
 96 |             raise RuntimeError('Consumer thread raised an exception') from consumer_exception
 97 | 
 98 |     def __enter__(self):
 99 |         return self
100 | 
101 |     def __exit__(self, exc_type, exc_val, exc_tb):
102 |         self.close()
103 | 
104 | 
105 | class IterableQueue:
106 |     def __init__(self, q):
107 |         self.q = q
108 | 
109 |     def __iter__(self):
110 |         while (item := self.q.get()) is not None:
111 |             yield item
112 |             self.q.task_done()
113 |         self.q.task_done()
114 | 
115 | 
116 | def noop():
117 |     pass
118 | 


--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/class.rst:
--------------------------------------------------------------------------------
  1 | :html_theme.sidebar_secondary.remove: true
  2 | 
  3 | {% if obj.display %}
  4 |    {% if is_own_page %}
  5 | {{ obj.name }}
  6 | {{ "=" * obj.name | length }}
  7 | 
  8 |    {% endif %}
  9 |    {% set visible_children = obj.children|selectattr("display")|list %}
 10 |    {% set own_page_children = visible_children|selectattr("type", "in", own_page_types)|list %}
 11 |    {% if is_own_page and own_page_children %}
 12 | .. toctree::
 13 |    :hidden:
 14 | 
 15 |       {% for child in own_page_children %}
 16 |    {{ child.include_path }}
 17 |       {% endfor %}
 18 | 
 19 |    {% endif %}
 20 | .. py:{{ obj.type }}:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}{% if obj.args %}({{ obj.args }}){% endif %}
 21 | 
 22 |    {% for (args, return_annotation) in obj.overloads %}
 23 |       {{ " " * (obj.type | length) }}   {{ obj.short_name }}{% if args %}({{ args }}){% endif %}
 24 | 
 25 |    {% endfor %}
 26 |    {% if obj.bases %}
 27 |       {% if "show-inheritance" in autoapi_options %}
 28 | 
 29 |    Bases: {% for base in obj.bases %}{{ base|link_objs }}{% if not loop.last %}, {% endif %}{% endfor %}
 30 |       {% endif %}
 31 | 
 32 | 
 33 |       {% if "show-inheritance-diagram" in autoapi_options and obj.bases != ["object"] %}
 34 |    .. autoapi-inheritance-diagram:: {{ obj.obj["full_name"] }}
 35 |       :parts: 1
 36 |          {% if "private-members" in autoapi_options %}
 37 |       :private-bases:
 38 |          {% endif %}
 39 | 
 40 |       {% endif %}
 41 |    {% endif %}
 42 |    {% if obj.docstring %}
 43 | 
 44 |    {{ obj.docstring|indent(3) }}
 45 |    {% endif %}
 46 |    {% for obj_item in visible_children %}
 47 |       {% if obj_item.type not in own_page_types %}
 48 | 
 49 |    {{ obj_item.render()|indent(3) }}
 50 |       {% endif %}
 51 |    {% endfor %}
 52 |    {% if is_own_page and own_page_children %}
 53 |       {% set visible_attributes = own_page_children|selectattr("type", "equalto", "attribute")|list %}
 54 |       {% if visible_attributes %}
 55 | Attributes
 56 | ----------
 57 | 
 58 | .. autoapisummary::
 59 | 
 60 |          {% for attribute in visible_attributes %}
 61 |    {{ attribute.id }}
 62 |          {% endfor %}
 63 | 
 64 | 
 65 |       {% endif %}
 66 |       {% set visible_properties = own_page_children|selectattr("type", "equalto", "property")|list %}
 67 |       {% if visible_properties %}
 68 | Properties
 69 | ----------
 70 | 
 71 | .. autoapisummary::
 72 | 
 73 |             {% for property in visible_properties %}
 74 |    {{ property.id }}
 75 |             {% endfor %}
 76 | 
 77 | 
 78 |       {% endif %}
 79 |       {% set visible_exceptions = own_page_children|selectattr("type", "equalto", "exception")|list %}
 80 |       {% if visible_exceptions %}
 81 | Exceptions
 82 | ----------
 83 | 
 84 | .. autoapisummary::
 85 | 
 86 |          {% for exception in visible_exceptions %}
 87 |    {{ exception.id }}
 88 |          {% endfor %}
 89 | 
 90 | 
 91 |       {% endif %}
 92 |       {% set visible_classes = own_page_children|selectattr("type", "equalto", "class")|list %}
 93 |       {% if visible_classes %}
 94 | Classes
 95 | -------
 96 | 
 97 | .. autoapisummary::
 98 | 
 99 |          {% for klass in visible_classes %}
100 |    {{ klass.id }}
101 |          {% endfor %}
102 | 
103 | 
104 |       {% endif %}
105 | 
106 |       {% set static_methods = own_page_children|selectattr("type", "equalto", "method")|selectattr("properties", "defined")|selectattr("properties", "equalto", ["staticmethod"])|list %}
107 |       {% set class_methods = own_page_children|selectattr("type", "equalto", "method")|selectattr("properties", "defined")|selectattr("properties", "equalto", ["classmethod"])|list %}
108 |       {% set instance_methods = own_page_children|selectattr("type", "equalto", "method")|rejectattr("properties", "equalto", ["staticmethod"])|rejectattr("properties", "equalto", ["classmethod"])|list %}
109 | 
110 |       {% if instance_methods %}
111 | Instance Methods
112 | ----------------
113 | 
114 | .. autoapisummary::
115 | 
116 |          {% for method in instance_methods %}
117 |    {{ method.id }}
118 |          {% endfor %}
119 | 
120 | 
121 |       {% endif %}
122 |       {% if class_methods %}
123 | Class Methods
124 | -------------
125 | 
126 | .. autoapisummary::
127 | 
128 |          {% for method in class_methods %}
129 |    {{ method.id }}
130 |          {% endfor %}
131 | 
132 | 
133 |       {% endif %}
134 |       {% if static_methods %}
135 | Static Methods
136 | --------------
137 | 
138 | .. autoapisummary::
139 | 
140 |          {% for method in static_methods %}
141 |    {{ method.id }}
142 |          {% endfor %}
143 | 
144 | 
145 |       {% endif %}
146 |    {% endif %}
147 | {% endif %}
148 | 
149 | 
150 | .. footbibliography::


--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/module.rst:
--------------------------------------------------------------------------------
  1 | :html_theme.sidebar_secondary.remove: true
  2 | 
  3 | {% if obj.display %}
  4 |    {% if is_own_page %}
  5 | {{ obj.id }}
  6 | {{ "=" * obj.id|length }}
  7 | 
  8 | .. py:module:: {{ obj.name }}
  9 | 
 10 |       {% if obj.docstring %}
 11 | .. autoapi-nested-parse::
 12 | 
 13 |    {{ obj.docstring|indent(3) }}
 14 | 
 15 |       {% endif %}
 16 | 
 17 |       {% block submodules %}
 18 |          {% set visible_subpackages = obj.subpackages|selectattr("display")|list %}
 19 |          {% set visible_submodules = obj.submodules|selectattr("display")|list %}
 20 |          {% set visible_submodules = (visible_subpackages + visible_submodules)|sort %}
 21 |          {% if visible_submodules %}
 22 | Submodules
 23 | ----------
 24 | 
 25 | .. toctree::
 26 |    :maxdepth: 1
 27 | 
 28 |             {% for submodule in visible_submodules %}
 29 |    {{ submodule.include_path }}
 30 |             {% endfor %}
 31 | 
 32 | 
 33 |          {% endif %}
 34 |       {% endblock %}
 35 |       {% block content %}
 36 |          {% set visible_children = obj.children|selectattr("display")|list %}
 37 |          {% if visible_children %}
 38 |             {% set visible_attributes = visible_children|selectattr("type", "equalto", "data")|list %}
 39 |             {% if visible_attributes %}
 40 |                {% if "attribute" in own_page_types or "show-module-summary" in autoapi_options %}
 41 | Attributes
 42 | ----------
 43 | 
 44 |                   {% if "attribute" in own_page_types %}
 45 | .. toctree::
 46 |    :hidden:
 47 | 
 48 |                      {% for attribute in visible_attributes %}
 49 |    {{ attribute.include_path }}
 50 |                      {% endfor %}
 51 | 
 52 |                   {% endif %}
 53 | .. autoapisummary::
 54 | 
 55 |                   {% for attribute in visible_attributes %}
 56 |    {{ attribute.id }}
 57 |                   {% endfor %}
 58 |                {% endif %}
 59 | 
 60 | 
 61 |             {% endif %}
 62 |             {% set visible_exceptions = visible_children|selectattr("type", "equalto", "exception")|list %}
 63 |             {% if visible_exceptions %}
 64 |                {% if "exception" in own_page_types or "show-module-summary" in autoapi_options %}
 65 | Exceptions
 66 | ----------
 67 | 
 68 |                   {% if "exception" in own_page_types %}
 69 | .. toctree::
 70 |    :hidden:
 71 | 
 72 |                      {% for exception in visible_exceptions %}
 73 |    {{ exception.include_path }}
 74 |                      {% endfor %}
 75 | 
 76 |                   {% endif %}
 77 | .. autoapisummary::
 78 | 
 79 |                   {% for exception in visible_exceptions %}
 80 |    {{ exception.id }}
 81 |                   {% endfor %}
 82 |                {% endif %}
 83 | 
 84 | 
 85 |             {% endif %}
 86 |             {% set visible_classes = visible_children|selectattr("type", "equalto", "class")|list %}
 87 |             {% if visible_classes %}
 88 |                {% if "class" in own_page_types or "show-module-summary" in autoapi_options %}
 89 | Classes
 90 | -------
 91 | 
 92 |                   {% if "class" in own_page_types %}
 93 | .. toctree::
 94 |    :hidden:
 95 | 
 96 |                      {% for klass in visible_classes %}
 97 |    {{ klass.include_path }}
 98 |                      {% endfor %}
 99 | 
100 |                   {% endif %}
101 | .. autoapisummary::
102 | 
103 |                   {% for klass in visible_classes %}
104 |    {{ klass.id }}
105 |                   {% endfor %}
106 |                {% endif %}
107 | 
108 | 
109 |             {% endif %}
110 |             {% set visible_functions = visible_children|selectattr("type", "equalto", "function")|list %}
111 |             {% if visible_functions %}
112 |                {% if "function" in own_page_types or "show-module-summary" in autoapi_options %}
113 | Functions
114 | ---------
115 | 
116 |                   {% if "function" in own_page_types %}
117 | .. toctree::
118 |    :hidden:
119 | 
120 |                      {% for function in visible_functions %}
121 |    {{ function.include_path }}
122 |                      {% endfor %}
123 | 
124 |                   {% endif %}
125 | .. autoapisummary::
126 | 
127 |                   {% for function in visible_functions %}
128 |    {{ function.id }}
129 |                   {% endfor %}
130 |                {% endif %}
131 | 
132 | 
133 |             {% endif %}
134 |             {% set this_page_children = visible_children|rejectattr("type", "in", own_page_types)|list %}
135 |             {% if this_page_children %}
136 | {{ obj.type|title }} Contents
137 | {{ "-" * obj.type|length }}---------
138 | 
139 |                {% for obj_item in this_page_children %}
140 | {{ obj_item.render()|indent(0) }}
141 |                {% endfor %}
142 |             {% endif %}
143 |          {% endif %}
144 |       {% endblock %}
145 |    {% else %}
146 | .. py:module:: {{ obj.name }}
147 | 
148 |       {% if obj.docstring %}
149 |    .. autoapi-nested-parse::
150 | 
151 |       {{ obj.docstring|indent(6) }}
152 | 
153 |       {% endif %}
154 |       {% for obj_item in visible_children %}
155 |    {{ obj_item.render()|indent(3) }}
156 |       {% endfor %}
157 |    {% endif %}
158 | {% endif %}
159 | 
160 | .. footbibliography::


--------------------------------------------------------------------------------
/src/barecat/glob_to_regex.py:
--------------------------------------------------------------------------------
  1 | # This is copied from CPython main branch as of 2024-12-07.
  2 | import re
  3 | import os.path
  4 | import functools
  5 | 
  6 | _re_setops_sub = re.compile(r'([&~|])').sub
  7 | _re_escape = functools.lru_cache(maxsize=512)(re.escape)
  8 | 
  9 | 
 10 | def glob_to_regex(pat, *, recursive=False, include_hidden=False, seps=None):
 11 |     """Translate a pathname with shell wildcards to a regular expression.
 12 | 
 13 |     If `recursive` is true, the pattern segment '**' will match any number of
 14 |     path segments.
 15 | 
 16 |     If `include_hidden` is true, wildcards can match path segments beginning
 17 |     with a dot ('.').
 18 | 
 19 |     If a sequence of separator characters is given to `seps`, they will be
 20 |     used to split the pattern into segments and match path separators. If not
 21 |     given, os.path.sep and os.path.altsep (where available) are used.
 22 |     """
 23 |     if not seps:
 24 |         if os.path.altsep:
 25 |             seps = (os.path.sep, os.path.altsep)
 26 |         else:
 27 |             seps = os.path.sep
 28 |     escaped_seps = ''.join(map(re.escape, seps))
 29 |     any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
 30 |     not_sep = f'[^{escaped_seps}]'
 31 |     if include_hidden:
 32 |         one_last_segment = f'{not_sep}+'
 33 |         one_segment = f'{one_last_segment}{any_sep}'
 34 |         any_segments = f'(?:.+{any_sep})?'
 35 |         any_last_segments = '.*'
 36 |     else:
 37 |         one_last_segment = f'[^{escaped_seps}.]{not_sep}*'
 38 |         one_segment = f'{one_last_segment}{any_sep}'
 39 |         any_segments = f'(?:{one_segment})*'
 40 |         any_last_segments = f'{any_segments}(?:{one_last_segment})?'
 41 | 
 42 |     results = []
 43 |     parts = re.split(any_sep, pat)
 44 |     last_part_idx = len(parts) - 1
 45 |     for idx, part in enumerate(parts):
 46 |         if part == '*':
 47 |             results.append(one_segment if idx < last_part_idx else one_last_segment)
 48 |         elif recursive and part == '**':
 49 |             if idx < last_part_idx:
 50 |                 if parts[idx + 1] != '**':
 51 |                     results.append(any_segments)
 52 |             else:
 53 |                 results.append(any_last_segments)
 54 |         else:
 55 |             if part:
 56 |                 if not include_hidden and part[0] in '*?':
 57 |                     results.append(r'(?!\.)')
 58 |                 results.extend(_translate(part, f'{not_sep}*', not_sep)[0])
 59 |             if idx < last_part_idx:
 60 |                 results.append(any_sep)
 61 |     res = ''.join(results)
 62 |     return fr'(?s:{res})\Z'
 63 | 
 64 | 
 65 | def _translate(pat, star, question_mark):
 66 |     res = []
 67 |     add = res.append
 68 |     star_indices = []
 69 | 
 70 |     i, n = 0, len(pat)
 71 |     while i < n:
 72 |         c = pat[i]
 73 |         i = i + 1
 74 |         if c == '*':
 75 |             # store the position of the wildcard
 76 |             star_indices.append(len(res))
 77 |             add(star)
 78 |             # compress consecutive `*` into one
 79 |             while i < n and pat[i] == '*':
 80 |                 i += 1
 81 |         elif c == '?':
 82 |             add(question_mark)
 83 |         elif c == '[':
 84 |             j = i
 85 |             if j < n and pat[j] == '!':
 86 |                 j = j + 1
 87 |             if j < n and pat[j] == ']':
 88 |                 j = j + 1
 89 |             while j < n and pat[j] != ']':
 90 |                 j = j + 1
 91 |             if j >= n:
 92 |                 add('\\[')
 93 |             else:
 94 |                 stuff = pat[i:j]
 95 |                 if '-' not in stuff:
 96 |                     stuff = stuff.replace('\\', r'\\')
 97 |                 else:
 98 |                     chunks = []
 99 |                     k = i + 2 if pat[i] == '!' else i + 1
100 |                     while True:
101 |                         k = pat.find('-', k, j)
102 |                         if k < 0:
103 |                             break
104 |                         chunks.append(pat[i:k])
105 |                         i = k + 1
106 |                         k = k + 3
107 |                     chunk = pat[i:j]
108 |                     if chunk:
109 |                         chunks.append(chunk)
110 |                     else:
111 |                         chunks[-1] += '-'
112 |                     # Remove empty ranges -- invalid in RE.
113 |                     for k in range(len(chunks) - 1, 0, -1):
114 |                         if chunks[k - 1][-1] > chunks[k][0]:
115 |                             chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
116 |                             del chunks[k]
117 |                     # Escape backslashes and hyphens for set difference (--).
118 |                     # Hyphens that create ranges shouldn't be escaped.
119 |                     stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') for s in chunks)
120 |                 i = j + 1
121 |                 if not stuff:
122 |                     # Empty range: never match.
123 |                     add('(?!)')
124 |                 elif stuff == '!':
125 |                     # Negated empty range: match any character.
126 |                     add('.')
127 |                 else:
128 |                     # Escape set operations (&&, ~~ and ||).
129 |                     stuff = _re_setops_sub(r'\\\1', stuff)
130 |                     if stuff[0] == '!':
131 |                         stuff = '^' + stuff[1:]
132 |                     elif stuff[0] in ('^', '['):
133 |                         stuff = '\\' + stuff
134 |                     add(f'[{stuff}]')
135 |         else:
136 |             add(_re_escape(c))
137 |     assert i == n
138 |     return res, star_indices
139 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Barecat
  2 | 
  3 | **[Full API Reference Docs](https://istvansarandi.com/docs/barecat/api/barecat/Barecat.html)**
  4 | 
  5 | Barecat (**bare** con**cat**enation) is a highly scalable, simple aggregate storage format for
  6 | storing many (tens of millions and more) small files, with focus on fast random access and 
  7 | minimal overhead.
  8 | 
  9 | Barecat can be thought of as a simple filesystem, or as something akin to an indexed tarball, or a
 10 | key-value store. Indeed, it can be [mounted via FUSE](https://github.com/isarandi/barecat-mount), converted to a tarball, or used like a dictionary
 11 | within Python.
 12 | 
 13 | Barecat associates strings (file paths) with binary data (file contents). It's like a dictionary,
 14 | but it has some special handling for '/' characters in the keys, supporting a filesystem-like
 15 | experience (`listdir`, `walk`, `glob`, etc).
 16 | 
 17 | Internally, all the data is simply concatenated one after another into one or more data shard files.
 18 | Additionally, an index is maintained in an SQLite database, which stores the shard number, the offset
 19 | and the size of each inner file (as well as a checksum, and further filesystem-like metadata 
 20 | like modification time). Barecat also maintains aggregate statistics for each directory, such as the
 21 | total number of files and total file size.
 22 | 
 23 | 
 24 | ![Architecture](./figure.png)
 25 | 
 26 | As you can see, the Barecat format is very simple. Readers/writers are easy to write in any language, since
 27 | SQLite is a widely-supported format.
 28 | 
 29 | 
 30 | ## Background
 31 | 
 32 | A typical use case for Barecat is storing image files for training deep learning models, where the
 33 | files are accessed randomly during training. The files are typically stored on a network file
 34 | system, where accessing many small files can be slow, and clusters often put a limit on the number
 35 | of files of a user. So it is necessary to somehow merge the small files into big ones.
 36 | However, typical archive formats such as tar are not suitable, since they don't allow fast random
 37 | lookups. In tar, one has to scan the entire archive as there is no central directory.
 38 | Zip is better, but still requires scanning the central directory, which can be slow for very large
 39 | archives with millions or tens of millions of files.
 40 | 
 41 | We need an index into the archive, and the index itself cannot be required to be loaded
 42 | into memory, to support very large datasets.
 43 | 
 44 | Therefore, in this format the metadata is indexed separately in an SQLite database for fast lookup
 45 | based on paths. The index also allows fast listing of directory contents and contains aggregate
 46 | statistics (total file size, number of files) for each directory.
 47 | 
 48 | ## Features
 49 | 
 50 | - **Fast random access**: The archive can be accessed randomly, addressed by filepath,
 51 |   without having to scan the entire archive or all the metadata.
 52 |   The index is stored in a separate SQLite database file, which itself does not need to be loaded
 53 |   entirely into memory. Ideal for storing training image data for deep learning jobs.
 54 | - **Sharding**: To make it easier to move the data around or to distribute it across multiple
 55 |   storage devices, the archive can be split into multiple files of equal size (shards, or volumes). 
 56 |   The shards do not have to be concatenated to be used, the library will keep all shard files open
 57 |   and load data from the appropriate one during normal operations.
 58 | - **Fast browsing**: The SQLite database contains an index for the parent directories, allowing
 59 |   fast listing of directory contents and aggregate statistics (total file size, number of files).
 60 | - **Intuitive API**: Familiar filesystem-like API, as well as a dictionary-like one.
 61 | - **Mountable**: The archive can be efficiently mounted in readonly or read-write mode.
 62 | - **Simple storage format**: The files are simply concatenated after each other and the index contains
 63 |   the offsets and sizes of each file. There is no header format to understand. The index can be
 64 |   dumped into any format with simple SQL queries.
 65 | 
 66 | ## Command line interface
 67 | 
 68 | To create a Barecat archive, use the `barecat-create` or `barecat-create-recursive` commands, which 
 69 | are automatically installed executables with the pip package.
 70 | 
 71 | ```bash
 72 | barecat-create --file=mydata.barecat --shard-size=100G < path_of_paths.txt 
 73 | 
 74 | find dirname -name '*.jpg' -print0 | barecat-create --null --file=mydata.barecat --shard-size=100G
 75 | 
 76 | barecat-create-recursive dir1 dir2 dir3 --file=mydata.barecat --shard-size=100G
 77 | ```
 78 | 
 79 | This may yield the following files:
 80 | 
 81 | ```
 82 | mydata.barecat-shard-00001
 83 | mydata.barecat-shard-00002
 84 | mydata.barecat-sqlite-index
 85 | ```
 86 | 
 87 | The files can be extracted out again. Unix-like permissions, modification times, owner info are
 88 | preserved.
 89 | 
 90 | ```bash
 91 | barecat-extract --file=mydata.barecat --target-directory=targetdir/
 92 | ```
 93 | 
 94 | ## Python API
 95 | 
 96 | ```python
 97 | 
 98 | import barecat
 99 | 
100 | with barecat.Barecat('mydata.barecat', readonly=False) as bc:
101 |   bc['path/to/file/as/stored.jpg'] = binary_file_data
102 |   bc.add_by_path('path/to/file/on/disk.jpg')
103 |   
104 |   with open('path', 'rb') as f:
105 |     bc.add('path/to/file/on/disk.jpg', fileobj=f)
106 |     
107 | with barecat.Barecat('mydata.barecat') as bc:
108 |   binary_file_data = bc['path/to/file.jpg']
109 |   entrynames = bc.listdir('path/to')
110 |   for root, dirs, files in bc.walk('path/to/something'):
111 |     print(root, dirs, files)
112 |     
113 |   paths = bc.glob('path/to/**/*.jpg', recursive=True)
114 |   
115 |   with bc.open('path/to/file.jpg', 'rb') as f:
116 |     data = f.read(123)
117 | ```
118 | 
119 | ## Image viewer
120 | 
121 | Barecat comes with a simple image viewer that can be used to browse the contents of a Barecat
122 | archive.
123 | 
124 | ```bash
125 | barecat-image-viewer mydata.barecat
126 | ```
127 | 
128 |  
129 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | Barecat
  2 | =======
  3 | 
  4 | Barecat (**bare** con**cat**enation) is a highly scalable, simple aggregate storage format for
  5 | storing many (tens of millions and more) small files, with focus on fast random access and
  6 | minimal overhead.
  7 | 
  8 | Barecat can be thought of as a simple filesystem, or as something akin to an indexed tarball, or a
  9 | key-value store. Indeed, it can be `mounted via FUSE <https://github.com/isarandi/barecat-mount>`_, converted to a tarball, or used like a dictionary
 10 | within Python.
 11 | 
 12 | Barecat associates strings (file paths) with binary data (file contents). It's like a dictionary,
 13 | but it has some special handling for '/' characters in the keys, supporting a filesystem-like
 14 | experience (``listdir``, ``walk``, ``glob``, etc).
 15 | 
 16 | Internally, all the data is simply concatenated one after another into one or more data shard files.
 17 | Additionally, an index is maintained in an SQLite database, which stores the shard number, the offset
 18 | and the size of each inner file (as well as a checksum, and further filesystem-like metadata
 19 | like modification time). Barecat also maintains aggregate statistics for each directory, such as the
 20 | total number of files and total file size.
 21 | 
 22 | .. image:: ../figure.png
 23 | 
 24 | As you can see, the Barecat format is very simple. Readers/writers are easy to write in any language, since
 25 | SQLite is a widely-supported format.
 26 | 
 27 | Background
 28 | ----------
 29 | 
 30 | A typical use case for Barecat is storing image files for training deep learning models, where the
 31 | files are accessed randomly during training. The files are typically stored on a network file
 32 | system, where accessing many small files can be slow, and clusters often put a limit on the number
 33 | of files of a user. So it is necessary to somehow merge the small files into big ones.
 34 | However, typical archive formats such as tar are not suitable, since they don't allow fast random
 35 | lookups. In tar, one has to scan the entire archive as there is no central directory.
 36 | Zip is better, but still requires scanning the central directory, which can be slow for very large
 37 | archives with millions or tens of millions of files.
 38 | 
 39 | We need an index into the archive, and the index itself cannot be required to be loaded
 40 | into memory, to support very large datasets.
 41 | 
 42 | Therefore, in this format the metadata is indexed separately in an SQLite database for fast lookup
 43 | based on paths. The index also allows fast listing of directory contents and contains aggregate
 44 | statistics (total file size, number of files) for each directory.
 45 | 
 46 | Features
 47 | --------
 48 | 
 49 | - **Fast random access**: The archive can be accessed randomly, addressed by filepath,
 50 |   without having to scan the entire archive or all the metadata.
 51 |   The index is stored in a separate SQLite database file, which itself does not need to be loaded
 52 |   entirely into memory. Ideal for storing training image data for deep learning jobs.
 53 | - **Sharding**: To make it easier to move the data around or to distribute it across multiple
 54 |   storage devices, the archive can be split into multiple files of equal size (shards, or volumes).
 55 |   The shards do not have to be concatenated to be used, the library will keep all shard files open
 56 |   and load data from the appropriate one during normal operations.
 57 | - **Fast browsing**: The SQLite database contains an index for the parent directories, allowing
 58 |   fast listing of directory contents and aggregate statistics (total file size, number of files).
 59 | - **Intuitive API**: Familiar filesystem-like API, as well as a dictionary-like one.
 60 | - **Mountable**: The archive can be efficiently mounted in readonly or read-write mode.
 61 | - **Simple storage format**: The files are simply concatenated after each other and the index contains
 62 |   the offsets and sizes of each file. There is no header format to understand. The index can be
 63 |   dumped into any format with simple SQL queries.
 64 | 
 65 | Command line interface
 66 | ----------------------
 67 | 
 68 | To create a Barecat archive, use the ``barecat-create`` or ``barecat-create-recursive`` commands, which
 69 | are automatically installed executables with the pip package.
 70 | 
 71 | .. code-block:: bash
 72 | 
 73 |    barecat-create --file=mydata.barecat --shard-size=100G < path_of_paths.txt
 74 | 
 75 |    find dirname -name '*.jpg' -print0 | barecat-create --null --file=mydata.barecat --shard-size=100G
 76 | 
 77 |    barecat-create-recursive dir1 dir2 dir3 --file=mydata.barecat --shard-size=100G
 78 | 
 79 | This may yield the following files:
 80 | 
 81 | .. code-block:: text
 82 | 
 83 |    mydata.barecat-shard-00001
 84 |    mydata.barecat-shard-00002
 85 |    mydata.barecat-sqlite-index
 86 | 
 87 | The files can be extracted out again. Unix-like permissions, modification times, owner info are
 88 | preserved.
 89 | 
 90 | .. code-block:: bash
 91 | 
 92 |    barecat-extract --file=mydata.barecat --target-directory=targetdir/
 93 | 
 94 | Python API
 95 | ----------
 96 | 
 97 | .. code-block:: python
 98 | 
 99 |    import barecat
100 | 
101 |    with barecat.Barecat('mydata.barecat', readonly=False) as bc:
102 |      bc['path/to/file/as/stored.jpg'] = binary_file_data
103 |      bc.add_by_path('path/to/file/on/disk.jpg')
104 | 
105 |      with open('path', 'rb') as f:
106 |        bc.add('path/to/file/on/disk.jpg', fileobj=f)
107 | 
108 |    with barecat.Barecat('mydata.barecat') as bc:
109 |      binary_file_data = bc['path/to/file.jpg']
110 |      entrynames = bc.listdir('path/to')
111 |      for root, dirs, files in bc.walk('path/to/something'):
112 |        print(root, dirs, files)
113 | 
114 |      paths = bc.glob('path/to/**/*.jpg', recursive=True)
115 | 
116 |      with bc.open('path/to/file.jpg', 'rb') as f:
117 |        data = f.read(123)
118 | 
119 | Image viewer
120 | ------------
121 | 
122 | Barecat comes with a simple image viewer that can be used to browse the contents of a Barecat
123 | archive.
124 | 
125 | .. code-block:: bash
126 | 
127 |    barecat-image-viewer mydata.barecat
128 | 
129 | Sitemap
130 | -------
131 | 
132 | .. toctree::
133 |    :maxdepth: 3
134 |    :caption: Contents
135 | 
136 | 
137 | * :ref:`genindex`
138 | * :ref:`modindex`
139 | * :ref:`search`
140 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | import types
  2 | import contextlib
  3 | import importlib
  4 | import inspect
  5 | import os
  6 | import re
  7 | import sys
  8 | from enum import Enum
  9 | 
 10 | import setuptools_scm
 11 | import toml
 12 | 
 13 | sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
 14 | 
 15 | 
 16 | pyproject_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "pyproject.toml"))
 17 | 
 18 | with open(pyproject_path) as f:
 19 |     data = toml.load(f)
 20 | 
 21 | project_info = data["project"]
 22 | project_slug = project_info["name"].replace(" ", "-").lower()
 23 | tool_urls = project_info.get("urls", {})
 24 | 
 25 | repo_url = tool_urls.get("Repository", "")
 26 | author_url = tool_urls.get("Author", "")
 27 | github_username = re.match(r"https://github\.com/([^/]+)/?", repo_url)[1]
 28 | 
 29 | project = project_info["name"]
 30 | release = setuptools_scm.get_version('..')
 31 | version = ".".join(release.split(".")[:2])
 32 | main_module_name = project_slug.replace('-', '_')
 33 | repo_name = project_slug
 34 | module = importlib.import_module(main_module_name)
 35 | globals()[main_module_name] = module
 36 | 
 37 | 
 38 | # -- Project information -----------------------------------------------------
 39 | linkcode_url = repo_url
 40 | 
 41 | author = project_info["authors"][0]["name"]
 42 | copyright = f'%Y'
 43 | 
 44 | # -- General configuration ---------------------------------------------------
 45 | add_module_names = False
 46 | python_use_unqualified_type_names = True
 47 | extensions = [
 48 |     'sphinx.ext.autodoc',
 49 |     'sphinx.ext.napoleon',
 50 |     'sphinx.ext.autosummary',
 51 |     'sphinx.ext.intersphinx',
 52 |     'sphinx.ext.linkcode',
 53 |     'sphinx.ext.autodoc.typehints',
 54 |     'sphinxcontrib.bibtex',
 55 |     'autoapi.extension',
 56 |     'sphinx.ext.inheritance_diagram',
 57 |     'sphinx_codeautolink',
 58 | ]
 59 | bibtex_bibfiles = ['abbrev_long.bib', 'references.bib']
 60 | bibtex_footbibliography_header = ".. rubric:: References"
 61 | intersphinx_mapping = {
 62 |     'python': ('https://docs.python.org/3', None),
 63 |     'torch': ('https://pytorch.org/docs/main/', None),
 64 |     'numpy': ('https://numpy.org/doc/stable/', None),
 65 |     'scipy': ('https://docs.scipy.org/doc/scipy/', None),
 66 | }
 67 | 
 68 | github_username = github_username
 69 | github_repository = repo_name
 70 | autodoc_show_sourcelink = False
 71 | html_show_sourcelink = False
 72 | 
 73 | templates_path = ['_templates']
 74 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 75 | python_display_short_literal_types = True
 76 | 
 77 | html_title = project
 78 | html_theme = 'pydata_sphinx_theme'
 79 | html_theme_options = {
 80 |     "show_toc_level": 3,
 81 |     "icon_links": [
 82 |         {
 83 |             "name": "GitHub",
 84 |             "url": repo_url,
 85 |             "icon": "fa-brands fa-square-github",
 86 |             "type": "fontawesome",
 87 |         }
 88 |     ],
 89 | }
 90 | html_static_path = ['_static']
 91 | html_css_files = ['styles/my_theme.css']
 92 | 
 93 | html_context = {
 94 |     "author_url": author_url,
 95 |     "author": author,
 96 | }
 97 | 
 98 | toc_object_entries_show_parents = "hide"
 99 | 
100 | autoapi_root = 'api'
101 | autoapi_member_order = 'bysource'
102 | autodoc_typehints = 'description'
103 | autoapi_own_page_level = 'attribute'
104 | autoapi_type = 'python'
105 | autodoc_default_options = {
106 |     'members': True,
107 |     'inherited-members': True,
108 |     'undoc-members': False,
109 |     'exclude-members': '__init__, __weakref__, __repr__, __str__',
110 | }
111 | autoapi_options = ['members', 'show-inheritance', 'special-members', 'show-module-summary']
112 | autoapi_add_toctree_entry = True
113 | autoapi_dirs = ['../src']
114 | autoapi_template_dir = '_templates/autoapi'
115 | 
116 | autodoc_member_order = 'bysource'
117 | autoclass_content = 'class'
118 | 
119 | autosummary_generate = True
120 | autosummary_imported_members = False
121 | 
122 | 
123 | def autodoc_skip_member(app, what, name, obj, skip, options):
124 |     """
125 |     Skip members (functions, classes, modules) without docstrings.
126 |     """
127 |     # Check if the object has a __doc__ attribute
128 |     if not getattr(obj, 'docstring', None):
129 |         print('no docstring', name)
130 |         return True  # Skip if there's no docstring
131 |     elif what in ('class', 'function', 'attribute'):
132 |         # Check if the module of the class has a docstring
133 |         print('checking module', name)
134 |         module_name = '.'.join(name.split('.')[:-1])
135 | 
136 |         try:
137 |             module = importlib.import_module(module_name)
138 |             return not getattr(module, '__doc__', None)
139 |         except ModuleNotFoundError as e:
140 |             print('module not found', module_name, str(e))
141 |             return None
142 | 
143 | 
144 | def linkcode_resolve(domain, info):
145 |     if domain != 'py':
146 |         return None
147 | 
148 |     file, start, end = get_line_numbers(eval(info['fullname']))
149 |     relpath = os.path.relpath(file, os.path.dirname(module.__file__))
150 |     return f'{repo_url}/blob/v{release}/src/{main_module_name}/{relpath}#L{start}-L{end}'
151 | 
152 | 
153 | def get_line_numbers(obj):
154 |     if isinstance(obj, property):
155 |         obj = obj.fget
156 | 
157 |     if isinstance(obj, Enum):
158 |         return get_enum_member_line_numbers(obj)
159 | 
160 |     if inspect.ismemberdescriptor(obj):
161 |         return get_member_line_numbers(obj)
162 | 
163 |     with module_restored(obj):
164 |         lines = inspect.getsourcelines(obj)
165 |         file = inspect.getsourcefile(obj)
166 | 
167 |     start, end = lines[1], lines[1] + len(lines[0]) - 1
168 |     return file, start, end
169 | 
170 | 
171 | def get_enum_member_line_numbers(obj):
172 |     class_ = obj.__class__
173 |     with module_restored(class_):
174 |         source_lines, start_line = inspect.getsourcelines(class_)
175 | 
176 |         for i, line in enumerate(source_lines):
177 |             if f"{obj.name} =" in line:
178 |                 return inspect.getsourcefile(class_), start_line + i, start_line + i
179 |         else:
180 |             raise ValueError(f"Enum member {obj.name} not found in {class_}")
181 | 
182 | 
183 | def get_member_line_numbers(obj: types.MemberDescriptorType):
184 |     class_ = obj.__objclass__
185 |     with module_restored(class_):
186 |         source_lines, start_line = inspect.getsourcelines(class_)
187 | 
188 |         for i, line in enumerate(source_lines):
189 |             if f"{obj.__name__} = " in line:
190 |                 return inspect.getsourcefile(class_), start_line + i, start_line + i
191 |         else:
192 |             raise ValueError(f"Member {obj.__name__} not found in {class_}")
193 | 
194 | 
195 | @contextlib.contextmanager
196 | def module_restored(obj):
197 |     if not hasattr(obj, '_module_original_'):
198 |         yield
199 |     else:
200 |         fake_module = obj.__module__
201 |         obj.__module__ = obj._module_original_
202 |         yield
203 |         obj.__module__ = fake_module
204 | 
205 | 
206 | def setup(app):
207 |     app.connect('autoapi-skip-member', autodoc_skip_member)
208 |     app.connect('autodoc-skip-member', autodoc_skip_member)
209 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | 
  3 | import barecat
  4 | import pytest
  5 | 
  6 | 
  7 | @pytest.fixture
  8 | def temp_jpeg_dir(tmp_path):
  9 |     """
 10 |     Creates a complex temporary directory with sample JPEG files.
 11 |     """
 12 |     (tmp_path / "dir1").mkdir()
 13 |     (tmp_path / "dir1/subdir1").mkdir()
 14 |     (tmp_path / "dir1/subdir1/test1.jpg").write_bytes(b"dummy data1")
 15 |     (tmp_path / "dir1/subdir2").mkdir()
 16 |     (tmp_path / "dir1/subdir2/test2.jpg").write_bytes(b"dummy data2")
 17 |     (tmp_path / "dir2").mkdir()
 18 |     (tmp_path / "dir2/test3.jpg").write_bytes(b"dummy data3")
 19 |     (tmp_path / "dir2/empty_subdir").mkdir()
 20 |     (tmp_path / "dir3").mkdir()
 21 |     return tmp_path
 22 | 
 23 | 
 24 | @pytest.fixture
 25 | def barecat_archive(temp_jpeg_dir):
 26 |     """
 27 |     Creates a standard Barecat archive for testing.
 28 |     """
 29 |     archive_file = temp_jpeg_dir / "mydata.barecat"
 30 | 
 31 |     create_cmd = [
 32 |         "barecat-create-recursive",
 33 |         "--file", str(archive_file),
 34 |         "--overwrite",
 35 |         str(temp_jpeg_dir / "dir1"),
 36 |         str(temp_jpeg_dir / "dir2"),
 37 |         str(temp_jpeg_dir / "dir3"),
 38 |         '--shard-size=22'
 39 |     ]
 40 |     subprocess.run(create_cmd, check=True)
 41 | 
 42 |     return archive_file
 43 | 
 44 | 
 45 | def test_barecat_creation(temp_jpeg_dir):
 46 |     """
 47 |     Runs `find` with `barecat-create` and verifies output.
 48 |     """
 49 |     output_file = temp_jpeg_dir / "mydata.barecat"
 50 |     cmd = f"cd {temp_jpeg_dir}; find . -name '*.jpg' -print0 | sort | barecat-create --null --file={output_file} --overwrite --shard-size=22"
 51 | 
 52 |     result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
 53 | 
 54 |     with barecat.Barecat(output_file) as reader:
 55 |         file_list = list(reader)
 56 |         assert len(file_list) == 3, "Expected 3 files in the archive"
 57 |         assert "dir1/subdir1/test1.jpg" in file_list, "Expected dir1/subdir1/test1.jpg in the archive"
 58 |         assert "dir1/subdir2/test2.jpg" in file_list, "Expected dir1/subdir2/test2.jpg in the archive"
 59 |         assert "dir2/test3.jpg" in file_list, "Expected dir2/test3.jpg in the archive"
 60 |         assert reader[
 61 |                    "dir1/subdir1/test1.jpg"] == b"dummy data1", "Expected dir1/subdir1/test1.jpg to contain 'dummy data1'"
 62 |         assert reader[
 63 |                    "dir1/subdir2/test2.jpg"] == b"dummy data2", "Expected dir1/subdir2/test2.jpg to contain 'dummy data2'"
 64 |         assert reader[
 65 |                    "dir2/test3.jpg"] == b"dummy data3", "Expected dir2/test3.jpg to contain 'dummy data3'"
 66 |         assert reader.sharder.num_shards == 2, "Expected 2 shards in the archive"
 67 | 
 68 |     assert result.returncode == 0, f"Command failed: {result.stderr}"
 69 |     assert (temp_jpeg_dir / "mydata.barecat-sqlite-index").exists(), "Output file was not created"
 70 | 
 71 | def test_barecat_creation_workers(temp_jpeg_dir):
 72 |     """
 73 |     Runs `find` with `barecat-create` and verifies output.
 74 |     """
 75 |     output_file = temp_jpeg_dir / "mydata.barecat"
 76 |     cmd = f"cd {temp_jpeg_dir}; find . -name '*.jpg' -print0 | sort | barecat-create --null --file={output_file} --overwrite --shard-size=22 --workers=8"
 77 | 
 78 |     result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
 79 | 
 80 |     with barecat.Barecat(output_file) as reader:
 81 |         file_list = list(reader)
 82 |         assert len(file_list) == 3, "Expected 3 files in the archive"
 83 |         assert "dir1/subdir1/test1.jpg" in file_list, "Expected dir1/subdir1/test1.jpg in the archive"
 84 |         assert "dir1/subdir2/test2.jpg" in file_list, "Expected dir1/subdir2/test2.jpg in the archive"
 85 |         assert "dir2/test3.jpg" in file_list, "Expected dir2/test3.jpg in the archive"
 86 |         assert reader[
 87 |                    "dir1/subdir1/test1.jpg"] == b"dummy data1", "Expected dir1/subdir1/test1.jpg to contain 'dummy data1'"
 88 |         assert reader[
 89 |                    "dir1/subdir2/test2.jpg"] == b"dummy data2", "Expected dir1/subdir2/test2.jpg to contain 'dummy data2'"
 90 |         assert reader[
 91 |                    "dir2/test3.jpg"] == b"dummy data3", "Expected dir2/test3.jpg to contain 'dummy data3'"
 92 |         assert reader.sharder.num_shards == 2, "Expected 2 shards in the archive"
 93 | 
 94 |     assert result.returncode == 0, f"Command failed: {result.stderr}"
 95 |     assert (temp_jpeg_dir / "mydata.barecat-sqlite-index").exists(), "Output file was not created"
 96 | 
 97 | 
 98 | def test_extract_single(barecat_archive):
 99 |     """
100 |     Tests `barecat-extract-single` to ensure a specific file is correctly extracted from the archive.
101 |     """
102 |     extract_cmd = [
103 |         "barecat-extract-single",
104 |         "--barecat-file", str(barecat_archive),
105 |         "--path", "dir1/subdir1/test1.jpg"
106 |     ]
107 | 
108 |     result = subprocess.run(extract_cmd, capture_output=True)
109 | 
110 |     assert result.stdout == b"dummy data1", "Unexpected content in extracted file"
111 |     assert result.returncode == 0, f"Command failed: {result.stderr}"
112 | 
113 | 
114 | def test_defrag(barecat_archive):
115 |     """
116 |     Tests `barecat-defrag` to ensure the archive can be defragmented properly.
117 |     """
118 | 
119 | 
120 |     with barecat.Barecat(barecat_archive, readonly=False) as bc:
121 |         first_file = next(iter(bc.index.iter_all_filepaths(barecat.Order.ADDRESS)))
122 | 
123 |         del bc[first_file]
124 |         assert first_file not in bc
125 |         assert bc.total_logical_size != bc.total_physical_size_seek
126 | 
127 | 
128 |     defrag_cmd = [
129 |         "barecat-defrag",
130 |         str(barecat_archive)
131 |     ]
132 | 
133 |     result = subprocess.run(defrag_cmd, capture_output=True, text=True)
134 | 
135 |     with barecat.Barecat(barecat_archive) as reader:
136 |         assert reader.total_logical_size == reader.total_physical_size_seek
137 |         assert reader.sharder.num_shards == 1
138 | 
139 | 
140 |     assert result.returncode == 0, f"Command failed: {result.stderr}"
141 | 
142 | 
143 | def test_verify_integrity(barecat_archive):
144 |     """
145 |     Tests `barecat-verify` to ensure the archive's integrity.
146 |     """
147 |     verify_cmd = [
148 |         "barecat-verify",
149 |         str(barecat_archive)
150 |     ]
151 | 
152 |     result = subprocess.run(verify_cmd, capture_output=True, text=True)
153 | 
154 |     assert result.returncode == 0, f"Command failed: {result.stderr}"
155 | 
156 |     # now edit the file and verify again
157 |     with open(f'{barecat_archive}-shard-00000', "r+b") as f:
158 |         f.seek(0)
159 |         f.write(b"junk")
160 | 
161 |     result = subprocess.run(verify_cmd, capture_output=True, text=True)
162 |     assert result.returncode != 0, f"Command should have failed: {result.stderr}"
163 |     assert 'CRC32C' in result.stdout, "Expected CRC mismatch error message"
164 | 
165 | 
166 | def test_index_to_csv(barecat_archive):
167 |     """
168 |     Tests `barecat-index-to-csv` to ensure index can be dumped as CSV.
169 |     """
170 |     csv_cmd = [
171 |         "barecat-index-to-csv",
172 |         str(barecat_archive) + "-sqlite-index"
173 |     ]
174 | 
175 |     result = subprocess.run(csv_cmd, capture_output=True, text=True)
176 | 
177 |     assert '"path","shard","offset","size","crc32c"' in result.stdout, "CSV output missing expected header"
178 |     assert result.returncode == 0, f"Command failed: {result.stderr}"
179 | 


--------------------------------------------------------------------------------
/src/barecat/archive_formats.py:
--------------------------------------------------------------------------------
  1 | import os.path as osp
  2 | import shutil
  3 | import tarfile
  4 | import zipfile
  5 | from datetime import datetime
  6 | 
  7 | from barecat.core.index import BarecatDirInfo, BarecatFileInfo, BarecatEntryInfo
  8 | from barecat.progbar import progressbar
  9 | 
 10 | 
 11 | def iter_archive(src_path):
 12 |     if src_path.endswith(('.tar', '.tar.gz', '.tar.bz2', '.tar.xz')):
 13 |         return iter_tarfile(src_path)
 14 |     elif src_path.endswith('.zip'):
 15 |         return iter_zipfile(src_path)
 16 |     else:
 17 |         raise ValueError('Unsupported archive format')
 18 | 
 19 | 
 20 | def iter_archive_nocontent(src_path):
 21 |     if src_path.endswith(('.tar', '.tar.gz', '.tar.bz2', '.tar.xz')):
 22 |         return iter_tarfile_nocontent(src_path)
 23 |     elif src_path.endswith('.zip'):
 24 |         return iter_zipfile_nocontent(src_path)
 25 |     else:
 26 |         raise ValueError('Unsupported archive format')
 27 | 
 28 | 
 29 | def iter_zipfile(path):
 30 |     with zipfile.ZipFile(path, mode='r') as zipf:
 31 |         for member in progressbar(zipf.infolist(), desc='Packing files', unit=' files'):
 32 |             if member.is_dir():
 33 |                 di = BarecatDirInfo(path=member.filename)
 34 |                 di.mtime_dt = datetime(*member.date_time)
 35 |                 yield di, None
 36 |             else:
 37 |                 fi = BarecatFileInfo(path=member.filename, size=member.file_size)
 38 |                 fi.mtime_dt = datetime(*member.date_time)
 39 |                 with zipf.open(member) as file_in_zip:
 40 |                     yield fi, file_in_zip
 41 | 
 42 | 
 43 | def iter_zipfile_nocontent(path):
 44 |     with open(path, 'rb') as f:
 45 |         with zipfile.ZipFile(f, mode='r') as zipf:
 46 |             for member in progressbar(zipf.infolist(), desc='Packing files', unit=' files'):
 47 |                 if member.is_dir():
 48 |                     di = BarecatDirInfo(path=member.filename)
 49 |                     di.mtime_dt = datetime(*member.date_time)
 50 |                     yield di
 51 |                 else:
 52 |                     f.seek(member.header_offset + 26)
 53 |                     namelen = int.from_bytes(f.read(2), byteorder='little')
 54 |                     extralen = int.from_bytes(f.read(2), byteorder='little')
 55 |                     data_offset = member.header_offset + 30 + namelen + extralen
 56 | 
 57 |                     fi = BarecatFileInfo(
 58 |                         path=member.filename, shard=0, offset=data_offset, size=member.file_size
 59 |                     )
 60 |                     fi.mtime_dt = datetime(*member.date_time)
 61 |                     yield fi
 62 | 
 63 | 
 64 | def iter_tarfile(path):
 65 |     tar_file_size = osp.getsize(path) // 1024 // 1024
 66 |     pbar = progressbar(None, desc='Packing files', unit=' MB', total=tar_file_size)
 67 |     progpos = 0
 68 | 
 69 |     with tarfile.open(path, mode='r|*') as tar:
 70 |         for member in tar:
 71 |             if member.isdir():
 72 |                 di = BarecatDirInfo(
 73 |                     path=member.name,
 74 |                     mode=member.mode,
 75 |                     uid=member.uid,
 76 |                     gid=member.gid,
 77 |                     mtime_ns=member.mtime * 1_000_000_000,
 78 |                 )
 79 |                 yield di, None
 80 |             if member.isfile():
 81 |                 fi = BarecatFileInfo(
 82 |                     path=member.name,
 83 |                     size=member.size,
 84 |                     mode=member.mode,
 85 |                     uid=member.uid,
 86 |                     gid=member.gid,
 87 |                     mtime_ns=member.mtime * 1_000_000_000,
 88 |                 )
 89 | 
 90 |                 with tar.extractfile(member) as file_in_tar:
 91 |                     yield fi, file_in_tar
 92 | 
 93 |                 new_pos = tar.fileobj.tell() // 1024 // 1024
 94 |                 delta = new_pos - progpos
 95 |                 pbar.update(delta)
 96 |                 progpos += delta
 97 | 
 98 | 
 99 | def iter_tarfile_nocontent(path):
100 |     tar_file_size = osp.getsize(path) // 1024 // 1024
101 |     pbar = progressbar(None, desc='Packing files', unit=' MB', total=tar_file_size)
102 |     progpos = 0
103 | 
104 |     with tarfile.open(path, mode='r|*') as tar:
105 |         for member in tar:
106 |             if member.isdir():
107 |                 di = BarecatDirInfo(
108 |                     path=member.name,
109 |                     mode=member.mode,
110 |                     uid=member.uid,
111 |                     gid=member.gid,
112 |                     mtime_ns=member.mtime * 1_000_000_000,
113 |                 )
114 |                 yield di
115 |             if member.isfile():
116 |                 fi = BarecatFileInfo(
117 |                     path=member.name,
118 |                     shard=0,
119 |                     offset=member.offset_data,
120 |                     size=member.size,
121 |                     mode=member.mode,
122 |                     uid=member.uid,
123 |                     gid=member.gid,
124 |                     mtime_ns=member.mtime * 1_000_000_000,
125 |                 )
126 |                 yield fi
127 |                 new_pos = tar.fileobj.tell() // 1024 // 1024
128 |                 delta = new_pos - progpos
129 |                 pbar.update(delta)
130 |                 progpos += delta
131 | 
132 | 
133 | def get_archive_writer(target_path):
134 |     if target_path.endswith(('.tar', '.tar.gz', '.tar.bz2', '.tar.xz')):
135 |         return TarWriter(target_path)
136 |     elif target_path.endswith('.zip'):
137 |         return ZipWriter(target_path)
138 |     else:
139 |         raise ValueError('Unsupported archive format')
140 | 
141 | 
142 | class ZipWriter:
143 |     def __init__(self, target_path):
144 |         self.zip = zipfile.ZipFile(target_path, mode='w')
145 | 
146 |     def add(self, info: BarecatEntryInfo, fileobj=None):
147 |         if isinstance(info, BarecatDirInfo):
148 |             zipinfo = zipfile.ZipInfo(info.path + '/')
149 |             zipinfo.date_time = info.mtime_dt.timetuple()[:6]
150 |             self.zip.writestr(zipinfo, '')
151 |         else:
152 |             zipinfo = zipfile.ZipInfo(info.path)
153 |             zipinfo.date_time = info.mtime_dt.timetuple()[:6]
154 |             zipinfo.file_size = info.size
155 |             with self.zip.open(zipinfo, 'w') as file_in_zip:
156 |                 shutil.copyfileobj(fileobj, file_in_zip)
157 | 
158 |     def close(self):
159 |         self.zip.close()
160 | 
161 |     def __enter__(self):
162 |         return self
163 | 
164 |     def __exit__(self, *args):
165 |         self.close()
166 | 
167 | 
168 | class TarWriter:
169 |     def __init__(self, *args, **kwargs):
170 |         if 'mode' not in kwargs:
171 |             kwargs['mode'] = 'w'
172 |         self.tar = tarfile.open(*args, **kwargs)
173 | 
174 |     def add(self, info: BarecatEntryInfo, fileobj=None):
175 |         tarinfo = tarfile.TarInfo(info.path)
176 |         tarinfo.uid = info.uid or 0
177 |         tarinfo.gid = info.gid or 0
178 |         if info.mtime_ns is not None:
179 |             tarinfo.mtime = info.mtime_ns // 1_000_000_000
180 |         if isinstance(info, BarecatDirInfo):
181 |             tarinfo.type = tarfile.DIRTYPE
182 |             tarinfo.mode = 0o755 if info.mode is None else info.mode
183 |             self.tar.addfile(tarinfo)
184 |         else:
185 |             tarinfo.size = info.size
186 |             tarinfo.mode = 0o644 if info.mode is None else info.mode
187 |             self.tar.addfile(tarinfo, fileobj)
188 | 
189 |     def close(self):
190 |         self.tar.close()
191 | 
192 |     def __enter__(self):
193 |         return self
194 | 
195 |     def __exit__(self, *args):
196 |         self.close()
197 | 


--------------------------------------------------------------------------------
/src/barecat/defrag.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import dataclasses
  4 | import os
  5 | import time
  6 | from typing import TYPE_CHECKING
  7 | 
  8 | from barecat.core.index import Order
  9 | from barecat.progbar import progressbar
 10 | 
 11 | if TYPE_CHECKING:
 12 |     from barecat.core.barecat import Barecat
 13 | 
 14 | 
 15 | class BarecatDefragger:
 16 |     def __init__(self, bc: Barecat):
 17 |         self.bc = bc
 18 |         self.index = bc.index
 19 |         self.shard_size_limit = bc.shard_size_limit
 20 |         self.readonly = bc.readonly
 21 |         self.shard_files = bc.sharder.shard_files
 22 | 
 23 |     def get_gaps(self):
 24 |         gaps = self.index.fetch_all("""
 25 |             WITH x AS (
 26 |                 SELECT config.value_int AS shard_size_limit
 27 |                 FROM config
 28 |                 WHERE config.key = 'shard_size_limit'
 29 |             ),
 30 |             first_gaps AS (
 31 |                 SELECT
 32 |                     f.shard,
 33 |                     0 AS offset,
 34 |                     MIN(f.offset) AS size
 35 |                 FROM files f
 36 |                 GROUP BY f.shard
 37 |             ),
 38 |             nonfirst_gaps AS (
 39 |                 SELECT 
 40 |                     f.shard,
 41 |                     (f.offset + f.size) AS offset,
 42 |                     coalesce(
 43 |                         lead(f.offset, 1) OVER (PARTITION BY f.shard ORDER BY f.offset),
 44 |                         x.shard_size_limit
 45 |                     ) - (f.offset + f.size) AS size
 46 |                 FROM files f, x
 47 |             ),
 48 |             all_gaps AS (SELECT * FROM first_gaps UNION ALL SELECT * FROM nonfirst_gaps)
 49 |             SELECT shard, offset, size
 50 |             FROM all_gaps
 51 |             WHERE size > 0
 52 |             ORDER BY shard, offset
 53 |         """, rowcls=FragmentGap)
 54 | 
 55 |         empty_shard_gaps = [
 56 |             FragmentGap(shard, 0, self.shard_size_limit)
 57 |             for shard in range(len(self.shard_files))
 58 |             if self.bc.index.logical_shard_end(shard) == 0]
 59 |         gaps.extend(empty_shard_gaps)
 60 |         gaps.sort(key=lambda gap: (gap.shard, gap.offset))
 61 |         return gaps
 62 | 
 63 |         # gaps = []
 64 |         # prev_end = 0
 65 |         # prev_shard = -1
 66 |         # for fi in self.index.iter_all_fileinfos(order=Order.ADDRESS):
 67 |         #     if fi.shard > prev_shard:
 68 |         #         if self.shard_size_limit > prev_end and prev_shard >= 0:
 69 |         #             gaps.append(FragmentGap(prev_shard, prev_end, self.shard_size_limit -
 70 |         #             prev_end))
 71 |         #         for i in range(prev_shard + 1, fi.shard):
 72 |         #             gaps.append(FragmentGap(i, 0, self.shard_size_limit))
 73 |         #         prev_end = 0
 74 |         #     if fi.offset > prev_end:
 75 |         #         gaps.append(FragmentGap(fi.shard, prev_end, fi.offset - prev_end))
 76 |         #     prev_shard = fi.shard
 77 |         #     prev_end = fi.offset + fi.size
 78 |         # return gaps
 79 | 
 80 |     def needs_defrag(self):
 81 |         # check if total size of shards is larger than the sum of the sizes of the files in index
 82 |         # the getsize() function may not be fully up to date but this is only a heuristic anyway.
 83 |         return self.bc.total_physical_size_seek > self.bc.total_logical_size
 84 | 
 85 |     def get_defrag_info(self):
 86 |         return self.bc.total_physical_size_seek, self.bc.total_logical_size
 87 | 
 88 |     def defrag(self):
 89 |         if self.readonly:
 90 |             raise ValueError('Cannot defrag a read-only Barecat')
 91 | 
 92 |         new_shard = 0
 93 |         new_offset = 0
 94 | 
 95 |         old_total = self.bc.total_physical_size_seek
 96 | 
 97 |         try:
 98 |             for i in range(len(self.shard_files)):
 99 |                 self.bc.sharder.reopen_shard(i, 'r+b')
100 | 
101 |             file_iter = self.index.iter_all_fileinfos(order=Order.ADDRESS)
102 |             for fi in progressbar(file_iter, total=self.index.num_files, desc='Defragging'):
103 |                 if (self.shard_size_limit is not None and new_offset + fi.size >
104 |                         self.shard_size_limit):
105 |                     self.shard_files[new_shard].truncate(new_offset)
106 |                     self.bc.sharder.reopen_shard(new_shard, 'rb')
107 |                     new_shard += 1
108 |                     new_offset = 0
109 | 
110 |                 if not (new_shard == fi.shard and new_offset == fi.offset):
111 |                     shift_n_bytes(
112 |                         self.shard_files[fi.shard], self.shard_files[new_shard],
113 |                         fi.offset, new_offset, fi.size)
114 |                     self.index.move_file(fi.path, new_shard, new_offset)
115 | 
116 |                 new_offset += fi.size
117 | 
118 |             # Truncate the last shard to its real size (the others are truncated already)
119 |             self.shard_files[new_shard].truncate(new_offset)
120 |             # Close and delete all shards after the last one
121 |             for i in range(new_shard + 1, len(self.shard_files)):
122 |                 self.shard_files[i].close()
123 |                 os.remove(self.shard_files[i].name)
124 |             del self.shard_files[new_shard + 1:]
125 | 
126 |             new_total = self.bc.total_physical_size_seek
127 |             return old_total - new_total
128 |         finally:
129 |             self.bc.sharder.reopen_shards()
130 | 
131 |     def defrag_quick(self, time_max_seconds=5):
132 |         if self.readonly:
133 |             raise ValueError('Cannot defrag a read-only Barecat')
134 | 
135 |         start_time = time.monotonic()
136 |         # Collect all gaps in the shards
137 |         gaps = self.get_gaps()
138 |         freed_space = 0
139 |         try:
140 |             for i in range(len(self.shard_files)):
141 |                 self.bc.sharder.reopen_shard(i, 'r+b')
142 | 
143 |             for fi in self.index.iter_all_fileinfos(order=Order.ADDRESS | Order.DESC):
144 |                 moved = self.move_to_earlier_gap(fi, gaps)
145 |                 if not moved or time.monotonic() - start_time > time_max_seconds:
146 |                     # We stop when we reach the first file that cannot be moved to an earlier gap
147 |                     break
148 |                 freed_space += fi.size
149 | 
150 |             self.bc.truncate_all_to_logical_size()
151 |         finally:
152 |             self.bc.sharder.reopen_shards()
153 | 
154 |         return freed_space
155 | 
156 |     def move_to_earlier_gap(self, fi, gaps):
157 |         for i_gap, gap in enumerate(gaps):
158 |             if gap.shard > fi.shard or (gap.shard == fi.shard and gap.offset >= fi.offset):
159 |                 # reached the gap that is after the file, no move is possible
160 |                 return False
161 |             if gap.size >= fi.size:
162 |                 shift_n_bytes(
163 |                     self.shard_files[fi.shard], self.shard_files[gap.shard], fi.offset,
164 |                     gap.offset, fi.size)
165 |                 self.index.move_file(fi.path, gap.shard, gap.offset)
166 |                 gap.size -= fi.size
167 |                 gap.offset += fi.size
168 |                 if gap.size == 0:
169 |                     # even though we are changing the list while in a for loop that is iterating
170 |                     # over it, this is safe because we are immediately returning in this iteration.
171 |                     del gaps[i_gap]
172 |                 return True
173 |         return False
174 | 
175 | 
176 | def shift_n_bytes(src_file, dst_file, src_offset, dst_offset, length, bufsize=64 * 1024):
177 |     if src_file == dst_file and src_offset < dst_offset:
178 |         raise ValueError('This function can only shift left'
179 |                          ' because defragging is done towards the left')
180 | 
181 |     bytes_to_copy = length
182 |     while bytes_to_copy > 0:
183 |         src_file.seek(src_offset)
184 |         data = src_file.read(min(bufsize, bytes_to_copy))
185 |         if not data:
186 |             raise ValueError('Unexpected EOF')
187 | 
188 |         dst_file.seek(dst_offset)
189 |         dst_file.write(data)
190 | 
191 |         len_data = len(data)
192 |         src_offset += len_data
193 |         dst_offset += len_data
194 |         bytes_to_copy -= len_data
195 | 
196 | 
197 | @dataclasses.dataclass
198 | class FragmentGap:
199 |     shard: int
200 |     offset: int
201 |     size: int
202 | 
203 |     @classmethod
204 |     def row_factory(cls, cursor, row):
205 |         field_names = [d[0] for d in cursor.description]
206 |         return cls(**dict(zip(field_names, row)))
207 | 


--------------------------------------------------------------------------------
/src/barecat/sql/schema.sql:
--------------------------------------------------------------------------------
  1 | -- Description: Schema for the barecat database
  2 | 
  3 | 
  4 | --####################################  Tables
  5 | CREATE TABLE files
  6 | (
  7 |     path     TEXT PRIMARY KEY NOT NULL,
  8 |     parent   TEXT GENERATED ALWAYS AS ( -- Parent directory is computed automatically
  9 |         rtrim(rtrim(path, replace(path, '/', '')), '/')
 10 |         ) VIRTUAL             NOT NULL REFERENCES dirs (path) ON DELETE RESTRICT,
 11 | 
 12 |     shard    INTEGER          NOT NULL,
 13 |     offset   INTEGER          NOT NULL,
 14 |     size     INTEGER DEFAULT 0,
 15 |     crc32c   INTEGER DEFAULT NULL,
 16 | 
 17 |     mode     INTEGER DEFAULT NULL,
 18 |     uid      INTEGER DEFAULT NULL,
 19 |     gid      INTEGER DEFAULT NULL,
 20 |     mtime_ns INTEGER DEFAULT NULL
 21 | );
 22 | 
 23 | CREATE TABLE dirs
 24 | (
 25 |     path           TEXT PRIMARY KEY,
 26 |     parent         TEXT GENERATED ALWAYS AS (
 27 |         CASE
 28 |             WHEN path = '' THEN NULL
 29 |             ELSE rtrim(rtrim(path, replace(path, '/', '')), '/')
 30 |             END
 31 |         ) VIRTUAL REFERENCES dirs (path) ON DELETE RESTRICT,
 32 | 
 33 |     num_subdirs    INTEGER DEFAULT 0, -- These are maintained by triggers
 34 |     num_files      INTEGER DEFAULT 0,
 35 |     num_files_tree INTEGER DEFAULT 0,
 36 |     size_tree      INTEGER DEFAULT 0,
 37 | 
 38 |     mode           INTEGER DEFAULT NULL,
 39 |     uid            INTEGER DEFAULT NULL,
 40 |     gid            INTEGER DEFAULT NULL,
 41 |     mtime_ns       INTEGER DEFAULT NULL
 42 | );
 43 | 
 44 | CREATE TABLE config -- For now, this table only holds the `shard_size_limit`
 45 | (
 46 |     key        TEXT PRIMARY KEY,
 47 |     value_text TEXT    DEFAULT NULL,
 48 |     value_int  INTEGER DEFAULT NULL
 49 | ) WITHOUT ROWID;
 50 | 
 51 | INSERT INTO config (key, value_int)
 52 | VALUES ('use_triggers', 1),
 53 |        ('shard_size_limit', CAST(power(2, 63) - 1 AS INTEGER)),
 54 |        ('schema_version_major', 0),
 55 |        ('schema_version_minor', 2);
 56 | 
 57 | -- Indexes
 58 | CREATE INDEX idx_files_parent ON files (parent);
 59 | CREATE INDEX idx_dirs_parent ON dirs (parent);
 60 | CREATE INDEX idx_files_shard_offset ON files (shard, offset);
 61 | 
 62 | --####################################  Triggers
 63 | --  The idea is: we propagate changes up the tree with triggers, as this is cumbersome to do in
 64 | --  the Python code. There is no propagation downwards (for example when moving a dir, we do not
 65 | --  update all the children with triggers). This is because the Python code can do this
 66 | --  quite easily. Furthermore, if we did it with triggers, the chain would start upward again
 67 | --  with a circular mess. So we only propagate upwards the tree.
 68 | --  We propagate two kinds of things:
 69 | --  1) statistics: direct and aggregate file count and aggregate size
 70 | --  2) modification time of the parent directory
 71 | --  We don't update the modification time of the entity being inserted or modified,
 72 | --  this can be simply done in the Python code. If the app doesn't supply mtime, presumably it
 73 | --  doesn't care about it, so the overhead of triggering it makes no sense.
 74 | 
 75 | ---- Files: add, del, move, resize
 76 | CREATE TRIGGER add_file -- Upsert the parent when adding a file
 77 |     AFTER INSERT
 78 |     ON files
 79 |     WHEN (SELECT value_int
 80 |           FROM config
 81 |           WHERE key = 'use_triggers') = 1
 82 | BEGIN
 83 |     -- Add the parent directory if it doesn't exist
 84 |     INSERT INTO dirs (path, num_files, num_files_tree, size_tree, mtime_ns)
 85 |     VALUES (NEW.parent, 1, 1, NEW.size,
 86 |             CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER))
 87 |     -- If the parent directory already exists, update it
 88 |     ON CONFLICT(path) DO UPDATE
 89 |         SET num_files      = num_files + 1,
 90 |             num_files_tree = num_files_tree + 1,
 91 |             size_tree      = size_tree + excluded.size_tree,
 92 |             mtime_ns       = excluded.mtime_ns;
 93 | END;
 94 | 
 95 | CREATE TRIGGER del_file -- Update the parent when deleting a file
 96 |     AFTER DELETE
 97 |     ON files
 98 |     WHEN (SELECT value_int
 99 |           FROM config
100 |           WHERE key = 'use_triggers') = 1
101 | BEGIN
102 |     UPDATE dirs
103 |     SET num_files      = num_files - 1,
104 |         num_files_tree = num_files_tree - 1,
105 |         size_tree      = size_tree - OLD.size,
106 |         mtime_ns       = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
107 |     WHERE path = OLD.parent;
108 | END;
109 | 
110 | CREATE TRIGGER move_file -- Update both parents when moving a file
111 |     AFTER UPDATE OF path
112 |     ON files
113 |     WHEN NEW.parent != OLD.parent
114 |         AND (SELECT value_int
115 |              FROM config
116 |              WHERE key = 'use_triggers') = 1
117 | BEGIN
118 |     UPDATE dirs
119 |     SET num_files      = num_files + 1,
120 |         num_files_tree = num_files_tree + 1,
121 |         size_tree      = size_tree + NEW.size,
122 |         mtime_ns       = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
123 |     WHERE path = NEW.parent;
124 |     UPDATE dirs
125 |     SET num_files      = num_files - 1,
126 |         num_files_tree = num_files_tree - 1,
127 |         size_tree      = size_tree - OLD.size,
128 |         mtime_ns       = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
129 |     WHERE path = OLD.parent;
130 | END;
131 | 
132 | CREATE TRIGGER resize_file -- When file size changes
133 |     AFTER UPDATE OF size
134 |     ON files
135 |     WHEN NEW.parent == OLD.parent -- and the file was not moved
136 |         AND (SELECT value_int
137 |              FROM config
138 |              WHERE key = 'use_triggers') = 1
139 | BEGIN
140 |     UPDATE dirs
141 |     SET size_tree = size_tree + NEW.size - OLD.size
142 |     WHERE path = OLD.parent;
143 | END;
144 | 
145 | ---- Directories: add, del, move, resize
146 | CREATE TRIGGER add_subdir -- Upsert the parent when adding a directory
147 |     AFTER INSERT
148 |     ON dirs
149 |     WHEN (SELECT value_int
150 |           FROM config
151 |           WHERE key = 'use_triggers') = 1
152 | BEGIN
153 |     INSERT INTO dirs (path, num_subdirs, size_tree, num_files_tree, mtime_ns)
154 |     VALUES (NEW.parent, 1, NEW.size_tree, NEW.num_files_tree,
155 |             CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER))
156 |     ON CONFLICT(path) DO UPDATE
157 |         SET num_subdirs    = num_subdirs + 1,
158 |             size_tree      = size_tree + excluded.size_tree,
159 |             num_files_tree = num_files_tree + excluded.num_files_tree,
160 |             mtime_ns= excluded.mtime_ns;
161 | END;
162 | 
163 | CREATE TRIGGER del_subdir -- Update the parent when deleting a directory
164 |     AFTER DELETE
165 |     ON dirs
166 |     WHEN (SELECT value_int
167 |           FROM config
168 |           WHERE key = 'use_triggers') = 1
169 | BEGIN
170 |     UPDATE dirs
171 |     SET num_subdirs    = num_subdirs - 1,
172 |         num_files      = num_files - OLD.num_files,
173 |         size_tree      = size_tree - OLD.size_tree,
174 |         num_files_tree = num_files_tree - OLD.num_files_tree,
175 |         mtime_ns       = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
176 |     WHERE path = OLD.parent;
177 | END;
178 | 
179 | CREATE TRIGGER move_subdir -- Update both parents when moving a directory
180 |     AFTER UPDATE OF path
181 |     ON dirs
182 |     WHEN NEW.parent != OLD.parent
183 |         AND (SELECT value_int
184 |              FROM config
185 |              WHERE key = 'use_triggers') = 1
186 | BEGIN
187 |     UPDATE dirs
188 |     SET num_subdirs    = num_subdirs - 1,
189 |         num_files      = num_files - OLD.num_files,
190 |         size_tree      = size_tree - OLD.size_tree,
191 |         num_files_tree = num_files_tree - OLD.num_files_tree,
192 |         mtime_ns       = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
193 |     WHERE path = OLD.parent;
194 |     UPDATE dirs
195 |     SET num_subdirs    = num_subdirs + 1,
196 |         num_files      = num_files + NEW.num_files,
197 |         size_tree      = size_tree + NEW.size_tree,
198 |         num_files_tree = num_files_tree + NEW.num_files_tree,
199 |         mtime_ns       = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
200 |     WHERE path = NEW.parent;
201 | END;
202 | 
203 | 
204 | CREATE TRIGGER resize_dir -- Update the parent when a directory changes size
205 |     AFTER UPDATE OF size_tree, num_files_tree
206 |     ON dirs
207 |     WHEN NEW.parent = OLD.parent AND
208 |          (NEW.size_tree != OLD.size_tree OR NEW.num_files_tree != OLD.num_files_tree)
209 |         AND (SELECT value_int
210 |              FROM config
211 |              WHERE key = 'use_triggers') = 1
212 | BEGIN
213 |     UPDATE dirs
214 |     SET size_tree      = size_tree + (NEW.size_tree - OLD.size_tree),
215 |         num_files_tree = num_files_tree + (NEW.num_files_tree - OLD.num_files_tree)
216 |     WHERE path = OLD.parent;
217 | END;


--------------------------------------------------------------------------------
/src/barecat/util.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import glob
  3 | import itertools
  4 | import os
  5 | import os.path as osp
  6 | import shutil
  7 | from datetime import datetime
  8 | 
  9 | import crc32c as crc32c_lib
 10 | 
 11 | 
 12 | def read_file(input_path, mode='r'):
 13 |     with open(input_path, mode) as f:
 14 |         return f.read()
 15 | 
 16 | 
 17 | def remove(path):
 18 |     index_path = f'{path}-sqlite-index'
 19 |     shard_paths = glob.glob(f'{path}-shard-?????')
 20 |     for path in [index_path] + shard_paths:
 21 |         os.remove(path)
 22 | 
 23 | 
 24 | def exists(path):
 25 |     index_path = f'{path}-sqlite-index'
 26 |     shard_paths = glob.glob(f'{path}-shard-?????')
 27 |     return osp.exists(index_path) or len(shard_paths) > 0
 28 | 
 29 | 
 30 | # From `more-itertools` package.
 31 | def chunked(iterable, n, strict=False):
 32 |     """Break *iterable* into lists of length *n*:
 33 | 
 34 |         >>> list(chunked([1, 2, 3, 4, 5, 6], 3))
 35 |         [[1, 2, 3], [4, 5, 6]]
 36 | 
 37 |     By the default, the last yielded list will have fewer than *n* elements
 38 |     if the length of *iterable* is not divisible by *n*:
 39 | 
 40 |         >>> list(chunked([1, 2, 3, 4, 5, 6, 7, 8], 3))
 41 |         [[1, 2, 3], [4, 5, 6], [7, 8]]
 42 | 
 43 |     To use a fill-in value instead, see the :func:`grouper` recipe.
 44 | 
 45 |     If the length of *iterable* is not divisible by *n* and *strict* is
 46 |     ``True``, then ``ValueError`` will be raised before the last
 47 |     list is yielded.
 48 | 
 49 |     """
 50 |     iterator = iter(functools.partial(take, n, iter(iterable)), [])
 51 |     if strict:
 52 |         if n is None:
 53 |             raise ValueError('n must not be None when using strict mode.')
 54 | 
 55 |         def ret():
 56 |             for chunk in iterator:
 57 |                 if len(chunk) != n:
 58 |                     raise ValueError('iterable is not divisible by n.')
 59 |                 yield chunk
 60 | 
 61 |         return iter(ret())
 62 |     else:
 63 |         return iterator
 64 | 
 65 | 
 66 | def take(n, iterable):
 67 |     """Return first *n* items of the iterable as a list.
 68 | 
 69 |         >>> take(3, range(10))
 70 |         [0, 1, 2]
 71 | 
 72 |     If there are fewer than *n* items in the iterable, all of them are
 73 |     returned.
 74 | 
 75 |         >>> take(10, range(3))
 76 |         [0, 1, 2]
 77 | 
 78 |     """
 79 |     return list(itertools.islice(iterable, n))
 80 | 
 81 | 
 82 | def copy_n_bytes(src_file, dest_file, n=None, bufsize=64 * 1024):
 83 |     if n is None:
 84 |         return shutil.copyfileobj(src_file, dest_file, bufsize)
 85 | 
 86 |     bytes_to_copy = n
 87 |     while bytes_to_copy > 0:
 88 |         data = src_file.read(min(bufsize, bytes_to_copy))
 89 |         if not data:
 90 |             raise ValueError('Unexpected EOF')
 91 | 
 92 |         dest_file.write(data)
 93 |         bytes_to_copy -= len(data)
 94 | 
 95 | 
 96 | def normalize_path(path):
 97 |     x = osp.normpath(path).removeprefix('/')
 98 |     return '' if x == '.' else x
 99 | 
100 | 
101 | def get_parent(path):
102 |     if path == '':
103 |         # root already, has no parent
104 |         return b'\x00'
105 | 
106 |     partition = path.rpartition('/')
107 |     return partition[0]
108 | 
109 | 
110 | def partition_path(path):
111 |     if path == '':
112 |         # root already, has no parent
113 |         return b'\x00', path
114 | 
115 |     parts = path.rpartition('/')
116 |     return parts[0], parts[2]
117 | 
118 | 
119 | def get_ancestors(path):
120 |     yield ''
121 |     for i in range(len(path)):
122 |         if path[i] == '/':
123 |             yield path[:i]
124 | 
125 | 
126 | def reopen(file, mode):
127 |     if file.mode == mode:
128 |         return file
129 |     file.close()
130 |     return open_(file.name, mode)
131 | 
132 | 
133 | def fileobj_crc32c_until_end(fileobj, bufsize=64 * 1024):
134 |     crc32c = 0
135 |     while chunk := fileobj.read(bufsize):
136 |         crc32c = crc32c_lib.crc32c(chunk, crc32c)
137 |     return crc32c
138 | 
139 | 
140 | def fileobj_crc32c(fileobj, size=-1, bufsize=64 * 1024):
141 |     if size == -1 or size is None:
142 |         return fileobj_crc32c_until_end(fileobj, bufsize)
143 | 
144 |     crc32c = 0
145 |     n_full_bufs, remainder = divmod(size, bufsize)
146 | 
147 |     for _ in range(n_full_bufs):
148 |         data = fileobj.read(bufsize)
149 |         if len(data) != bufsize:
150 |             raise ValueError('Unexpected EOF')
151 |         crc32c = crc32c_lib.crc32c(data, crc32c)
152 | 
153 |     if remainder:
154 |         data = fileobj.read(remainder)
155 |         if len(data) != remainder:
156 |             raise ValueError('Unexpected EOF')
157 |         crc32c = crc32c_lib.crc32c(data, crc32c)
158 | 
159 |     return crc32c
160 | 
161 | 
162 | def copyfileobj_crc32c_until_end(src_file, dst_file, bufsize=64 * 1024):
163 |     crc32c = 0
164 |     size = 0
165 |     while chunk := src_file.read(bufsize):
166 |         dst_file.write(chunk)
167 |         crc32c = crc32c_lib.crc32c(chunk, crc32c)
168 |         size += len(chunk)
169 |     return size, crc32c
170 | 
171 | 
172 | def copyfileobj_crc32c(src_file, dst_file, size=None, bufsize=64 * 1024):
173 |     if size is None:
174 |         return copyfileobj_crc32c_until_end(src_file, dst_file, bufsize)
175 | 
176 |     crc32c = 0
177 |     n_bytes_transferred = 0
178 |     n_full_bufs, remainder = divmod(size, bufsize)
179 | 
180 |     for _ in range(n_full_bufs):
181 |         data = src_file.read(bufsize)
182 |         if len(data) != bufsize:
183 |             raise ValueError('Unexpected EOF')
184 | 
185 |         crc32c = crc32c_lib.crc32c(data, crc32c)
186 |         n_written = dst_file.write(data)
187 |         if n_written != len(data):
188 |             raise ValueError('Unexpected write problem')
189 | 
190 |         n_bytes_transferred += n_written
191 | 
192 |     if remainder:
193 |         data = src_file.read(remainder)
194 |         if len(data) != remainder:
195 |             raise ValueError('Unexpected EOF')
196 | 
197 |         crc32c = crc32c_lib.crc32c(data, crc32c)
198 |         n_written = dst_file.write(data)
199 |         if n_written != len(data):
200 |             raise ValueError('Unexpected write problem')
201 | 
202 |         n_bytes_transferred += n_written
203 | 
204 |     return n_bytes_transferred, crc32c
205 | 
206 | 
207 | def copyfileobj(src_file, dst_file, size=None, bufsize=64 * 1024):
208 |     if size is None:
209 |         return shutil.copyfileobj(src_file, dst_file, bufsize)
210 | 
211 |     n_bytes_transferred = 0
212 |     nreads, remainder = divmod(size, bufsize)
213 | 
214 |     for _ in range(nreads):
215 |         data = src_file.read(bufsize)
216 |         dst_file.write(data)
217 |         n_bytes_transferred += len(data)
218 | 
219 |     if remainder:
220 |         data = src_file.read(remainder)
221 |         dst_file.write(data)
222 |         n_bytes_transferred += len(data)
223 | 
224 |     return n_bytes_transferred
225 | 
226 | 
227 | def write_zeroes(file, n, bufsize=64 * 1024):
228 |     n_written = 0
229 |     if n >= bufsize:
230 |         zeroes = bytearray(bufsize)
231 |         while n >= bufsize:
232 |             n_written += file.write(zeroes)
233 |             n -= bufsize
234 |     n_written += file.write(bytearray(n))
235 |     return n_written
236 | 
237 | 
238 | def raise_if_readonly(method):
239 |     @functools.wraps(method)
240 |     def wrapper(self, *args, **kwargs):
241 |         if self.readonly:
242 |             raise PermissionError('This function is not allowed in readonly mode')
243 |         return method(self, *args, **kwargs)
244 | 
245 |     return wrapper
246 | 
247 | 
248 | def raise_if_append_only(method):
249 |     @functools.wraps(method)
250 |     def wrapper(self, *args, **kwargs):
251 |         if self.append_only:
252 |             raise PermissionError('This function is not allowed in append-only mode')
253 |         return method(self, *args, **kwargs)
254 | 
255 |     return wrapper
256 | 
257 | 
258 | def raise_if_readonly_or_append_only(method):
259 |     @functools.wraps(method)
260 |     def wrapper(self, *args, **kwargs):
261 |         if self.readonly or self.append_only:
262 |             raise PermissionError('This function is not allowed in append-only mode')
263 |         return method(self, *args, **kwargs)
264 | 
265 |     return wrapper
266 | 
267 | 
268 | def parse_size(size):
269 |     if size is None:
270 |         return None
271 |     units = dict(K=1024, M=1024**2, G=1024**3, T=1024**4)
272 |     size = size.upper()
273 | 
274 |     for unit, factor in units.items():
275 |         if unit in size:
276 |             return int(float(size.replace(unit, '')) * factor)
277 | 
278 |     return int(size)
279 | 
280 | 
281 | def open_(path, mode, *args, **kwargs):
282 |     # This is like open() but supports an additional mode 'ax+b' which is like
283 |     # 'x+b' in that it fails if the file already exists, and creates it if it doesn't,
284 |     # but it also opens the file in append mode, like 'a+b'
285 | 
286 |     if sorted(mode) == sorted('ax+b'):
287 |         fd = os.open(path, os.O_APPEND)
288 |         return os.fdopen(fd, 'a+b', *args, **kwargs)
289 |     return open(path, mode, *args, **kwargs)
290 | 
291 | 
292 | def datetime_to_ns(dt):
293 |     return int(dt.timestamp() * 1e9)
294 | 
295 | 
296 | def ns_to_datetime(ns):
297 |     return datetime.fromtimestamp(ns / 1e9)
298 | 


--------------------------------------------------------------------------------
/src/barecat/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | import pickle
  4 | import sys
  5 | 
  6 | import barecat
  7 | import barecat.cli_impl as impl
  8 | from barecat.common import Order
  9 | from barecat.defrag import BarecatDefragger
 10 | from barecat.util import parse_size
 11 | 
 12 | 
 13 | def create():
 14 |     parser = argparse.ArgumentParser(
 15 |         description='Concatenate files to sharded blobs and create an sqlite index.'
 16 |     )
 17 |     parser.add_argument('--file', type=str, help='target path', required=True)
 18 |     parser.add_argument(
 19 |         '--null',
 20 |         action='store_true',
 21 |         help='read input paths from stdin, separated by null bytes as output by '
 22 |         'the find command with the -print0 option (otherwise newlines are '
 23 |         'interpreted as delimiters)',
 24 |     )
 25 |     parser.add_argument('--workers', type=int, default=None)
 26 |     parser.add_argument(
 27 |         '--shard-size-limit',
 28 |         type=str,
 29 |         default=None,
 30 |         help='maximum size of a shard in bytes (if not specified, '
 31 |         'all files will be concatenated into a single shard)',
 32 |     )
 33 |     parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
 34 | 
 35 |     args = parser.parse_args()
 36 |     impl.create_from_stdin_paths(
 37 |         target_path=args.file,
 38 |         shard_size_limit=parse_size(args.shard_size_limit),
 39 |         zero_terminated=args.null,
 40 |         overwrite=args.overwrite,
 41 |         workers=args.workers,
 42 |     )
 43 | 
 44 | 
 45 | def create_recursive():
 46 |     # args are --file, and --shard-size-limit and --workers and --overwrite, and positional args
 47 |     # are what you wanna pack in. if ya supply a single posarg thing then ya can use also the
 48 |     # flag --strip-root and then the root will be stripped from the paths
 49 |     parser = argparse.ArgumentParser(
 50 |         description='Concatenate files to sharded blobs and create an sqlite index.'
 51 |     )
 52 |     parser.add_argument('--file', type=str, help='target path', required=True)
 53 |     parser.add_argument('--workers', type=int, default=None)
 54 |     parser.add_argument(
 55 |         '--shard-size-limit',
 56 |         type=str,
 57 |         default=None,
 58 |         help='maximum size of a shard in bytes (if not specified, '
 59 |         'all files will be concatenated into a single shard)',
 60 |     )
 61 |     parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
 62 |     parser.add_argument('paths', type=str, nargs='+', help='paths to pack')
 63 |     parser.add_argument(
 64 |         '--strip-root',
 65 |         action='store_true',
 66 |         help='strip the root from the paths (only applicable if a single path is provided)',
 67 |     )
 68 | 
 69 |     args = parser.parse_args()
 70 |     impl.create_recursive(
 71 |         target_path=args.file,
 72 |         shard_size_limit=parse_size(args.shard_size_limit),
 73 |         roots=args.paths,
 74 |         overwrite=args.overwrite,
 75 |         workers=args.workers,
 76 |         strip_root=args.strip_root,
 77 |     )
 78 | 
 79 | 
 80 | def extract():
 81 |     parser = argparse.ArgumentParser(description='Extract files from a barecat archive.')
 82 |     parser.add_argument('--file', type=str, help='path to the archive file')
 83 |     parser.add_argument('--target-directory', type=str, help='path to the target directory')
 84 |     args = parser.parse_args()
 85 |     impl.extract(args.file, args.target_directory)
 86 | 
 87 | 
 88 | def extract_single():
 89 |     parser = argparse.ArgumentParser(description='Extract a single file from a barecat archive.')
 90 |     parser.add_argument('--barecat-file', type=str, help='path to the archive file')
 91 |     parser.add_argument('--path', type=str, help='path to the file to extract, within the archive')
 92 |     args = parser.parse_args()
 93 |     with barecat.Barecat(args.barecat_file) as reader:
 94 |         sys.stdout.buffer.write(reader[args.path])
 95 | 
 96 | 
 97 | def index_to_csv():
 98 |     parser = argparse.ArgumentParser(description='Dump the index contents as csv')
 99 |     parser.add_argument('file', type=str, help='path to the index file')
100 |     args = parser.parse_args()
101 | 
102 |     writer = csv.writer(sys.stdout, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
103 |     writer.writerow(['path', 'shard', 'offset', 'size', 'crc32c'])
104 |     with barecat.Index(args.file) as index:
105 |         for f in index.iter_all_fileinfos(order=Order.PATH):
106 |             writer.writerow([f.path, f.shard, f.offset, f.size, f.crc32c])
107 | 
108 | 
109 | def index_to_pickledict():
110 |     parser = argparse.ArgumentParser(description='Dump the index contents as a pickled dictionary')
111 |     parser.add_argument('file', type=str, help='path to the index file')
112 |     parser.add_argument('outfile', type=str, help='path to the result file')
113 |     args = parser.parse_args()
114 | 
115 |     with barecat.Index(args.file) as index_reader:
116 |         dicti = dict(index_reader.items())
117 | 
118 |     with open(args.outfile, 'xb') as outfile:
119 |         pickle.dump(dicti, outfile)
120 | 
121 | 
122 | def merge():
123 |     parser = argparse.ArgumentParser(description='Merge existing Barecat archives into one.')
124 |     parser.add_argument(
125 |         'input_paths', metavar='N', type=str, nargs='+', help='paths to the archives to merge'
126 |     )
127 |     parser.add_argument('--output', required=True, help='output path')
128 |     parser.add_argument(
129 |         '--shard-size-limit',
130 |         type=str,
131 |         default=None,
132 |         help='maximum size of a shard in bytes (if not specified, '
133 |         'all files will be concatenated into a single shard)',
134 |     )
135 |     parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
136 |     parser.add_argument(
137 |         '--ignore-duplicates',
138 |         action='store_true',
139 |         help='if true then if a later file has the same path as an earlier one,'
140 |         ' skip it; if false then raise an error',
141 |     )
142 | 
143 |     args = parser.parse_args()
144 |     impl.merge(
145 |         source_paths=args.input_paths,
146 |         target_path=args.output,
147 |         shard_size_limit=parse_size(args.shard_size_limit),
148 |         overwrite=args.overwrite,
149 |         ignore_duplicates=args.ignore_duplicates,
150 |     )
151 | 
152 | 
153 | def merge_symlink():
154 |     parser = argparse.ArgumentParser(description='Merge existing Barecat archives into one.')
155 |     parser.add_argument(
156 |         'input_paths', metavar='N', type=str, nargs='+', help='paths to the archives to merge'
157 |     )
158 |     parser.add_argument('--output', required=True, help='output path')
159 |     parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
160 |     parser.add_argument(
161 |         '--ignore-duplicates',
162 |         action='store_true',
163 |         help='if true then if a later file has the same path as an earlier one,'
164 |         ' skip it; if false then raise an error',
165 |     )
166 | 
167 |     args = parser.parse_args()
168 |     impl.merge_symlink(
169 |         source_paths=args.input_paths,
170 |         target_path=args.output,
171 |         overwrite=args.overwrite,
172 |         ignore_duplicates=args.ignore_duplicates,
173 |     )
174 | 
175 | 
176 | def verify_integrity():
177 |     parser = argparse.ArgumentParser(
178 |         description='Verify the integrity of a Barecat archive, including CRC32C, directory '
179 |         'stats and no gaps between stored files.'
180 |     )
181 |     parser.add_argument('file', type=str, help='path to the index file')
182 |     parser.add_argument(
183 |         '--quick', action='store_true', help='CRC32C is only verified on the last file'
184 |     )
185 |     args = parser.parse_args()
186 | 
187 |     with barecat.Barecat(args.file) as bc:
188 |         if not bc.verify_integrity(quick=args.quick):
189 |             print(f'Integrity errors were found.')
190 |             sys.exit(1)
191 | 
192 | 
193 | def defrag():
194 |     parser = argparse.ArgumentParser(
195 |         description='Defragment a Barecat archive to remove gaps left by deleted files.'
196 |     )
197 |     parser.add_argument('file', type=str, help='path to the index file')
198 |     parser.add_argument(
199 |         '--quick',
200 |         action='store_true',
201 |         help='faster but less thorough attempt at defrag, using the best-fit '
202 |         'algorithm to move the last files into gaps.',
203 |     )
204 | 
205 |     args = parser.parse_args()
206 |     with barecat.Barecat(args.file, readonly=False, append_only=False) as bc:
207 |         defragger = BarecatDefragger(bc)
208 |         if defragger.needs_defrag():
209 |             if args.quick:
210 |                 defragger.defrag_quick()
211 |             else:
212 |                 defragger.defrag()
213 | 
214 | 
215 | def archive2barecat():
216 |     parser = argparse.ArgumentParser(
217 |         description='Convert a tar or zip archive to a Barecat archive.'
218 |     )
219 |     # 2 positional args are the tar file and the target barecat file
220 |     parser.add_argument('archive_file', type=str, help='path to the tar or zip file')
221 |     parser.add_argument('barecat_file', type=str, help='path to the target barecat file')
222 | 
223 |     parser.add_argument(
224 |         '--shard-size-limit',
225 |         type=str,
226 |         default=None,
227 |         help='maximum size of a shard in bytes (if not specified, '
228 |         'all files will be concatenated into a single shard)',
229 |     )
230 |     parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
231 |     args = parser.parse_args()
232 |     impl.archive2barecat(
233 |         src_path=args.archive_file,
234 |         target_path=args.barecat_file,
235 |         shard_size_limit=parse_size(args.shard_size_limit),
236 |         overwrite=args.overwrite,
237 |     )
238 | 
239 | 
240 | def barecat2archive():
241 |     parser = argparse.ArgumentParser(
242 |         description='Convert a Barecat archive to a tar or tar or zip archive.'
243 |     )
244 |     # 2 positional args are the barecat file and the target tar file
245 |     parser.add_argument('barecat_file', type=str, help='path to the barecat file')
246 |     parser.add_argument('archive_file', type=str, help='path to the target archive file')
247 | 
248 |     args = parser.parse_args()
249 |     impl.barecat2archive(src_path=args.barecat_file, target_path=args.archive_file)
250 | 
251 | 
252 | 
253 | 
254 | def print_ncdu_json():
255 |     parser = argparse.ArgumentParser(
256 |         description='Print the contents of a Barecat as JSON in the format expected by ncdu.'
257 |     )
258 |     parser.add_argument('file', type=str, help='path to the index file')
259 |     args = parser.parse_args()
260 |     impl.print_ncdu_json(args.file)
261 | 


--------------------------------------------------------------------------------
/src/barecat/cli_impl.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import itertools
  3 | import json
  4 | import os
  5 | import os.path as osp
  6 | import shutil
  7 | import stat
  8 | import sys
  9 | import time
 10 | 
 11 | import barecat.util
 12 | from barecat.archive_formats import (
 13 |     get_archive_writer,
 14 |     iter_archive,
 15 |     iter_archive_nocontent,
 16 |     TarWriter,
 17 | )
 18 | from barecat.consumed_threadpool import ConsumedThreadPool
 19 | from barecat.core import barecat as barecat_
 20 | from barecat.core.index import BarecatDirInfo, BarecatFileInfo, Order
 21 | from barecat.core.sharder import Sharder
 22 | from barecat.progbar import progressbar
 23 | 
 24 | 
 25 | def create_from_stdin_paths(
 26 |     target_path, shard_size_limit, zero_terminated=False, overwrite=False, workers=None
 27 | ):
 28 |     iterator = generate_from_stdin(zero_terminated)
 29 |     create(iterator, target_path, shard_size_limit, overwrite, workers)
 30 | 
 31 | 
 32 | def create_recursive(target_path, shard_size_limit, roots, overwrite, strip_root, workers=None):
 33 |     iterator = generate_from_walks(roots, strip_root)
 34 |     create(iterator, target_path, shard_size_limit, overwrite, workers)
 35 | 
 36 | 
 37 | def generate_from_stdin(zero_terminated=False):
 38 |     if zero_terminated:
 39 |         input_paths = iterate_zero_terminated(sys.stdin.buffer)
 40 |     else:
 41 |         input_paths = (l.rstrip('\n') for l in sys.stdin)
 42 | 
 43 |     for input_path in progressbar(input_paths, desc='Packing files', unit=' files'):
 44 |         yield input_path, input_path
 45 | 
 46 | 
 47 | def generate_from_walks(roots, strip_root):
 48 |     for root in roots:
 49 |         if not strip_root:
 50 |             yield root, osp.basename(root)
 51 | 
 52 |         for dirpath, subdirnames, filenames in os.walk(root):
 53 |             for entryname in itertools.chain(filenames, subdirnames):
 54 |                 full_path = osp.join(dirpath, entryname)
 55 |                 relpath = osp.relpath(full_path, start=root)
 56 |                 if not strip_root:
 57 |                     store_path = osp.join(osp.basename(root), relpath)
 58 |                 else:
 59 |                     store_path = relpath
 60 |                 yield full_path, store_path
 61 | 
 62 | 
 63 | def create(
 64 |     filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite=False, workers=8
 65 | ):
 66 |     if workers is None:
 67 |         create_without_workers(
 68 |             filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite
 69 |         )
 70 |     else:
 71 |         create_with_workers(
 72 |             filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite, workers
 73 |         )
 74 | 
 75 | 
 76 | def create_without_workers(
 77 |     filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite=False
 78 | ):
 79 |     with barecat_.Barecat(
 80 |         target_path,
 81 |         shard_size_limit=shard_size_limit,
 82 |         readonly=False,
 83 |         overwrite=overwrite,
 84 |         append_only=False,
 85 |     ) as writer:
 86 |         for filesys_path, store_path in filesys_and_store_path_pairs:
 87 |             writer.add_by_path(filesys_path, store_path)
 88 | 
 89 | 
 90 | def create_with_workers(
 91 |     filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite=False, workers=8
 92 | ):
 93 |     if overwrite and barecat.util.exists(target_path):
 94 |         barecat.util.remove(target_path)
 95 | 
 96 |     with (
 97 |         Sharder(
 98 |             target_path,
 99 |             shard_size_limit=shard_size_limit,
100 |             readonly=False,
101 |             append_only=False,
102 |             threadsafe=True,
103 |             allow_writing_symlinked_shard=False,
104 |         ) as sharder,
105 |         ConsumedThreadPool(
106 |             index_writer_main, main_args=(f'{target_path}-sqlite-index',), max_workers=workers
107 |         ) as ctp,
108 |     ):
109 |         for filesys_path, store_path in filesys_and_store_path_pairs:
110 |             statresult = os.stat(filesys_path)
111 | 
112 |             if stat.S_ISDIR(statresult.st_mode):
113 |                 dinfo = BarecatDirInfo(path=store_path)
114 |                 dinfo.fill_from_statresult(statresult)
115 |                 ctp.submit(userdata=dinfo)
116 |             else:
117 |                 finfo = BarecatFileInfo(path=store_path)
118 |                 finfo.fill_from_statresult(statresult)
119 |                 finfo.shard, finfo.offset = sharder.reserve(finfo.size)
120 |                 ctp.submit(
121 |                     sharder.add_by_path,
122 |                     userdata=finfo,
123 |                     args=(filesys_path, finfo.shard, finfo.offset, finfo.size),
124 |                     kwargs=dict(raise_if_cannot_fit=True),
125 |                 )
126 | 
127 | 
128 | def index_writer_main(target_path, future_iter):
129 |     with barecat_.Index(target_path, readonly=False) as index_writer:
130 |         for future in future_iter:
131 |             info = future.userdata
132 |             if isinstance(info, BarecatDirInfo):
133 |                 index_writer.add_dir(info)
134 |                 continue
135 | 
136 |             shard_real, offset_real, size_real, crc32c = future.result()
137 |             info.shard = shard_real
138 |             info.offset = offset_real
139 |             info.crc32c = crc32c
140 | 
141 |             if info.size != size_real:
142 |                 raise ValueError('Size mismatch!')
143 |             index_writer.add_file(info)
144 | 
145 | 
146 | def extract(barecat_path, target_directory):
147 |     with barecat_.Barecat(barecat_path) as reader:
148 |         for path_in_archive in progressbar(reader, desc='Extracting files', unit=' files'):
149 |             target_path = osp.join(target_directory, path_in_archive)
150 |             os.makedirs(osp.dirname(target_path), exist_ok=True)
151 |             with open(target_path, 'wb') as output_file:
152 |                 shutil.copyfileobj(reader.open(path_in_archive), output_file)
153 | 
154 | 
155 | def merge(source_paths, target_path, shard_size_limit, overwrite=False, ignore_duplicates=False):
156 |     with barecat_.Barecat(
157 |         target_path, shard_size_limit=shard_size_limit, readonly=False, overwrite=overwrite
158 |     ) as writer:
159 |         for source_path in source_paths:
160 |             print(f'Merging files from {source_path}')
161 |             writer.merge_from_other_barecat(source_path, ignore_duplicates=ignore_duplicates)
162 | 
163 | 
164 | def merge_symlink(source_paths, target_path, overwrite=False, ignore_duplicates=False):
165 |     index_path = f'{target_path}-sqlite-index'
166 |     if overwrite and osp.exists(index_path):
167 |         os.remove(index_path)
168 | 
169 |     with barecat_.Index(index_path, readonly=False) as index_writer:
170 |         c = index_writer.cursor
171 |         c.execute("COMMIT")
172 |         c.execute('PRAGMA synchronous=OFF')
173 |         c.execute('PRAGMA journal_mode=OFF')
174 | 
175 |         i_out_shard = 0
176 |         for source_path in source_paths:
177 |             index_writer.merge_from_other_barecat(
178 |                 f'{source_path}-sqlite-index', ignore_duplicates=ignore_duplicates
179 |             )
180 |             for shard_path in sorted(glob.glob(f'{source_path}-shard-*')):
181 |                 os.symlink(
182 |                     osp.relpath(shard_path, start=osp.dirname(target_path)),
183 |                     f'{target_path}-shard-{i_out_shard:05d}',
184 |                 )
185 |                 i_out_shard += 1
186 | 
187 | 
188 | def write_index(dictionary, target_path):
189 |     with barecat_.Index(target_path, readonly=False) as index_writer:
190 |         for path, (shard, offset, size) in dictionary.items():
191 |             index_writer.add_file(
192 |                 BarecatFileInfo(path=path, shard=shard, offset=offset, size=size)
193 |             )
194 | 
195 | 
196 | def read_index(path):
197 |     with barecat_.Index(path) as reader:
198 |         return dict(reader.items())
199 | 
200 | 
201 | def iterate_zero_terminated(fileobj):
202 |     partial_path = b''
203 |     while chunk := fileobj.read(4096):
204 |         parts = chunk.split(b'\x00')
205 |         parts[0] = partial_path + parts[0]
206 |         partial_path = parts.pop()
207 | 
208 |         for input_path in parts:
209 |             input_path = input_path.decode()
210 |             yield input_path
211 | 
212 | 
213 | def archive2barecat(src_path, target_path, shard_size_limit, overwrite=False):
214 |     with barecat_.Barecat(
215 |         target_path, shard_size_limit=shard_size_limit, readonly=False, overwrite=overwrite
216 |     ) as writer:
217 |         for file_or_dir_info, fileobj in iter_archive(src_path):
218 |             writer.add(file_or_dir_info, fileobj=fileobj, dir_exist_ok=True)
219 | 
220 | 
221 | def wrap_archive(src_path, target_path, overwrite=False):
222 |     index_path = f'{target_path}-sqlite-index'
223 |     if overwrite and osp.exists(index_path):
224 |         os.remove(index_path)
225 | 
226 |     with barecat_.Index(target_path, readonly=False) as index:
227 |         for file_or_dir_info in iter_archive_nocontent(src_path):
228 |             index.add(file_or_dir_info)
229 | 
230 |     os.symlink(src_path, f'{target_path}-shard-00000')
231 | 
232 | 
233 | def barecat2archive(src_path, target_path):
234 |     with barecat_.Barecat(src_path, readonly=True) as bc:
235 |         with get_archive_writer(target_path) as target_archive:
236 |             infos = bc.index.iter_all_infos(order=Order.PATH)
237 |             num_total = bc.index.num_files + bc.index.num_dirs
238 |             for entry in progressbar(infos, total=num_total, desc='Writing', unit=' entries'):
239 |                 if isinstance(entry, BarecatDirInfo):
240 |                     target_archive.add(entry)
241 |                 else:
242 |                     with bc.open(entry.path) as file_in_barecat:
243 |                         target_archive.add(entry, fileobj=file_in_barecat)
244 | 
245 | 
246 | def print_ncdu_json(path):
247 |     timestamp = time.time()
248 |     import importlib.metadata
249 | 
250 |     progver = importlib.metadata.version('barecat')
251 |     progver = '.'.join(progver.split('.')[:3])
252 | 
253 |     print(f'[1,1,{{"progname":"barecat","progver": {progver},"timestamp":{timestamp}}},')
254 |     with barecat_.Index(path) as index_reader:
255 |         _print_ncdu_json(index_reader, '')
256 |     print(']')
257 | 
258 | 
259 | def _print_ncdu_json(index_reader, dirpath):
260 |     basename = '/' if dirpath == '' else osp.basename(dirpath)
261 | 
262 |     print('[', json.dumps(dict(name=basename, asize=4096, ino=0)), end='')
263 |     infos = index_reader.listdir_infos(dirpath)
264 |     file_infos = [f for f in infos if isinstance(f, BarecatFileInfo)]
265 |     subdir_infos = [d for d in infos if isinstance(d, BarecatDirInfo)]
266 |     del infos
267 | 
268 |     if file_infos:
269 |         filedump = json.dumps(
270 |             [dict(name=osp.basename(fi.path), asize=fi.size, dsize=fi.size, ino=0) for fi in file_infos]
271 |         )
272 |         print(',', filedump[1:-1], end='')
273 |     del file_infos
274 | 
275 |     for subdir in subdir_infos:
276 |         print(',')
277 |         _print_ncdu_json(index_reader, subdir.path)
278 | 
279 |     print(']', end='')
280 | 


--------------------------------------------------------------------------------
/src/barecat/core/sharder.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import os.path as osp
  4 | import shutil
  5 | from contextlib import AbstractContextManager
  6 | 
  7 | import crc32c as crc32c_lib
  8 | from barecat.common import FileSection
  9 | from barecat.util import (
 10 |     copyfileobj,
 11 |     copyfileobj_crc32c,
 12 |     open_,
 13 |     raise_if_readonly,
 14 |     reopen,
 15 |     write_zeroes,
 16 | )
 17 | 
 18 | 
 19 | class Sharder(AbstractContextManager):
 20 |     def __init__(
 21 |         self,
 22 |         path,
 23 |         shard_size_limit=None,
 24 |         readonly=True,
 25 |         append_only=False,
 26 |         threadsafe=False,
 27 |         allow_writing_symlinked_shard=False,
 28 |     ):
 29 | 
 30 |         self.path = path
 31 |         self.readonly = readonly
 32 |         self.append_only = append_only
 33 |         self.threadsafe = threadsafe
 34 |         self.allow_writing_symlinked_shard = allow_writing_symlinked_shard
 35 | 
 36 |         self.shard_size_limit = shard_size_limit
 37 | 
 38 |         if readonly:
 39 |             self.shard_mode_nonlast = 'rb'
 40 |             self.shard_mode_last_existing = 'rb'
 41 |             self.shard_mode_new = 'rb'
 42 |         elif append_only:
 43 |             self.shard_mode_nonlast = 'rb'
 44 |             self.shard_mode_last_existing = 'a+b'
 45 |             self.shard_mode_new = 'ax+b'
 46 |         else:
 47 |             self.shard_mode_nonlast = 'r+b'
 48 |             self.shard_mode_last_existing = 'r+b'
 49 |             self.shard_mode_new = 'x+b'
 50 | 
 51 |         self._shard_files = None
 52 |         if threadsafe:
 53 |             import multiprocessing_utils
 54 | 
 55 |             self.local = multiprocessing_utils.local()
 56 |         else:
 57 |             self.local = None
 58 | 
 59 |     # READING
 60 |     def readinto_from_address(self, shard, offset, buffer, expected_crc32c=None):
 61 |         shard_file = self.shard_files[shard]
 62 |         shard_file.seek(offset)
 63 |         num_read = shard_file.readinto(buffer)
 64 |         if expected_crc32c is not None and crc32c_lib.crc32c(buffer[:num_read]) != expected_crc32c:
 65 |             raise ValueError('CRC32C mismatch')
 66 |         return num_read
 67 | 
 68 |     def read_from_address(self, shard, offset, size, expected_crc32c=None):
 69 |         shard_file = self.shard_files[shard]
 70 |         shard_file.seek(offset)
 71 |         data = shard_file.read(size)
 72 |         if expected_crc32c is not None and crc32c_lib.crc32c(data) != expected_crc32c:
 73 |             raise ValueError('CRC32C mismatch')
 74 |         return data
 75 | 
 76 |     def open_from_address(self, shard, offset, size, mode='r'):
 77 |         return FileSection(self.shard_files[shard], offset, size, readonly=mode in ('r', 'rb'))
 78 | 
 79 |     # WRITING
 80 |     @raise_if_readonly
 81 |     def add_by_path(self, filesys_path, shard, offset, size, raise_if_cannot_fit=False):
 82 |         with open(filesys_path, 'rb') as in_file:
 83 |             return self.add(
 84 |                 shard, offset, size, fileobj=in_file, raise_if_cannot_fit=raise_if_cannot_fit
 85 |             )
 86 | 
 87 |     @raise_if_readonly
 88 |     def reopen_current_shard(self, mode):
 89 |         return self.reopen_shard(self.num_shards - 1, mode)
 90 | 
 91 |     @raise_if_readonly
 92 |     def reopen_shard(self, shard_number, mode):
 93 |         if mode != 'rb' and shard_number != self.num_shards - 1:
 94 |             self.raise_if_append_only(
 95 |                 'Cannot change mode of non-last shard in an append-only Barecat'
 96 |             )
 97 |         self.shard_files[shard_number] = reopen(self.shard_files[shard_number], mode)
 98 |         return self.shard_files[shard_number]
 99 | 
100 |     @raise_if_readonly
101 |     def reopen_shards(self):
102 |         for i in range(self.num_shards):
103 |             if i == self.num_shards - 1:
104 |                 mode = self.shard_mode_last_existing
105 |             else:
106 |                 mode = self.shard_mode_nonlast
107 |             self.reopen_shard(i, mode)
108 | 
109 |     @raise_if_readonly
110 |     def start_new_shard(self):
111 |         self.reopen_current_shard(self.shard_mode_nonlast)
112 |         new_shard_file = open_(f'{self.path}-shard-{self.num_shards:05d}', self.shard_mode_new)
113 |         self.shard_files.append(new_shard_file)
114 |         return new_shard_file
115 | 
116 |     @raise_if_readonly
117 |     def start_new_shard_and_transfer_last_file(self, offset, size):
118 |         self.raise_if_readonly('Cannot add to a read-only Barecat')
119 | 
120 |         old_shard_file = self.reopen_current_shard('r+b')
121 |         new_shard_file = open_(f'{self.path}-shard-{self.num_shards:05d}', self.shard_mode_new)
122 |         old_shard_file.seek(offset)
123 |         copyfileobj(old_shard_file, new_shard_file, size)
124 |         old_shard_file.truncate(offset)
125 |         self.reopen_current_shard(self.shard_mode_nonlast)
126 | 
127 |         self.shard_files.append(new_shard_file)
128 |         return new_shard_file
129 | 
130 |     @raise_if_readonly
131 |     def add(
132 |         self,
133 |         shard=None,
134 |         offset=None,
135 |         size=None,
136 |         data=None,
137 |         fileobj=None,
138 |         bufsize=shutil.COPY_BUFSIZE,
139 |         raise_if_cannot_fit=False,
140 |     ):
141 |         if data is None and fileobj is None:
142 |             raise ValueError('Either data or fileobj must be provided')
143 |         if data is not None and fileobj is not None:
144 |             raise ValueError('Both data and fileobj cannot be provided')
145 |         if data is not None and size is not None and size != len(data):
146 |             raise ValueError('Specified size does not match the length of the data')
147 |         if shard is None and offset is not None:
148 |             raise ValueError('Offset cannot be specified without a shard')
149 |         if shard is not None and offset is None:
150 |             raise ValueError('Shard cannot be specified without an offset')
151 | 
152 |         if size is None and data is not None:
153 |             size = len(data)
154 | 
155 |         if shard is None:
156 |             shard_file = self.shard_files[-1]
157 |             shard = self.num_shards - 1
158 |             offset = shard_file.seek(0, os.SEEK_END)
159 |         else:
160 |             self.ensure_open_shards(shard)
161 |             shard_file = self.shard_files[shard]
162 |             shard_file.seek(offset)
163 | 
164 |         offset_real = offset
165 |         shard_real = shard
166 |         if size is not None:
167 |             if size > self.shard_size_limit:
168 |                 raise ValueError(f'File is too large to fit into a shard')
169 |             if offset + size > self.shard_size_limit:
170 |                 if raise_if_cannot_fit:
171 |                     raise ValueError(f'File does not fit in the shard')
172 |                 shard_file = self.start_new_shard()
173 |                 offset_real = 0
174 |                 shard_real = self.num_shards - 1
175 | 
176 |         if data is not None:
177 |             if not isinstance(data, (bytes, bytearray, memoryview)):
178 |                 raise ValueError(
179 |                     'Data must be bytes, bytearray or memoryview. Are you using auto_codec/register_codec wrong?'
180 |                 )
181 |             shard_file.write(data)
182 |             crc32c = crc32c_lib.crc32c(data)
183 |             size_real = len(data)
184 |         else:
185 |             size_real, crc32c = copyfileobj_crc32c(fileobj, shard_file, size, bufsize)
186 |             if size is not None and size != size_real:
187 |                 raise ValueError(f'Size mismatch! Expected {size}, got only {size_real}')
188 | 
189 |         if offset_real + size_real > self.shard_size_limit:
190 |             if raise_if_cannot_fit:
191 | 
192 |                 raise ValueError('File does not fit in the shard')
193 |             self.start_new_shard_and_transfer_last_file(offset_real, size_real)
194 |             offset_real = 0
195 |             shard_real = self.num_shards - 1
196 | 
197 |         return shard_real, offset_real, size_real, crc32c
198 | 
199 |     def reserve(self, size):
200 |         if size > self.shard_size_limit:
201 |             raise ValueError(f'File is too large to fit into a shard')
202 | 
203 |         shard_file = self.shard_files[-1]
204 |         offset = shard_file.seek(0, os.SEEK_END)
205 |         if offset + size > self.shard_size_limit:
206 |             shard_file = self.start_new_shard()
207 |             offset = 0
208 | 
209 |         shard_file.seek(offset)
210 |         write_zeroes(shard_file, size)
211 |         shard_file.flush()
212 |         return self.num_shards - 1, offset
213 | 
214 |     def ensure_open_shards(self, shard_id):
215 |         if self.num_shards < shard_id + 1:
216 |             for i in range(self.num_shards, shard_id + 1):
217 |                 self.shard_files.append(
218 |                     open_(f'{self.path}-shard-{i:05d}', mode=self.shard_mode_nonlast)
219 |                 )
220 | 
221 |     def open_shard_files(self):
222 |         shard_paths = sorted(glob.glob(f'{self.path}-shard-?????'))
223 |         if (
224 |             not self.readonly
225 |             and not self.allow_writing_symlinked_shard
226 |             and any(osp.islink(p) for p in shard_paths)
227 |         ):
228 |             raise ValueError(
229 |                 'Writing symlinked shards was disabled in this Barecat '
230 |                 '(allow_writing_symlinked_shard on the constructor)'
231 |             )
232 | 
233 |         shard_files_nonlast = [open_(p, mode=self.shard_mode_nonlast) for p in shard_paths[:-1]]
234 |         last_shard_name = f'{self.path}-shard-{len(shard_files_nonlast):05d}'
235 |         try:
236 |             last_shard_file = open_(last_shard_name, mode=self.shard_mode_last_existing)
237 |         except FileNotFoundError:
238 |             if self.readonly:
239 |                 raise
240 |             last_shard_file = open_(last_shard_name, mode=self.shard_mode_new)
241 | 
242 |         return shard_files_nonlast + [last_shard_file]
243 | 
244 |     def truncate_all_to_logical_size(self, logical_shard_ends):
245 |         shard_files = self.shard_files
246 |         for i in range(self.num_shards - 1, 0, -1):
247 |             if logical_shard_ends[i] == 0:
248 |                 shard_files[i].truncate(0)
249 |                 shard_files[i].close()
250 |                 os.remove(shard_files[i].name)
251 |                 del shard_files[i]
252 |             else:
253 |                 break
254 |         for i, f in enumerate(self.shard_files):
255 |             f.truncate(logical_shard_ends[i])
256 |         self.reopen_current_shard(self.shard_mode_last_existing)
257 | 
258 |     def close(self):
259 |         for f in self.shard_files:
260 |             f.close()
261 | 
262 |     def raise_if_readonly(self, message):
263 |         if self.readonly:
264 |             raise ValueError(message)
265 | 
266 |     def raise_if_append_only(self, message):
267 |         if self.append_only:
268 |             raise ValueError(message)
269 | 
270 |     def physical_shard_end(self, shard_number):
271 |         return self.shard_files[shard_number].seek(0, os.SEEK_END)
272 | 
273 |     @property
274 |     def num_shards(self):
275 |         return len(self.shard_files)
276 | 
277 |     @property
278 |     def total_physical_size_seek(self):
279 |         return sum(self.physical_shard_end(i) for i in range(self.num_shards))
280 | 
281 |     @property
282 |     def total_physical_size_stat(self):
283 |         return sum(osp.getsize(f.name) for f in self.shard_files)
284 | 
285 |     # THREADSAFE
286 |     @property
287 |     def shard_files(self):
288 |         if self.local is None:
289 |             if self._shard_files is None:
290 |                 self._shard_files = self.open_shard_files()
291 |             return self._shard_files
292 |         try:
293 |             return self.local.shard_files
294 |         except AttributeError:
295 |             self.local.shard_files = self.open_shard_files()
296 |             return self.local.shard_files
297 | 
298 |     def __exit__(self, exc_type, exc_val, exc_tb):
299 |         self.close()
300 | 


--------------------------------------------------------------------------------
/docs/abbrev_long.bib:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%% Journals %%%%%%%%%%%%%%%%
  2 | @string{IJCV     = "International Journal of Computer Vision (IJCV)"}
  3 | @string{CVIU     = "Computer Vision and Image Understanding (CVIU)"}
  4 | @string{PR       = "Pattern Recognition"}
  5 | @string{PRL      = "Pattern Recognition Letters"}
  6 | 
  7 | @string{ML       = "Machine Learning"}
  8 | @string{AI       = "Artificial Intelligence"}
  9 | @string{AR       = "Autonomous Robots"}
 10 | @string{MVA      = "Machine Vision and Applications"}
 11 | @string{IVC      = "Image and Vision Computing"}
 12 | @string{BBS      = "Behavioral and Brain Sciences (BBS)"}
 13 | @string{VR       = "Vision Research"}
 14 | @string{IR       = "Information Retrieval"}
 15 | @string{NN       = "Neural Networks"}
 16 | @string{CAG      = "Computers \& Graphics"}
 17 | @string{CVGIP    = "Computer Vision, Graphics, and Image Processing (CVGIP)"}
 18 | @string{CVGIPIU  = "CVGIP: Image Understanding"}
 19 | @string{PP       = "Perception \& Psychophysics"}
 20 | @string{FTCGV    = "Foundations and Trends in Computer Graphics and Vision"}
 21 | @string{AdvRob   = "Advanced Robotics"}
 22 | 
 23 | @string{Nature   = "Nature"}
 24 | @string{Science  = "Science"}
 25 | @string{Mechatronics = "Mechatronics"}
 26 | @string{NRN      = "Nature Reviews Neuroscience"}
 27 | @string{NM       = "Nature Methods"}
 28 | @string{PHY      = "Physical Review E"}
 29 | @string{PsychRev = "Psychological Review"}
 30 | 
 31 | @string{JMLR     = "Journal of Machine Learning Research (JMLR)"}
 32 | @string{JSC      = "Journal of Scientific Computing"}
 33 | @string{JCN      = "Journal of Cognitive Neuroscience"}
 34 | @string{JEPHPP   = "Journal of Experimental Psychology: Human Perception and Performance"}
 35 | @string{JECP     = "Journal of Experimental Child Psychology"}
 36 | @string{JB       = "Journal of Biomechanics"}
 37 | 
 38 | @string{EURASIP  = "EURASIP Journal on Advances in Signal Processing"}
 39 | @string{PRESENCE = "Presence: Teleoperators and Virtual Environments"}
 40 | @string{BMB      = "The Bulletin of Mathematical Biophysics"}
 41 | 
 42 | @string{TVC     = "The Visual Computer"}
 43 | @string{TJSC    = "The Journal of Supercomputing"}
 44 | 
 45 | % IEEE
 46 | @string{PIEEE = "Proceedings of the IEEE"}
 47 | @string{RAL   = "IEEE Robotics and Automation Letters (RA-L)"}
 48 | @string{CGA   = "IEEE Computer Graphics and Applications"}
 49 | @string{IEEEA = "IEEE Access"}
 50 | @string{TPAMI = "IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"}
 51 | @string{PAMI  = "IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"}
 52 | @string{TC    = "IEEE Transactions on Communications"}
 53 | @string{TCyb  = "IEEE Transactions on Cybernetics"}
 54 | @string{TSE   = "IEEE Transactions on Software Engineering"}
 55 | @string{TIV   = "IEEE Transactions on Intelligent Vehicles"}
 56 | @string{TIP   = "IEEE Transactions on Image Processing"}
 57 | @string{TOR   = "IEEE Transactions on Robotics"}
 58 | @string{TAC   = "IEEE Transactions on Automatic Control"}
 59 | @string{TITS  = "IEEE Transactions on Intelligent Transportation Systems (T-ITS)"}
 60 | @string{TOC   = "IEEE Transactions on Computers"}
 61 | @string{TVT   = "IEEE Transactions on Vehicular Technologies"}
 62 | @string{TNN   = "IEEE Transactions on Neural Networks"}
 63 | @string{THMS  = "IEEE Transactions on Human-Machine Systems"}
 64 | @string{TCSVT = "IEEE Transactions on Circuits and Systems for Video Technology"}
 65 | @string{TBIOM = "IEEE Transactions on Biometrics, Behavior, and Identity Science (T-BIOM)"}
 66 | @string{TIT   = "IEEE Transactions on Information Theory"}
 67 | @string{TVCG  = "IEEE Transactions on Visualization and Computer Graphics (TVCG)"}
 68 | @string{TSSC  = "IEEE Transactions on Systems Science and Cybernetics"}
 69 | @string{IRETIT= "IRE Transactions on Information Theory"}
 70 | @string{IJTEHM= "IEEE Journal of Translational Engineering in Health and Medicine"}
 71 | 
 72 | 
 73 | % ACM
 74 | @string{TOCHI = "ACM Transactions on Computer-Human Interaction (TOCHI)"}
 75 | @string{TOG   = "ACM Transactions on Graphics (TOG)"}
 76 | @string{CACM  = "Communications of the ACM (CACM)"}
 77 | @string{IMWUT = "Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT)"}
 78 | @string{CSUR  = "ACM Computing Surveys (CSUR)"}
 79 | @string{THRI = "ACM Transactions on Human-Robot Interaction"}
 80 | 
 81 | @string{AnnStat = "Annals of Statistics"}
 82 | @string{JC      = "Journal of Classification"}
 83 | @string{IJRR    = "International Journal of Robotics Research (IJRR)"}
 84 | @string{RSS     = "Robotics: Science and Systems (RSS)"}
 85 | 
 86 | @string{PLOSOne    = "PLOS One"}
 87 | @string{SMO        = "Sports Medicine -- Open"}
 88 | @string{IJMIR      = "International Journal of Multimedia Information Retrieval (IJMIR)"}
 89 | 
 90 | @string{BiolCyb    = "Biological Cybernetics"}
 91 | @string{Psychomet  = "Psychometrika"}
 92 | @string{Biotelem   = "Biotelemetry"}
 93 | @string{NC         = "Neural Computation"}
 94 | @string{Neurocomputing = "Neurocomputing"}
 95 | @string{PhilosMag  = "London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science"}
 96 | 
 97 | @string{TST   = "Tsinghua Science and Technology"}
 98 | @string{VRIH  = "Virtual Reality \& Intelligent Hardware (VRIH)"}
 99 | @string{AR    = "Autonomous Robots Journal"}
100 | @string{ISPRS = "ISPRS Journal of Photogrammetry and Remote Sensing (P\&RS)"}
101 | @string{MMS   = "Multimedia Systems"}
102 | @string{SSS   = "Social Studies of Science"}
103 | @string{SIREV = "SIAM Review"}
104 | 
105 | @string{Sensors     = "Sensors"}
106 | @string{Electronics = "Electronics"}
107 | 
108 | @string{ARVC  = "Annual Review of Vision Science"}
109 | @string{ARP  = "Annual Review of Psychology"}
110 | @string{PRSLB = "Proceedings of the Royal Society of London. Series B, Biological Sciences"}
111 | @string{PRSA = "Proceedings of the Royal Society A"}
112 | 
113 | @string{TJP   = "The Journal of Physiology"}
114 | @string{USSRCMMP = "USSR Computational Mathematics and Mathematical Physics"}
115 | @string{CRHSAS = "Comptes rendus hebdomadaires des séances de l'Académie des sciences"}
116 | 
117 | 
118 | %%%%%%%%%%%%%%%%%%%%% Conferences %%%%%%%%%%%%%%
119 | @string{CVPR     = "IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"}
120 | @string{ICCV     = "IEEE/CVF International Conference on Computer Vision (ICCV)"}
121 | @string{WACV     = "IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)"}
122 | 
123 | @string{ECCV     = "European Conference on Computer Vision (ECCV)"}
124 | @string{ACCV     = "Asian Conference on Computer Vision (ACCV)"}
125 | @string{BMVC     = "British Machine Vision Conference (BMVC)"}
126 | @string{DAGM     = "DAGM Annual Pattern Recognition Symposium"}
127 | @string{GCPR     = "DAGM German Conference on Pattern Recognition (GCPR)"}
128 | 
129 | @string{NIPS     = "Advances in Neural Information Processing Systems (NIPS)"}
130 | @string{NeurIPS  = "Advances in Neural Information Processing Systems (NeurIPS)"}
131 | @string{NeurIPSDB = "Neural Information Processing Systems: Datasets and Benchmarks Track"}
132 | 
133 | @string{TDV      = "International Conference on 3D Vision (3DV)"}
134 | @string{ICML     = "International Conference on Machine Learning (ICML)"}
135 | @string{ICLR     = "International Conference on Learning Representations (ICLR)"}
136 | @string{ICPR     = "International Conference on Pattern Recogntion (ICPR)"}
137 | @string{CAIP     = "International Conference on Analysis of Images and Patterns (CAIP)"}
138 | @string{ICIAP    = "International Conference on Image Analysis and Processing (ICIAP)"}
139 | @string{ICIAR    = "International Conference on Image Analysis and Recognition (ICIAR)"}
140 | 
141 | @string{ISCS     = "IEEE International Symposium on Circuits and Systems (ISCAS)"}
142 | @string{FG       = "IEEE International Conference on Automatic Face and Gesture Recognition (FG)"}
143 | @string{CDC      = "IEEE Conference on Decision and Control (CDC)"}
144 | @string{IROS     = "IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)"}
145 | @string{ICRA     = "IEEE International Conference on Robotics and Automation (ICRA)"}
146 | @string{IVS      = "IEEE Intelligent Vehicles Symposium (IV)"}
147 | @string{ICASSP   = "IEEE Conference on Acoustics, Speech and Signal Processing (ICASSP)"}
148 | @string{ITW      = "IEEE Information Theory Workshop (ITW)"}
149 | @string{ICIP     = "IEEE International Conference on Image Processing (ICIP)"}
150 | @string{ICME     = "IEEE International Conference on Multimedia \& Expo (ICME)"}
151 | @string{CITS     = "IEEE Conference on Intelligent Transportation Systems (ITSC)"}
152 | @string{RSS      = "Robotics: Science and Systems (RSS)"}
153 | 
154 | @string{SIGGRAPH = "ACM Transactions on Graphics (Proceedings of ACM SIGGRAPH)"}
155 | @STRING{SIGGRAPHAsia = "ACM Transactions on Graphics (Proceedings of ACM SIGGRAPH Asia)"}
156 | @string{CHI      = "ACM Conference on Human Factors in Computing Systems (CHI)"}
157 | @string{MMSys    = "ACM Multimedia Systems Conference (MMSys)"}
158 | @string{SIGMOD   = "ACM SIGMOD International Conference on Management of Data"}
159 | @string{MM       = "ACM International Conference on Multimedia"}
160 | @string{KDD      = "ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)"}
161 | @string{AAAI     = "AAAI Conference on Artificial Intelligence"}
162 | @string{AAAI     = "AAAI Conference on Artificial Intelligence"}
163 | @string{IJCAI    = "International Joint Conference on Artificial Intelligence (IJCAI)"}
164 | 
165 | @string{ACC      = "American Control Conference (ACC)"}
166 | @string{WAPCV    = "International Workshop on Attention in Cognitive Systems (WAPCV)"}
167 | @string{COLT92   = "Annual Workshop on Computational Learning Theory (COLT)"}
168 | 
169 | @string{SIBGRAPI = "SIBGRAPI Conference on Graphics, Patterns and Images"}
170 | @string{ICIRA    = "International Conference on Intelligent Robotics and Applications (ICIRA)"}
171 | 
172 | @string{AISTAT   = "International Conference on Artificial Intelligence and Statistics (AISTATS)"}
173 | @string{AISTATS  = "International Conference on Artificial Intelligence and Statistics (AISTATS)"}
174 | 
175 | @string{SCIA     = "Scandinavian Conference on Image Analysis (SCIA)"}
176 | @string{EUROCOLT = "European Conference on Computational Learning Theory (EuroCOLT)"}
177 | @string{ICVS     = "International Conference on Computer Vision Systems (ICVS)"}
178 | @string{EMMCVPR  = "International Conference on Energy Minimization Methods in Computer Vision and Pattern Recognition (EMMCVPR)"}
179 | @string{IJCNN    = "International Joint Conference on Neural Networks (IJCNN)"}
180 | 
181 | @string{MICCAI   = "International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)"}
182 | @string{ICANN    = "International Conference on Artificial Neural Networks (ICANN)"}
183 | @string{ISMIR    = "International Society for Music Information Retrieval Conference (ISMIR)"}
184 | @string{AMDO     = "International Conference on Articulated Motion and Deformable Objects (AMDO)"}
185 | @string{Allerton = "Annual Allerton Conference on Communication, Control, and Computing"}
186 | @string{OSDI     = "USENIX Symposium on Operating Systems Design and Implementation (OSDI)"}
187 | 
188 | @string{BRACIS   = "Brazilian Conference on Intelligent Systems (BRACIS)"}
189 | @string{MIDL     = "Medical Imaging with Deep Learning (MIDL)"}
190 | @string{TDBODYTECH = "International Conference and Exhibition on 3D Body Scanning and Processing Technologies (3DBODY.TECH)"}
191 | @string{IAS      = "International Conference on Intelligent Autonomous Systems"}
192 | @string{CoRL     = "Conference on Robot Learning"}
193 | @string{CRV      = "Conference on Computer and Robot Vision"}
194 | @string{ICONIP   = "International Conference on Neural Information Processing"}
195 | @string{SGP      = "Symposium on Geometry Processing"}
196 | 
197 | 
198 | @string{WACV_until_2016 = "IEEE Workshop on Applications of Computer Vision (WACV)"}
199 | %%%%%%%%%%%%%%%%%%%%% Workshops %%%%%%%%%%%%%%
200 | @string{ICCVW   = "IEEE International Conference on Computer Vision -- Workshops (ICCVW)"}
201 | @string{ECCVW   = "European Conference on Computer Vision -- Workshops (ECCVW)"}
202 | @string{CVPRW   = "IEEE Conference on Computer Vision and Pattern Recognition -- Workshops (CVPRW)"}
203 | @string{IROSW   = "IEEE/RSJ International Conference on Intelligent Robots and Systems -- Workshops (IROSW)"}
204 | @string{WACVW   = "IEEE Winter Conference on Applications of Computer Vision -- Workshops (WACVW)"}
205 | @string{MICCAIW = "International Conference on Medical Image Computing and Computer Assisted Intervention -- Workshops (MICCAIW)"}
206 | 
207 | @string{MMWVSCC = "ACM Multimedia Conference (MM) -- Workshop on Visual Analysis in Smart and Connected Communities (VSCC)"}
208 | 


--------------------------------------------------------------------------------
/src/barecat/common.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | from datetime import datetime
  4 | from enum import Flag, auto
  5 | from typing import Union, TYPE_CHECKING, Optional
  6 | from barecat.util import datetime_to_ns, normalize_path, ns_to_datetime
  7 | 
  8 | if TYPE_CHECKING:
  9 |     from barecat import BarecatEntryInfo
 10 | 
 11 | SHARD_SIZE_UNLIMITED = (1 << 63) - 1  #: An extremely large integer, representing unlimited size
 12 | 
 13 | 
 14 | class BarecatEntryInfo:
 15 |     """
 16 |     Base class for file and directory information classes.
 17 | 
 18 |     The two subclasses are :class:`barecat.BarecatFileInfo` and :class:`barecat.BarecatDirInfo`.
 19 | 
 20 |     Args:
 21 |         path: path to the file or directory
 22 |         mode: file mode, i.e. permissions
 23 |         uid: user ID
 24 |         gid: group ID
 25 |         mtime_ns: last modification time in nanoseconds since the Unix epoch
 26 |     """
 27 | 
 28 |     __slots__ = ('_path', 'mode', 'uid', 'gid', 'mtime_ns')
 29 | 
 30 |     def __init__(
 31 |         self,
 32 |         path: Optional[str] = None,
 33 |         mode: Optional[int] = None,
 34 |         uid: Optional[int] = None,
 35 |         gid: Optional[int] = None,
 36 |         mtime_ns: Optional[Union[int, datetime]] = None,
 37 |     ):
 38 |         self._path = normalize_path(path)
 39 |         self.mode = mode
 40 |         """File mode, i.e., permissions."""
 41 | 
 42 |         self.uid = uid
 43 |         """User ID."""
 44 | 
 45 |         self.gid = gid
 46 |         """Group ID."""
 47 | 
 48 |         self.mtime_ns = mtime_ns
 49 |         """Last modification time in nanoseconds since the Unix epoch."""
 50 | 
 51 |         if isinstance(self.mtime_ns, datetime):
 52 |             self.mtime_ns = datetime_to_ns(self.mtime_ns)
 53 | 
 54 |     @property
 55 |     def path(self):
 56 |         """Path to the file or directory. The path is normalized on assignment."""
 57 |         return self._path
 58 | 
 59 |     @path.setter
 60 |     def path(self, value):
 61 |         self._path = normalize_path(value)
 62 | 
 63 |     @property
 64 |     def mtime_dt(self) -> Optional[datetime]:
 65 |         """Last modification time as a datetime object."""
 66 |         return ns_to_datetime(self.mtime_ns) if self.mtime_ns else None
 67 | 
 68 |     @mtime_dt.setter
 69 |     def mtime_dt(self, dt: datetime):
 70 |         self.mtime_ns = datetime_to_ns(dt)
 71 | 
 72 |     def update_mtime(self):
 73 |         """Update the last modification time to the current time."""
 74 |         self.mtime_dt = datetime.now()
 75 | 
 76 |     def fill_from_statresult(self, s: os.stat_result):
 77 |         """Fills the metadata information from a stat result, obtained from the file system.
 78 | 
 79 |         Args:
 80 |             s: stat result object to fill the metadata from
 81 |         """
 82 |         self.mode = s.st_mode
 83 |         self.uid = s.st_uid
 84 |         self.gid = s.st_gid
 85 |         self.mtime_ns = s.st_mtime_ns
 86 | 
 87 |     @classmethod
 88 |     def row_factory(cls, cursor, row):
 89 |         """Factory method for creating instances from SQLite query results.
 90 | 
 91 |         Args:
 92 |             cursor: SQLite cursor object
 93 |             row: row from the query result
 94 |         """
 95 | 
 96 |         # Raw construction without any of that property business or validation, just for speed
 97 |         instance = cls.__new__(cls)
 98 |         for field, value in zip(cursor.description, row):
 99 |             fieldname = field[0]
100 |             if fieldname == 'path':
101 |                 instance._path = value
102 |             else:
103 |                 object.__setattr__(instance, fieldname, value)
104 |         return instance
105 | 
106 | 
107 | class BarecatFileInfo(BarecatEntryInfo):
108 |     """
109 |     Describes file information such as path, location in the shards and metadata.
110 | 
111 |     This class is used both when retrieving existing file information and when adding new files.
112 | 
113 |     Args:
114 |         path: path to the file inside the archive
115 |         mode: file mode, i.e., permissions
116 |         uid: user ID
117 |         gid: group ID
118 |         mtime_ns: last modification time in nanoseconds since the Unix epoch
119 |         shard: shard number
120 |         offset: offset within the shard in bytes
121 |         size: size of the file in bytes
122 |         crc32c: CRC32C checksum of the file contents
123 |     """
124 | 
125 |     __slots__ = ('shard', 'offset', 'size', 'crc32c')
126 | 
127 |     def __init__(
128 |         self,
129 |         path: Optional[str] = None,
130 |         mode: Optional[int] = None,
131 |         uid: Optional[int] = None,
132 |         gid: Optional[int] = None,
133 |         mtime_ns: Optional[Union[int, datetime]] = None,
134 |         shard: Optional[int] = None,
135 |         offset: Optional[int] = None,
136 |         size: Optional[int] = None,
137 |         crc32c: Optional[int] = None,
138 |     ):
139 |         super().__init__(path, mode, uid, gid, mtime_ns)
140 |         self.shard = shard
141 |         """Shard number where the file is located."""
142 | 
143 |         self.offset = offset
144 |         """Offset within the shard in bytes."""
145 | 
146 |         self.size = size
147 |         """Size of the file in bytes."""
148 | 
149 |         self.crc32c = crc32c
150 |         """CRC32C checksum of the file contents."""
151 | 
152 |     def asdict(self) -> dict:
153 |         """Returns a dictionary representation of the file information.
154 | 
155 |         Returns:
156 |             Dictionary with keys 'path', 'shard', 'offset', 'size', 'crc32c', 'mode', 'uid',
157 |                 'gid', 'mtime_ns'
158 |         """
159 |         return dict(
160 |             path=self.path,
161 |             shard=self.shard,
162 |             offset=self.offset,
163 |             size=self.size,
164 |             crc32c=self.crc32c,
165 |             mode=self.mode,
166 |             uid=self.uid,
167 |             gid=self.gid,
168 |             mtime_ns=self.mtime_ns,
169 |         )
170 | 
171 |     def fill_from_statresult(self, s: os.stat_result):
172 |         """Fills the file metadata information from a stat result, obtained from the file system.
173 | 
174 |         Args:
175 |             s: stat result object to fill the metadata from
176 |         """
177 |         super().fill_from_statresult(s)
178 |         self.size = s.st_size
179 | 
180 |     @property
181 |     def end(self) -> int:
182 |         """End position of the file in the shard."""
183 |         return self.offset + self.size
184 | 
185 | 
186 | class BarecatDirInfo(BarecatEntryInfo):
187 |     """
188 |     Describes directory information such as path, metadata and statistics.
189 | 
190 |     This class is used both when retrieving existing directory information and when adding new
191 |     directories.
192 | 
193 |     Args:
194 |         path: path to the directory inside the archive
195 |         mode: directory mode, i.e., permissions
196 |         uid: user ID
197 |         gid: group ID
198 |         mtime_ns: last modification time in nanoseconds since the Unix epoch
199 |         num_subdirs: number of subdirectories in the directory
200 |         num_files: number of files in the directory
201 |         size_tree: total size of the directory contents in bytes
202 |         num_files_tree: total number of files in the directory and its subdirectories
203 |     """
204 | 
205 |     __slots__ = ('num_subdirs', 'num_files', 'size_tree', 'num_files_tree')
206 | 
207 |     def __init__(
208 |         self,
209 |         path: Optional[str] = None,
210 |         mode: Optional[int] = None,
211 |         uid: Optional[int] = None,
212 |         gid: Optional[int] = None,
213 |         mtime_ns: Optional[Union[int, datetime]] = None,
214 |         num_subdirs: Optional[bool] = None,
215 |         num_files: Optional[int] = None,
216 |         size_tree: Optional[int] = None,
217 |         num_files_tree: Optional[int] = None,
218 |     ):
219 |         super().__init__(path, mode, uid, gid, mtime_ns)
220 |         self.num_subdirs = num_subdirs
221 |         """Number of immediate subdirectories in the directory."""
222 | 
223 |         self.num_files = num_files
224 |         """Number of immediate files in the directory."""
225 | 
226 |         self.size_tree = size_tree
227 |         """Total size of the directory's contents (recursively) in bytes."""
228 | 
229 |         self.num_files_tree = num_files_tree
230 |         """Total number of files in the directory and its subdirectories, recursively."""
231 | 
232 |     def asdict(self) -> dict:
233 |         """Returns a dictionary representation of the directory information.
234 | 
235 |         Returns:
236 |             Dictionary with keys 'path', 'num_subdirs', 'num_files', 'size_tree', 'num_files_tree',
237 |                 'mode', 'uid', 'gid', 'mtime_ns'
238 |         """
239 |         return dict(
240 |             path=self.path,
241 |             num_subdirs=self.num_subdirs,
242 |             num_files=self.num_files,
243 |             size_tree=self.size_tree,
244 |             num_files_tree=self.num_files_tree,
245 |             mode=self.mode,
246 |             uid=self.uid,
247 |             gid=self.gid,
248 |             mtime_ns=self.mtime_ns,
249 |         )
250 | 
251 |     @property
252 |     def num_entries(self) -> int:
253 |         """Total number of entries in the directory, including subdirectories and files."""
254 |         return self.num_subdirs + self.num_files
255 | 
256 |     def fill_from_statresult(self, s: os.stat_result):
257 |         """Fills the directory metadata information from a stat result, from the file system.
258 | 
259 |         Args:
260 |             s: stat result object to fill the metadata from
261 |         """
262 |         super().fill_from_statresult(s)
263 |         self.num_subdirs = s.st_nlink - 2
264 | 
265 | 
266 | class Order(Flag):
267 |     """Ordering specification for file and directory listings.
268 | 
269 |     The ordering can be by address (shard and offset), path, or random. The order can be ascending
270 |     or descending. The default order is ANY, which is the order in which SQLite yields rows.
271 |     """
272 | 
273 |     ANY = auto()
274 |     """Default order, as returned by SQLite"""
275 | 
276 |     RANDOM = auto()
277 |     """Randomized order"""
278 | 
279 |     ADDRESS = auto()
280 |     """Order by shard and offset position"""
281 | 
282 |     PATH = auto()
283 |     """Alphabetical order by path"""
284 | 
285 |     DESC = auto()
286 |     """Descending order"""
287 | 
288 |     def as_query_text(self) -> str:
289 |         """Returns the SQL ORDER BY clause corresponding to the ordering specification."""
290 | 
291 |         if self & Order.ADDRESS and self & Order.DESC:
292 |             return ' ORDER BY shard DESC, offset DESC'
293 |         elif self & Order.ADDRESS:
294 |             return ' ORDER BY shard, offset'
295 |         elif self & Order.PATH and self & Order.DESC:
296 |             return ' ORDER BY path DESC'
297 |         elif self & Order.PATH:
298 |             return ' ORDER BY path'
299 |         elif self & Order.RANDOM:
300 |             return ' ORDER BY RANDOM()'
301 |         return ''
302 | 
303 | 
304 | class FileSection(io.IOBase):
305 |     """File-like object representing a section of a file.
306 | 
307 |     Args:
308 |         file: file-like object to read from or write to
309 |         start: start position of the section in the file
310 |         size: size of the section
311 |         readonly: whether the section should be read-only
312 |     """
313 | 
314 |     def __init__(self, file: io.RawIOBase, start: int, size: int, readonly: bool = True):
315 |         self.file = file
316 |         self.start = start
317 |         self.end = start + size
318 |         self.position = start
319 |         self.readonly = readonly
320 | 
321 |     def read(self, size: int = -1) -> bytes:
322 |         """Read a from the section, starting from the current position.
323 | 
324 |         Args:
325 |             size: number of bytes to read, or -1 to read until the end of the section
326 | 
327 |         Returns:
328 |             Bytes read from the section.
329 |         """
330 |         if size == -1:
331 |             size = self.end - self.position
332 | 
333 |         size = min(size, self.end - self.position)
334 |         self.file.seek(self.position)
335 |         data = self.file.read(size)
336 |         self.position += len(data)
337 |         return data
338 | 
339 |     def readinto(self, buffer: Union[bytearray, memoryview]) -> int:
340 |         """Read bytes into a buffer from the section, starting from the current position.
341 | 
342 |         Will read up to the length of the buffer or until the end of the section.
343 | 
344 |         Args:
345 |             buffer: destination buffer to read into
346 | 
347 |         Returns:
348 |             Number of bytes read into the buffer.
349 |         """
350 |         size = min(len(buffer), self.end - self.position)
351 |         if size == 0:
352 |             return 0
353 | 
354 |         self.file.seek(self.position)
355 |         num_read = self.file.readinto(buffer[:size])
356 |         self.position += num_read
357 |         return num_read
358 | 
359 |     def readall(self) -> bytes:
360 |         """Read all remaining bytes from the section.
361 | 
362 |         Returns:
363 |             Bytes read from the section.
364 |         """
365 | 
366 |         return self.read()
367 | 
368 |     def readable(self):
369 |         """Always returns True, since the section is always readable."""
370 |         return True
371 | 
372 |     def writable(self):
373 |         return not self.readonly
374 | 
375 |     def write(self, data: Union[bytes, bytearray, memoryview]) -> int:
376 |         """Write data to the section, starting from the current position.
377 | 
378 |         Args:
379 |             data: data to write to the section
380 | 
381 |         Returns:
382 |             Number of bytes written to the section.
383 | 
384 |         Raises:
385 |             PermissionError: if the section is read-only
386 |             EOFError: if the write would go past the end of the section
387 |         """
388 | 
389 |         if self.readonly:
390 |             raise PermissionError('Cannot write to a read-only file section')
391 | 
392 |         if self.position + len(data) > self.end:
393 |             raise EOFError('Cannot write past the end of the section')
394 | 
395 |         self.file.seek(self.position)
396 |         n_written = self.file.write(data)
397 |         self.position += n_written
398 |         return n_written
399 | 
400 |     def readline(self, size: int = -1) -> bytes:
401 |         size = min(size, self.end - self.position)
402 |         if size == -1:
403 |             size = self.end - self.position
404 | 
405 |         self.file.seek(self.position)
406 |         data = self.file.readline(size)
407 | 
408 |         self.position += len(data)
409 |         return data
410 | 
411 |     def tell(self):
412 |         return self.position - self.start
413 | 
414 |     def seek(self, offset, whence=0):
415 |         if whence == io.SEEK_SET:
416 |             new_position = self.start + offset
417 |         elif whence == io.SEEK_CUR:
418 |             new_position = self.position + offset
419 |         elif whence == io.SEEK_END:
420 |             new_position = self.end + offset
421 |         else:
422 |             raise ValueError(f"Invalid value for whence: {whence}")
423 | 
424 |         if new_position < self.start or new_position > self.end:
425 |             raise EOFError("Seek position out of bounds")
426 | 
427 |         self.position = new_position
428 |         return self.position - self.start
429 | 
430 |     def close(self):
431 |         """Close the file section, this is a no-op, since the real shard file is not closed."""
432 |         pass
433 | 
434 |     @property
435 |     def size(self) -> int:
436 |         """Size of the section in bytes."""
437 |         return self.end - self.start
438 | 
439 |     def __exit__(self, exc_type, exc_val, exc_tb):
440 |         self.close()
441 | 


--------------------------------------------------------------------------------
/src/barecat/viewerqt6.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import os.path as osp
  4 | import pprint
  5 | import re
  6 | import shutil
  7 | import sys
  8 | from typing import List
  9 | 
 10 | import msgpack_numpy
 11 | from PyQt6.QtCore import QBuffer, QByteArray, QMimeData, QModelIndex, Qt, pyqtSlot
 12 | from PyQt6.QtGui import (
 13 |     QClipboard,
 14 |     QFont,
 15 |     QFontMetrics,
 16 |     QImageReader,
 17 |     QPixmap,
 18 |     QStandardItem,
 19 |     QStandardItemModel,
 20 | )
 21 | from PyQt6.QtWidgets import (
 22 |     QAbstractItemView,
 23 |     QApplication,
 24 |     QFileDialog,
 25 |     QHBoxLayout,
 26 |     QHeaderView,
 27 |     QLabel,
 28 |     QMenu,
 29 |     QScrollArea,
 30 |     QSplitter,
 31 |     QStyleFactory,
 32 |     QTableView,
 33 |     QTreeView,
 34 |     QVBoxLayout,
 35 |     QWidget,
 36 | )
 37 | 
 38 | import barecat
 39 | from barecat.common import BarecatDirInfo, BarecatFileInfo
 40 | 
 41 | 
 42 | def main():
 43 |     app = QApplication(sys.argv)
 44 |     app.setStyle(QStyleFactory.create(QApplication.style().objectName()))
 45 | 
 46 |     parser = argparse.ArgumentParser(description='View images stored in a barecat archive.')
 47 |     parser.add_argument('path', type=str, help='path to load from')
 48 |     args = parser.parse_args()
 49 |     viewer = BarecatViewer(args.path)
 50 |     viewer.show()
 51 |     sys.exit(app.exec())
 52 | 
 53 | 
 54 | class BarecatViewer(QWidget):
 55 |     def __init__(self, path):
 56 |         super().__init__()
 57 |         self.file_reader = barecat.Barecat(path)
 58 |         self.barecat_path = path
 59 |         self.tree = QTreeView()
 60 |         self.tree.setEditTriggers(QAbstractItemView.EditTrigger.NoEditTriggers)
 61 | 
 62 |         self.file_table = self.create_file_table()
 63 |         self.content_viewer = ContentViewer()
 64 |         self.content_viewer.label.setWordWrap(True)
 65 |         font = QFont("Courier New")  # Replace with the desired monospace font
 66 |         self.content_viewer.label.setFont(font)
 67 | 
 68 |         splitter = QSplitter()
 69 |         splitter.addWidget(self.tree)
 70 |         splitter.addWidget(self.file_table)
 71 |         splitter.addWidget(self.content_viewer)
 72 |         splitter.setSizes([650, 650, 1000])
 73 |         layout = QHBoxLayout()
 74 |         layout.addWidget(splitter)
 75 |         self.setLayout(layout)
 76 | 
 77 |         self.resize(2400, 800)
 78 | 
 79 |         self.fill_tree()
 80 |         self.tree.selectionModel().selectionChanged.connect(self.update_file_table)
 81 |         self.tree.activated.connect(self.expand_tree_item)
 82 |         self.tree.doubleClicked.connect(self.expand_tree_item)
 83 |         self.tree.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
 84 |         self.tree.customContextMenuRequested.connect(self.show_tree_context_menu)
 85 | 
 86 |         root_index = self.tree.model().index(0, 0)
 87 |         self.tree.setCurrentIndex(root_index)
 88 | 
 89 |     def create_file_table(self):
 90 |         ft = QTableView()
 91 |         ft.verticalHeader().setVisible(False)
 92 |         ft.verticalHeader().setDefaultSectionSize(20)
 93 |         ft.setShowGrid(False)
 94 |         ft.setSelectionMode(QAbstractItemView.SelectionMode.SingleSelection)
 95 |         ft.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows)
 96 |         ft.setEditTriggers(QAbstractItemView.EditTrigger.NoEditTriggers)
 97 |         model = QStandardItemModel()
 98 |         model.setHorizontalHeaderLabels(['Name', 'Size'])
 99 |         ft.setModel(model)
100 |         ft.selectionModel().selectionChanged.connect(self.show_selected_file)
101 |         ft.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.Stretch)
102 |         ft.horizontalHeader().setStyleSheet(
103 |             "QHeaderView::section {font-weight: normal; text-align: left;}"
104 |         )
105 |         ft.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
106 |         ft.customContextMenuRequested.connect(self.show_file_table_context_menu)
107 |         return ft
108 | 
109 |     def fill_tree(self):
110 |         root_item = TreeItem(self.file_reader)
111 |         dinfo: BarecatDirInfo = self.file_reader.index.lookup_dir('')
112 |         item = TreeItem(
113 |             self.file_reader,
114 |             path='',
115 |             size=dinfo.size_tree,
116 |             count=dinfo.num_files_tree,
117 |             has_subdirs=dinfo.num_subdirs > 0,
118 |             parent=root_item,
119 |         )
120 |         root_item.children.append(item)
121 |         self.model = LazyItemModel(root_item)
122 |         self.tree.setModel(self.model)
123 | 
124 |         root_index = self.tree.model().index(0, 0)
125 |         self.tree.expand(root_index)  # Expand the root item by default
126 |         self.tree.setColumnWidth(0, 400)
127 |         self.tree.setColumnWidth(1, 70)
128 |         self.tree.setColumnWidth(2, 70)
129 | 
130 |     @pyqtSlot(QModelIndex)
131 |     def expand_tree_item(self, index):
132 |         if self.tree.isExpanded(index):
133 |             self.tree.collapse(index)
134 |         else:
135 |             self.tree.expand(index)
136 | 
137 |     def update_file_table(self, selected, deselected):
138 |         indexes = selected.indexes()
139 |         if not indexes:
140 |             return
141 | 
142 |         index = indexes[0]  # Get the first selected index
143 |         item = index.internalPointer()
144 | 
145 |         model = self.file_table.model()
146 |         model.removeRows(0, model.rowCount())
147 |         finfos: List[BarecatFileInfo] = self.file_reader.index.list_direct_fileinfos(item.path)
148 |         finfos = sorted(finfos, key=lambda x: natural_sort_key(x.path))
149 |         for finfo in finfos:
150 |             file_item = QStandardItem(osp.basename(finfo.path))
151 |             file_item.setData(finfo, Qt.ItemDataRole.UserRole)  # Store the fileinfo as user data
152 |             model.appendRow([file_item, QStandardItem(format_size(finfo.size))])
153 | 
154 |         if len(finfos) > 0:
155 |             first_file_index = self.file_table.model().index(0, 0)
156 |             self.file_table.setCurrentIndex(first_file_index)
157 |         else:
158 |             for dinfo, subdinfos, finfos in self.file_reader.index.walk_infos(item.path):
159 |                 finfo = next(iter(finfos), None)
160 |                 if finfo is not None:
161 |                     self.show_file(finfo)
162 |                     break
163 | 
164 |     def show_selected_file(self, selected, deselected):
165 |         indexes = selected.indexes()
166 |         if not indexes:
167 |             return
168 |         path = self.file_table.model().item(indexes[0].row(), 0).data(Qt.ItemDataRole.UserRole)
169 |         self.show_file(path)
170 | 
171 |     def show_file(self, finfo):
172 |         content = self.file_reader.read(finfo)
173 |         extension = osp.splitext(finfo.path)[1].lower()
174 |         if extension in ('.jpg', '.jpeg', '.png', '.gif', '.bmp'):
175 |             byte_array = QByteArray(content)
176 |             buffer = QBuffer(byte_array)
177 |             imageReader = QImageReader()
178 |             imageReader.setDecideFormatFromContent(True)
179 |             imageReader.setQuality(100)
180 |             imageReader.setDevice(buffer)
181 |             qim = imageReader.read()
182 | 
183 |             if not qim.isNull():
184 |                 pixmap = QPixmap.fromImage(qim)
185 |                 self.content_viewer.setPixmap(pixmap)
186 |         elif extension == '.msgpack':
187 |             data = msgpack_numpy.unpackb(content)
188 |             self.content_viewer.setText(data)
189 |         else:
190 |             self.content_viewer.setText(repr(content))
191 | 
192 |     def update_image_label(self, pixmap):
193 |         self.content_viewer.setPixmap(pixmap)
194 | 
195 |     def show_file_table_context_menu(self, position):
196 |         menu = QMenu()
197 |         extract_action = menu.addAction("Extract file...")
198 |         copy_path_action = menu.addAction("Copy path")
199 | 
200 |         action = menu.exec(self.file_table.viewport().mapToGlobal(position))
201 | 
202 |         if action == extract_action:
203 |             indexes = self.file_table.selectionModel().selectedRows()
204 |             if indexes:
205 |                 path_of_what_to_extract = (
206 |                     self.file_table.model().item(indexes[0].row(), 0).data(Qt.ItemDataRole.UserRole)
207 |                 )
208 |                 default_filename = osp.basename(path_of_what_to_extract)
209 |                 target_filename, _ = QFileDialog.getSaveFileName(
210 |                     self, "Select Target File", default_filename
211 |                 )
212 |                 if target_filename:
213 |                     self.extract_file(path_of_what_to_extract, target_filename)
214 |         elif action == copy_path_action:
215 |             indexes = self.file_table.selectionModel().selectedRows()
216 |             if indexes:
217 |                 path = (
218 |                     self.file_table.model().item(indexes[0].row(), 0).data(Qt.ItemDataRole.UserRole)
219 |                 )
220 |                 clipboard = QApplication.clipboard()
221 |                 clipboard.setText(path)
222 | 
223 |     def show_tree_context_menu(self, position):
224 |         menu = QMenu()
225 |         extract_action = menu.addAction("Extract directory...")
226 |         copy_path_action = menu.addAction("Copy path")
227 | 
228 |         action = menu.exec(self.tree.viewport().mapToGlobal(position))
229 |         if action == extract_action:
230 |             index = self.tree.indexAt(position)
231 |             if index.isValid():
232 |                 if target_directory := QFileDialog.get(self, "Select Target Directory"):
233 |                     self.extract_directory(index.internalPointer().path, target_directory)
234 |         elif action == copy_path_action:
235 |             index = self.tree.indexAt(position)
236 |             if index.isValid():
237 |                 clipboard = QApplication.clipboard()
238 |                 clipboard.setText(index.internalPointer().path)
239 | 
240 |     def extract_file(self, path_of_what_to_extract, target_filename):
241 |         with open(target_filename, 'wb') as f:
242 |             shutil.copyfileobj(self.file_reader.open(path_of_what_to_extract), f)
243 | 
244 |     def extract_directory(self, dir_in_archive, target_directory):
245 |         basename = osp.basename(dir_in_archive)
246 |         for dinfo, _, finfos in self.file_reader.index.walk_infos(dir_in_archive):
247 |             for finfo in finfos:
248 |                 target_path = osp.join(
249 |                     target_directory, basename, osp.relpath(finfo.path, dir_in_archive)
250 |                 )
251 |                 os.makedirs(osp.dirname(target_path), exist_ok=True)
252 |                 with open(target_path, 'wb') as f:
253 |                     shutil.copyfileobj(self.file_reader.open(finfo.path), f)
254 | 
255 | 
256 | class ContentViewer(QWidget):
257 |     def __init__(self):
258 |         super().__init__()
259 |         self.label = QLabel()
260 |         self.originalPixmap = None
261 |         self.originalText = None  # New attribute to hold the original text
262 |         self.scrollArea = QScrollArea(self)
263 |         self.scrollArea.setWidgetResizable(True)
264 |         self.scrollArea.setWidget(self.label)
265 |         layout = QVBoxLayout(self)
266 |         layout.addWidget(self.scrollArea)
267 | 
268 |         self.label.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
269 |         self.label.customContextMenuRequested.connect(self.show_context_menu)
270 | 
271 |     def setPixmap(self, pixmap):
272 |         self.originalPixmap = pixmap
273 |         self.originalText = None  # Reset the original text
274 |         self.updateImage()
275 | 
276 |     def setText(self, original_data):
277 |         self.originalText = original_data  # Store the original data
278 |         self.originalPixmap = None  # Reset the pixmap
279 |         self.updateText()
280 | 
281 |     def updateImage(self):
282 |         if self.originalPixmap:
283 |             availableSize = self.scrollArea.size()
284 |             if (
285 |                 self.originalPixmap.width() > availableSize.width()
286 |                 or self.originalPixmap.height() > availableSize.height()
287 |             ):
288 |                 scaledPixmap = self.originalPixmap.scaled(
289 |                     availableSize,
290 |                     Qt.AspectRatioMode.KeepAspectRatio,
291 |                     Qt.TransformationMode.SmoothTransformation,
292 |                 )
293 |             else:
294 |                 scaledPixmap = self.originalPixmap
295 |             self.label.setPixmap(scaledPixmap)
296 |             self.label.setAlignment(Qt.AlignmentFlag.AlignCenter)
297 | 
298 |     def updateText(self):
299 |         if self.originalText:
300 |             # Calculate the maximum line width
301 |             width_pixels = self.scrollArea.width()
302 |             fm = QFontMetrics(self.label.font())
303 |             average_char_width_pixels = fm.averageCharWidth()
304 |             max_line_width = width_pixels // average_char_width_pixels
305 | 
306 |             # Pretty-print the text
307 |             pp = pprint.PrettyPrinter(
308 |                 indent=2, width=max_line_width, compact=True, sort_dicts=False
309 |             )
310 |             formatted_text = pp.pformat(self.originalText)
311 |             self.label.setText(formatted_text)
312 |             self.label.setAlignment(Qt.AlignmentFlag.AlignLeft | Qt.AlignmentFlag.AlignTop)
313 | 
314 |     def resizeEvent(self, event):
315 |         if self.originalPixmap:
316 |             self.updateImage()
317 |         elif self.originalText:
318 |             self.updateText()
319 |         super().resizeEvent(event)
320 | 
321 |     def show_context_menu(self, position):
322 |         menu = QMenu()
323 |         copy_image_action = menu.addAction("Copy image")
324 | 
325 |         action = menu.exec(self.mapToGlobal(position))
326 | 
327 |         if action == copy_image_action and self.originalPixmap:
328 |             clipboard = QApplication.clipboard()
329 |             mime_data = QMimeData()
330 |             mime_data.setImageData(self.originalPixmap.toImage())
331 |             clipboard.setMimeData(mime_data, QClipboard.Mode.Clipboard)
332 | 
333 | 
334 | class LazyItemModel(QStandardItemModel):
335 |     def __init__(self, root):
336 |         super().__init__()
337 |         self.root = root
338 | 
339 |     def index(self, row, column, parent=QModelIndex()):
340 |         if not self.hasIndex(row, column, parent):
341 |             return QModelIndex()
342 |         parent_item = self.root if not parent.isValid() else parent.internalPointer()
343 |         return (
344 |             self.createIndex(row, column, parent_item.children[row])
345 |             if row < len(parent_item.children)
346 |             else QModelIndex()
347 |         )
348 | 
349 |     def parent(self, index):
350 |         if not index.isValid():
351 |             return QModelIndex()
352 |         parent_item = index.internalPointer().parent
353 |         return self.createIndex(parent_item.row, 0, parent_item) if parent_item else QModelIndex()
354 | 
355 |     def rowCount(self, parent=QModelIndex()):
356 |         parent_item = self.root if not parent.isValid() else parent.internalPointer()
357 |         return len(parent_item.children)
358 | 
359 |     def columnCount(self, parent=QModelIndex()):
360 |         return 3  # Name, Size, Count
361 | 
362 |     def headerData(self, section, orientation, role):
363 |         if role == Qt.ItemDataRole.DisplayRole and orientation == Qt.Orientation.Horizontal:
364 |             return ["Name", "Size", "Count"][section]
365 |         return None
366 | 
367 |     def data(self, index, role):
368 |         item = index.internalPointer()
369 |         if role == Qt.ItemDataRole.DisplayRole:
370 |             if index.column() == 0:
371 |                 if item.parent == self.root:
372 |                     return '[root]'
373 |                 return osp.basename(item.path)
374 |             elif index.column() == 1:
375 |                 return format_size(item.size)
376 |             elif index.column() == 2:
377 |                 return format_count(item.count)
378 |         elif role == Qt.ItemDataRole.TextAlignmentRole:
379 |             if index.column() in [1, 2]:
380 |                 return Qt.AlignmentFlag.AlignRight
381 |         return None
382 | 
383 |     def canFetchMore(self, index):
384 |         if not index.isValid():
385 |             return False
386 |         return not index.internalPointer().fetched
387 | 
388 |     def fetchMore(self, index):
389 |         item = index.internalPointer()
390 |         if item == self.root:
391 |             return
392 |         item.fetch_more()
393 |         self.beginInsertRows(index, 0, len(item.children) - 1)
394 |         self.endInsertRows()
395 | 
396 |     def hasChildren(self, index=QModelIndex()):
397 |         if not index.isValid():
398 |             return True
399 |         return index.internalPointer().has_subdirs
400 | 
401 | 
402 | class TreeItem:
403 |     def __init__(self, file_reader, path='', size=0, count=0, has_subdirs=True, parent=None):
404 |         self.file_reader = file_reader
405 | 
406 |         self.path = path
407 |         self.parent = parent
408 |         self.children = []
409 | 
410 |         self.size = size
411 |         self.count = count
412 |         self.has_subdirs = has_subdirs
413 |         self.fetched = False
414 | 
415 |     def fetch_more(self):
416 |         if self.fetched:
417 |             return
418 |         subdir_infos = self.file_reader.index.list_subdir_dirinfos(self.path)
419 |         subdir_infos = sorted(subdir_infos, key=lambda x: natural_sort_key(x.path))
420 |         for dinfo in subdir_infos:
421 |             self.children.append(
422 |                 TreeItem(
423 |                     self.file_reader,
424 |                     path=dinfo.path,
425 |                     size=dinfo.size_tree,
426 |                     count=dinfo.num_files_tree,
427 |                     has_subdirs=dinfo.num_subdirs > 0,
428 |                     parent=self,
429 |                 )
430 |             )
431 | 
432 |         self.fetched = True
433 | 
434 |     @property
435 |     def row(self):
436 |         return self.parent.children.index(self) if self.parent else 0
437 | 
438 | 
439 | def format_size(size):
440 |     units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
441 |     index = 0
442 |     while size >= 1024:
443 |         index += 1
444 |         size /= 1024
445 |     return f'{size:.2f} {units[index]}'
446 | 
447 | 
448 | def format_count(size):
449 |     units = ['', ' K', ' M', ' B']
450 |     unit_index = 0
451 |     while size >= 1000 and unit_index < len(units) - 1:
452 |         size /= 1000
453 |         unit_index += 1
454 |     if unit_index == 0:
455 |         return str(size)
456 |     return f'{size:.1f}{units[unit_index]}'
457 | 
458 | 
459 | def natural_sort_key(s):
460 |     """Normal string sort puts '10' before '2'. Natural sort puts '2' before '10'."""
461 |     return [float(t) if t.isdigit() else t for t in re.split('([0-9]+)', s)]
462 | 
463 | 
464 | if __name__ == '__main__':
465 |     main()
466 | 


--------------------------------------------------------------------------------