├── docs
├── references.bib
├── _templates
│ ├── autoapi
│ │ ├── python
│ │ │ ├── attribute.rst
│ │ │ ├── exception.rst
│ │ │ ├── package.rst
│ │ │ ├── property.rst
│ │ │ ├── method.rst
│ │ │ ├── function.rst
│ │ │ ├── data.rst
│ │ │ ├── class.rst
│ │ │ └── module.rst
│ │ └── index.rst
│ └── copyright.html
├── requirements.txt
├── Makefile
├── make.bat
├── _static
│ └── styles
│ │ └── my_theme.css
├── index.rst
├── conf.py
└── abbrev_long.bib
├── src
└── barecat
│ ├── core
│ ├── __init__.py
│ └── sharder.py
│ ├── threadsafe.py
│ ├── progbar.py
│ ├── codecs.py
│ ├── to_tar_stream.py
│ ├── from_tar_stream.py
│ ├── exceptions.py
│ ├── __init__.py
│ ├── upgrade_database2.py
│ ├── upgrade_database.py
│ ├── consumed_threadpool.py
│ ├── glob_to_regex.py
│ ├── archive_formats.py
│ ├── defrag.py
│ ├── sql
│ └── schema.sql
│ ├── util.py
│ ├── cli.py
│ ├── cli_impl.py
│ ├── common.py
│ └── viewerqt6.py
├── MANIFEST.in
├── figure.png
├── .readthedocs.yaml
├── .github
└── workflows
│ └── python-publish.yml
├── LICENSE
├── pyproject.toml
├── tests
├── test_barecat.py
└── test_cli.py
├── .gitignore
└── README.md
/docs/references.bib:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/barecat/core/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/barecat/sql/*.sql
2 |
--------------------------------------------------------------------------------
/figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isarandi/barecat/HEAD/figure.png
--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/attribute.rst:
--------------------------------------------------------------------------------
1 | {% extends "python/data.rst" %}
2 |
--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/exception.rst:
--------------------------------------------------------------------------------
1 | {% extends "python/class.rst" %}
2 |
--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/package.rst:
--------------------------------------------------------------------------------
1 | {% extends "python/module.rst" %}
2 |
--------------------------------------------------------------------------------
/docs/_templates/autoapi/index.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | =============
3 |
4 | Start at :class:`barecat.Barecat` to explore the API.
5 |
6 | .. toctree::
7 | :titlesonly:
8 |
9 | {% for page in pages|selectattr("is_top_level_object") %}
10 | {{ page.include_path }}
11 | {% endfor %}
12 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: ubuntu-24.04
5 | tools:
6 | python: "3.10"
7 | commands:
8 | - python -m pip install .
9 | - python -m pip install --no-cache-dir -r docs/requirements.txt
10 | - python -m sphinx -E -b html docs $READTHEDOCS_OUTPUT/html
11 |
12 | sphinx:
13 | configuration: docs/conf.py
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinxcontrib-bibtex
3 | sphinx-autoapi
4 | sphinx-autobuild
5 | sphinx-autodoc-typehints
6 | sphinxcontrib-prettyspecialmethods
7 | sphinx-autodoc-napoleon-typehints
8 | sphinx-codeautolink
9 | sphinx-rtd-theme
10 | pydata-sphinx-theme
11 | sphinxcontrib-napoleon
12 | Cython
13 | numpy
14 | setuptools-scm
15 | toml
--------------------------------------------------------------------------------
/docs/_templates/copyright.html:
--------------------------------------------------------------------------------
1 | {# Displays the copyright information (which is defined in conf.py). #}
2 | {% if show_copyright and copyright %}
3 |
4 | {% if hasdoc('copyright') %}
5 | © {% trans copyright=copyright|e %}Copyright {{ copyright }} {% endtrans %}.
6 |
7 | {% else %}
8 | {% trans copyright=copyright|e %}© Copyright {{ copyright }}, {{ author }}.{% endtrans %}
9 |
10 | {% endif %}
11 |
12 | {% endif %}
--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/property.rst:
--------------------------------------------------------------------------------
1 | :html_theme.sidebar_secondary.remove: true
2 |
3 | {% if obj.display %}
4 | {% if is_own_page %}
5 | {{ obj.name }}
6 | {{ "=" * obj.name | length }}
7 |
8 | {% endif %}
9 | .. py:property:: {% if is_own_page %}{{ obj.id}}{% else %}{{ obj.short_name }}{% endif %}
10 | {% if obj.annotation %}
11 |
12 | :type: {{ obj.annotation }}
13 | {% endif %}
14 | {% for property in obj.properties %}
15 |
16 | :{{ property }}:
17 | {% endfor %}
18 |
19 | {% if obj.docstring %}
20 |
21 | {{ obj.docstring|indent(3) }}
22 | {% endif %}
23 | {% endif %}
24 |
25 | .. footbibliography::
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/src/barecat/threadsafe.py:
--------------------------------------------------------------------------------
1 | import functools
2 |
3 | import multiprocessing_utils
4 |
5 | from barecat.core import barecat as barecat
6 |
7 |
8 | def threadlocal_decorate(decorator):
9 | def my_decorator(fun):
10 | local = multiprocessing_utils.local()
11 |
12 | @functools.wraps(fun)
13 | def wrapper(*args, **kwargs):
14 | if not hasattr(local, 'fn'):
15 | local.fn = decorator(fun)
16 | return local.fn(*args, **kwargs)
17 |
18 | return wrapper
19 |
20 | return my_decorator
21 |
22 |
23 | @threadlocal_decorate(functools.lru_cache())
24 | def get_cached_reader(path, auto_codec=True):
25 | return barecat.Barecat(path, readonly=True, auto_codec=auto_codec)
26 |
--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/method.rst:
--------------------------------------------------------------------------------
1 | :html_theme.sidebar_secondary.remove: true
2 |
3 | {% if obj.display %}
4 | {% if is_own_page %}
5 | {{ obj.name }}
6 | {{ "=" * obj.name | length }}
7 |
8 | {% endif %}
9 | .. py:method:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ obj.args }}){% if obj.return_annotation is not none %} -> {{ obj.return_annotation }}{% endif %}
10 | {% for (args, return_annotation) in obj.overloads %}
11 |
12 | {%+ if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ args }}){% if return_annotation is not none %} -> {{ return_annotation }}{% endif %}
13 | {% endfor %}
14 | {% for property in obj.properties %}
15 |
16 | :{{ property }}:
17 | {% endfor %}
18 |
19 | {% if obj.docstring %}
20 |
21 | {{ obj.docstring|indent(3) }}
22 | {% endif %}
23 | {% endif %}
24 |
25 | .. footbibliography::
--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/function.rst:
--------------------------------------------------------------------------------
1 | :html_theme.sidebar_secondary.remove: true
2 |
3 | {% if obj.display %}
4 | {% if is_own_page %}
5 | {{ obj.name }}
6 | {{ "=" * obj.name | length }}
7 |
8 | {% endif %}
9 | .. py:function:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ obj.args }}){% if obj.return_annotation is not none %} -> {{ obj.return_annotation }}{% endif %}
10 | {% for (args, return_annotation) in obj.overloads %}
11 |
12 | {%+ if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ args }}){% if return_annotation is not none %} -> {{ return_annotation }}{% endif %}
13 | {% endfor %}
14 | {% for property in obj.properties %}
15 |
16 | :{{ property }}:
17 | {% endfor %}
18 |
19 | {% if obj.docstring %}
20 |
21 | {{ obj.docstring|indent(3) }}
22 | {% endif %}
23 | {% endif %}
24 |
25 | .. footbibliography::
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | permissions:
8 | contents: read
9 |
10 | jobs:
11 | pypi-publish:
12 | name: Upload release to PyPI
13 | runs-on: ubuntu-latest
14 | environment: pypi
15 | permissions:
16 | id-token: write
17 | steps:
18 | - name: Check out repository
19 | uses: actions/checkout@v4
20 | with:
21 | fetch-depth: 0
22 |
23 | - name: Set up Python
24 | uses: actions/setup-python@v5
25 | with:
26 | python-version: "3.x"
27 |
28 | - name: Install build dependencies
29 | run: python -m pip install --upgrade build
30 |
31 | - name: Build package distribution
32 | run: python -m build --sdist
33 |
34 | - name: Publish package distributions to PyPI
35 | uses: pypa/gh-action-pypi-publish@release/v1
36 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/data.rst:
--------------------------------------------------------------------------------
1 | :html_theme.sidebar_secondary.remove: true
2 |
3 | {% if obj.display %}
4 | {% if is_own_page %}
5 | {{ obj.name }}
6 | {{ "=" * obj.name | length }}
7 |
8 | {% endif %}
9 | .. py:{{ obj.type }}:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.name }}{% endif %}
10 | {% if obj.annotation is not none %}
11 |
12 | :type: {% if obj.annotation %} {{ obj.annotation }}{% endif %}
13 | {% endif %}
14 | {% if obj.value is not none %}
15 |
16 | {% if obj.value.splitlines()|count > 1 %}
17 | :value: Multiline-String
18 |
19 | .. raw:: html
20 |
21 | Show Value
22 |
23 | .. code-block:: python
24 |
25 | {{ obj.value|indent(width=6,blank=true) }}
26 |
27 | .. raw:: html
28 |
29 |
30 |
31 | {% else %}
32 | :value: {{ obj.value|truncate(100) }}
33 | {% endif %}
34 | {% endif %}
35 |
36 | {% if obj.docstring %}
37 |
38 | {{ obj.docstring|indent(3) }}
39 | {% endif %}
40 | {% endif %}
41 |
42 | .. footbibliography::
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 István Sárándi
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/barecat/progbar.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def is_running_in_jupyter_notebook():
5 | try:
6 | # noinspection PyUnresolvedReferences
7 | shell = get_ipython().__class__.__name__
8 | if shell == 'ZMQInteractiveShell':
9 | return True # Jupyter notebook or qtconsole
10 | elif shell == 'TerminalInteractiveShell':
11 | return False # Terminal running IPython
12 | else:
13 | return False # Other type (?)
14 | except NameError:
15 | return False # Probably standard Python interpreter
16 |
17 |
18 | def progressbar(iterable=None, *args, **kwargs):
19 | import tqdm
20 |
21 | if is_running_in_jupyter_notebook():
22 | return tqdm.notebook.tqdm(iterable, *args, **kwargs)
23 | elif sys.stdout.isatty():
24 | return tqdm.tqdm(iterable, *args, dynamic_ncols=True, **kwargs)
25 | elif iterable is None:
26 |
27 | class X:
28 | def update(self, *a, **kw):
29 | pass
30 |
31 | return X()
32 | else:
33 | return iterable
34 |
35 |
36 | def progressbar_items(dictionary, *args, **kwargs):
37 | return progressbar(dictionary.items(), total=len(dictionary), *args, **kwargs)
38 |
--------------------------------------------------------------------------------
/src/barecat/codecs.py:
--------------------------------------------------------------------------------
1 | import io
2 |
3 |
4 | def encode_jpeg(data):
5 | import imageio.v2 as imageio
6 |
7 | with io.BytesIO() as f:
8 | imageio.imwrite(f, data, format='jpeg', quality=95)
9 | return f.getvalue()
10 |
11 |
12 | def decode_jpeg(data):
13 | import jpeg4py
14 | import numpy as np
15 |
16 | return jpeg4py.JPEG(np.frombuffer(data, np.uint8)).decode()
17 |
18 |
19 | def encode_msgpack_np(data):
20 | import msgpack_numpy
21 |
22 | return msgpack_numpy.packb(data)
23 |
24 |
25 | def decode_msgpack_np(data):
26 | import msgpack_numpy
27 |
28 | return msgpack_numpy.unpackb(data)
29 |
30 |
31 | def encode_npy(data):
32 | import numpy as np
33 |
34 | with io.BytesIO() as f:
35 | np.save(f, data)
36 | return f.getvalue()
37 |
38 |
39 | def decode_npy(data):
40 | import numpy as np
41 |
42 | with io.BytesIO(data) as f:
43 | return np.load(f)
44 |
45 |
46 | def encode_npz(data):
47 | import numpy as np
48 |
49 | with io.BytesIO() as f:
50 | np.savez(f, **data)
51 | return f.getvalue()
52 |
53 |
54 | def decode_npz(data):
55 | import numpy as np
56 |
57 | with io.BytesIO(data) as f:
58 | return dict(np.load(f))
59 |
--------------------------------------------------------------------------------
/src/barecat/to_tar_stream.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from barecat.archive_formats import TarWriter
3 | import barecat.core.barecat as barecat_
4 | import argparse
5 | import sys
6 |
7 |
8 | def main():
9 | parser = argparse.ArgumentParser(description='Convert a tar stream to a barecat file')
10 | parser.add_argument('barecat_file', type=str, help='path to the target barecat file')
11 | parser.add_argument("args", nargs=argparse.REMAINDER, help="Ordered --in and --ex arguments")
12 |
13 | args = parser.parse_args()
14 | patterns = parse_in_ex_patterns(args)
15 |
16 | with (
17 | barecat_.Barecat(args.barecat_file, readonly=True) as bc_reader,
18 | TarWriter(fileobj=sys.stdout.buffer, mode='w|') as tar_writer,
19 | ):
20 | for finfo in bc_reader.index.raw_iterglob_infos_incl_excl(
21 | patterns=patterns, only_files=True
22 | ):
23 | with bc_reader.open(finfo.path) as fileobj:
24 | tar_writer.add(finfo, fileobj)
25 |
26 |
27 | def parse_in_ex_patterns(args):
28 | patterns = []
29 | i = 0
30 | while i < len(args.args):
31 | arg = args.args[i]
32 |
33 | if arg.startswith("--in="):
34 | patterns.append((True, arg.split("=", 1)[1]))
35 |
36 | elif arg.startswith("--ex="):
37 | patterns.append((False, arg.split("=", 1)[1]))
38 |
39 | elif arg == "--in":
40 | if i + 1 < len(args.args):
41 | patterns.append((True, args.args[i + 1]))
42 | i += 1
43 |
44 | elif arg == "--ex":
45 | if i + 1 < len(args.args):
46 | patterns.append((False, args.args[i + 1]))
47 | i += 1
48 |
49 | i += 1
50 |
51 | return patterns
52 |
53 | if __name__ == '__main__':
54 | main()
55 |
--------------------------------------------------------------------------------
/docs/_static/styles/my_theme.css:
--------------------------------------------------------------------------------
1 | @import url("theme.css");
2 | @import url("https://fonts.googleapis.com/css2?family=Mona+Sans:ital,wght@0,200..900;1,200..900&family=Geist:wght@100..900&&family=JetBrains+Mono:ital,wght@0,100..800;1,100..800&family=Outfit:wght@100..900&display=swap");
3 |
4 | /*@media (min-width: 960px) {
5 | .bd-page-width {
6 | max-width: 120rem;
7 | }
8 | }*/
9 |
10 | #rtd-footer-container {
11 | margin-top: 0 !important;
12 | }
13 |
14 | html[data-theme="light"] {
15 | --pst-color-table-row-hover-bg: #dfc6ff;
16 | --pst-color-link-hover: #845818;
17 | }
18 |
19 | html[data-theme="dark"] {
20 | --pst-color-table-row-hover-bg: #41296c;
21 | --pst-color-inline-code: #dd8cd4;
22 | }
23 |
24 |
25 | html[data-theme="dark"] dt:target {
26 | background-color: #4f4500;
27 | }
28 |
29 | html[data-theme="dark"] .linkcode-link {
30 | color: #9090ff;
31 | }
32 |
33 | html[data-theme="dark"] table.indextable tr.cap {
34 | background-color: #464646;
35 | }
36 |
37 | html[data-theme="dark"] a:visited {
38 | color: #9E67D0;
39 | }
40 |
41 | .navbar-brand .logo__title {
42 | font-family: "Mona Sans", sans-serif;
43 | font-size: 2.5rem;
44 | font-weight: 400;
45 | font-style: normal;
46 | }
47 |
48 | :root {
49 | --pst-font-family-monospace: "JetBrains Mono", monospace;
50 | --pst-font-family-heading: "Mona Sans", sans-serif;
51 | --pst-font-family-base: "Mona Sans", sans-serif;
52 | }
53 |
54 | body {
55 | font-weight: 450;
56 | }
57 |
58 | .bd-main .bd-content .bd-article-container {
59 | max-width: 100%; /* default is 60em */
60 | }
61 |
62 | /*.bd-sidebar-primary {
63 | max-width: 20%;
64 | }*/
65 |
66 | /* Ensure links in code blocks are underlined */
67 | .highlight a {
68 | text-decoration: underline;
69 | color: #394198; /* Adjust color as needed */
70 | }
71 |
72 | /* For additional emphasis, change hover effect */
73 | .highlight a:hover {
74 | text-decoration: underline;
75 | color: #9090ff;
76 | }
77 |
--------------------------------------------------------------------------------
/src/barecat/from_tar_stream.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 | import tarfile
4 |
5 | import barecat.core.barecat as barecat_
6 | from barecat.common import BarecatDirInfo, BarecatFileInfo
7 |
8 |
9 | def main():
10 | parser = argparse.ArgumentParser(description='Convert a tar stream to a barecat file')
11 | parser.add_argument('barecat_file', type=str, help='path to the target barecat file')
12 | parser.add_argument(
13 | '--shard-size-limit',
14 | type=str,
15 | default=None,
16 | help='maximum size of a shard in bytes (if not specified, '
17 | 'all files will be concatenated into a single shard)',
18 | )
19 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
20 | args = parser.parse_args()
21 |
22 | with barecat_.Barecat(
23 | args.barecat_file,
24 | shard_size_limit=args.shard_size_limit,
25 | readonly=False,
26 | overwrite=args.overwrite,
27 | ) as writer:
28 | with tarfile.open(fileobj=sys.stdin.buffer, mode='r|') as tar:
29 | for member in tar:
30 | if member.isdir():
31 | dinfo = BarecatDirInfo(
32 | path=member.name,
33 | mode=member.mode,
34 | uid=member.uid,
35 | gid=member.gid,
36 | mtime_ns=member.mtime * 1_000_000_000,
37 | )
38 | writer.add(dinfo, dir_exist_ok=True)
39 | if member.isfile():
40 | finfo = BarecatFileInfo(
41 | path=member.name,
42 | size=member.size,
43 | mode=member.mode,
44 | uid=member.uid,
45 | gid=member.gid,
46 | mtime_ns=member.mtime * 1_000_000_000,
47 | )
48 | with tar.extractfile(member) as file_in_tar:
49 | writer.add(finfo, fileobj=file_in_tar)
50 |
51 |
52 | if __name__ == '__main__':
53 | main()
54 |
--------------------------------------------------------------------------------
/src/barecat/exceptions.py:
--------------------------------------------------------------------------------
1 | """Exceptions indicating various errors related to the use of Barecat archives"""
2 |
3 |
4 | class BarecatError(Exception):
5 | """Base class for all exceptions in Barecat"""
6 |
7 | def __init__(self, message: str):
8 | super().__init__(message)
9 |
10 |
11 | class FileExistsBarecatError(BarecatError):
12 | """Exception raised when trying to create a file that already exists
13 |
14 | Analogous to FileExistsError
15 |
16 | Args:
17 | path: path to the file that already exists
18 | """
19 |
20 | def __init__(self, path: str):
21 | super().__init__(f'File already exists: {path}')
22 |
23 |
24 | class FileNotFoundBarecatError(BarecatError):
25 | """Exception raised when trying to access a file that does not exist
26 |
27 | Analogous to FileNotFoundError
28 |
29 | Args:
30 | path: path to the file that does not exist
31 |
32 | """
33 |
34 | def __init__(self, path: str):
35 | super().__init__(f'File not found: {path}')
36 |
37 |
38 | class DirectoryNotEmptyBarecatError(BarecatError):
39 | """Exception raised when trying to delete a non-empty directory
40 |
41 | Args:
42 | path: path to the non-empty directory
43 | """
44 |
45 | def __init__(self, path: str):
46 | super().__init__(f'Directory not empty: {path}')
47 |
48 |
49 | class IsADirectoryBarecatError(BarecatError):
50 | """Exception raised when trying to access a directory as a file.
51 |
52 | Args:
53 | path: path to the directory
54 |
55 | """
56 |
57 | def __init__(self, path: str):
58 | super().__init__(f'Is a directory: {path}')
59 |
60 |
61 | class NotADirectoryBarecatError(BarecatError):
62 | """Exception raised when trying to access a file as a directory."""
63 |
64 | def __init__(self, message: str):
65 | super().__init__(message)
66 |
67 |
68 | class BarecatIntegrityError(BarecatError):
69 | """Exception raised when the CRC32C checksum of a file does not match the expected checksum"""
70 |
71 | def __init__(self, message: str):
72 | super().__init__(message)
73 |
74 |
75 | class NotEnoughSpaceBarecatError(BarecatError):
76 | """Exception raised when there is not enough space to write a file to the archive"""
77 |
78 | def __init__(self, message: str):
79 | super().__init__(message)
80 |
--------------------------------------------------------------------------------
/src/barecat/__init__.py:
--------------------------------------------------------------------------------
1 | """Barecat is a fast random-access, mountable archive format for storing and accessing many small
2 | files."""
3 |
4 | from .core.barecat import Barecat
5 | from .core.index import Index
6 |
7 | from .cli_impl import (
8 | archive2barecat,
9 | barecat2archive,
10 | extract,
11 | merge,
12 | merge_symlink,
13 | read_index,
14 | write_index,
15 | )
16 | from .common import (
17 | BarecatFileInfo,
18 | BarecatDirInfo,
19 | BarecatEntryInfo,
20 | FileSection,
21 | Order,
22 | SHARD_SIZE_UNLIMITED,
23 | )
24 |
25 | from .exceptions import (
26 | BarecatError,
27 | BarecatIntegrityError,
28 | FileExistsBarecatError,
29 | FileNotFoundBarecatError,
30 | IsADirectoryBarecatError,
31 | NotEnoughSpaceBarecatError,
32 | DirectoryNotEmptyBarecatError,
33 | )
34 |
35 | from .threadsafe import get_cached_reader
36 |
37 |
38 | def open(path, mode='r', auto_codec=False, threadsafe_reader=True):
39 | if mode == 'r':
40 | return Barecat(path, readonly=True, threadsafe=threadsafe_reader, auto_codec=auto_codec)
41 | elif mode == 'w+':
42 | return Barecat(
43 | path,
44 | readonly=False,
45 | overwrite=True,
46 | exist_ok=True,
47 | append_only=False,
48 | auto_codec=auto_codec,
49 | )
50 | elif mode == 'r+':
51 | return Barecat(
52 | path,
53 | readonly=False,
54 | overwrite=False,
55 | exist_ok=True,
56 | append_only=False,
57 | auto_codec=auto_codec,
58 | )
59 | elif mode == 'a+':
60 | return Barecat(
61 | path,
62 | readonly=False,
63 | overwrite=False,
64 | exist_ok=True,
65 | append_only=True,
66 | auto_codec=auto_codec,
67 | )
68 | elif mode == 'ax+':
69 | return Barecat(
70 | path,
71 | readonly=False,
72 | overwrite=False,
73 | exist_ok=False,
74 | append_only=True,
75 | auto_codec=auto_codec,
76 | )
77 | elif mode == 'x+':
78 | return Barecat(
79 | path,
80 | readonly=False,
81 | overwrite=False,
82 | exist_ok=False,
83 | append_only=False,
84 | auto_codec=auto_codec,
85 | )
86 | else:
87 | raise ValueError(f"Invalid mode: {mode}")
88 |
--------------------------------------------------------------------------------
/src/barecat/upgrade_database2.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os.path
3 |
4 | import barecat
5 |
6 |
7 | def main():
8 | parser = argparse.ArgumentParser(description='Migrate index database to new version')
9 | parser.add_argument('path_in', type=str, help='Path to the old barecat')
10 | parser.add_argument('path_out', type=str, help='Path to the new barecat')
11 |
12 | args = parser.parse_args()
13 | upgrade_schema(args.path_in, args.path_out)
14 |
15 |
16 | def upgrade_schema(path_in: str, path_out: str):
17 | if os.path.exists(path_out + '-sqlite-index'):
18 | raise FileExistsError(f'Output path {path_out}-sqlite-index already exists')
19 | with barecat.Index(path_out + '-sqlite-index', readonly=False) as index_out:
20 | c = index_out.cursor
21 | c.execute('COMMIT')
22 | c.execute('PRAGMA foreign_keys=OFF')
23 | c.execute('PRAGMA synchronous=OFF')
24 | c.execute('PRAGMA journal_mode=OFF')
25 | c.execute(f'ATTACH DATABASE "file:{path_in}-sqlite-index?mode=ro" AS source')
26 |
27 | with index_out.no_triggers(), index_out.no_foreign_keys():
28 | print('Migrating dir metadata...')
29 | c.execute(
30 | """
31 | INSERT INTO dirs (
32 | path, num_subdirs, num_files, num_files_tree, size_tree, mode, uid, gid,
33 | mtime_ns)
34 | SELECT path, num_subdirs, num_files, num_files_tree, size_tree, mode, uid,
35 | gid, mtime_ns
36 | FROM source.dirs
37 | WHERE path != ''
38 | """
39 | )
40 | c.execute("""
41 | UPDATE dirs
42 | SET (num_subdirs, num_files, num_files_tree, size_tree, mode, uid, gid, mtime_ns) =
43 | (SELECT num_subdirs, num_files, num_files_tree, size_tree, mode, uid, gid, mtime_ns
44 | FROM source.dirs WHERE path = '')
45 | WHERE path = ''
46 | """)
47 |
48 |
49 | print('Migrating file metadata...')
50 | c.execute(
51 | f"""
52 | INSERT INTO files (
53 | path, shard, offset, size, crc32c, mode, uid, gid, mtime_ns)
54 | SELECT path, shard, offset, size, crc32c, mode, uid, gid, mtime_ns
55 | FROM source.files
56 | """
57 | )
58 |
59 | c.execute(
60 | f"""
61 | INSERT OR REPLACE INTO config (key, value_text, value_int)
62 | SELECT key, value_text, value_int
63 | FROM source.config
64 | """
65 | )
66 |
67 | index_out.conn.commit()
68 | c.execute("DETACH DATABASE source")
69 | index_out.optimize()
70 |
71 |
72 | if __name__ == '__main__':
73 | main()
74 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=64",
4 | "wheel",
5 | "setuptools_scm[toml]>=8"
6 | ]
7 | build-backend = "setuptools.build_meta"
8 |
9 | [project]
10 | name = "barecat"
11 | dynamic = ["version"]
12 | description = "Scalable archive format for storing millions of small files with random access and SQLite indexing."
13 | readme = "README.md"
14 | requires-python = ">=3.9"
15 | license = { file = "LICENSE" }
16 |
17 | authors = [
18 | { name = "István Sárándi", email = "istvan.sarandi@gmail.com" }
19 | ]
20 |
21 | dependencies = [
22 | "multiprocessing-utils",
23 | "tqdm",
24 | "crc32c"
25 | ]
26 |
27 | keywords = [
28 | "sqlite",
29 | "dataset",
30 | "storage",
31 | "archive",
32 | "random-access",
33 | "image-dataset",
34 | "filesystem",
35 | "key-value-store",
36 | "deep-learning",
37 | "data-loader",
38 | "file-indexing"
39 | ]
40 |
41 | classifiers = [
42 | "Development Status :: 4 - Beta",
43 | "Intended Audience :: Developers",
44 | "Intended Audience :: Science/Research",
45 | "Topic :: Scientific/Engineering :: Information Analysis",
46 | "Topic :: Software Development :: Libraries",
47 | "Topic :: System :: Archiving",
48 | "Topic :: System :: Filesystems",
49 | "License :: OSI Approved :: MIT License",
50 | "Programming Language :: Python",
51 | "Programming Language :: Python :: 3",
52 | "Programming Language :: Cython",
53 | "Operating System :: POSIX :: Linux"
54 | ]
55 |
56 | [project.scripts]
57 | barecat-create = "barecat.cli:create"
58 | barecat-extract = "barecat.cli:extract"
59 | barecat-merge = "barecat.cli:merge"
60 | barecat-merge-symlink = "barecat.cli:merge_symlink"
61 | barecat-extract-single = "barecat.cli:extract_single"
62 | barecat-index-to-csv = "barecat.cli:index_to_csv"
63 | barecat-verify = "barecat.cli:verify_integrity"
64 | barecat-to-ncdu-json = "barecat.cli:print_ncdu_json"
65 | archive2barecat = "barecat.cli:archive2barecat"
66 | barecat2archive = "barecat.cli:barecat2archive"
67 | barecat-defrag = "barecat.cli:defrag"
68 | barecat-create-recursive = "barecat.cli:create_recursive"
69 | barecat-viewer = "barecat.viewerqt6:main"
70 | barecat-upgrade-database = "barecat.upgrade_database:main"
71 |
72 | [project.urls]
73 | Homepage = "https://github.com/isarandi/barecat"
74 | Documentation = "https://istvansarandi.com/docs/barecat/api/barecat/Barecat.html"
75 | Repository = "https://github.com/isarandi/barecat"
76 | Issues = "https://github.com/isarandi/barecat/issues"
77 | Author = "https://istvansarandi.com"
78 |
79 | [tool.setuptools_scm]
80 | version_scheme = "guess-next-dev"
81 | local_scheme = "no-local-version"
82 | write_to = "src/barecat/_version.py"
83 |
84 | [tool.setuptools]
85 | package-dir = { "" = "src" }
86 |
87 | [tool.setuptools.packages.find]
88 | where = ["src"]
89 |
90 | [tool.black]
91 | line-length = 99
92 | skip-string-normalization = true
93 |
--------------------------------------------------------------------------------
/tests/test_barecat.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import barecat
4 | from barecat import Barecat, BarecatFileInfo, BarecatDirInfo
5 | import pytest
6 | import tempfile
7 | import os.path as osp
8 |
9 |
10 | def test_barecat():
11 | tempdir = tempfile.mkdtemp()
12 | filepath = osp.join(tempdir, 'test.barecat')
13 | with barecat.Barecat(filepath, readonly=False) as bc:
14 | bc['some/path.txt'] = b'hello'
15 |
16 | with barecat.Barecat(filepath, readonly=True) as bc:
17 | assert bc['some/path.txt'] == b'hello'
18 |
19 | with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc:
20 | bc.add(BarecatFileInfo(path='some/path.txt', mode=0o666), data=b'hello world')
21 | bc.add(BarecatDirInfo(path='some/dir', mode=0o777))
22 |
23 | with barecat.Barecat(filepath, readonly=True) as bc:
24 | assert bc['some/path.txt'] == b'hello world'
25 | assert bc.listdir('some/dir') == []
26 |
27 | with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc:
28 | bc['some/path.txt'] = b'hello world'
29 | assert bc['some/path.txt'] == b'hello world'
30 | del bc['some/path.txt']
31 | with pytest.raises(KeyError):
32 | a = bc['some/path.txt']
33 |
34 | with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc:
35 | bc['some/path.txt'] = b'hello world'
36 |
37 | with barecat.Barecat(filepath, readonly=True) as bc:
38 | with bc.open('some/path.txt') as f:
39 | f.seek(6)
40 | assert f.read() == b'world'
41 |
42 | with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc:
43 | bc['dir/file.txt'] = b'Hello, world!'
44 | bc['dir/subdir/file2.txt'] = b'Hello, world2!'
45 |
46 | with barecat.Barecat(filepath, readonly=True) as bc:
47 | assert bc.listdir('dir/subdir') == ['file2.txt']
48 |
49 | assert list(bc.walk('dir')) == [
50 | ('dir', ['subdir'], ['file.txt']),
51 | ('dir/subdir', [], ['file2.txt']),
52 | ]
53 |
54 | with open(osp.join(tempdir, 'file.txt'), 'wb') as f:
55 | f.write(b'Hello, world!')
56 | os.mkdir(osp.join(tempdir, 'dir2'))
57 |
58 | with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc:
59 | bc.add_by_path(osp.join(tempdir, 'file.txt'))
60 | bc.add_by_path(osp.join(tempdir, 'dir2'), store_path='dir')
61 |
62 | with barecat.Barecat(filepath, readonly=True) as bc:
63 | assert bc[osp.join(tempdir, 'file.txt')] == b'Hello, world!'
64 | assert bc.listdir('dir') == []
65 |
66 | with Barecat(filepath, readonly=False, overwrite=True) as bc:
67 | bc.add(BarecatFileInfo(path='file.txt', mode=0o666), data=b'Hello, world!')
68 | bc.add(BarecatDirInfo(path='dir', mode=0o777))
69 |
70 | with Barecat(filepath, readonly=True) as bc:
71 | assert bc['file.txt'] == b'Hello, world!'
72 | assert bc.listdir('dir') == []
73 |
74 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | *_cython.c
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
--------------------------------------------------------------------------------
/src/barecat/upgrade_database.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os.path
3 | import sqlite3
4 |
5 | import barecat
6 | import barecat_cython
7 | from barecat.consumed_threadpool import ConsumedThreadPool
8 | from barecat.progbar import progressbar
9 |
10 |
11 | def main():
12 | parser = argparse.ArgumentParser(description='Migrate index database to new version')
13 | parser.add_argument('path', type=str, help='Path to the old barecat')
14 | parser.add_argument(
15 | '--workers', type=int, default=8, help='Number of workers for calculating crc32c'
16 | )
17 |
18 | args = parser.parse_args()
19 | dbase_path = args.path + '-sqlite-index'
20 | if not os.path.exists(dbase_path):
21 | raise FileNotFoundError(f'{dbase_path} does not exist!')
22 |
23 | os.rename(args.path + '-sqlite-index', args.path + '-sqlite-index.old')
24 | upgrade_schema(args.path)
25 | update_crc32c(args.path, workers=args.workers)
26 |
27 |
28 | def upgrade_schema(path: str):
29 | with barecat.Index(path + '-sqlite-index', readonly=False) as index_out:
30 | c = index_out.cursor
31 | c.execute('COMMIT')
32 | c.execute('PRAGMA foreign_keys=OFF')
33 | c.execute('PRAGMA synchronous=OFF')
34 | c.execute('PRAGMA journal_mode=OFF')
35 | c.execute('PRAGMA recursive_triggers=ON')
36 | c.execute(f'ATTACH DATABASE "file:{path}-sqlite-index.old?mode=ro" AS source')
37 | print('Migrating dir metadata...')
38 | c.execute(
39 | """
40 | INSERT INTO dirs (path)
41 | SELECT path FROM source.directories
42 | WHERE path != ''
43 | """
44 | )
45 | print('Migrating file metadata...')
46 | c.execute(
47 | f"""
48 | INSERT INTO files (path, shard, offset, size)
49 | SELECT path, shard, offset, size
50 | FROM source.files
51 | """
52 | )
53 |
54 | c.execute('COMMIT')
55 | c.execute("DETACH DATABASE source")
56 |
57 |
58 | def update_crc32c(path_out: str, workers=8):
59 | with (
60 | barecat_cython.BarecatMmapCython(path_out) as sh,
61 | barecat.Index(path_out + '-sqlite-index', readonly=False) as index,
62 | ):
63 | c = index.cursor
64 | c.execute('COMMIT')
65 | c.execute('PRAGMA synchronous=OFF')
66 | c.execute('PRAGMA journal_mode=OFF')
67 | index._triggers_enabled = False
68 |
69 | print('Calculating crc32c for all files to separate database...')
70 | path_newcrc_temp = f'{path_out}-sqlite-index-newcrc-temp'
71 | with ConsumedThreadPool(
72 | temp_crc_writer_main,
73 | main_args=(path_newcrc_temp,),
74 | max_workers=workers,
75 | queue_size=1024,
76 | ) as ctp:
77 | for fi in progressbar(
78 | index.iter_all_fileinfos(order=barecat.Order.ADDRESS), total=index.num_files
79 | ):
80 | ctp.submit(
81 | sh.crc32c_from_address, userdata=fi.path, args=(fi.shard, fi.offset, fi.size)
82 | )
83 |
84 | print('Updating crc32c in the barecat index...')
85 | c.execute(f'ATTACH DATABASE "file:{path_newcrc_temp}?mode=ro" AS newdb')
86 | c.execute(
87 | """
88 | UPDATE files
89 | SET crc32c=newdb.crc32c.crc32c
90 | FROM newdb.crc32c
91 | WHERE files.path=newdb.crc32c.path
92 | """
93 | )
94 | c.execute('COMMIT')
95 | c.execute('DETACH DATABASE newdb')
96 |
97 | os.remove(path_newcrc_temp)
98 |
99 |
100 | def temp_crc_writer_main(dbpath, future_iter):
101 | with sqlite3.connect(dbpath) as conn:
102 | c = conn.cursor()
103 | c.execute('PRAGMA synchronous=OFF')
104 | c.execute('PRAGMA journal_mode=OFF')
105 | c.execute("CREATE TABLE IF NOT EXISTS crc32c (path TEXT PRIMARY KEY, crc32c INTEGER)")
106 | for future in future_iter:
107 | path = future.userdata
108 | crc32c = future.result()
109 | c.execute("INSERT INTO crc32c (path, crc32c) VALUES (?, ?)", (path, crc32c))
110 |
111 |
112 | if __name__ == '__main__':
113 | main()
114 |
--------------------------------------------------------------------------------
/src/barecat/consumed_threadpool.py:
--------------------------------------------------------------------------------
1 | import concurrent.futures
2 | import os
3 | import queue
4 | import threading
5 |
6 |
7 | class ConsumedThreadPool:
8 | """This class solves a form of the producer-consumer problem.
9 | There is one main producer, whose items need to be processed in parallel by one of several
10 | workers, and finally the processed items are consumed by a single consumer thread.
11 |
12 | So the three steps are:
13 |
14 | 1. The main thread constructs this object, then iterates and calls submit() for each item,
15 | passing the appropriate processing function and arguments to submit().
16 | 2. The workers process the items in parallel threads, these are the threads created by a
17 | ThreadPoolExecutor.
18 | 3. The consumer thread consumes the items, in the form of futures, running the consumer_main
19 | function originally passed to the constructor.
20 |
21 | The main producer's loop is meant to be computationally inexpensive, something that generates
22 | "tasks". The worker threads do the heavy lifting.
23 | The consumer does something that must happen in a serial manner or otherwise must happen in the
24 | same, single thread.
25 |
26 | Example:
27 |
28 | def producer_main():
29 | with ConsumedThreadPool(consumer_main, main_args=('hello',), max_workers=8) as pool:
30 | for i in range(100):
31 | pool.submit(process_fn, userdata='anything', args=(i,))
32 |
33 | def process_fn(i):
34 | return i * 2
35 |
36 | def consumer_main(greeting, future_iter):
37 | print(greeting)
38 | for future in future_iter:
39 | print(future.userdata)
40 | print(future.result())
41 | """
42 |
43 | def __init__(
44 | self, consumer_main, main_args=None, main_kwargs=None, max_workers=None, queue_size=None
45 | ):
46 | if max_workers is None:
47 | max_workers = len(os.sched_getaffinity(0))
48 | if queue_size is None:
49 | queue_size = max_workers * 2
50 | self.q = queue.Queue(queue_size)
51 | self.semaphore = threading.Semaphore(queue_size)
52 | self.executor = concurrent.futures.ThreadPoolExecutor(max_workers)
53 |
54 | self.consumer_error_queue = queue.Queue()
55 | self.consumer_main = consumer_main
56 |
57 | if main_kwargs is None:
58 | main_kwargs = {}
59 | self.consumer_thread = threading.Thread(
60 | target=self._safe_consumer_main, args=(main_args, main_kwargs)
61 | )
62 | self.consumer_thread.start()
63 |
64 | def _safe_consumer_main(self, main_args, main_kwargs):
65 | try:
66 | main_kwargs = {**main_kwargs, 'future_iter': IterableQueue(self.q)}
67 | self.consumer_main(*main_args, **main_kwargs)
68 | except Exception as e:
69 | self.consumer_error_queue.put(e)
70 |
71 | def submit(self, fn=None, userdata=None, args=None, kwargs=None):
72 | if not self.consumer_error_queue.empty():
73 | consumer_exception = self.consumer_error_queue.get()
74 | raise RuntimeError('Consumer thread raised an exception') from consumer_exception
75 |
76 | self.semaphore.acquire()
77 | if args is None:
78 | args = ()
79 | if kwargs is None:
80 | kwargs = {}
81 | if fn is None:
82 | fn = noop
83 | future = self.executor.submit(fn, *args, **kwargs)
84 | future.userdata = userdata
85 | future.add_done_callback(lambda f: self.semaphore.release())
86 | future.add_done_callback(self.q.put)
87 |
88 | def close(self):
89 | self.executor.shutdown(wait=True)
90 | self.q.put(None)
91 | self.q.join()
92 | self.consumer_thread.join()
93 |
94 | if not self.consumer_error_queue.empty():
95 | consumer_exception = self.consumer_error_queue.get()
96 | raise RuntimeError('Consumer thread raised an exception') from consumer_exception
97 |
98 | def __enter__(self):
99 | return self
100 |
101 | def __exit__(self, exc_type, exc_val, exc_tb):
102 | self.close()
103 |
104 |
105 | class IterableQueue:
106 | def __init__(self, q):
107 | self.q = q
108 |
109 | def __iter__(self):
110 | while (item := self.q.get()) is not None:
111 | yield item
112 | self.q.task_done()
113 | self.q.task_done()
114 |
115 |
116 | def noop():
117 | pass
118 |
--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/class.rst:
--------------------------------------------------------------------------------
1 | :html_theme.sidebar_secondary.remove: true
2 |
3 | {% if obj.display %}
4 | {% if is_own_page %}
5 | {{ obj.name }}
6 | {{ "=" * obj.name | length }}
7 |
8 | {% endif %}
9 | {% set visible_children = obj.children|selectattr("display")|list %}
10 | {% set own_page_children = visible_children|selectattr("type", "in", own_page_types)|list %}
11 | {% if is_own_page and own_page_children %}
12 | .. toctree::
13 | :hidden:
14 |
15 | {% for child in own_page_children %}
16 | {{ child.include_path }}
17 | {% endfor %}
18 |
19 | {% endif %}
20 | .. py:{{ obj.type }}:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}{% if obj.args %}({{ obj.args }}){% endif %}
21 |
22 | {% for (args, return_annotation) in obj.overloads %}
23 | {{ " " * (obj.type | length) }} {{ obj.short_name }}{% if args %}({{ args }}){% endif %}
24 |
25 | {% endfor %}
26 | {% if obj.bases %}
27 | {% if "show-inheritance" in autoapi_options %}
28 |
29 | Bases: {% for base in obj.bases %}{{ base|link_objs }}{% if not loop.last %}, {% endif %}{% endfor %}
30 | {% endif %}
31 |
32 |
33 | {% if "show-inheritance-diagram" in autoapi_options and obj.bases != ["object"] %}
34 | .. autoapi-inheritance-diagram:: {{ obj.obj["full_name"] }}
35 | :parts: 1
36 | {% if "private-members" in autoapi_options %}
37 | :private-bases:
38 | {% endif %}
39 |
40 | {% endif %}
41 | {% endif %}
42 | {% if obj.docstring %}
43 |
44 | {{ obj.docstring|indent(3) }}
45 | {% endif %}
46 | {% for obj_item in visible_children %}
47 | {% if obj_item.type not in own_page_types %}
48 |
49 | {{ obj_item.render()|indent(3) }}
50 | {% endif %}
51 | {% endfor %}
52 | {% if is_own_page and own_page_children %}
53 | {% set visible_attributes = own_page_children|selectattr("type", "equalto", "attribute")|list %}
54 | {% if visible_attributes %}
55 | Attributes
56 | ----------
57 |
58 | .. autoapisummary::
59 |
60 | {% for attribute in visible_attributes %}
61 | {{ attribute.id }}
62 | {% endfor %}
63 |
64 |
65 | {% endif %}
66 | {% set visible_properties = own_page_children|selectattr("type", "equalto", "property")|list %}
67 | {% if visible_properties %}
68 | Properties
69 | ----------
70 |
71 | .. autoapisummary::
72 |
73 | {% for property in visible_properties %}
74 | {{ property.id }}
75 | {% endfor %}
76 |
77 |
78 | {% endif %}
79 | {% set visible_exceptions = own_page_children|selectattr("type", "equalto", "exception")|list %}
80 | {% if visible_exceptions %}
81 | Exceptions
82 | ----------
83 |
84 | .. autoapisummary::
85 |
86 | {% for exception in visible_exceptions %}
87 | {{ exception.id }}
88 | {% endfor %}
89 |
90 |
91 | {% endif %}
92 | {% set visible_classes = own_page_children|selectattr("type", "equalto", "class")|list %}
93 | {% if visible_classes %}
94 | Classes
95 | -------
96 |
97 | .. autoapisummary::
98 |
99 | {% for klass in visible_classes %}
100 | {{ klass.id }}
101 | {% endfor %}
102 |
103 |
104 | {% endif %}
105 |
106 | {% set static_methods = own_page_children|selectattr("type", "equalto", "method")|selectattr("properties", "defined")|selectattr("properties", "equalto", ["staticmethod"])|list %}
107 | {% set class_methods = own_page_children|selectattr("type", "equalto", "method")|selectattr("properties", "defined")|selectattr("properties", "equalto", ["classmethod"])|list %}
108 | {% set instance_methods = own_page_children|selectattr("type", "equalto", "method")|rejectattr("properties", "equalto", ["staticmethod"])|rejectattr("properties", "equalto", ["classmethod"])|list %}
109 |
110 | {% if instance_methods %}
111 | Instance Methods
112 | ----------------
113 |
114 | .. autoapisummary::
115 |
116 | {% for method in instance_methods %}
117 | {{ method.id }}
118 | {% endfor %}
119 |
120 |
121 | {% endif %}
122 | {% if class_methods %}
123 | Class Methods
124 | -------------
125 |
126 | .. autoapisummary::
127 |
128 | {% for method in class_methods %}
129 | {{ method.id }}
130 | {% endfor %}
131 |
132 |
133 | {% endif %}
134 | {% if static_methods %}
135 | Static Methods
136 | --------------
137 |
138 | .. autoapisummary::
139 |
140 | {% for method in static_methods %}
141 | {{ method.id }}
142 | {% endfor %}
143 |
144 |
145 | {% endif %}
146 | {% endif %}
147 | {% endif %}
148 |
149 |
150 | .. footbibliography::
--------------------------------------------------------------------------------
/docs/_templates/autoapi/python/module.rst:
--------------------------------------------------------------------------------
1 | :html_theme.sidebar_secondary.remove: true
2 |
3 | {% if obj.display %}
4 | {% if is_own_page %}
5 | {{ obj.id }}
6 | {{ "=" * obj.id|length }}
7 |
8 | .. py:module:: {{ obj.name }}
9 |
10 | {% if obj.docstring %}
11 | .. autoapi-nested-parse::
12 |
13 | {{ obj.docstring|indent(3) }}
14 |
15 | {% endif %}
16 |
17 | {% block submodules %}
18 | {% set visible_subpackages = obj.subpackages|selectattr("display")|list %}
19 | {% set visible_submodules = obj.submodules|selectattr("display")|list %}
20 | {% set visible_submodules = (visible_subpackages + visible_submodules)|sort %}
21 | {% if visible_submodules %}
22 | Submodules
23 | ----------
24 |
25 | .. toctree::
26 | :maxdepth: 1
27 |
28 | {% for submodule in visible_submodules %}
29 | {{ submodule.include_path }}
30 | {% endfor %}
31 |
32 |
33 | {% endif %}
34 | {% endblock %}
35 | {% block content %}
36 | {% set visible_children = obj.children|selectattr("display")|list %}
37 | {% if visible_children %}
38 | {% set visible_attributes = visible_children|selectattr("type", "equalto", "data")|list %}
39 | {% if visible_attributes %}
40 | {% if "attribute" in own_page_types or "show-module-summary" in autoapi_options %}
41 | Attributes
42 | ----------
43 |
44 | {% if "attribute" in own_page_types %}
45 | .. toctree::
46 | :hidden:
47 |
48 | {% for attribute in visible_attributes %}
49 | {{ attribute.include_path }}
50 | {% endfor %}
51 |
52 | {% endif %}
53 | .. autoapisummary::
54 |
55 | {% for attribute in visible_attributes %}
56 | {{ attribute.id }}
57 | {% endfor %}
58 | {% endif %}
59 |
60 |
61 | {% endif %}
62 | {% set visible_exceptions = visible_children|selectattr("type", "equalto", "exception")|list %}
63 | {% if visible_exceptions %}
64 | {% if "exception" in own_page_types or "show-module-summary" in autoapi_options %}
65 | Exceptions
66 | ----------
67 |
68 | {% if "exception" in own_page_types %}
69 | .. toctree::
70 | :hidden:
71 |
72 | {% for exception in visible_exceptions %}
73 | {{ exception.include_path }}
74 | {% endfor %}
75 |
76 | {% endif %}
77 | .. autoapisummary::
78 |
79 | {% for exception in visible_exceptions %}
80 | {{ exception.id }}
81 | {% endfor %}
82 | {% endif %}
83 |
84 |
85 | {% endif %}
86 | {% set visible_classes = visible_children|selectattr("type", "equalto", "class")|list %}
87 | {% if visible_classes %}
88 | {% if "class" in own_page_types or "show-module-summary" in autoapi_options %}
89 | Classes
90 | -------
91 |
92 | {% if "class" in own_page_types %}
93 | .. toctree::
94 | :hidden:
95 |
96 | {% for klass in visible_classes %}
97 | {{ klass.include_path }}
98 | {% endfor %}
99 |
100 | {% endif %}
101 | .. autoapisummary::
102 |
103 | {% for klass in visible_classes %}
104 | {{ klass.id }}
105 | {% endfor %}
106 | {% endif %}
107 |
108 |
109 | {% endif %}
110 | {% set visible_functions = visible_children|selectattr("type", "equalto", "function")|list %}
111 | {% if visible_functions %}
112 | {% if "function" in own_page_types or "show-module-summary" in autoapi_options %}
113 | Functions
114 | ---------
115 |
116 | {% if "function" in own_page_types %}
117 | .. toctree::
118 | :hidden:
119 |
120 | {% for function in visible_functions %}
121 | {{ function.include_path }}
122 | {% endfor %}
123 |
124 | {% endif %}
125 | .. autoapisummary::
126 |
127 | {% for function in visible_functions %}
128 | {{ function.id }}
129 | {% endfor %}
130 | {% endif %}
131 |
132 |
133 | {% endif %}
134 | {% set this_page_children = visible_children|rejectattr("type", "in", own_page_types)|list %}
135 | {% if this_page_children %}
136 | {{ obj.type|title }} Contents
137 | {{ "-" * obj.type|length }}---------
138 |
139 | {% for obj_item in this_page_children %}
140 | {{ obj_item.render()|indent(0) }}
141 | {% endfor %}
142 | {% endif %}
143 | {% endif %}
144 | {% endblock %}
145 | {% else %}
146 | .. py:module:: {{ obj.name }}
147 |
148 | {% if obj.docstring %}
149 | .. autoapi-nested-parse::
150 |
151 | {{ obj.docstring|indent(6) }}
152 |
153 | {% endif %}
154 | {% for obj_item in visible_children %}
155 | {{ obj_item.render()|indent(3) }}
156 | {% endfor %}
157 | {% endif %}
158 | {% endif %}
159 |
160 | .. footbibliography::
--------------------------------------------------------------------------------
/src/barecat/glob_to_regex.py:
--------------------------------------------------------------------------------
1 | # This is copied from CPython main branch as of 2024-12-07.
2 | import re
3 | import os.path
4 | import functools
5 |
6 | _re_setops_sub = re.compile(r'([&~|])').sub
7 | _re_escape = functools.lru_cache(maxsize=512)(re.escape)
8 |
9 |
10 | def glob_to_regex(pat, *, recursive=False, include_hidden=False, seps=None):
11 | """Translate a pathname with shell wildcards to a regular expression.
12 |
13 | If `recursive` is true, the pattern segment '**' will match any number of
14 | path segments.
15 |
16 | If `include_hidden` is true, wildcards can match path segments beginning
17 | with a dot ('.').
18 |
19 | If a sequence of separator characters is given to `seps`, they will be
20 | used to split the pattern into segments and match path separators. If not
21 | given, os.path.sep and os.path.altsep (where available) are used.
22 | """
23 | if not seps:
24 | if os.path.altsep:
25 | seps = (os.path.sep, os.path.altsep)
26 | else:
27 | seps = os.path.sep
28 | escaped_seps = ''.join(map(re.escape, seps))
29 | any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
30 | not_sep = f'[^{escaped_seps}]'
31 | if include_hidden:
32 | one_last_segment = f'{not_sep}+'
33 | one_segment = f'{one_last_segment}{any_sep}'
34 | any_segments = f'(?:.+{any_sep})?'
35 | any_last_segments = '.*'
36 | else:
37 | one_last_segment = f'[^{escaped_seps}.]{not_sep}*'
38 | one_segment = f'{one_last_segment}{any_sep}'
39 | any_segments = f'(?:{one_segment})*'
40 | any_last_segments = f'{any_segments}(?:{one_last_segment})?'
41 |
42 | results = []
43 | parts = re.split(any_sep, pat)
44 | last_part_idx = len(parts) - 1
45 | for idx, part in enumerate(parts):
46 | if part == '*':
47 | results.append(one_segment if idx < last_part_idx else one_last_segment)
48 | elif recursive and part == '**':
49 | if idx < last_part_idx:
50 | if parts[idx + 1] != '**':
51 | results.append(any_segments)
52 | else:
53 | results.append(any_last_segments)
54 | else:
55 | if part:
56 | if not include_hidden and part[0] in '*?':
57 | results.append(r'(?!\.)')
58 | results.extend(_translate(part, f'{not_sep}*', not_sep)[0])
59 | if idx < last_part_idx:
60 | results.append(any_sep)
61 | res = ''.join(results)
62 | return fr'(?s:{res})\Z'
63 |
64 |
65 | def _translate(pat, star, question_mark):
66 | res = []
67 | add = res.append
68 | star_indices = []
69 |
70 | i, n = 0, len(pat)
71 | while i < n:
72 | c = pat[i]
73 | i = i + 1
74 | if c == '*':
75 | # store the position of the wildcard
76 | star_indices.append(len(res))
77 | add(star)
78 | # compress consecutive `*` into one
79 | while i < n and pat[i] == '*':
80 | i += 1
81 | elif c == '?':
82 | add(question_mark)
83 | elif c == '[':
84 | j = i
85 | if j < n and pat[j] == '!':
86 | j = j + 1
87 | if j < n and pat[j] == ']':
88 | j = j + 1
89 | while j < n and pat[j] != ']':
90 | j = j + 1
91 | if j >= n:
92 | add('\\[')
93 | else:
94 | stuff = pat[i:j]
95 | if '-' not in stuff:
96 | stuff = stuff.replace('\\', r'\\')
97 | else:
98 | chunks = []
99 | k = i + 2 if pat[i] == '!' else i + 1
100 | while True:
101 | k = pat.find('-', k, j)
102 | if k < 0:
103 | break
104 | chunks.append(pat[i:k])
105 | i = k + 1
106 | k = k + 3
107 | chunk = pat[i:j]
108 | if chunk:
109 | chunks.append(chunk)
110 | else:
111 | chunks[-1] += '-'
112 | # Remove empty ranges -- invalid in RE.
113 | for k in range(len(chunks) - 1, 0, -1):
114 | if chunks[k - 1][-1] > chunks[k][0]:
115 | chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:]
116 | del chunks[k]
117 | # Escape backslashes and hyphens for set difference (--).
118 | # Hyphens that create ranges shouldn't be escaped.
119 | stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') for s in chunks)
120 | i = j + 1
121 | if not stuff:
122 | # Empty range: never match.
123 | add('(?!)')
124 | elif stuff == '!':
125 | # Negated empty range: match any character.
126 | add('.')
127 | else:
128 | # Escape set operations (&&, ~~ and ||).
129 | stuff = _re_setops_sub(r'\\\1', stuff)
130 | if stuff[0] == '!':
131 | stuff = '^' + stuff[1:]
132 | elif stuff[0] in ('^', '['):
133 | stuff = '\\' + stuff
134 | add(f'[{stuff}]')
135 | else:
136 | add(_re_escape(c))
137 | assert i == n
138 | return res, star_indices
139 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Barecat
2 |
3 | **[Full API Reference Docs](https://istvansarandi.com/docs/barecat/api/barecat/Barecat.html)**
4 |
5 | Barecat (**bare** con**cat**enation) is a highly scalable, simple aggregate storage format for
6 | storing many (tens of millions and more) small files, with focus on fast random access and
7 | minimal overhead.
8 |
9 | Barecat can be thought of as a simple filesystem, or as something akin to an indexed tarball, or a
10 | key-value store. Indeed, it can be [mounted via FUSE](https://github.com/isarandi/barecat-mount), converted to a tarball, or used like a dictionary
11 | within Python.
12 |
13 | Barecat associates strings (file paths) with binary data (file contents). It's like a dictionary,
14 | but it has some special handling for '/' characters in the keys, supporting a filesystem-like
15 | experience (`listdir`, `walk`, `glob`, etc).
16 |
17 | Internally, all the data is simply concatenated one after another into one or more data shard files.
18 | Additionally, an index is maintained in an SQLite database, which stores the shard number, the offset
19 | and the size of each inner file (as well as a checksum, and further filesystem-like metadata
20 | like modification time). Barecat also maintains aggregate statistics for each directory, such as the
21 | total number of files and total file size.
22 |
23 |
24 | 
25 |
26 | As you can see, the Barecat format is very simple. Readers/writers are easy to write in any language, since
27 | SQLite is a widely-supported format.
28 |
29 |
30 | ## Background
31 |
32 | A typical use case for Barecat is storing image files for training deep learning models, where the
33 | files are accessed randomly during training. The files are typically stored on a network file
34 | system, where accessing many small files can be slow, and clusters often put a limit on the number
35 | of files of a user. So it is necessary to somehow merge the small files into big ones.
36 | However, typical archive formats such as tar are not suitable, since they don't allow fast random
37 | lookups. In tar, one has to scan the entire archive as there is no central directory.
38 | Zip is better, but still requires scanning the central directory, which can be slow for very large
39 | archives with millions or tens of millions of files.
40 |
41 | We need an index into the archive, and the index itself cannot be required to be loaded
42 | into memory, to support very large datasets.
43 |
44 | Therefore, in this format the metadata is indexed separately in an SQLite database for fast lookup
45 | based on paths. The index also allows fast listing of directory contents and contains aggregate
46 | statistics (total file size, number of files) for each directory.
47 |
48 | ## Features
49 |
50 | - **Fast random access**: The archive can be accessed randomly, addressed by filepath,
51 | without having to scan the entire archive or all the metadata.
52 | The index is stored in a separate SQLite database file, which itself does not need to be loaded
53 | entirely into memory. Ideal for storing training image data for deep learning jobs.
54 | - **Sharding**: To make it easier to move the data around or to distribute it across multiple
55 | storage devices, the archive can be split into multiple files of equal size (shards, or volumes).
56 | The shards do not have to be concatenated to be used, the library will keep all shard files open
57 | and load data from the appropriate one during normal operations.
58 | - **Fast browsing**: The SQLite database contains an index for the parent directories, allowing
59 | fast listing of directory contents and aggregate statistics (total file size, number of files).
60 | - **Intuitive API**: Familiar filesystem-like API, as well as a dictionary-like one.
61 | - **Mountable**: The archive can be efficiently mounted in readonly or read-write mode.
62 | - **Simple storage format**: The files are simply concatenated after each other and the index contains
63 | the offsets and sizes of each file. There is no header format to understand. The index can be
64 | dumped into any format with simple SQL queries.
65 |
66 | ## Command line interface
67 |
68 | To create a Barecat archive, use the `barecat-create` or `barecat-create-recursive` commands, which
69 | are automatically installed executables with the pip package.
70 |
71 | ```bash
72 | barecat-create --file=mydata.barecat --shard-size=100G < path_of_paths.txt
73 |
74 | find dirname -name '*.jpg' -print0 | barecat-create --null --file=mydata.barecat --shard-size=100G
75 |
76 | barecat-create-recursive dir1 dir2 dir3 --file=mydata.barecat --shard-size=100G
77 | ```
78 |
79 | This may yield the following files:
80 |
81 | ```
82 | mydata.barecat-shard-00001
83 | mydata.barecat-shard-00002
84 | mydata.barecat-sqlite-index
85 | ```
86 |
87 | The files can be extracted out again. Unix-like permissions, modification times, owner info are
88 | preserved.
89 |
90 | ```bash
91 | barecat-extract --file=mydata.barecat --target-directory=targetdir/
92 | ```
93 |
94 | ## Python API
95 |
96 | ```python
97 |
98 | import barecat
99 |
100 | with barecat.Barecat('mydata.barecat', readonly=False) as bc:
101 | bc['path/to/file/as/stored.jpg'] = binary_file_data
102 | bc.add_by_path('path/to/file/on/disk.jpg')
103 |
104 | with open('path', 'rb') as f:
105 | bc.add('path/to/file/on/disk.jpg', fileobj=f)
106 |
107 | with barecat.Barecat('mydata.barecat') as bc:
108 | binary_file_data = bc['path/to/file.jpg']
109 | entrynames = bc.listdir('path/to')
110 | for root, dirs, files in bc.walk('path/to/something'):
111 | print(root, dirs, files)
112 |
113 | paths = bc.glob('path/to/**/*.jpg', recursive=True)
114 |
115 | with bc.open('path/to/file.jpg', 'rb') as f:
116 | data = f.read(123)
117 | ```
118 |
119 | ## Image viewer
120 |
121 | Barecat comes with a simple image viewer that can be used to browse the contents of a Barecat
122 | archive.
123 |
124 | ```bash
125 | barecat-image-viewer mydata.barecat
126 | ```
127 |
128 |
129 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | Barecat
2 | =======
3 |
4 | Barecat (**bare** con**cat**enation) is a highly scalable, simple aggregate storage format for
5 | storing many (tens of millions and more) small files, with focus on fast random access and
6 | minimal overhead.
7 |
8 | Barecat can be thought of as a simple filesystem, or as something akin to an indexed tarball, or a
9 | key-value store. Indeed, it can be `mounted via FUSE `_, converted to a tarball, or used like a dictionary
10 | within Python.
11 |
12 | Barecat associates strings (file paths) with binary data (file contents). It's like a dictionary,
13 | but it has some special handling for '/' characters in the keys, supporting a filesystem-like
14 | experience (``listdir``, ``walk``, ``glob``, etc).
15 |
16 | Internally, all the data is simply concatenated one after another into one or more data shard files.
17 | Additionally, an index is maintained in an SQLite database, which stores the shard number, the offset
18 | and the size of each inner file (as well as a checksum, and further filesystem-like metadata
19 | like modification time). Barecat also maintains aggregate statistics for each directory, such as the
20 | total number of files and total file size.
21 |
22 | .. image:: ../figure.png
23 |
24 | As you can see, the Barecat format is very simple. Readers/writers are easy to write in any language, since
25 | SQLite is a widely-supported format.
26 |
27 | Background
28 | ----------
29 |
30 | A typical use case for Barecat is storing image files for training deep learning models, where the
31 | files are accessed randomly during training. The files are typically stored on a network file
32 | system, where accessing many small files can be slow, and clusters often put a limit on the number
33 | of files of a user. So it is necessary to somehow merge the small files into big ones.
34 | However, typical archive formats such as tar are not suitable, since they don't allow fast random
35 | lookups. In tar, one has to scan the entire archive as there is no central directory.
36 | Zip is better, but still requires scanning the central directory, which can be slow for very large
37 | archives with millions or tens of millions of files.
38 |
39 | We need an index into the archive, and the index itself cannot be required to be loaded
40 | into memory, to support very large datasets.
41 |
42 | Therefore, in this format the metadata is indexed separately in an SQLite database for fast lookup
43 | based on paths. The index also allows fast listing of directory contents and contains aggregate
44 | statistics (total file size, number of files) for each directory.
45 |
46 | Features
47 | --------
48 |
49 | - **Fast random access**: The archive can be accessed randomly, addressed by filepath,
50 | without having to scan the entire archive or all the metadata.
51 | The index is stored in a separate SQLite database file, which itself does not need to be loaded
52 | entirely into memory. Ideal for storing training image data for deep learning jobs.
53 | - **Sharding**: To make it easier to move the data around or to distribute it across multiple
54 | storage devices, the archive can be split into multiple files of equal size (shards, or volumes).
55 | The shards do not have to be concatenated to be used, the library will keep all shard files open
56 | and load data from the appropriate one during normal operations.
57 | - **Fast browsing**: The SQLite database contains an index for the parent directories, allowing
58 | fast listing of directory contents and aggregate statistics (total file size, number of files).
59 | - **Intuitive API**: Familiar filesystem-like API, as well as a dictionary-like one.
60 | - **Mountable**: The archive can be efficiently mounted in readonly or read-write mode.
61 | - **Simple storage format**: The files are simply concatenated after each other and the index contains
62 | the offsets and sizes of each file. There is no header format to understand. The index can be
63 | dumped into any format with simple SQL queries.
64 |
65 | Command line interface
66 | ----------------------
67 |
68 | To create a Barecat archive, use the ``barecat-create`` or ``barecat-create-recursive`` commands, which
69 | are automatically installed executables with the pip package.
70 |
71 | .. code-block:: bash
72 |
73 | barecat-create --file=mydata.barecat --shard-size=100G < path_of_paths.txt
74 |
75 | find dirname -name '*.jpg' -print0 | barecat-create --null --file=mydata.barecat --shard-size=100G
76 |
77 | barecat-create-recursive dir1 dir2 dir3 --file=mydata.barecat --shard-size=100G
78 |
79 | This may yield the following files:
80 |
81 | .. code-block:: text
82 |
83 | mydata.barecat-shard-00001
84 | mydata.barecat-shard-00002
85 | mydata.barecat-sqlite-index
86 |
87 | The files can be extracted out again. Unix-like permissions, modification times, owner info are
88 | preserved.
89 |
90 | .. code-block:: bash
91 |
92 | barecat-extract --file=mydata.barecat --target-directory=targetdir/
93 |
94 | Python API
95 | ----------
96 |
97 | .. code-block:: python
98 |
99 | import barecat
100 |
101 | with barecat.Barecat('mydata.barecat', readonly=False) as bc:
102 | bc['path/to/file/as/stored.jpg'] = binary_file_data
103 | bc.add_by_path('path/to/file/on/disk.jpg')
104 |
105 | with open('path', 'rb') as f:
106 | bc.add('path/to/file/on/disk.jpg', fileobj=f)
107 |
108 | with barecat.Barecat('mydata.barecat') as bc:
109 | binary_file_data = bc['path/to/file.jpg']
110 | entrynames = bc.listdir('path/to')
111 | for root, dirs, files in bc.walk('path/to/something'):
112 | print(root, dirs, files)
113 |
114 | paths = bc.glob('path/to/**/*.jpg', recursive=True)
115 |
116 | with bc.open('path/to/file.jpg', 'rb') as f:
117 | data = f.read(123)
118 |
119 | Image viewer
120 | ------------
121 |
122 | Barecat comes with a simple image viewer that can be used to browse the contents of a Barecat
123 | archive.
124 |
125 | .. code-block:: bash
126 |
127 | barecat-image-viewer mydata.barecat
128 |
129 | Sitemap
130 | -------
131 |
132 | .. toctree::
133 | :maxdepth: 3
134 | :caption: Contents
135 |
136 |
137 | * :ref:`genindex`
138 | * :ref:`modindex`
139 | * :ref:`search`
140 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | import types
2 | import contextlib
3 | import importlib
4 | import inspect
5 | import os
6 | import re
7 | import sys
8 | from enum import Enum
9 |
10 | import setuptools_scm
11 | import toml
12 |
13 | sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
14 |
15 |
16 | pyproject_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "pyproject.toml"))
17 |
18 | with open(pyproject_path) as f:
19 | data = toml.load(f)
20 |
21 | project_info = data["project"]
22 | project_slug = project_info["name"].replace(" ", "-").lower()
23 | tool_urls = project_info.get("urls", {})
24 |
25 | repo_url = tool_urls.get("Repository", "")
26 | author_url = tool_urls.get("Author", "")
27 | github_username = re.match(r"https://github\.com/([^/]+)/?", repo_url)[1]
28 |
29 | project = project_info["name"]
30 | release = setuptools_scm.get_version('..')
31 | version = ".".join(release.split(".")[:2])
32 | main_module_name = project_slug.replace('-', '_')
33 | repo_name = project_slug
34 | module = importlib.import_module(main_module_name)
35 | globals()[main_module_name] = module
36 |
37 |
38 | # -- Project information -----------------------------------------------------
39 | linkcode_url = repo_url
40 |
41 | author = project_info["authors"][0]["name"]
42 | copyright = f'%Y'
43 |
44 | # -- General configuration ---------------------------------------------------
45 | add_module_names = False
46 | python_use_unqualified_type_names = True
47 | extensions = [
48 | 'sphinx.ext.autodoc',
49 | 'sphinx.ext.napoleon',
50 | 'sphinx.ext.autosummary',
51 | 'sphinx.ext.intersphinx',
52 | 'sphinx.ext.linkcode',
53 | 'sphinx.ext.autodoc.typehints',
54 | 'sphinxcontrib.bibtex',
55 | 'autoapi.extension',
56 | 'sphinx.ext.inheritance_diagram',
57 | 'sphinx_codeautolink',
58 | ]
59 | bibtex_bibfiles = ['abbrev_long.bib', 'references.bib']
60 | bibtex_footbibliography_header = ".. rubric:: References"
61 | intersphinx_mapping = {
62 | 'python': ('https://docs.python.org/3', None),
63 | 'torch': ('https://pytorch.org/docs/main/', None),
64 | 'numpy': ('https://numpy.org/doc/stable/', None),
65 | 'scipy': ('https://docs.scipy.org/doc/scipy/', None),
66 | }
67 |
68 | github_username = github_username
69 | github_repository = repo_name
70 | autodoc_show_sourcelink = False
71 | html_show_sourcelink = False
72 |
73 | templates_path = ['_templates']
74 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
75 | python_display_short_literal_types = True
76 |
77 | html_title = project
78 | html_theme = 'pydata_sphinx_theme'
79 | html_theme_options = {
80 | "show_toc_level": 3,
81 | "icon_links": [
82 | {
83 | "name": "GitHub",
84 | "url": repo_url,
85 | "icon": "fa-brands fa-square-github",
86 | "type": "fontawesome",
87 | }
88 | ],
89 | }
90 | html_static_path = ['_static']
91 | html_css_files = ['styles/my_theme.css']
92 |
93 | html_context = {
94 | "author_url": author_url,
95 | "author": author,
96 | }
97 |
98 | toc_object_entries_show_parents = "hide"
99 |
100 | autoapi_root = 'api'
101 | autoapi_member_order = 'bysource'
102 | autodoc_typehints = 'description'
103 | autoapi_own_page_level = 'attribute'
104 | autoapi_type = 'python'
105 | autodoc_default_options = {
106 | 'members': True,
107 | 'inherited-members': True,
108 | 'undoc-members': False,
109 | 'exclude-members': '__init__, __weakref__, __repr__, __str__',
110 | }
111 | autoapi_options = ['members', 'show-inheritance', 'special-members', 'show-module-summary']
112 | autoapi_add_toctree_entry = True
113 | autoapi_dirs = ['../src']
114 | autoapi_template_dir = '_templates/autoapi'
115 |
116 | autodoc_member_order = 'bysource'
117 | autoclass_content = 'class'
118 |
119 | autosummary_generate = True
120 | autosummary_imported_members = False
121 |
122 |
123 | def autodoc_skip_member(app, what, name, obj, skip, options):
124 | """
125 | Skip members (functions, classes, modules) without docstrings.
126 | """
127 | # Check if the object has a __doc__ attribute
128 | if not getattr(obj, 'docstring', None):
129 | print('no docstring', name)
130 | return True # Skip if there's no docstring
131 | elif what in ('class', 'function', 'attribute'):
132 | # Check if the module of the class has a docstring
133 | print('checking module', name)
134 | module_name = '.'.join(name.split('.')[:-1])
135 |
136 | try:
137 | module = importlib.import_module(module_name)
138 | return not getattr(module, '__doc__', None)
139 | except ModuleNotFoundError as e:
140 | print('module not found', module_name, str(e))
141 | return None
142 |
143 |
144 | def linkcode_resolve(domain, info):
145 | if domain != 'py':
146 | return None
147 |
148 | file, start, end = get_line_numbers(eval(info['fullname']))
149 | relpath = os.path.relpath(file, os.path.dirname(module.__file__))
150 | return f'{repo_url}/blob/v{release}/src/{main_module_name}/{relpath}#L{start}-L{end}'
151 |
152 |
153 | def get_line_numbers(obj):
154 | if isinstance(obj, property):
155 | obj = obj.fget
156 |
157 | if isinstance(obj, Enum):
158 | return get_enum_member_line_numbers(obj)
159 |
160 | if inspect.ismemberdescriptor(obj):
161 | return get_member_line_numbers(obj)
162 |
163 | with module_restored(obj):
164 | lines = inspect.getsourcelines(obj)
165 | file = inspect.getsourcefile(obj)
166 |
167 | start, end = lines[1], lines[1] + len(lines[0]) - 1
168 | return file, start, end
169 |
170 |
171 | def get_enum_member_line_numbers(obj):
172 | class_ = obj.__class__
173 | with module_restored(class_):
174 | source_lines, start_line = inspect.getsourcelines(class_)
175 |
176 | for i, line in enumerate(source_lines):
177 | if f"{obj.name} =" in line:
178 | return inspect.getsourcefile(class_), start_line + i, start_line + i
179 | else:
180 | raise ValueError(f"Enum member {obj.name} not found in {class_}")
181 |
182 |
183 | def get_member_line_numbers(obj: types.MemberDescriptorType):
184 | class_ = obj.__objclass__
185 | with module_restored(class_):
186 | source_lines, start_line = inspect.getsourcelines(class_)
187 |
188 | for i, line in enumerate(source_lines):
189 | if f"{obj.__name__} = " in line:
190 | return inspect.getsourcefile(class_), start_line + i, start_line + i
191 | else:
192 | raise ValueError(f"Member {obj.__name__} not found in {class_}")
193 |
194 |
195 | @contextlib.contextmanager
196 | def module_restored(obj):
197 | if not hasattr(obj, '_module_original_'):
198 | yield
199 | else:
200 | fake_module = obj.__module__
201 | obj.__module__ = obj._module_original_
202 | yield
203 | obj.__module__ = fake_module
204 |
205 |
206 | def setup(app):
207 | app.connect('autoapi-skip-member', autodoc_skip_member)
208 | app.connect('autodoc-skip-member', autodoc_skip_member)
209 |
--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 |
3 | import barecat
4 | import pytest
5 |
6 |
7 | @pytest.fixture
8 | def temp_jpeg_dir(tmp_path):
9 | """
10 | Creates a complex temporary directory with sample JPEG files.
11 | """
12 | (tmp_path / "dir1").mkdir()
13 | (tmp_path / "dir1/subdir1").mkdir()
14 | (tmp_path / "dir1/subdir1/test1.jpg").write_bytes(b"dummy data1")
15 | (tmp_path / "dir1/subdir2").mkdir()
16 | (tmp_path / "dir1/subdir2/test2.jpg").write_bytes(b"dummy data2")
17 | (tmp_path / "dir2").mkdir()
18 | (tmp_path / "dir2/test3.jpg").write_bytes(b"dummy data3")
19 | (tmp_path / "dir2/empty_subdir").mkdir()
20 | (tmp_path / "dir3").mkdir()
21 | return tmp_path
22 |
23 |
24 | @pytest.fixture
25 | def barecat_archive(temp_jpeg_dir):
26 | """
27 | Creates a standard Barecat archive for testing.
28 | """
29 | archive_file = temp_jpeg_dir / "mydata.barecat"
30 |
31 | create_cmd = [
32 | "barecat-create-recursive",
33 | "--file", str(archive_file),
34 | "--overwrite",
35 | str(temp_jpeg_dir / "dir1"),
36 | str(temp_jpeg_dir / "dir2"),
37 | str(temp_jpeg_dir / "dir3"),
38 | '--shard-size=22'
39 | ]
40 | subprocess.run(create_cmd, check=True)
41 |
42 | return archive_file
43 |
44 |
45 | def test_barecat_creation(temp_jpeg_dir):
46 | """
47 | Runs `find` with `barecat-create` and verifies output.
48 | """
49 | output_file = temp_jpeg_dir / "mydata.barecat"
50 | cmd = f"cd {temp_jpeg_dir}; find . -name '*.jpg' -print0 | sort | barecat-create --null --file={output_file} --overwrite --shard-size=22"
51 |
52 | result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
53 |
54 | with barecat.Barecat(output_file) as reader:
55 | file_list = list(reader)
56 | assert len(file_list) == 3, "Expected 3 files in the archive"
57 | assert "dir1/subdir1/test1.jpg" in file_list, "Expected dir1/subdir1/test1.jpg in the archive"
58 | assert "dir1/subdir2/test2.jpg" in file_list, "Expected dir1/subdir2/test2.jpg in the archive"
59 | assert "dir2/test3.jpg" in file_list, "Expected dir2/test3.jpg in the archive"
60 | assert reader[
61 | "dir1/subdir1/test1.jpg"] == b"dummy data1", "Expected dir1/subdir1/test1.jpg to contain 'dummy data1'"
62 | assert reader[
63 | "dir1/subdir2/test2.jpg"] == b"dummy data2", "Expected dir1/subdir2/test2.jpg to contain 'dummy data2'"
64 | assert reader[
65 | "dir2/test3.jpg"] == b"dummy data3", "Expected dir2/test3.jpg to contain 'dummy data3'"
66 | assert reader.sharder.num_shards == 2, "Expected 2 shards in the archive"
67 |
68 | assert result.returncode == 0, f"Command failed: {result.stderr}"
69 | assert (temp_jpeg_dir / "mydata.barecat-sqlite-index").exists(), "Output file was not created"
70 |
71 | def test_barecat_creation_workers(temp_jpeg_dir):
72 | """
73 | Runs `find` with `barecat-create` and verifies output.
74 | """
75 | output_file = temp_jpeg_dir / "mydata.barecat"
76 | cmd = f"cd {temp_jpeg_dir}; find . -name '*.jpg' -print0 | sort | barecat-create --null --file={output_file} --overwrite --shard-size=22 --workers=8"
77 |
78 | result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
79 |
80 | with barecat.Barecat(output_file) as reader:
81 | file_list = list(reader)
82 | assert len(file_list) == 3, "Expected 3 files in the archive"
83 | assert "dir1/subdir1/test1.jpg" in file_list, "Expected dir1/subdir1/test1.jpg in the archive"
84 | assert "dir1/subdir2/test2.jpg" in file_list, "Expected dir1/subdir2/test2.jpg in the archive"
85 | assert "dir2/test3.jpg" in file_list, "Expected dir2/test3.jpg in the archive"
86 | assert reader[
87 | "dir1/subdir1/test1.jpg"] == b"dummy data1", "Expected dir1/subdir1/test1.jpg to contain 'dummy data1'"
88 | assert reader[
89 | "dir1/subdir2/test2.jpg"] == b"dummy data2", "Expected dir1/subdir2/test2.jpg to contain 'dummy data2'"
90 | assert reader[
91 | "dir2/test3.jpg"] == b"dummy data3", "Expected dir2/test3.jpg to contain 'dummy data3'"
92 | assert reader.sharder.num_shards == 2, "Expected 2 shards in the archive"
93 |
94 | assert result.returncode == 0, f"Command failed: {result.stderr}"
95 | assert (temp_jpeg_dir / "mydata.barecat-sqlite-index").exists(), "Output file was not created"
96 |
97 |
98 | def test_extract_single(barecat_archive):
99 | """
100 | Tests `barecat-extract-single` to ensure a specific file is correctly extracted from the archive.
101 | """
102 | extract_cmd = [
103 | "barecat-extract-single",
104 | "--barecat-file", str(barecat_archive),
105 | "--path", "dir1/subdir1/test1.jpg"
106 | ]
107 |
108 | result = subprocess.run(extract_cmd, capture_output=True)
109 |
110 | assert result.stdout == b"dummy data1", "Unexpected content in extracted file"
111 | assert result.returncode == 0, f"Command failed: {result.stderr}"
112 |
113 |
114 | def test_defrag(barecat_archive):
115 | """
116 | Tests `barecat-defrag` to ensure the archive can be defragmented properly.
117 | """
118 |
119 |
120 | with barecat.Barecat(barecat_archive, readonly=False) as bc:
121 | first_file = next(iter(bc.index.iter_all_filepaths(barecat.Order.ADDRESS)))
122 |
123 | del bc[first_file]
124 | assert first_file not in bc
125 | assert bc.total_logical_size != bc.total_physical_size_seek
126 |
127 |
128 | defrag_cmd = [
129 | "barecat-defrag",
130 | str(barecat_archive)
131 | ]
132 |
133 | result = subprocess.run(defrag_cmd, capture_output=True, text=True)
134 |
135 | with barecat.Barecat(barecat_archive) as reader:
136 | assert reader.total_logical_size == reader.total_physical_size_seek
137 | assert reader.sharder.num_shards == 1
138 |
139 |
140 | assert result.returncode == 0, f"Command failed: {result.stderr}"
141 |
142 |
143 | def test_verify_integrity(barecat_archive):
144 | """
145 | Tests `barecat-verify` to ensure the archive's integrity.
146 | """
147 | verify_cmd = [
148 | "barecat-verify",
149 | str(barecat_archive)
150 | ]
151 |
152 | result = subprocess.run(verify_cmd, capture_output=True, text=True)
153 |
154 | assert result.returncode == 0, f"Command failed: {result.stderr}"
155 |
156 | # now edit the file and verify again
157 | with open(f'{barecat_archive}-shard-00000', "r+b") as f:
158 | f.seek(0)
159 | f.write(b"junk")
160 |
161 | result = subprocess.run(verify_cmd, capture_output=True, text=True)
162 | assert result.returncode != 0, f"Command should have failed: {result.stderr}"
163 | assert 'CRC32C' in result.stdout, "Expected CRC mismatch error message"
164 |
165 |
166 | def test_index_to_csv(barecat_archive):
167 | """
168 | Tests `barecat-index-to-csv` to ensure index can be dumped as CSV.
169 | """
170 | csv_cmd = [
171 | "barecat-index-to-csv",
172 | str(barecat_archive) + "-sqlite-index"
173 | ]
174 |
175 | result = subprocess.run(csv_cmd, capture_output=True, text=True)
176 |
177 | assert '"path","shard","offset","size","crc32c"' in result.stdout, "CSV output missing expected header"
178 | assert result.returncode == 0, f"Command failed: {result.stderr}"
179 |
--------------------------------------------------------------------------------
/src/barecat/archive_formats.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import shutil
3 | import tarfile
4 | import zipfile
5 | from datetime import datetime
6 |
7 | from barecat.core.index import BarecatDirInfo, BarecatFileInfo, BarecatEntryInfo
8 | from barecat.progbar import progressbar
9 |
10 |
11 | def iter_archive(src_path):
12 | if src_path.endswith(('.tar', '.tar.gz', '.tar.bz2', '.tar.xz')):
13 | return iter_tarfile(src_path)
14 | elif src_path.endswith('.zip'):
15 | return iter_zipfile(src_path)
16 | else:
17 | raise ValueError('Unsupported archive format')
18 |
19 |
20 | def iter_archive_nocontent(src_path):
21 | if src_path.endswith(('.tar', '.tar.gz', '.tar.bz2', '.tar.xz')):
22 | return iter_tarfile_nocontent(src_path)
23 | elif src_path.endswith('.zip'):
24 | return iter_zipfile_nocontent(src_path)
25 | else:
26 | raise ValueError('Unsupported archive format')
27 |
28 |
29 | def iter_zipfile(path):
30 | with zipfile.ZipFile(path, mode='r') as zipf:
31 | for member in progressbar(zipf.infolist(), desc='Packing files', unit=' files'):
32 | if member.is_dir():
33 | di = BarecatDirInfo(path=member.filename)
34 | di.mtime_dt = datetime(*member.date_time)
35 | yield di, None
36 | else:
37 | fi = BarecatFileInfo(path=member.filename, size=member.file_size)
38 | fi.mtime_dt = datetime(*member.date_time)
39 | with zipf.open(member) as file_in_zip:
40 | yield fi, file_in_zip
41 |
42 |
43 | def iter_zipfile_nocontent(path):
44 | with open(path, 'rb') as f:
45 | with zipfile.ZipFile(f, mode='r') as zipf:
46 | for member in progressbar(zipf.infolist(), desc='Packing files', unit=' files'):
47 | if member.is_dir():
48 | di = BarecatDirInfo(path=member.filename)
49 | di.mtime_dt = datetime(*member.date_time)
50 | yield di
51 | else:
52 | f.seek(member.header_offset + 26)
53 | namelen = int.from_bytes(f.read(2), byteorder='little')
54 | extralen = int.from_bytes(f.read(2), byteorder='little')
55 | data_offset = member.header_offset + 30 + namelen + extralen
56 |
57 | fi = BarecatFileInfo(
58 | path=member.filename, shard=0, offset=data_offset, size=member.file_size
59 | )
60 | fi.mtime_dt = datetime(*member.date_time)
61 | yield fi
62 |
63 |
64 | def iter_tarfile(path):
65 | tar_file_size = osp.getsize(path) // 1024 // 1024
66 | pbar = progressbar(None, desc='Packing files', unit=' MB', total=tar_file_size)
67 | progpos = 0
68 |
69 | with tarfile.open(path, mode='r|*') as tar:
70 | for member in tar:
71 | if member.isdir():
72 | di = BarecatDirInfo(
73 | path=member.name,
74 | mode=member.mode,
75 | uid=member.uid,
76 | gid=member.gid,
77 | mtime_ns=member.mtime * 1_000_000_000,
78 | )
79 | yield di, None
80 | if member.isfile():
81 | fi = BarecatFileInfo(
82 | path=member.name,
83 | size=member.size,
84 | mode=member.mode,
85 | uid=member.uid,
86 | gid=member.gid,
87 | mtime_ns=member.mtime * 1_000_000_000,
88 | )
89 |
90 | with tar.extractfile(member) as file_in_tar:
91 | yield fi, file_in_tar
92 |
93 | new_pos = tar.fileobj.tell() // 1024 // 1024
94 | delta = new_pos - progpos
95 | pbar.update(delta)
96 | progpos += delta
97 |
98 |
99 | def iter_tarfile_nocontent(path):
100 | tar_file_size = osp.getsize(path) // 1024 // 1024
101 | pbar = progressbar(None, desc='Packing files', unit=' MB', total=tar_file_size)
102 | progpos = 0
103 |
104 | with tarfile.open(path, mode='r|*') as tar:
105 | for member in tar:
106 | if member.isdir():
107 | di = BarecatDirInfo(
108 | path=member.name,
109 | mode=member.mode,
110 | uid=member.uid,
111 | gid=member.gid,
112 | mtime_ns=member.mtime * 1_000_000_000,
113 | )
114 | yield di
115 | if member.isfile():
116 | fi = BarecatFileInfo(
117 | path=member.name,
118 | shard=0,
119 | offset=member.offset_data,
120 | size=member.size,
121 | mode=member.mode,
122 | uid=member.uid,
123 | gid=member.gid,
124 | mtime_ns=member.mtime * 1_000_000_000,
125 | )
126 | yield fi
127 | new_pos = tar.fileobj.tell() // 1024 // 1024
128 | delta = new_pos - progpos
129 | pbar.update(delta)
130 | progpos += delta
131 |
132 |
133 | def get_archive_writer(target_path):
134 | if target_path.endswith(('.tar', '.tar.gz', '.tar.bz2', '.tar.xz')):
135 | return TarWriter(target_path)
136 | elif target_path.endswith('.zip'):
137 | return ZipWriter(target_path)
138 | else:
139 | raise ValueError('Unsupported archive format')
140 |
141 |
142 | class ZipWriter:
143 | def __init__(self, target_path):
144 | self.zip = zipfile.ZipFile(target_path, mode='w')
145 |
146 | def add(self, info: BarecatEntryInfo, fileobj=None):
147 | if isinstance(info, BarecatDirInfo):
148 | zipinfo = zipfile.ZipInfo(info.path + '/')
149 | zipinfo.date_time = info.mtime_dt.timetuple()[:6]
150 | self.zip.writestr(zipinfo, '')
151 | else:
152 | zipinfo = zipfile.ZipInfo(info.path)
153 | zipinfo.date_time = info.mtime_dt.timetuple()[:6]
154 | zipinfo.file_size = info.size
155 | with self.zip.open(zipinfo, 'w') as file_in_zip:
156 | shutil.copyfileobj(fileobj, file_in_zip)
157 |
158 | def close(self):
159 | self.zip.close()
160 |
161 | def __enter__(self):
162 | return self
163 |
164 | def __exit__(self, *args):
165 | self.close()
166 |
167 |
168 | class TarWriter:
169 | def __init__(self, *args, **kwargs):
170 | if 'mode' not in kwargs:
171 | kwargs['mode'] = 'w'
172 | self.tar = tarfile.open(*args, **kwargs)
173 |
174 | def add(self, info: BarecatEntryInfo, fileobj=None):
175 | tarinfo = tarfile.TarInfo(info.path)
176 | tarinfo.uid = info.uid or 0
177 | tarinfo.gid = info.gid or 0
178 | if info.mtime_ns is not None:
179 | tarinfo.mtime = info.mtime_ns // 1_000_000_000
180 | if isinstance(info, BarecatDirInfo):
181 | tarinfo.type = tarfile.DIRTYPE
182 | tarinfo.mode = 0o755 if info.mode is None else info.mode
183 | self.tar.addfile(tarinfo)
184 | else:
185 | tarinfo.size = info.size
186 | tarinfo.mode = 0o644 if info.mode is None else info.mode
187 | self.tar.addfile(tarinfo, fileobj)
188 |
189 | def close(self):
190 | self.tar.close()
191 |
192 | def __enter__(self):
193 | return self
194 |
195 | def __exit__(self, *args):
196 | self.close()
197 |
--------------------------------------------------------------------------------
/src/barecat/defrag.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import dataclasses
4 | import os
5 | import time
6 | from typing import TYPE_CHECKING
7 |
8 | from barecat.core.index import Order
9 | from barecat.progbar import progressbar
10 |
11 | if TYPE_CHECKING:
12 | from barecat.core.barecat import Barecat
13 |
14 |
15 | class BarecatDefragger:
16 | def __init__(self, bc: Barecat):
17 | self.bc = bc
18 | self.index = bc.index
19 | self.shard_size_limit = bc.shard_size_limit
20 | self.readonly = bc.readonly
21 | self.shard_files = bc.sharder.shard_files
22 |
23 | def get_gaps(self):
24 | gaps = self.index.fetch_all("""
25 | WITH x AS (
26 | SELECT config.value_int AS shard_size_limit
27 | FROM config
28 | WHERE config.key = 'shard_size_limit'
29 | ),
30 | first_gaps AS (
31 | SELECT
32 | f.shard,
33 | 0 AS offset,
34 | MIN(f.offset) AS size
35 | FROM files f
36 | GROUP BY f.shard
37 | ),
38 | nonfirst_gaps AS (
39 | SELECT
40 | f.shard,
41 | (f.offset + f.size) AS offset,
42 | coalesce(
43 | lead(f.offset, 1) OVER (PARTITION BY f.shard ORDER BY f.offset),
44 | x.shard_size_limit
45 | ) - (f.offset + f.size) AS size
46 | FROM files f, x
47 | ),
48 | all_gaps AS (SELECT * FROM first_gaps UNION ALL SELECT * FROM nonfirst_gaps)
49 | SELECT shard, offset, size
50 | FROM all_gaps
51 | WHERE size > 0
52 | ORDER BY shard, offset
53 | """, rowcls=FragmentGap)
54 |
55 | empty_shard_gaps = [
56 | FragmentGap(shard, 0, self.shard_size_limit)
57 | for shard in range(len(self.shard_files))
58 | if self.bc.index.logical_shard_end(shard) == 0]
59 | gaps.extend(empty_shard_gaps)
60 | gaps.sort(key=lambda gap: (gap.shard, gap.offset))
61 | return gaps
62 |
63 | # gaps = []
64 | # prev_end = 0
65 | # prev_shard = -1
66 | # for fi in self.index.iter_all_fileinfos(order=Order.ADDRESS):
67 | # if fi.shard > prev_shard:
68 | # if self.shard_size_limit > prev_end and prev_shard >= 0:
69 | # gaps.append(FragmentGap(prev_shard, prev_end, self.shard_size_limit -
70 | # prev_end))
71 | # for i in range(prev_shard + 1, fi.shard):
72 | # gaps.append(FragmentGap(i, 0, self.shard_size_limit))
73 | # prev_end = 0
74 | # if fi.offset > prev_end:
75 | # gaps.append(FragmentGap(fi.shard, prev_end, fi.offset - prev_end))
76 | # prev_shard = fi.shard
77 | # prev_end = fi.offset + fi.size
78 | # return gaps
79 |
80 | def needs_defrag(self):
81 | # check if total size of shards is larger than the sum of the sizes of the files in index
82 | # the getsize() function may not be fully up to date but this is only a heuristic anyway.
83 | return self.bc.total_physical_size_seek > self.bc.total_logical_size
84 |
85 | def get_defrag_info(self):
86 | return self.bc.total_physical_size_seek, self.bc.total_logical_size
87 |
88 | def defrag(self):
89 | if self.readonly:
90 | raise ValueError('Cannot defrag a read-only Barecat')
91 |
92 | new_shard = 0
93 | new_offset = 0
94 |
95 | old_total = self.bc.total_physical_size_seek
96 |
97 | try:
98 | for i in range(len(self.shard_files)):
99 | self.bc.sharder.reopen_shard(i, 'r+b')
100 |
101 | file_iter = self.index.iter_all_fileinfos(order=Order.ADDRESS)
102 | for fi in progressbar(file_iter, total=self.index.num_files, desc='Defragging'):
103 | if (self.shard_size_limit is not None and new_offset + fi.size >
104 | self.shard_size_limit):
105 | self.shard_files[new_shard].truncate(new_offset)
106 | self.bc.sharder.reopen_shard(new_shard, 'rb')
107 | new_shard += 1
108 | new_offset = 0
109 |
110 | if not (new_shard == fi.shard and new_offset == fi.offset):
111 | shift_n_bytes(
112 | self.shard_files[fi.shard], self.shard_files[new_shard],
113 | fi.offset, new_offset, fi.size)
114 | self.index.move_file(fi.path, new_shard, new_offset)
115 |
116 | new_offset += fi.size
117 |
118 | # Truncate the last shard to its real size (the others are truncated already)
119 | self.shard_files[new_shard].truncate(new_offset)
120 | # Close and delete all shards after the last one
121 | for i in range(new_shard + 1, len(self.shard_files)):
122 | self.shard_files[i].close()
123 | os.remove(self.shard_files[i].name)
124 | del self.shard_files[new_shard + 1:]
125 |
126 | new_total = self.bc.total_physical_size_seek
127 | return old_total - new_total
128 | finally:
129 | self.bc.sharder.reopen_shards()
130 |
131 | def defrag_quick(self, time_max_seconds=5):
132 | if self.readonly:
133 | raise ValueError('Cannot defrag a read-only Barecat')
134 |
135 | start_time = time.monotonic()
136 | # Collect all gaps in the shards
137 | gaps = self.get_gaps()
138 | freed_space = 0
139 | try:
140 | for i in range(len(self.shard_files)):
141 | self.bc.sharder.reopen_shard(i, 'r+b')
142 |
143 | for fi in self.index.iter_all_fileinfos(order=Order.ADDRESS | Order.DESC):
144 | moved = self.move_to_earlier_gap(fi, gaps)
145 | if not moved or time.monotonic() - start_time > time_max_seconds:
146 | # We stop when we reach the first file that cannot be moved to an earlier gap
147 | break
148 | freed_space += fi.size
149 |
150 | self.bc.truncate_all_to_logical_size()
151 | finally:
152 | self.bc.sharder.reopen_shards()
153 |
154 | return freed_space
155 |
156 | def move_to_earlier_gap(self, fi, gaps):
157 | for i_gap, gap in enumerate(gaps):
158 | if gap.shard > fi.shard or (gap.shard == fi.shard and gap.offset >= fi.offset):
159 | # reached the gap that is after the file, no move is possible
160 | return False
161 | if gap.size >= fi.size:
162 | shift_n_bytes(
163 | self.shard_files[fi.shard], self.shard_files[gap.shard], fi.offset,
164 | gap.offset, fi.size)
165 | self.index.move_file(fi.path, gap.shard, gap.offset)
166 | gap.size -= fi.size
167 | gap.offset += fi.size
168 | if gap.size == 0:
169 | # even though we are changing the list while in a for loop that is iterating
170 | # over it, this is safe because we are immediately returning in this iteration.
171 | del gaps[i_gap]
172 | return True
173 | return False
174 |
175 |
176 | def shift_n_bytes(src_file, dst_file, src_offset, dst_offset, length, bufsize=64 * 1024):
177 | if src_file == dst_file and src_offset < dst_offset:
178 | raise ValueError('This function can only shift left'
179 | ' because defragging is done towards the left')
180 |
181 | bytes_to_copy = length
182 | while bytes_to_copy > 0:
183 | src_file.seek(src_offset)
184 | data = src_file.read(min(bufsize, bytes_to_copy))
185 | if not data:
186 | raise ValueError('Unexpected EOF')
187 |
188 | dst_file.seek(dst_offset)
189 | dst_file.write(data)
190 |
191 | len_data = len(data)
192 | src_offset += len_data
193 | dst_offset += len_data
194 | bytes_to_copy -= len_data
195 |
196 |
197 | @dataclasses.dataclass
198 | class FragmentGap:
199 | shard: int
200 | offset: int
201 | size: int
202 |
203 | @classmethod
204 | def row_factory(cls, cursor, row):
205 | field_names = [d[0] for d in cursor.description]
206 | return cls(**dict(zip(field_names, row)))
207 |
--------------------------------------------------------------------------------
/src/barecat/sql/schema.sql:
--------------------------------------------------------------------------------
1 | -- Description: Schema for the barecat database
2 |
3 |
4 | --#################################### Tables
5 | CREATE TABLE files
6 | (
7 | path TEXT PRIMARY KEY NOT NULL,
8 | parent TEXT GENERATED ALWAYS AS ( -- Parent directory is computed automatically
9 | rtrim(rtrim(path, replace(path, '/', '')), '/')
10 | ) VIRTUAL NOT NULL REFERENCES dirs (path) ON DELETE RESTRICT,
11 |
12 | shard INTEGER NOT NULL,
13 | offset INTEGER NOT NULL,
14 | size INTEGER DEFAULT 0,
15 | crc32c INTEGER DEFAULT NULL,
16 |
17 | mode INTEGER DEFAULT NULL,
18 | uid INTEGER DEFAULT NULL,
19 | gid INTEGER DEFAULT NULL,
20 | mtime_ns INTEGER DEFAULT NULL
21 | );
22 |
23 | CREATE TABLE dirs
24 | (
25 | path TEXT PRIMARY KEY,
26 | parent TEXT GENERATED ALWAYS AS (
27 | CASE
28 | WHEN path = '' THEN NULL
29 | ELSE rtrim(rtrim(path, replace(path, '/', '')), '/')
30 | END
31 | ) VIRTUAL REFERENCES dirs (path) ON DELETE RESTRICT,
32 |
33 | num_subdirs INTEGER DEFAULT 0, -- These are maintained by triggers
34 | num_files INTEGER DEFAULT 0,
35 | num_files_tree INTEGER DEFAULT 0,
36 | size_tree INTEGER DEFAULT 0,
37 |
38 | mode INTEGER DEFAULT NULL,
39 | uid INTEGER DEFAULT NULL,
40 | gid INTEGER DEFAULT NULL,
41 | mtime_ns INTEGER DEFAULT NULL
42 | );
43 |
44 | CREATE TABLE config -- For now, this table only holds the `shard_size_limit`
45 | (
46 | key TEXT PRIMARY KEY,
47 | value_text TEXT DEFAULT NULL,
48 | value_int INTEGER DEFAULT NULL
49 | ) WITHOUT ROWID;
50 |
51 | INSERT INTO config (key, value_int)
52 | VALUES ('use_triggers', 1),
53 | ('shard_size_limit', CAST(power(2, 63) - 1 AS INTEGER)),
54 | ('schema_version_major', 0),
55 | ('schema_version_minor', 2);
56 |
57 | -- Indexes
58 | CREATE INDEX idx_files_parent ON files (parent);
59 | CREATE INDEX idx_dirs_parent ON dirs (parent);
60 | CREATE INDEX idx_files_shard_offset ON files (shard, offset);
61 |
62 | --#################################### Triggers
63 | -- The idea is: we propagate changes up the tree with triggers, as this is cumbersome to do in
64 | -- the Python code. There is no propagation downwards (for example when moving a dir, we do not
65 | -- update all the children with triggers). This is because the Python code can do this
66 | -- quite easily. Furthermore, if we did it with triggers, the chain would start upward again
67 | -- with a circular mess. So we only propagate upwards the tree.
68 | -- We propagate two kinds of things:
69 | -- 1) statistics: direct and aggregate file count and aggregate size
70 | -- 2) modification time of the parent directory
71 | -- We don't update the modification time of the entity being inserted or modified,
72 | -- this can be simply done in the Python code. If the app doesn't supply mtime, presumably it
73 | -- doesn't care about it, so the overhead of triggering it makes no sense.
74 |
75 | ---- Files: add, del, move, resize
76 | CREATE TRIGGER add_file -- Upsert the parent when adding a file
77 | AFTER INSERT
78 | ON files
79 | WHEN (SELECT value_int
80 | FROM config
81 | WHERE key = 'use_triggers') = 1
82 | BEGIN
83 | -- Add the parent directory if it doesn't exist
84 | INSERT INTO dirs (path, num_files, num_files_tree, size_tree, mtime_ns)
85 | VALUES (NEW.parent, 1, 1, NEW.size,
86 | CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER))
87 | -- If the parent directory already exists, update it
88 | ON CONFLICT(path) DO UPDATE
89 | SET num_files = num_files + 1,
90 | num_files_tree = num_files_tree + 1,
91 | size_tree = size_tree + excluded.size_tree,
92 | mtime_ns = excluded.mtime_ns;
93 | END;
94 |
95 | CREATE TRIGGER del_file -- Update the parent when deleting a file
96 | AFTER DELETE
97 | ON files
98 | WHEN (SELECT value_int
99 | FROM config
100 | WHERE key = 'use_triggers') = 1
101 | BEGIN
102 | UPDATE dirs
103 | SET num_files = num_files - 1,
104 | num_files_tree = num_files_tree - 1,
105 | size_tree = size_tree - OLD.size,
106 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
107 | WHERE path = OLD.parent;
108 | END;
109 |
110 | CREATE TRIGGER move_file -- Update both parents when moving a file
111 | AFTER UPDATE OF path
112 | ON files
113 | WHEN NEW.parent != OLD.parent
114 | AND (SELECT value_int
115 | FROM config
116 | WHERE key = 'use_triggers') = 1
117 | BEGIN
118 | UPDATE dirs
119 | SET num_files = num_files + 1,
120 | num_files_tree = num_files_tree + 1,
121 | size_tree = size_tree + NEW.size,
122 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
123 | WHERE path = NEW.parent;
124 | UPDATE dirs
125 | SET num_files = num_files - 1,
126 | num_files_tree = num_files_tree - 1,
127 | size_tree = size_tree - OLD.size,
128 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
129 | WHERE path = OLD.parent;
130 | END;
131 |
132 | CREATE TRIGGER resize_file -- When file size changes
133 | AFTER UPDATE OF size
134 | ON files
135 | WHEN NEW.parent == OLD.parent -- and the file was not moved
136 | AND (SELECT value_int
137 | FROM config
138 | WHERE key = 'use_triggers') = 1
139 | BEGIN
140 | UPDATE dirs
141 | SET size_tree = size_tree + NEW.size - OLD.size
142 | WHERE path = OLD.parent;
143 | END;
144 |
145 | ---- Directories: add, del, move, resize
146 | CREATE TRIGGER add_subdir -- Upsert the parent when adding a directory
147 | AFTER INSERT
148 | ON dirs
149 | WHEN (SELECT value_int
150 | FROM config
151 | WHERE key = 'use_triggers') = 1
152 | BEGIN
153 | INSERT INTO dirs (path, num_subdirs, size_tree, num_files_tree, mtime_ns)
154 | VALUES (NEW.parent, 1, NEW.size_tree, NEW.num_files_tree,
155 | CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER))
156 | ON CONFLICT(path) DO UPDATE
157 | SET num_subdirs = num_subdirs + 1,
158 | size_tree = size_tree + excluded.size_tree,
159 | num_files_tree = num_files_tree + excluded.num_files_tree,
160 | mtime_ns= excluded.mtime_ns;
161 | END;
162 |
163 | CREATE TRIGGER del_subdir -- Update the parent when deleting a directory
164 | AFTER DELETE
165 | ON dirs
166 | WHEN (SELECT value_int
167 | FROM config
168 | WHERE key = 'use_triggers') = 1
169 | BEGIN
170 | UPDATE dirs
171 | SET num_subdirs = num_subdirs - 1,
172 | num_files = num_files - OLD.num_files,
173 | size_tree = size_tree - OLD.size_tree,
174 | num_files_tree = num_files_tree - OLD.num_files_tree,
175 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
176 | WHERE path = OLD.parent;
177 | END;
178 |
179 | CREATE TRIGGER move_subdir -- Update both parents when moving a directory
180 | AFTER UPDATE OF path
181 | ON dirs
182 | WHEN NEW.parent != OLD.parent
183 | AND (SELECT value_int
184 | FROM config
185 | WHERE key = 'use_triggers') = 1
186 | BEGIN
187 | UPDATE dirs
188 | SET num_subdirs = num_subdirs - 1,
189 | num_files = num_files - OLD.num_files,
190 | size_tree = size_tree - OLD.size_tree,
191 | num_files_tree = num_files_tree - OLD.num_files_tree,
192 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
193 | WHERE path = OLD.parent;
194 | UPDATE dirs
195 | SET num_subdirs = num_subdirs + 1,
196 | num_files = num_files + NEW.num_files,
197 | size_tree = size_tree + NEW.size_tree,
198 | num_files_tree = num_files_tree + NEW.num_files_tree,
199 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)
200 | WHERE path = NEW.parent;
201 | END;
202 |
203 |
204 | CREATE TRIGGER resize_dir -- Update the parent when a directory changes size
205 | AFTER UPDATE OF size_tree, num_files_tree
206 | ON dirs
207 | WHEN NEW.parent = OLD.parent AND
208 | (NEW.size_tree != OLD.size_tree OR NEW.num_files_tree != OLD.num_files_tree)
209 | AND (SELECT value_int
210 | FROM config
211 | WHERE key = 'use_triggers') = 1
212 | BEGIN
213 | UPDATE dirs
214 | SET size_tree = size_tree + (NEW.size_tree - OLD.size_tree),
215 | num_files_tree = num_files_tree + (NEW.num_files_tree - OLD.num_files_tree)
216 | WHERE path = OLD.parent;
217 | END;
--------------------------------------------------------------------------------
/src/barecat/util.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import glob
3 | import itertools
4 | import os
5 | import os.path as osp
6 | import shutil
7 | from datetime import datetime
8 |
9 | import crc32c as crc32c_lib
10 |
11 |
12 | def read_file(input_path, mode='r'):
13 | with open(input_path, mode) as f:
14 | return f.read()
15 |
16 |
17 | def remove(path):
18 | index_path = f'{path}-sqlite-index'
19 | shard_paths = glob.glob(f'{path}-shard-?????')
20 | for path in [index_path] + shard_paths:
21 | os.remove(path)
22 |
23 |
24 | def exists(path):
25 | index_path = f'{path}-sqlite-index'
26 | shard_paths = glob.glob(f'{path}-shard-?????')
27 | return osp.exists(index_path) or len(shard_paths) > 0
28 |
29 |
30 | # From `more-itertools` package.
31 | def chunked(iterable, n, strict=False):
32 | """Break *iterable* into lists of length *n*:
33 |
34 | >>> list(chunked([1, 2, 3, 4, 5, 6], 3))
35 | [[1, 2, 3], [4, 5, 6]]
36 |
37 | By the default, the last yielded list will have fewer than *n* elements
38 | if the length of *iterable* is not divisible by *n*:
39 |
40 | >>> list(chunked([1, 2, 3, 4, 5, 6, 7, 8], 3))
41 | [[1, 2, 3], [4, 5, 6], [7, 8]]
42 |
43 | To use a fill-in value instead, see the :func:`grouper` recipe.
44 |
45 | If the length of *iterable* is not divisible by *n* and *strict* is
46 | ``True``, then ``ValueError`` will be raised before the last
47 | list is yielded.
48 |
49 | """
50 | iterator = iter(functools.partial(take, n, iter(iterable)), [])
51 | if strict:
52 | if n is None:
53 | raise ValueError('n must not be None when using strict mode.')
54 |
55 | def ret():
56 | for chunk in iterator:
57 | if len(chunk) != n:
58 | raise ValueError('iterable is not divisible by n.')
59 | yield chunk
60 |
61 | return iter(ret())
62 | else:
63 | return iterator
64 |
65 |
66 | def take(n, iterable):
67 | """Return first *n* items of the iterable as a list.
68 |
69 | >>> take(3, range(10))
70 | [0, 1, 2]
71 |
72 | If there are fewer than *n* items in the iterable, all of them are
73 | returned.
74 |
75 | >>> take(10, range(3))
76 | [0, 1, 2]
77 |
78 | """
79 | return list(itertools.islice(iterable, n))
80 |
81 |
82 | def copy_n_bytes(src_file, dest_file, n=None, bufsize=64 * 1024):
83 | if n is None:
84 | return shutil.copyfileobj(src_file, dest_file, bufsize)
85 |
86 | bytes_to_copy = n
87 | while bytes_to_copy > 0:
88 | data = src_file.read(min(bufsize, bytes_to_copy))
89 | if not data:
90 | raise ValueError('Unexpected EOF')
91 |
92 | dest_file.write(data)
93 | bytes_to_copy -= len(data)
94 |
95 |
96 | def normalize_path(path):
97 | x = osp.normpath(path).removeprefix('/')
98 | return '' if x == '.' else x
99 |
100 |
101 | def get_parent(path):
102 | if path == '':
103 | # root already, has no parent
104 | return b'\x00'
105 |
106 | partition = path.rpartition('/')
107 | return partition[0]
108 |
109 |
110 | def partition_path(path):
111 | if path == '':
112 | # root already, has no parent
113 | return b'\x00', path
114 |
115 | parts = path.rpartition('/')
116 | return parts[0], parts[2]
117 |
118 |
119 | def get_ancestors(path):
120 | yield ''
121 | for i in range(len(path)):
122 | if path[i] == '/':
123 | yield path[:i]
124 |
125 |
126 | def reopen(file, mode):
127 | if file.mode == mode:
128 | return file
129 | file.close()
130 | return open_(file.name, mode)
131 |
132 |
133 | def fileobj_crc32c_until_end(fileobj, bufsize=64 * 1024):
134 | crc32c = 0
135 | while chunk := fileobj.read(bufsize):
136 | crc32c = crc32c_lib.crc32c(chunk, crc32c)
137 | return crc32c
138 |
139 |
140 | def fileobj_crc32c(fileobj, size=-1, bufsize=64 * 1024):
141 | if size == -1 or size is None:
142 | return fileobj_crc32c_until_end(fileobj, bufsize)
143 |
144 | crc32c = 0
145 | n_full_bufs, remainder = divmod(size, bufsize)
146 |
147 | for _ in range(n_full_bufs):
148 | data = fileobj.read(bufsize)
149 | if len(data) != bufsize:
150 | raise ValueError('Unexpected EOF')
151 | crc32c = crc32c_lib.crc32c(data, crc32c)
152 |
153 | if remainder:
154 | data = fileobj.read(remainder)
155 | if len(data) != remainder:
156 | raise ValueError('Unexpected EOF')
157 | crc32c = crc32c_lib.crc32c(data, crc32c)
158 |
159 | return crc32c
160 |
161 |
162 | def copyfileobj_crc32c_until_end(src_file, dst_file, bufsize=64 * 1024):
163 | crc32c = 0
164 | size = 0
165 | while chunk := src_file.read(bufsize):
166 | dst_file.write(chunk)
167 | crc32c = crc32c_lib.crc32c(chunk, crc32c)
168 | size += len(chunk)
169 | return size, crc32c
170 |
171 |
172 | def copyfileobj_crc32c(src_file, dst_file, size=None, bufsize=64 * 1024):
173 | if size is None:
174 | return copyfileobj_crc32c_until_end(src_file, dst_file, bufsize)
175 |
176 | crc32c = 0
177 | n_bytes_transferred = 0
178 | n_full_bufs, remainder = divmod(size, bufsize)
179 |
180 | for _ in range(n_full_bufs):
181 | data = src_file.read(bufsize)
182 | if len(data) != bufsize:
183 | raise ValueError('Unexpected EOF')
184 |
185 | crc32c = crc32c_lib.crc32c(data, crc32c)
186 | n_written = dst_file.write(data)
187 | if n_written != len(data):
188 | raise ValueError('Unexpected write problem')
189 |
190 | n_bytes_transferred += n_written
191 |
192 | if remainder:
193 | data = src_file.read(remainder)
194 | if len(data) != remainder:
195 | raise ValueError('Unexpected EOF')
196 |
197 | crc32c = crc32c_lib.crc32c(data, crc32c)
198 | n_written = dst_file.write(data)
199 | if n_written != len(data):
200 | raise ValueError('Unexpected write problem')
201 |
202 | n_bytes_transferred += n_written
203 |
204 | return n_bytes_transferred, crc32c
205 |
206 |
207 | def copyfileobj(src_file, dst_file, size=None, bufsize=64 * 1024):
208 | if size is None:
209 | return shutil.copyfileobj(src_file, dst_file, bufsize)
210 |
211 | n_bytes_transferred = 0
212 | nreads, remainder = divmod(size, bufsize)
213 |
214 | for _ in range(nreads):
215 | data = src_file.read(bufsize)
216 | dst_file.write(data)
217 | n_bytes_transferred += len(data)
218 |
219 | if remainder:
220 | data = src_file.read(remainder)
221 | dst_file.write(data)
222 | n_bytes_transferred += len(data)
223 |
224 | return n_bytes_transferred
225 |
226 |
227 | def write_zeroes(file, n, bufsize=64 * 1024):
228 | n_written = 0
229 | if n >= bufsize:
230 | zeroes = bytearray(bufsize)
231 | while n >= bufsize:
232 | n_written += file.write(zeroes)
233 | n -= bufsize
234 | n_written += file.write(bytearray(n))
235 | return n_written
236 |
237 |
238 | def raise_if_readonly(method):
239 | @functools.wraps(method)
240 | def wrapper(self, *args, **kwargs):
241 | if self.readonly:
242 | raise PermissionError('This function is not allowed in readonly mode')
243 | return method(self, *args, **kwargs)
244 |
245 | return wrapper
246 |
247 |
248 | def raise_if_append_only(method):
249 | @functools.wraps(method)
250 | def wrapper(self, *args, **kwargs):
251 | if self.append_only:
252 | raise PermissionError('This function is not allowed in append-only mode')
253 | return method(self, *args, **kwargs)
254 |
255 | return wrapper
256 |
257 |
258 | def raise_if_readonly_or_append_only(method):
259 | @functools.wraps(method)
260 | def wrapper(self, *args, **kwargs):
261 | if self.readonly or self.append_only:
262 | raise PermissionError('This function is not allowed in append-only mode')
263 | return method(self, *args, **kwargs)
264 |
265 | return wrapper
266 |
267 |
268 | def parse_size(size):
269 | if size is None:
270 | return None
271 | units = dict(K=1024, M=1024**2, G=1024**3, T=1024**4)
272 | size = size.upper()
273 |
274 | for unit, factor in units.items():
275 | if unit in size:
276 | return int(float(size.replace(unit, '')) * factor)
277 |
278 | return int(size)
279 |
280 |
281 | def open_(path, mode, *args, **kwargs):
282 | # This is like open() but supports an additional mode 'ax+b' which is like
283 | # 'x+b' in that it fails if the file already exists, and creates it if it doesn't,
284 | # but it also opens the file in append mode, like 'a+b'
285 |
286 | if sorted(mode) == sorted('ax+b'):
287 | fd = os.open(path, os.O_APPEND)
288 | return os.fdopen(fd, 'a+b', *args, **kwargs)
289 | return open(path, mode, *args, **kwargs)
290 |
291 |
292 | def datetime_to_ns(dt):
293 | return int(dt.timestamp() * 1e9)
294 |
295 |
296 | def ns_to_datetime(ns):
297 | return datetime.fromtimestamp(ns / 1e9)
298 |
--------------------------------------------------------------------------------
/src/barecat/cli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import csv
3 | import pickle
4 | import sys
5 |
6 | import barecat
7 | import barecat.cli_impl as impl
8 | from barecat.common import Order
9 | from barecat.defrag import BarecatDefragger
10 | from barecat.util import parse_size
11 |
12 |
13 | def create():
14 | parser = argparse.ArgumentParser(
15 | description='Concatenate files to sharded blobs and create an sqlite index.'
16 | )
17 | parser.add_argument('--file', type=str, help='target path', required=True)
18 | parser.add_argument(
19 | '--null',
20 | action='store_true',
21 | help='read input paths from stdin, separated by null bytes as output by '
22 | 'the find command with the -print0 option (otherwise newlines are '
23 | 'interpreted as delimiters)',
24 | )
25 | parser.add_argument('--workers', type=int, default=None)
26 | parser.add_argument(
27 | '--shard-size-limit',
28 | type=str,
29 | default=None,
30 | help='maximum size of a shard in bytes (if not specified, '
31 | 'all files will be concatenated into a single shard)',
32 | )
33 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
34 |
35 | args = parser.parse_args()
36 | impl.create_from_stdin_paths(
37 | target_path=args.file,
38 | shard_size_limit=parse_size(args.shard_size_limit),
39 | zero_terminated=args.null,
40 | overwrite=args.overwrite,
41 | workers=args.workers,
42 | )
43 |
44 |
45 | def create_recursive():
46 | # args are --file, and --shard-size-limit and --workers and --overwrite, and positional args
47 | # are what you wanna pack in. if ya supply a single posarg thing then ya can use also the
48 | # flag --strip-root and then the root will be stripped from the paths
49 | parser = argparse.ArgumentParser(
50 | description='Concatenate files to sharded blobs and create an sqlite index.'
51 | )
52 | parser.add_argument('--file', type=str, help='target path', required=True)
53 | parser.add_argument('--workers', type=int, default=None)
54 | parser.add_argument(
55 | '--shard-size-limit',
56 | type=str,
57 | default=None,
58 | help='maximum size of a shard in bytes (if not specified, '
59 | 'all files will be concatenated into a single shard)',
60 | )
61 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
62 | parser.add_argument('paths', type=str, nargs='+', help='paths to pack')
63 | parser.add_argument(
64 | '--strip-root',
65 | action='store_true',
66 | help='strip the root from the paths (only applicable if a single path is provided)',
67 | )
68 |
69 | args = parser.parse_args()
70 | impl.create_recursive(
71 | target_path=args.file,
72 | shard_size_limit=parse_size(args.shard_size_limit),
73 | roots=args.paths,
74 | overwrite=args.overwrite,
75 | workers=args.workers,
76 | strip_root=args.strip_root,
77 | )
78 |
79 |
80 | def extract():
81 | parser = argparse.ArgumentParser(description='Extract files from a barecat archive.')
82 | parser.add_argument('--file', type=str, help='path to the archive file')
83 | parser.add_argument('--target-directory', type=str, help='path to the target directory')
84 | args = parser.parse_args()
85 | impl.extract(args.file, args.target_directory)
86 |
87 |
88 | def extract_single():
89 | parser = argparse.ArgumentParser(description='Extract a single file from a barecat archive.')
90 | parser.add_argument('--barecat-file', type=str, help='path to the archive file')
91 | parser.add_argument('--path', type=str, help='path to the file to extract, within the archive')
92 | args = parser.parse_args()
93 | with barecat.Barecat(args.barecat_file) as reader:
94 | sys.stdout.buffer.write(reader[args.path])
95 |
96 |
97 | def index_to_csv():
98 | parser = argparse.ArgumentParser(description='Dump the index contents as csv')
99 | parser.add_argument('file', type=str, help='path to the index file')
100 | args = parser.parse_args()
101 |
102 | writer = csv.writer(sys.stdout, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
103 | writer.writerow(['path', 'shard', 'offset', 'size', 'crc32c'])
104 | with barecat.Index(args.file) as index:
105 | for f in index.iter_all_fileinfos(order=Order.PATH):
106 | writer.writerow([f.path, f.shard, f.offset, f.size, f.crc32c])
107 |
108 |
109 | def index_to_pickledict():
110 | parser = argparse.ArgumentParser(description='Dump the index contents as a pickled dictionary')
111 | parser.add_argument('file', type=str, help='path to the index file')
112 | parser.add_argument('outfile', type=str, help='path to the result file')
113 | args = parser.parse_args()
114 |
115 | with barecat.Index(args.file) as index_reader:
116 | dicti = dict(index_reader.items())
117 |
118 | with open(args.outfile, 'xb') as outfile:
119 | pickle.dump(dicti, outfile)
120 |
121 |
122 | def merge():
123 | parser = argparse.ArgumentParser(description='Merge existing Barecat archives into one.')
124 | parser.add_argument(
125 | 'input_paths', metavar='N', type=str, nargs='+', help='paths to the archives to merge'
126 | )
127 | parser.add_argument('--output', required=True, help='output path')
128 | parser.add_argument(
129 | '--shard-size-limit',
130 | type=str,
131 | default=None,
132 | help='maximum size of a shard in bytes (if not specified, '
133 | 'all files will be concatenated into a single shard)',
134 | )
135 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
136 | parser.add_argument(
137 | '--ignore-duplicates',
138 | action='store_true',
139 | help='if true then if a later file has the same path as an earlier one,'
140 | ' skip it; if false then raise an error',
141 | )
142 |
143 | args = parser.parse_args()
144 | impl.merge(
145 | source_paths=args.input_paths,
146 | target_path=args.output,
147 | shard_size_limit=parse_size(args.shard_size_limit),
148 | overwrite=args.overwrite,
149 | ignore_duplicates=args.ignore_duplicates,
150 | )
151 |
152 |
153 | def merge_symlink():
154 | parser = argparse.ArgumentParser(description='Merge existing Barecat archives into one.')
155 | parser.add_argument(
156 | 'input_paths', metavar='N', type=str, nargs='+', help='paths to the archives to merge'
157 | )
158 | parser.add_argument('--output', required=True, help='output path')
159 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
160 | parser.add_argument(
161 | '--ignore-duplicates',
162 | action='store_true',
163 | help='if true then if a later file has the same path as an earlier one,'
164 | ' skip it; if false then raise an error',
165 | )
166 |
167 | args = parser.parse_args()
168 | impl.merge_symlink(
169 | source_paths=args.input_paths,
170 | target_path=args.output,
171 | overwrite=args.overwrite,
172 | ignore_duplicates=args.ignore_duplicates,
173 | )
174 |
175 |
176 | def verify_integrity():
177 | parser = argparse.ArgumentParser(
178 | description='Verify the integrity of a Barecat archive, including CRC32C, directory '
179 | 'stats and no gaps between stored files.'
180 | )
181 | parser.add_argument('file', type=str, help='path to the index file')
182 | parser.add_argument(
183 | '--quick', action='store_true', help='CRC32C is only verified on the last file'
184 | )
185 | args = parser.parse_args()
186 |
187 | with barecat.Barecat(args.file) as bc:
188 | if not bc.verify_integrity(quick=args.quick):
189 | print(f'Integrity errors were found.')
190 | sys.exit(1)
191 |
192 |
193 | def defrag():
194 | parser = argparse.ArgumentParser(
195 | description='Defragment a Barecat archive to remove gaps left by deleted files.'
196 | )
197 | parser.add_argument('file', type=str, help='path to the index file')
198 | parser.add_argument(
199 | '--quick',
200 | action='store_true',
201 | help='faster but less thorough attempt at defrag, using the best-fit '
202 | 'algorithm to move the last files into gaps.',
203 | )
204 |
205 | args = parser.parse_args()
206 | with barecat.Barecat(args.file, readonly=False, append_only=False) as bc:
207 | defragger = BarecatDefragger(bc)
208 | if defragger.needs_defrag():
209 | if args.quick:
210 | defragger.defrag_quick()
211 | else:
212 | defragger.defrag()
213 |
214 |
215 | def archive2barecat():
216 | parser = argparse.ArgumentParser(
217 | description='Convert a tar or zip archive to a Barecat archive.'
218 | )
219 | # 2 positional args are the tar file and the target barecat file
220 | parser.add_argument('archive_file', type=str, help='path to the tar or zip file')
221 | parser.add_argument('barecat_file', type=str, help='path to the target barecat file')
222 |
223 | parser.add_argument(
224 | '--shard-size-limit',
225 | type=str,
226 | default=None,
227 | help='maximum size of a shard in bytes (if not specified, '
228 | 'all files will be concatenated into a single shard)',
229 | )
230 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files')
231 | args = parser.parse_args()
232 | impl.archive2barecat(
233 | src_path=args.archive_file,
234 | target_path=args.barecat_file,
235 | shard_size_limit=parse_size(args.shard_size_limit),
236 | overwrite=args.overwrite,
237 | )
238 |
239 |
240 | def barecat2archive():
241 | parser = argparse.ArgumentParser(
242 | description='Convert a Barecat archive to a tar or tar or zip archive.'
243 | )
244 | # 2 positional args are the barecat file and the target tar file
245 | parser.add_argument('barecat_file', type=str, help='path to the barecat file')
246 | parser.add_argument('archive_file', type=str, help='path to the target archive file')
247 |
248 | args = parser.parse_args()
249 | impl.barecat2archive(src_path=args.barecat_file, target_path=args.archive_file)
250 |
251 |
252 |
253 |
254 | def print_ncdu_json():
255 | parser = argparse.ArgumentParser(
256 | description='Print the contents of a Barecat as JSON in the format expected by ncdu.'
257 | )
258 | parser.add_argument('file', type=str, help='path to the index file')
259 | args = parser.parse_args()
260 | impl.print_ncdu_json(args.file)
261 |
--------------------------------------------------------------------------------
/src/barecat/cli_impl.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import itertools
3 | import json
4 | import os
5 | import os.path as osp
6 | import shutil
7 | import stat
8 | import sys
9 | import time
10 |
11 | import barecat.util
12 | from barecat.archive_formats import (
13 | get_archive_writer,
14 | iter_archive,
15 | iter_archive_nocontent,
16 | TarWriter,
17 | )
18 | from barecat.consumed_threadpool import ConsumedThreadPool
19 | from barecat.core import barecat as barecat_
20 | from barecat.core.index import BarecatDirInfo, BarecatFileInfo, Order
21 | from barecat.core.sharder import Sharder
22 | from barecat.progbar import progressbar
23 |
24 |
25 | def create_from_stdin_paths(
26 | target_path, shard_size_limit, zero_terminated=False, overwrite=False, workers=None
27 | ):
28 | iterator = generate_from_stdin(zero_terminated)
29 | create(iterator, target_path, shard_size_limit, overwrite, workers)
30 |
31 |
32 | def create_recursive(target_path, shard_size_limit, roots, overwrite, strip_root, workers=None):
33 | iterator = generate_from_walks(roots, strip_root)
34 | create(iterator, target_path, shard_size_limit, overwrite, workers)
35 |
36 |
37 | def generate_from_stdin(zero_terminated=False):
38 | if zero_terminated:
39 | input_paths = iterate_zero_terminated(sys.stdin.buffer)
40 | else:
41 | input_paths = (l.rstrip('\n') for l in sys.stdin)
42 |
43 | for input_path in progressbar(input_paths, desc='Packing files', unit=' files'):
44 | yield input_path, input_path
45 |
46 |
47 | def generate_from_walks(roots, strip_root):
48 | for root in roots:
49 | if not strip_root:
50 | yield root, osp.basename(root)
51 |
52 | for dirpath, subdirnames, filenames in os.walk(root):
53 | for entryname in itertools.chain(filenames, subdirnames):
54 | full_path = osp.join(dirpath, entryname)
55 | relpath = osp.relpath(full_path, start=root)
56 | if not strip_root:
57 | store_path = osp.join(osp.basename(root), relpath)
58 | else:
59 | store_path = relpath
60 | yield full_path, store_path
61 |
62 |
63 | def create(
64 | filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite=False, workers=8
65 | ):
66 | if workers is None:
67 | create_without_workers(
68 | filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite
69 | )
70 | else:
71 | create_with_workers(
72 | filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite, workers
73 | )
74 |
75 |
76 | def create_without_workers(
77 | filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite=False
78 | ):
79 | with barecat_.Barecat(
80 | target_path,
81 | shard_size_limit=shard_size_limit,
82 | readonly=False,
83 | overwrite=overwrite,
84 | append_only=False,
85 | ) as writer:
86 | for filesys_path, store_path in filesys_and_store_path_pairs:
87 | writer.add_by_path(filesys_path, store_path)
88 |
89 |
90 | def create_with_workers(
91 | filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite=False, workers=8
92 | ):
93 | if overwrite and barecat.util.exists(target_path):
94 | barecat.util.remove(target_path)
95 |
96 | with (
97 | Sharder(
98 | target_path,
99 | shard_size_limit=shard_size_limit,
100 | readonly=False,
101 | append_only=False,
102 | threadsafe=True,
103 | allow_writing_symlinked_shard=False,
104 | ) as sharder,
105 | ConsumedThreadPool(
106 | index_writer_main, main_args=(f'{target_path}-sqlite-index',), max_workers=workers
107 | ) as ctp,
108 | ):
109 | for filesys_path, store_path in filesys_and_store_path_pairs:
110 | statresult = os.stat(filesys_path)
111 |
112 | if stat.S_ISDIR(statresult.st_mode):
113 | dinfo = BarecatDirInfo(path=store_path)
114 | dinfo.fill_from_statresult(statresult)
115 | ctp.submit(userdata=dinfo)
116 | else:
117 | finfo = BarecatFileInfo(path=store_path)
118 | finfo.fill_from_statresult(statresult)
119 | finfo.shard, finfo.offset = sharder.reserve(finfo.size)
120 | ctp.submit(
121 | sharder.add_by_path,
122 | userdata=finfo,
123 | args=(filesys_path, finfo.shard, finfo.offset, finfo.size),
124 | kwargs=dict(raise_if_cannot_fit=True),
125 | )
126 |
127 |
128 | def index_writer_main(target_path, future_iter):
129 | with barecat_.Index(target_path, readonly=False) as index_writer:
130 | for future in future_iter:
131 | info = future.userdata
132 | if isinstance(info, BarecatDirInfo):
133 | index_writer.add_dir(info)
134 | continue
135 |
136 | shard_real, offset_real, size_real, crc32c = future.result()
137 | info.shard = shard_real
138 | info.offset = offset_real
139 | info.crc32c = crc32c
140 |
141 | if info.size != size_real:
142 | raise ValueError('Size mismatch!')
143 | index_writer.add_file(info)
144 |
145 |
146 | def extract(barecat_path, target_directory):
147 | with barecat_.Barecat(barecat_path) as reader:
148 | for path_in_archive in progressbar(reader, desc='Extracting files', unit=' files'):
149 | target_path = osp.join(target_directory, path_in_archive)
150 | os.makedirs(osp.dirname(target_path), exist_ok=True)
151 | with open(target_path, 'wb') as output_file:
152 | shutil.copyfileobj(reader.open(path_in_archive), output_file)
153 |
154 |
155 | def merge(source_paths, target_path, shard_size_limit, overwrite=False, ignore_duplicates=False):
156 | with barecat_.Barecat(
157 | target_path, shard_size_limit=shard_size_limit, readonly=False, overwrite=overwrite
158 | ) as writer:
159 | for source_path in source_paths:
160 | print(f'Merging files from {source_path}')
161 | writer.merge_from_other_barecat(source_path, ignore_duplicates=ignore_duplicates)
162 |
163 |
164 | def merge_symlink(source_paths, target_path, overwrite=False, ignore_duplicates=False):
165 | index_path = f'{target_path}-sqlite-index'
166 | if overwrite and osp.exists(index_path):
167 | os.remove(index_path)
168 |
169 | with barecat_.Index(index_path, readonly=False) as index_writer:
170 | c = index_writer.cursor
171 | c.execute("COMMIT")
172 | c.execute('PRAGMA synchronous=OFF')
173 | c.execute('PRAGMA journal_mode=OFF')
174 |
175 | i_out_shard = 0
176 | for source_path in source_paths:
177 | index_writer.merge_from_other_barecat(
178 | f'{source_path}-sqlite-index', ignore_duplicates=ignore_duplicates
179 | )
180 | for shard_path in sorted(glob.glob(f'{source_path}-shard-*')):
181 | os.symlink(
182 | osp.relpath(shard_path, start=osp.dirname(target_path)),
183 | f'{target_path}-shard-{i_out_shard:05d}',
184 | )
185 | i_out_shard += 1
186 |
187 |
188 | def write_index(dictionary, target_path):
189 | with barecat_.Index(target_path, readonly=False) as index_writer:
190 | for path, (shard, offset, size) in dictionary.items():
191 | index_writer.add_file(
192 | BarecatFileInfo(path=path, shard=shard, offset=offset, size=size)
193 | )
194 |
195 |
196 | def read_index(path):
197 | with barecat_.Index(path) as reader:
198 | return dict(reader.items())
199 |
200 |
201 | def iterate_zero_terminated(fileobj):
202 | partial_path = b''
203 | while chunk := fileobj.read(4096):
204 | parts = chunk.split(b'\x00')
205 | parts[0] = partial_path + parts[0]
206 | partial_path = parts.pop()
207 |
208 | for input_path in parts:
209 | input_path = input_path.decode()
210 | yield input_path
211 |
212 |
213 | def archive2barecat(src_path, target_path, shard_size_limit, overwrite=False):
214 | with barecat_.Barecat(
215 | target_path, shard_size_limit=shard_size_limit, readonly=False, overwrite=overwrite
216 | ) as writer:
217 | for file_or_dir_info, fileobj in iter_archive(src_path):
218 | writer.add(file_or_dir_info, fileobj=fileobj, dir_exist_ok=True)
219 |
220 |
221 | def wrap_archive(src_path, target_path, overwrite=False):
222 | index_path = f'{target_path}-sqlite-index'
223 | if overwrite and osp.exists(index_path):
224 | os.remove(index_path)
225 |
226 | with barecat_.Index(target_path, readonly=False) as index:
227 | for file_or_dir_info in iter_archive_nocontent(src_path):
228 | index.add(file_or_dir_info)
229 |
230 | os.symlink(src_path, f'{target_path}-shard-00000')
231 |
232 |
233 | def barecat2archive(src_path, target_path):
234 | with barecat_.Barecat(src_path, readonly=True) as bc:
235 | with get_archive_writer(target_path) as target_archive:
236 | infos = bc.index.iter_all_infos(order=Order.PATH)
237 | num_total = bc.index.num_files + bc.index.num_dirs
238 | for entry in progressbar(infos, total=num_total, desc='Writing', unit=' entries'):
239 | if isinstance(entry, BarecatDirInfo):
240 | target_archive.add(entry)
241 | else:
242 | with bc.open(entry.path) as file_in_barecat:
243 | target_archive.add(entry, fileobj=file_in_barecat)
244 |
245 |
246 | def print_ncdu_json(path):
247 | timestamp = time.time()
248 | import importlib.metadata
249 |
250 | progver = importlib.metadata.version('barecat')
251 | progver = '.'.join(progver.split('.')[:3])
252 |
253 | print(f'[1,1,{{"progname":"barecat","progver": {progver},"timestamp":{timestamp}}},')
254 | with barecat_.Index(path) as index_reader:
255 | _print_ncdu_json(index_reader, '')
256 | print(']')
257 |
258 |
259 | def _print_ncdu_json(index_reader, dirpath):
260 | basename = '/' if dirpath == '' else osp.basename(dirpath)
261 |
262 | print('[', json.dumps(dict(name=basename, asize=4096, ino=0)), end='')
263 | infos = index_reader.listdir_infos(dirpath)
264 | file_infos = [f for f in infos if isinstance(f, BarecatFileInfo)]
265 | subdir_infos = [d for d in infos if isinstance(d, BarecatDirInfo)]
266 | del infos
267 |
268 | if file_infos:
269 | filedump = json.dumps(
270 | [dict(name=osp.basename(fi.path), asize=fi.size, dsize=fi.size, ino=0) for fi in file_infos]
271 | )
272 | print(',', filedump[1:-1], end='')
273 | del file_infos
274 |
275 | for subdir in subdir_infos:
276 | print(',')
277 | _print_ncdu_json(index_reader, subdir.path)
278 |
279 | print(']', end='')
280 |
--------------------------------------------------------------------------------
/src/barecat/core/sharder.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import os.path as osp
4 | import shutil
5 | from contextlib import AbstractContextManager
6 |
7 | import crc32c as crc32c_lib
8 | from barecat.common import FileSection
9 | from barecat.util import (
10 | copyfileobj,
11 | copyfileobj_crc32c,
12 | open_,
13 | raise_if_readonly,
14 | reopen,
15 | write_zeroes,
16 | )
17 |
18 |
19 | class Sharder(AbstractContextManager):
20 | def __init__(
21 | self,
22 | path,
23 | shard_size_limit=None,
24 | readonly=True,
25 | append_only=False,
26 | threadsafe=False,
27 | allow_writing_symlinked_shard=False,
28 | ):
29 |
30 | self.path = path
31 | self.readonly = readonly
32 | self.append_only = append_only
33 | self.threadsafe = threadsafe
34 | self.allow_writing_symlinked_shard = allow_writing_symlinked_shard
35 |
36 | self.shard_size_limit = shard_size_limit
37 |
38 | if readonly:
39 | self.shard_mode_nonlast = 'rb'
40 | self.shard_mode_last_existing = 'rb'
41 | self.shard_mode_new = 'rb'
42 | elif append_only:
43 | self.shard_mode_nonlast = 'rb'
44 | self.shard_mode_last_existing = 'a+b'
45 | self.shard_mode_new = 'ax+b'
46 | else:
47 | self.shard_mode_nonlast = 'r+b'
48 | self.shard_mode_last_existing = 'r+b'
49 | self.shard_mode_new = 'x+b'
50 |
51 | self._shard_files = None
52 | if threadsafe:
53 | import multiprocessing_utils
54 |
55 | self.local = multiprocessing_utils.local()
56 | else:
57 | self.local = None
58 |
59 | # READING
60 | def readinto_from_address(self, shard, offset, buffer, expected_crc32c=None):
61 | shard_file = self.shard_files[shard]
62 | shard_file.seek(offset)
63 | num_read = shard_file.readinto(buffer)
64 | if expected_crc32c is not None and crc32c_lib.crc32c(buffer[:num_read]) != expected_crc32c:
65 | raise ValueError('CRC32C mismatch')
66 | return num_read
67 |
68 | def read_from_address(self, shard, offset, size, expected_crc32c=None):
69 | shard_file = self.shard_files[shard]
70 | shard_file.seek(offset)
71 | data = shard_file.read(size)
72 | if expected_crc32c is not None and crc32c_lib.crc32c(data) != expected_crc32c:
73 | raise ValueError('CRC32C mismatch')
74 | return data
75 |
76 | def open_from_address(self, shard, offset, size, mode='r'):
77 | return FileSection(self.shard_files[shard], offset, size, readonly=mode in ('r', 'rb'))
78 |
79 | # WRITING
80 | @raise_if_readonly
81 | def add_by_path(self, filesys_path, shard, offset, size, raise_if_cannot_fit=False):
82 | with open(filesys_path, 'rb') as in_file:
83 | return self.add(
84 | shard, offset, size, fileobj=in_file, raise_if_cannot_fit=raise_if_cannot_fit
85 | )
86 |
87 | @raise_if_readonly
88 | def reopen_current_shard(self, mode):
89 | return self.reopen_shard(self.num_shards - 1, mode)
90 |
91 | @raise_if_readonly
92 | def reopen_shard(self, shard_number, mode):
93 | if mode != 'rb' and shard_number != self.num_shards - 1:
94 | self.raise_if_append_only(
95 | 'Cannot change mode of non-last shard in an append-only Barecat'
96 | )
97 | self.shard_files[shard_number] = reopen(self.shard_files[shard_number], mode)
98 | return self.shard_files[shard_number]
99 |
100 | @raise_if_readonly
101 | def reopen_shards(self):
102 | for i in range(self.num_shards):
103 | if i == self.num_shards - 1:
104 | mode = self.shard_mode_last_existing
105 | else:
106 | mode = self.shard_mode_nonlast
107 | self.reopen_shard(i, mode)
108 |
109 | @raise_if_readonly
110 | def start_new_shard(self):
111 | self.reopen_current_shard(self.shard_mode_nonlast)
112 | new_shard_file = open_(f'{self.path}-shard-{self.num_shards:05d}', self.shard_mode_new)
113 | self.shard_files.append(new_shard_file)
114 | return new_shard_file
115 |
116 | @raise_if_readonly
117 | def start_new_shard_and_transfer_last_file(self, offset, size):
118 | self.raise_if_readonly('Cannot add to a read-only Barecat')
119 |
120 | old_shard_file = self.reopen_current_shard('r+b')
121 | new_shard_file = open_(f'{self.path}-shard-{self.num_shards:05d}', self.shard_mode_new)
122 | old_shard_file.seek(offset)
123 | copyfileobj(old_shard_file, new_shard_file, size)
124 | old_shard_file.truncate(offset)
125 | self.reopen_current_shard(self.shard_mode_nonlast)
126 |
127 | self.shard_files.append(new_shard_file)
128 | return new_shard_file
129 |
130 | @raise_if_readonly
131 | def add(
132 | self,
133 | shard=None,
134 | offset=None,
135 | size=None,
136 | data=None,
137 | fileobj=None,
138 | bufsize=shutil.COPY_BUFSIZE,
139 | raise_if_cannot_fit=False,
140 | ):
141 | if data is None and fileobj is None:
142 | raise ValueError('Either data or fileobj must be provided')
143 | if data is not None and fileobj is not None:
144 | raise ValueError('Both data and fileobj cannot be provided')
145 | if data is not None and size is not None and size != len(data):
146 | raise ValueError('Specified size does not match the length of the data')
147 | if shard is None and offset is not None:
148 | raise ValueError('Offset cannot be specified without a shard')
149 | if shard is not None and offset is None:
150 | raise ValueError('Shard cannot be specified without an offset')
151 |
152 | if size is None and data is not None:
153 | size = len(data)
154 |
155 | if shard is None:
156 | shard_file = self.shard_files[-1]
157 | shard = self.num_shards - 1
158 | offset = shard_file.seek(0, os.SEEK_END)
159 | else:
160 | self.ensure_open_shards(shard)
161 | shard_file = self.shard_files[shard]
162 | shard_file.seek(offset)
163 |
164 | offset_real = offset
165 | shard_real = shard
166 | if size is not None:
167 | if size > self.shard_size_limit:
168 | raise ValueError(f'File is too large to fit into a shard')
169 | if offset + size > self.shard_size_limit:
170 | if raise_if_cannot_fit:
171 | raise ValueError(f'File does not fit in the shard')
172 | shard_file = self.start_new_shard()
173 | offset_real = 0
174 | shard_real = self.num_shards - 1
175 |
176 | if data is not None:
177 | if not isinstance(data, (bytes, bytearray, memoryview)):
178 | raise ValueError(
179 | 'Data must be bytes, bytearray or memoryview. Are you using auto_codec/register_codec wrong?'
180 | )
181 | shard_file.write(data)
182 | crc32c = crc32c_lib.crc32c(data)
183 | size_real = len(data)
184 | else:
185 | size_real, crc32c = copyfileobj_crc32c(fileobj, shard_file, size, bufsize)
186 | if size is not None and size != size_real:
187 | raise ValueError(f'Size mismatch! Expected {size}, got only {size_real}')
188 |
189 | if offset_real + size_real > self.shard_size_limit:
190 | if raise_if_cannot_fit:
191 |
192 | raise ValueError('File does not fit in the shard')
193 | self.start_new_shard_and_transfer_last_file(offset_real, size_real)
194 | offset_real = 0
195 | shard_real = self.num_shards - 1
196 |
197 | return shard_real, offset_real, size_real, crc32c
198 |
199 | def reserve(self, size):
200 | if size > self.shard_size_limit:
201 | raise ValueError(f'File is too large to fit into a shard')
202 |
203 | shard_file = self.shard_files[-1]
204 | offset = shard_file.seek(0, os.SEEK_END)
205 | if offset + size > self.shard_size_limit:
206 | shard_file = self.start_new_shard()
207 | offset = 0
208 |
209 | shard_file.seek(offset)
210 | write_zeroes(shard_file, size)
211 | shard_file.flush()
212 | return self.num_shards - 1, offset
213 |
214 | def ensure_open_shards(self, shard_id):
215 | if self.num_shards < shard_id + 1:
216 | for i in range(self.num_shards, shard_id + 1):
217 | self.shard_files.append(
218 | open_(f'{self.path}-shard-{i:05d}', mode=self.shard_mode_nonlast)
219 | )
220 |
221 | def open_shard_files(self):
222 | shard_paths = sorted(glob.glob(f'{self.path}-shard-?????'))
223 | if (
224 | not self.readonly
225 | and not self.allow_writing_symlinked_shard
226 | and any(osp.islink(p) for p in shard_paths)
227 | ):
228 | raise ValueError(
229 | 'Writing symlinked shards was disabled in this Barecat '
230 | '(allow_writing_symlinked_shard on the constructor)'
231 | )
232 |
233 | shard_files_nonlast = [open_(p, mode=self.shard_mode_nonlast) for p in shard_paths[:-1]]
234 | last_shard_name = f'{self.path}-shard-{len(shard_files_nonlast):05d}'
235 | try:
236 | last_shard_file = open_(last_shard_name, mode=self.shard_mode_last_existing)
237 | except FileNotFoundError:
238 | if self.readonly:
239 | raise
240 | last_shard_file = open_(last_shard_name, mode=self.shard_mode_new)
241 |
242 | return shard_files_nonlast + [last_shard_file]
243 |
244 | def truncate_all_to_logical_size(self, logical_shard_ends):
245 | shard_files = self.shard_files
246 | for i in range(self.num_shards - 1, 0, -1):
247 | if logical_shard_ends[i] == 0:
248 | shard_files[i].truncate(0)
249 | shard_files[i].close()
250 | os.remove(shard_files[i].name)
251 | del shard_files[i]
252 | else:
253 | break
254 | for i, f in enumerate(self.shard_files):
255 | f.truncate(logical_shard_ends[i])
256 | self.reopen_current_shard(self.shard_mode_last_existing)
257 |
258 | def close(self):
259 | for f in self.shard_files:
260 | f.close()
261 |
262 | def raise_if_readonly(self, message):
263 | if self.readonly:
264 | raise ValueError(message)
265 |
266 | def raise_if_append_only(self, message):
267 | if self.append_only:
268 | raise ValueError(message)
269 |
270 | def physical_shard_end(self, shard_number):
271 | return self.shard_files[shard_number].seek(0, os.SEEK_END)
272 |
273 | @property
274 | def num_shards(self):
275 | return len(self.shard_files)
276 |
277 | @property
278 | def total_physical_size_seek(self):
279 | return sum(self.physical_shard_end(i) for i in range(self.num_shards))
280 |
281 | @property
282 | def total_physical_size_stat(self):
283 | return sum(osp.getsize(f.name) for f in self.shard_files)
284 |
285 | # THREADSAFE
286 | @property
287 | def shard_files(self):
288 | if self.local is None:
289 | if self._shard_files is None:
290 | self._shard_files = self.open_shard_files()
291 | return self._shard_files
292 | try:
293 | return self.local.shard_files
294 | except AttributeError:
295 | self.local.shard_files = self.open_shard_files()
296 | return self.local.shard_files
297 |
298 | def __exit__(self, exc_type, exc_val, exc_tb):
299 | self.close()
300 |
--------------------------------------------------------------------------------
/docs/abbrev_long.bib:
--------------------------------------------------------------------------------
1 | %%%%%%%%%%%%%%%%%%%%%% Journals %%%%%%%%%%%%%%%%
2 | @string{IJCV = "International Journal of Computer Vision (IJCV)"}
3 | @string{CVIU = "Computer Vision and Image Understanding (CVIU)"}
4 | @string{PR = "Pattern Recognition"}
5 | @string{PRL = "Pattern Recognition Letters"}
6 |
7 | @string{ML = "Machine Learning"}
8 | @string{AI = "Artificial Intelligence"}
9 | @string{AR = "Autonomous Robots"}
10 | @string{MVA = "Machine Vision and Applications"}
11 | @string{IVC = "Image and Vision Computing"}
12 | @string{BBS = "Behavioral and Brain Sciences (BBS)"}
13 | @string{VR = "Vision Research"}
14 | @string{IR = "Information Retrieval"}
15 | @string{NN = "Neural Networks"}
16 | @string{CAG = "Computers \& Graphics"}
17 | @string{CVGIP = "Computer Vision, Graphics, and Image Processing (CVGIP)"}
18 | @string{CVGIPIU = "CVGIP: Image Understanding"}
19 | @string{PP = "Perception \& Psychophysics"}
20 | @string{FTCGV = "Foundations and Trends in Computer Graphics and Vision"}
21 | @string{AdvRob = "Advanced Robotics"}
22 |
23 | @string{Nature = "Nature"}
24 | @string{Science = "Science"}
25 | @string{Mechatronics = "Mechatronics"}
26 | @string{NRN = "Nature Reviews Neuroscience"}
27 | @string{NM = "Nature Methods"}
28 | @string{PHY = "Physical Review E"}
29 | @string{PsychRev = "Psychological Review"}
30 |
31 | @string{JMLR = "Journal of Machine Learning Research (JMLR)"}
32 | @string{JSC = "Journal of Scientific Computing"}
33 | @string{JCN = "Journal of Cognitive Neuroscience"}
34 | @string{JEPHPP = "Journal of Experimental Psychology: Human Perception and Performance"}
35 | @string{JECP = "Journal of Experimental Child Psychology"}
36 | @string{JB = "Journal of Biomechanics"}
37 |
38 | @string{EURASIP = "EURASIP Journal on Advances in Signal Processing"}
39 | @string{PRESENCE = "Presence: Teleoperators and Virtual Environments"}
40 | @string{BMB = "The Bulletin of Mathematical Biophysics"}
41 |
42 | @string{TVC = "The Visual Computer"}
43 | @string{TJSC = "The Journal of Supercomputing"}
44 |
45 | % IEEE
46 | @string{PIEEE = "Proceedings of the IEEE"}
47 | @string{RAL = "IEEE Robotics and Automation Letters (RA-L)"}
48 | @string{CGA = "IEEE Computer Graphics and Applications"}
49 | @string{IEEEA = "IEEE Access"}
50 | @string{TPAMI = "IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"}
51 | @string{PAMI = "IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"}
52 | @string{TC = "IEEE Transactions on Communications"}
53 | @string{TCyb = "IEEE Transactions on Cybernetics"}
54 | @string{TSE = "IEEE Transactions on Software Engineering"}
55 | @string{TIV = "IEEE Transactions on Intelligent Vehicles"}
56 | @string{TIP = "IEEE Transactions on Image Processing"}
57 | @string{TOR = "IEEE Transactions on Robotics"}
58 | @string{TAC = "IEEE Transactions on Automatic Control"}
59 | @string{TITS = "IEEE Transactions on Intelligent Transportation Systems (T-ITS)"}
60 | @string{TOC = "IEEE Transactions on Computers"}
61 | @string{TVT = "IEEE Transactions on Vehicular Technologies"}
62 | @string{TNN = "IEEE Transactions on Neural Networks"}
63 | @string{THMS = "IEEE Transactions on Human-Machine Systems"}
64 | @string{TCSVT = "IEEE Transactions on Circuits and Systems for Video Technology"}
65 | @string{TBIOM = "IEEE Transactions on Biometrics, Behavior, and Identity Science (T-BIOM)"}
66 | @string{TIT = "IEEE Transactions on Information Theory"}
67 | @string{TVCG = "IEEE Transactions on Visualization and Computer Graphics (TVCG)"}
68 | @string{TSSC = "IEEE Transactions on Systems Science and Cybernetics"}
69 | @string{IRETIT= "IRE Transactions on Information Theory"}
70 | @string{IJTEHM= "IEEE Journal of Translational Engineering in Health and Medicine"}
71 |
72 |
73 | % ACM
74 | @string{TOCHI = "ACM Transactions on Computer-Human Interaction (TOCHI)"}
75 | @string{TOG = "ACM Transactions on Graphics (TOG)"}
76 | @string{CACM = "Communications of the ACM (CACM)"}
77 | @string{IMWUT = "Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT)"}
78 | @string{CSUR = "ACM Computing Surveys (CSUR)"}
79 | @string{THRI = "ACM Transactions on Human-Robot Interaction"}
80 |
81 | @string{AnnStat = "Annals of Statistics"}
82 | @string{JC = "Journal of Classification"}
83 | @string{IJRR = "International Journal of Robotics Research (IJRR)"}
84 | @string{RSS = "Robotics: Science and Systems (RSS)"}
85 |
86 | @string{PLOSOne = "PLOS One"}
87 | @string{SMO = "Sports Medicine -- Open"}
88 | @string{IJMIR = "International Journal of Multimedia Information Retrieval (IJMIR)"}
89 |
90 | @string{BiolCyb = "Biological Cybernetics"}
91 | @string{Psychomet = "Psychometrika"}
92 | @string{Biotelem = "Biotelemetry"}
93 | @string{NC = "Neural Computation"}
94 | @string{Neurocomputing = "Neurocomputing"}
95 | @string{PhilosMag = "London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science"}
96 |
97 | @string{TST = "Tsinghua Science and Technology"}
98 | @string{VRIH = "Virtual Reality \& Intelligent Hardware (VRIH)"}
99 | @string{AR = "Autonomous Robots Journal"}
100 | @string{ISPRS = "ISPRS Journal of Photogrammetry and Remote Sensing (P\&RS)"}
101 | @string{MMS = "Multimedia Systems"}
102 | @string{SSS = "Social Studies of Science"}
103 | @string{SIREV = "SIAM Review"}
104 |
105 | @string{Sensors = "Sensors"}
106 | @string{Electronics = "Electronics"}
107 |
108 | @string{ARVC = "Annual Review of Vision Science"}
109 | @string{ARP = "Annual Review of Psychology"}
110 | @string{PRSLB = "Proceedings of the Royal Society of London. Series B, Biological Sciences"}
111 | @string{PRSA = "Proceedings of the Royal Society A"}
112 |
113 | @string{TJP = "The Journal of Physiology"}
114 | @string{USSRCMMP = "USSR Computational Mathematics and Mathematical Physics"}
115 | @string{CRHSAS = "Comptes rendus hebdomadaires des séances de l'Académie des sciences"}
116 |
117 |
118 | %%%%%%%%%%%%%%%%%%%%% Conferences %%%%%%%%%%%%%%
119 | @string{CVPR = "IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"}
120 | @string{ICCV = "IEEE/CVF International Conference on Computer Vision (ICCV)"}
121 | @string{WACV = "IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)"}
122 |
123 | @string{ECCV = "European Conference on Computer Vision (ECCV)"}
124 | @string{ACCV = "Asian Conference on Computer Vision (ACCV)"}
125 | @string{BMVC = "British Machine Vision Conference (BMVC)"}
126 | @string{DAGM = "DAGM Annual Pattern Recognition Symposium"}
127 | @string{GCPR = "DAGM German Conference on Pattern Recognition (GCPR)"}
128 |
129 | @string{NIPS = "Advances in Neural Information Processing Systems (NIPS)"}
130 | @string{NeurIPS = "Advances in Neural Information Processing Systems (NeurIPS)"}
131 | @string{NeurIPSDB = "Neural Information Processing Systems: Datasets and Benchmarks Track"}
132 |
133 | @string{TDV = "International Conference on 3D Vision (3DV)"}
134 | @string{ICML = "International Conference on Machine Learning (ICML)"}
135 | @string{ICLR = "International Conference on Learning Representations (ICLR)"}
136 | @string{ICPR = "International Conference on Pattern Recogntion (ICPR)"}
137 | @string{CAIP = "International Conference on Analysis of Images and Patterns (CAIP)"}
138 | @string{ICIAP = "International Conference on Image Analysis and Processing (ICIAP)"}
139 | @string{ICIAR = "International Conference on Image Analysis and Recognition (ICIAR)"}
140 |
141 | @string{ISCS = "IEEE International Symposium on Circuits and Systems (ISCAS)"}
142 | @string{FG = "IEEE International Conference on Automatic Face and Gesture Recognition (FG)"}
143 | @string{CDC = "IEEE Conference on Decision and Control (CDC)"}
144 | @string{IROS = "IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)"}
145 | @string{ICRA = "IEEE International Conference on Robotics and Automation (ICRA)"}
146 | @string{IVS = "IEEE Intelligent Vehicles Symposium (IV)"}
147 | @string{ICASSP = "IEEE Conference on Acoustics, Speech and Signal Processing (ICASSP)"}
148 | @string{ITW = "IEEE Information Theory Workshop (ITW)"}
149 | @string{ICIP = "IEEE International Conference on Image Processing (ICIP)"}
150 | @string{ICME = "IEEE International Conference on Multimedia \& Expo (ICME)"}
151 | @string{CITS = "IEEE Conference on Intelligent Transportation Systems (ITSC)"}
152 | @string{RSS = "Robotics: Science and Systems (RSS)"}
153 |
154 | @string{SIGGRAPH = "ACM Transactions on Graphics (Proceedings of ACM SIGGRAPH)"}
155 | @STRING{SIGGRAPHAsia = "ACM Transactions on Graphics (Proceedings of ACM SIGGRAPH Asia)"}
156 | @string{CHI = "ACM Conference on Human Factors in Computing Systems (CHI)"}
157 | @string{MMSys = "ACM Multimedia Systems Conference (MMSys)"}
158 | @string{SIGMOD = "ACM SIGMOD International Conference on Management of Data"}
159 | @string{MM = "ACM International Conference on Multimedia"}
160 | @string{KDD = "ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)"}
161 | @string{AAAI = "AAAI Conference on Artificial Intelligence"}
162 | @string{AAAI = "AAAI Conference on Artificial Intelligence"}
163 | @string{IJCAI = "International Joint Conference on Artificial Intelligence (IJCAI)"}
164 |
165 | @string{ACC = "American Control Conference (ACC)"}
166 | @string{WAPCV = "International Workshop on Attention in Cognitive Systems (WAPCV)"}
167 | @string{COLT92 = "Annual Workshop on Computational Learning Theory (COLT)"}
168 |
169 | @string{SIBGRAPI = "SIBGRAPI Conference on Graphics, Patterns and Images"}
170 | @string{ICIRA = "International Conference on Intelligent Robotics and Applications (ICIRA)"}
171 |
172 | @string{AISTAT = "International Conference on Artificial Intelligence and Statistics (AISTATS)"}
173 | @string{AISTATS = "International Conference on Artificial Intelligence and Statistics (AISTATS)"}
174 |
175 | @string{SCIA = "Scandinavian Conference on Image Analysis (SCIA)"}
176 | @string{EUROCOLT = "European Conference on Computational Learning Theory (EuroCOLT)"}
177 | @string{ICVS = "International Conference on Computer Vision Systems (ICVS)"}
178 | @string{EMMCVPR = "International Conference on Energy Minimization Methods in Computer Vision and Pattern Recognition (EMMCVPR)"}
179 | @string{IJCNN = "International Joint Conference on Neural Networks (IJCNN)"}
180 |
181 | @string{MICCAI = "International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)"}
182 | @string{ICANN = "International Conference on Artificial Neural Networks (ICANN)"}
183 | @string{ISMIR = "International Society for Music Information Retrieval Conference (ISMIR)"}
184 | @string{AMDO = "International Conference on Articulated Motion and Deformable Objects (AMDO)"}
185 | @string{Allerton = "Annual Allerton Conference on Communication, Control, and Computing"}
186 | @string{OSDI = "USENIX Symposium on Operating Systems Design and Implementation (OSDI)"}
187 |
188 | @string{BRACIS = "Brazilian Conference on Intelligent Systems (BRACIS)"}
189 | @string{MIDL = "Medical Imaging with Deep Learning (MIDL)"}
190 | @string{TDBODYTECH = "International Conference and Exhibition on 3D Body Scanning and Processing Technologies (3DBODY.TECH)"}
191 | @string{IAS = "International Conference on Intelligent Autonomous Systems"}
192 | @string{CoRL = "Conference on Robot Learning"}
193 | @string{CRV = "Conference on Computer and Robot Vision"}
194 | @string{ICONIP = "International Conference on Neural Information Processing"}
195 | @string{SGP = "Symposium on Geometry Processing"}
196 |
197 |
198 | @string{WACV_until_2016 = "IEEE Workshop on Applications of Computer Vision (WACV)"}
199 | %%%%%%%%%%%%%%%%%%%%% Workshops %%%%%%%%%%%%%%
200 | @string{ICCVW = "IEEE International Conference on Computer Vision -- Workshops (ICCVW)"}
201 | @string{ECCVW = "European Conference on Computer Vision -- Workshops (ECCVW)"}
202 | @string{CVPRW = "IEEE Conference on Computer Vision and Pattern Recognition -- Workshops (CVPRW)"}
203 | @string{IROSW = "IEEE/RSJ International Conference on Intelligent Robots and Systems -- Workshops (IROSW)"}
204 | @string{WACVW = "IEEE Winter Conference on Applications of Computer Vision -- Workshops (WACVW)"}
205 | @string{MICCAIW = "International Conference on Medical Image Computing and Computer Assisted Intervention -- Workshops (MICCAIW)"}
206 |
207 | @string{MMWVSCC = "ACM Multimedia Conference (MM) -- Workshop on Visual Analysis in Smart and Connected Communities (VSCC)"}
208 |
--------------------------------------------------------------------------------
/src/barecat/common.py:
--------------------------------------------------------------------------------
1 | import io
2 | import os
3 | from datetime import datetime
4 | from enum import Flag, auto
5 | from typing import Union, TYPE_CHECKING, Optional
6 | from barecat.util import datetime_to_ns, normalize_path, ns_to_datetime
7 |
8 | if TYPE_CHECKING:
9 | from barecat import BarecatEntryInfo
10 |
11 | SHARD_SIZE_UNLIMITED = (1 << 63) - 1 #: An extremely large integer, representing unlimited size
12 |
13 |
14 | class BarecatEntryInfo:
15 | """
16 | Base class for file and directory information classes.
17 |
18 | The two subclasses are :class:`barecat.BarecatFileInfo` and :class:`barecat.BarecatDirInfo`.
19 |
20 | Args:
21 | path: path to the file or directory
22 | mode: file mode, i.e. permissions
23 | uid: user ID
24 | gid: group ID
25 | mtime_ns: last modification time in nanoseconds since the Unix epoch
26 | """
27 |
28 | __slots__ = ('_path', 'mode', 'uid', 'gid', 'mtime_ns')
29 |
30 | def __init__(
31 | self,
32 | path: Optional[str] = None,
33 | mode: Optional[int] = None,
34 | uid: Optional[int] = None,
35 | gid: Optional[int] = None,
36 | mtime_ns: Optional[Union[int, datetime]] = None,
37 | ):
38 | self._path = normalize_path(path)
39 | self.mode = mode
40 | """File mode, i.e., permissions."""
41 |
42 | self.uid = uid
43 | """User ID."""
44 |
45 | self.gid = gid
46 | """Group ID."""
47 |
48 | self.mtime_ns = mtime_ns
49 | """Last modification time in nanoseconds since the Unix epoch."""
50 |
51 | if isinstance(self.mtime_ns, datetime):
52 | self.mtime_ns = datetime_to_ns(self.mtime_ns)
53 |
54 | @property
55 | def path(self):
56 | """Path to the file or directory. The path is normalized on assignment."""
57 | return self._path
58 |
59 | @path.setter
60 | def path(self, value):
61 | self._path = normalize_path(value)
62 |
63 | @property
64 | def mtime_dt(self) -> Optional[datetime]:
65 | """Last modification time as a datetime object."""
66 | return ns_to_datetime(self.mtime_ns) if self.mtime_ns else None
67 |
68 | @mtime_dt.setter
69 | def mtime_dt(self, dt: datetime):
70 | self.mtime_ns = datetime_to_ns(dt)
71 |
72 | def update_mtime(self):
73 | """Update the last modification time to the current time."""
74 | self.mtime_dt = datetime.now()
75 |
76 | def fill_from_statresult(self, s: os.stat_result):
77 | """Fills the metadata information from a stat result, obtained from the file system.
78 |
79 | Args:
80 | s: stat result object to fill the metadata from
81 | """
82 | self.mode = s.st_mode
83 | self.uid = s.st_uid
84 | self.gid = s.st_gid
85 | self.mtime_ns = s.st_mtime_ns
86 |
87 | @classmethod
88 | def row_factory(cls, cursor, row):
89 | """Factory method for creating instances from SQLite query results.
90 |
91 | Args:
92 | cursor: SQLite cursor object
93 | row: row from the query result
94 | """
95 |
96 | # Raw construction without any of that property business or validation, just for speed
97 | instance = cls.__new__(cls)
98 | for field, value in zip(cursor.description, row):
99 | fieldname = field[0]
100 | if fieldname == 'path':
101 | instance._path = value
102 | else:
103 | object.__setattr__(instance, fieldname, value)
104 | return instance
105 |
106 |
107 | class BarecatFileInfo(BarecatEntryInfo):
108 | """
109 | Describes file information such as path, location in the shards and metadata.
110 |
111 | This class is used both when retrieving existing file information and when adding new files.
112 |
113 | Args:
114 | path: path to the file inside the archive
115 | mode: file mode, i.e., permissions
116 | uid: user ID
117 | gid: group ID
118 | mtime_ns: last modification time in nanoseconds since the Unix epoch
119 | shard: shard number
120 | offset: offset within the shard in bytes
121 | size: size of the file in bytes
122 | crc32c: CRC32C checksum of the file contents
123 | """
124 |
125 | __slots__ = ('shard', 'offset', 'size', 'crc32c')
126 |
127 | def __init__(
128 | self,
129 | path: Optional[str] = None,
130 | mode: Optional[int] = None,
131 | uid: Optional[int] = None,
132 | gid: Optional[int] = None,
133 | mtime_ns: Optional[Union[int, datetime]] = None,
134 | shard: Optional[int] = None,
135 | offset: Optional[int] = None,
136 | size: Optional[int] = None,
137 | crc32c: Optional[int] = None,
138 | ):
139 | super().__init__(path, mode, uid, gid, mtime_ns)
140 | self.shard = shard
141 | """Shard number where the file is located."""
142 |
143 | self.offset = offset
144 | """Offset within the shard in bytes."""
145 |
146 | self.size = size
147 | """Size of the file in bytes."""
148 |
149 | self.crc32c = crc32c
150 | """CRC32C checksum of the file contents."""
151 |
152 | def asdict(self) -> dict:
153 | """Returns a dictionary representation of the file information.
154 |
155 | Returns:
156 | Dictionary with keys 'path', 'shard', 'offset', 'size', 'crc32c', 'mode', 'uid',
157 | 'gid', 'mtime_ns'
158 | """
159 | return dict(
160 | path=self.path,
161 | shard=self.shard,
162 | offset=self.offset,
163 | size=self.size,
164 | crc32c=self.crc32c,
165 | mode=self.mode,
166 | uid=self.uid,
167 | gid=self.gid,
168 | mtime_ns=self.mtime_ns,
169 | )
170 |
171 | def fill_from_statresult(self, s: os.stat_result):
172 | """Fills the file metadata information from a stat result, obtained from the file system.
173 |
174 | Args:
175 | s: stat result object to fill the metadata from
176 | """
177 | super().fill_from_statresult(s)
178 | self.size = s.st_size
179 |
180 | @property
181 | def end(self) -> int:
182 | """End position of the file in the shard."""
183 | return self.offset + self.size
184 |
185 |
186 | class BarecatDirInfo(BarecatEntryInfo):
187 | """
188 | Describes directory information such as path, metadata and statistics.
189 |
190 | This class is used both when retrieving existing directory information and when adding new
191 | directories.
192 |
193 | Args:
194 | path: path to the directory inside the archive
195 | mode: directory mode, i.e., permissions
196 | uid: user ID
197 | gid: group ID
198 | mtime_ns: last modification time in nanoseconds since the Unix epoch
199 | num_subdirs: number of subdirectories in the directory
200 | num_files: number of files in the directory
201 | size_tree: total size of the directory contents in bytes
202 | num_files_tree: total number of files in the directory and its subdirectories
203 | """
204 |
205 | __slots__ = ('num_subdirs', 'num_files', 'size_tree', 'num_files_tree')
206 |
207 | def __init__(
208 | self,
209 | path: Optional[str] = None,
210 | mode: Optional[int] = None,
211 | uid: Optional[int] = None,
212 | gid: Optional[int] = None,
213 | mtime_ns: Optional[Union[int, datetime]] = None,
214 | num_subdirs: Optional[bool] = None,
215 | num_files: Optional[int] = None,
216 | size_tree: Optional[int] = None,
217 | num_files_tree: Optional[int] = None,
218 | ):
219 | super().__init__(path, mode, uid, gid, mtime_ns)
220 | self.num_subdirs = num_subdirs
221 | """Number of immediate subdirectories in the directory."""
222 |
223 | self.num_files = num_files
224 | """Number of immediate files in the directory."""
225 |
226 | self.size_tree = size_tree
227 | """Total size of the directory's contents (recursively) in bytes."""
228 |
229 | self.num_files_tree = num_files_tree
230 | """Total number of files in the directory and its subdirectories, recursively."""
231 |
232 | def asdict(self) -> dict:
233 | """Returns a dictionary representation of the directory information.
234 |
235 | Returns:
236 | Dictionary with keys 'path', 'num_subdirs', 'num_files', 'size_tree', 'num_files_tree',
237 | 'mode', 'uid', 'gid', 'mtime_ns'
238 | """
239 | return dict(
240 | path=self.path,
241 | num_subdirs=self.num_subdirs,
242 | num_files=self.num_files,
243 | size_tree=self.size_tree,
244 | num_files_tree=self.num_files_tree,
245 | mode=self.mode,
246 | uid=self.uid,
247 | gid=self.gid,
248 | mtime_ns=self.mtime_ns,
249 | )
250 |
251 | @property
252 | def num_entries(self) -> int:
253 | """Total number of entries in the directory, including subdirectories and files."""
254 | return self.num_subdirs + self.num_files
255 |
256 | def fill_from_statresult(self, s: os.stat_result):
257 | """Fills the directory metadata information from a stat result, from the file system.
258 |
259 | Args:
260 | s: stat result object to fill the metadata from
261 | """
262 | super().fill_from_statresult(s)
263 | self.num_subdirs = s.st_nlink - 2
264 |
265 |
266 | class Order(Flag):
267 | """Ordering specification for file and directory listings.
268 |
269 | The ordering can be by address (shard and offset), path, or random. The order can be ascending
270 | or descending. The default order is ANY, which is the order in which SQLite yields rows.
271 | """
272 |
273 | ANY = auto()
274 | """Default order, as returned by SQLite"""
275 |
276 | RANDOM = auto()
277 | """Randomized order"""
278 |
279 | ADDRESS = auto()
280 | """Order by shard and offset position"""
281 |
282 | PATH = auto()
283 | """Alphabetical order by path"""
284 |
285 | DESC = auto()
286 | """Descending order"""
287 |
288 | def as_query_text(self) -> str:
289 | """Returns the SQL ORDER BY clause corresponding to the ordering specification."""
290 |
291 | if self & Order.ADDRESS and self & Order.DESC:
292 | return ' ORDER BY shard DESC, offset DESC'
293 | elif self & Order.ADDRESS:
294 | return ' ORDER BY shard, offset'
295 | elif self & Order.PATH and self & Order.DESC:
296 | return ' ORDER BY path DESC'
297 | elif self & Order.PATH:
298 | return ' ORDER BY path'
299 | elif self & Order.RANDOM:
300 | return ' ORDER BY RANDOM()'
301 | return ''
302 |
303 |
304 | class FileSection(io.IOBase):
305 | """File-like object representing a section of a file.
306 |
307 | Args:
308 | file: file-like object to read from or write to
309 | start: start position of the section in the file
310 | size: size of the section
311 | readonly: whether the section should be read-only
312 | """
313 |
314 | def __init__(self, file: io.RawIOBase, start: int, size: int, readonly: bool = True):
315 | self.file = file
316 | self.start = start
317 | self.end = start + size
318 | self.position = start
319 | self.readonly = readonly
320 |
321 | def read(self, size: int = -1) -> bytes:
322 | """Read a from the section, starting from the current position.
323 |
324 | Args:
325 | size: number of bytes to read, or -1 to read until the end of the section
326 |
327 | Returns:
328 | Bytes read from the section.
329 | """
330 | if size == -1:
331 | size = self.end - self.position
332 |
333 | size = min(size, self.end - self.position)
334 | self.file.seek(self.position)
335 | data = self.file.read(size)
336 | self.position += len(data)
337 | return data
338 |
339 | def readinto(self, buffer: Union[bytearray, memoryview]) -> int:
340 | """Read bytes into a buffer from the section, starting from the current position.
341 |
342 | Will read up to the length of the buffer or until the end of the section.
343 |
344 | Args:
345 | buffer: destination buffer to read into
346 |
347 | Returns:
348 | Number of bytes read into the buffer.
349 | """
350 | size = min(len(buffer), self.end - self.position)
351 | if size == 0:
352 | return 0
353 |
354 | self.file.seek(self.position)
355 | num_read = self.file.readinto(buffer[:size])
356 | self.position += num_read
357 | return num_read
358 |
359 | def readall(self) -> bytes:
360 | """Read all remaining bytes from the section.
361 |
362 | Returns:
363 | Bytes read from the section.
364 | """
365 |
366 | return self.read()
367 |
368 | def readable(self):
369 | """Always returns True, since the section is always readable."""
370 | return True
371 |
372 | def writable(self):
373 | return not self.readonly
374 |
375 | def write(self, data: Union[bytes, bytearray, memoryview]) -> int:
376 | """Write data to the section, starting from the current position.
377 |
378 | Args:
379 | data: data to write to the section
380 |
381 | Returns:
382 | Number of bytes written to the section.
383 |
384 | Raises:
385 | PermissionError: if the section is read-only
386 | EOFError: if the write would go past the end of the section
387 | """
388 |
389 | if self.readonly:
390 | raise PermissionError('Cannot write to a read-only file section')
391 |
392 | if self.position + len(data) > self.end:
393 | raise EOFError('Cannot write past the end of the section')
394 |
395 | self.file.seek(self.position)
396 | n_written = self.file.write(data)
397 | self.position += n_written
398 | return n_written
399 |
400 | def readline(self, size: int = -1) -> bytes:
401 | size = min(size, self.end - self.position)
402 | if size == -1:
403 | size = self.end - self.position
404 |
405 | self.file.seek(self.position)
406 | data = self.file.readline(size)
407 |
408 | self.position += len(data)
409 | return data
410 |
411 | def tell(self):
412 | return self.position - self.start
413 |
414 | def seek(self, offset, whence=0):
415 | if whence == io.SEEK_SET:
416 | new_position = self.start + offset
417 | elif whence == io.SEEK_CUR:
418 | new_position = self.position + offset
419 | elif whence == io.SEEK_END:
420 | new_position = self.end + offset
421 | else:
422 | raise ValueError(f"Invalid value for whence: {whence}")
423 |
424 | if new_position < self.start or new_position > self.end:
425 | raise EOFError("Seek position out of bounds")
426 |
427 | self.position = new_position
428 | return self.position - self.start
429 |
430 | def close(self):
431 | """Close the file section, this is a no-op, since the real shard file is not closed."""
432 | pass
433 |
434 | @property
435 | def size(self) -> int:
436 | """Size of the section in bytes."""
437 | return self.end - self.start
438 |
439 | def __exit__(self, exc_type, exc_val, exc_tb):
440 | self.close()
441 |
--------------------------------------------------------------------------------
/src/barecat/viewerqt6.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import os.path as osp
4 | import pprint
5 | import re
6 | import shutil
7 | import sys
8 | from typing import List
9 |
10 | import msgpack_numpy
11 | from PyQt6.QtCore import QBuffer, QByteArray, QMimeData, QModelIndex, Qt, pyqtSlot
12 | from PyQt6.QtGui import (
13 | QClipboard,
14 | QFont,
15 | QFontMetrics,
16 | QImageReader,
17 | QPixmap,
18 | QStandardItem,
19 | QStandardItemModel,
20 | )
21 | from PyQt6.QtWidgets import (
22 | QAbstractItemView,
23 | QApplication,
24 | QFileDialog,
25 | QHBoxLayout,
26 | QHeaderView,
27 | QLabel,
28 | QMenu,
29 | QScrollArea,
30 | QSplitter,
31 | QStyleFactory,
32 | QTableView,
33 | QTreeView,
34 | QVBoxLayout,
35 | QWidget,
36 | )
37 |
38 | import barecat
39 | from barecat.common import BarecatDirInfo, BarecatFileInfo
40 |
41 |
42 | def main():
43 | app = QApplication(sys.argv)
44 | app.setStyle(QStyleFactory.create(QApplication.style().objectName()))
45 |
46 | parser = argparse.ArgumentParser(description='View images stored in a barecat archive.')
47 | parser.add_argument('path', type=str, help='path to load from')
48 | args = parser.parse_args()
49 | viewer = BarecatViewer(args.path)
50 | viewer.show()
51 | sys.exit(app.exec())
52 |
53 |
54 | class BarecatViewer(QWidget):
55 | def __init__(self, path):
56 | super().__init__()
57 | self.file_reader = barecat.Barecat(path)
58 | self.barecat_path = path
59 | self.tree = QTreeView()
60 | self.tree.setEditTriggers(QAbstractItemView.EditTrigger.NoEditTriggers)
61 |
62 | self.file_table = self.create_file_table()
63 | self.content_viewer = ContentViewer()
64 | self.content_viewer.label.setWordWrap(True)
65 | font = QFont("Courier New") # Replace with the desired monospace font
66 | self.content_viewer.label.setFont(font)
67 |
68 | splitter = QSplitter()
69 | splitter.addWidget(self.tree)
70 | splitter.addWidget(self.file_table)
71 | splitter.addWidget(self.content_viewer)
72 | splitter.setSizes([650, 650, 1000])
73 | layout = QHBoxLayout()
74 | layout.addWidget(splitter)
75 | self.setLayout(layout)
76 |
77 | self.resize(2400, 800)
78 |
79 | self.fill_tree()
80 | self.tree.selectionModel().selectionChanged.connect(self.update_file_table)
81 | self.tree.activated.connect(self.expand_tree_item)
82 | self.tree.doubleClicked.connect(self.expand_tree_item)
83 | self.tree.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
84 | self.tree.customContextMenuRequested.connect(self.show_tree_context_menu)
85 |
86 | root_index = self.tree.model().index(0, 0)
87 | self.tree.setCurrentIndex(root_index)
88 |
89 | def create_file_table(self):
90 | ft = QTableView()
91 | ft.verticalHeader().setVisible(False)
92 | ft.verticalHeader().setDefaultSectionSize(20)
93 | ft.setShowGrid(False)
94 | ft.setSelectionMode(QAbstractItemView.SelectionMode.SingleSelection)
95 | ft.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows)
96 | ft.setEditTriggers(QAbstractItemView.EditTrigger.NoEditTriggers)
97 | model = QStandardItemModel()
98 | model.setHorizontalHeaderLabels(['Name', 'Size'])
99 | ft.setModel(model)
100 | ft.selectionModel().selectionChanged.connect(self.show_selected_file)
101 | ft.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.Stretch)
102 | ft.horizontalHeader().setStyleSheet(
103 | "QHeaderView::section {font-weight: normal; text-align: left;}"
104 | )
105 | ft.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
106 | ft.customContextMenuRequested.connect(self.show_file_table_context_menu)
107 | return ft
108 |
109 | def fill_tree(self):
110 | root_item = TreeItem(self.file_reader)
111 | dinfo: BarecatDirInfo = self.file_reader.index.lookup_dir('')
112 | item = TreeItem(
113 | self.file_reader,
114 | path='',
115 | size=dinfo.size_tree,
116 | count=dinfo.num_files_tree,
117 | has_subdirs=dinfo.num_subdirs > 0,
118 | parent=root_item,
119 | )
120 | root_item.children.append(item)
121 | self.model = LazyItemModel(root_item)
122 | self.tree.setModel(self.model)
123 |
124 | root_index = self.tree.model().index(0, 0)
125 | self.tree.expand(root_index) # Expand the root item by default
126 | self.tree.setColumnWidth(0, 400)
127 | self.tree.setColumnWidth(1, 70)
128 | self.tree.setColumnWidth(2, 70)
129 |
130 | @pyqtSlot(QModelIndex)
131 | def expand_tree_item(self, index):
132 | if self.tree.isExpanded(index):
133 | self.tree.collapse(index)
134 | else:
135 | self.tree.expand(index)
136 |
137 | def update_file_table(self, selected, deselected):
138 | indexes = selected.indexes()
139 | if not indexes:
140 | return
141 |
142 | index = indexes[0] # Get the first selected index
143 | item = index.internalPointer()
144 |
145 | model = self.file_table.model()
146 | model.removeRows(0, model.rowCount())
147 | finfos: List[BarecatFileInfo] = self.file_reader.index.list_direct_fileinfos(item.path)
148 | finfos = sorted(finfos, key=lambda x: natural_sort_key(x.path))
149 | for finfo in finfos:
150 | file_item = QStandardItem(osp.basename(finfo.path))
151 | file_item.setData(finfo, Qt.ItemDataRole.UserRole) # Store the fileinfo as user data
152 | model.appendRow([file_item, QStandardItem(format_size(finfo.size))])
153 |
154 | if len(finfos) > 0:
155 | first_file_index = self.file_table.model().index(0, 0)
156 | self.file_table.setCurrentIndex(first_file_index)
157 | else:
158 | for dinfo, subdinfos, finfos in self.file_reader.index.walk_infos(item.path):
159 | finfo = next(iter(finfos), None)
160 | if finfo is not None:
161 | self.show_file(finfo)
162 | break
163 |
164 | def show_selected_file(self, selected, deselected):
165 | indexes = selected.indexes()
166 | if not indexes:
167 | return
168 | path = self.file_table.model().item(indexes[0].row(), 0).data(Qt.ItemDataRole.UserRole)
169 | self.show_file(path)
170 |
171 | def show_file(self, finfo):
172 | content = self.file_reader.read(finfo)
173 | extension = osp.splitext(finfo.path)[1].lower()
174 | if extension in ('.jpg', '.jpeg', '.png', '.gif', '.bmp'):
175 | byte_array = QByteArray(content)
176 | buffer = QBuffer(byte_array)
177 | imageReader = QImageReader()
178 | imageReader.setDecideFormatFromContent(True)
179 | imageReader.setQuality(100)
180 | imageReader.setDevice(buffer)
181 | qim = imageReader.read()
182 |
183 | if not qim.isNull():
184 | pixmap = QPixmap.fromImage(qim)
185 | self.content_viewer.setPixmap(pixmap)
186 | elif extension == '.msgpack':
187 | data = msgpack_numpy.unpackb(content)
188 | self.content_viewer.setText(data)
189 | else:
190 | self.content_viewer.setText(repr(content))
191 |
192 | def update_image_label(self, pixmap):
193 | self.content_viewer.setPixmap(pixmap)
194 |
195 | def show_file_table_context_menu(self, position):
196 | menu = QMenu()
197 | extract_action = menu.addAction("Extract file...")
198 | copy_path_action = menu.addAction("Copy path")
199 |
200 | action = menu.exec(self.file_table.viewport().mapToGlobal(position))
201 |
202 | if action == extract_action:
203 | indexes = self.file_table.selectionModel().selectedRows()
204 | if indexes:
205 | path_of_what_to_extract = (
206 | self.file_table.model().item(indexes[0].row(), 0).data(Qt.ItemDataRole.UserRole)
207 | )
208 | default_filename = osp.basename(path_of_what_to_extract)
209 | target_filename, _ = QFileDialog.getSaveFileName(
210 | self, "Select Target File", default_filename
211 | )
212 | if target_filename:
213 | self.extract_file(path_of_what_to_extract, target_filename)
214 | elif action == copy_path_action:
215 | indexes = self.file_table.selectionModel().selectedRows()
216 | if indexes:
217 | path = (
218 | self.file_table.model().item(indexes[0].row(), 0).data(Qt.ItemDataRole.UserRole)
219 | )
220 | clipboard = QApplication.clipboard()
221 | clipboard.setText(path)
222 |
223 | def show_tree_context_menu(self, position):
224 | menu = QMenu()
225 | extract_action = menu.addAction("Extract directory...")
226 | copy_path_action = menu.addAction("Copy path")
227 |
228 | action = menu.exec(self.tree.viewport().mapToGlobal(position))
229 | if action == extract_action:
230 | index = self.tree.indexAt(position)
231 | if index.isValid():
232 | if target_directory := QFileDialog.get(self, "Select Target Directory"):
233 | self.extract_directory(index.internalPointer().path, target_directory)
234 | elif action == copy_path_action:
235 | index = self.tree.indexAt(position)
236 | if index.isValid():
237 | clipboard = QApplication.clipboard()
238 | clipboard.setText(index.internalPointer().path)
239 |
240 | def extract_file(self, path_of_what_to_extract, target_filename):
241 | with open(target_filename, 'wb') as f:
242 | shutil.copyfileobj(self.file_reader.open(path_of_what_to_extract), f)
243 |
244 | def extract_directory(self, dir_in_archive, target_directory):
245 | basename = osp.basename(dir_in_archive)
246 | for dinfo, _, finfos in self.file_reader.index.walk_infos(dir_in_archive):
247 | for finfo in finfos:
248 | target_path = osp.join(
249 | target_directory, basename, osp.relpath(finfo.path, dir_in_archive)
250 | )
251 | os.makedirs(osp.dirname(target_path), exist_ok=True)
252 | with open(target_path, 'wb') as f:
253 | shutil.copyfileobj(self.file_reader.open(finfo.path), f)
254 |
255 |
256 | class ContentViewer(QWidget):
257 | def __init__(self):
258 | super().__init__()
259 | self.label = QLabel()
260 | self.originalPixmap = None
261 | self.originalText = None # New attribute to hold the original text
262 | self.scrollArea = QScrollArea(self)
263 | self.scrollArea.setWidgetResizable(True)
264 | self.scrollArea.setWidget(self.label)
265 | layout = QVBoxLayout(self)
266 | layout.addWidget(self.scrollArea)
267 |
268 | self.label.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
269 | self.label.customContextMenuRequested.connect(self.show_context_menu)
270 |
271 | def setPixmap(self, pixmap):
272 | self.originalPixmap = pixmap
273 | self.originalText = None # Reset the original text
274 | self.updateImage()
275 |
276 | def setText(self, original_data):
277 | self.originalText = original_data # Store the original data
278 | self.originalPixmap = None # Reset the pixmap
279 | self.updateText()
280 |
281 | def updateImage(self):
282 | if self.originalPixmap:
283 | availableSize = self.scrollArea.size()
284 | if (
285 | self.originalPixmap.width() > availableSize.width()
286 | or self.originalPixmap.height() > availableSize.height()
287 | ):
288 | scaledPixmap = self.originalPixmap.scaled(
289 | availableSize,
290 | Qt.AspectRatioMode.KeepAspectRatio,
291 | Qt.TransformationMode.SmoothTransformation,
292 | )
293 | else:
294 | scaledPixmap = self.originalPixmap
295 | self.label.setPixmap(scaledPixmap)
296 | self.label.setAlignment(Qt.AlignmentFlag.AlignCenter)
297 |
298 | def updateText(self):
299 | if self.originalText:
300 | # Calculate the maximum line width
301 | width_pixels = self.scrollArea.width()
302 | fm = QFontMetrics(self.label.font())
303 | average_char_width_pixels = fm.averageCharWidth()
304 | max_line_width = width_pixels // average_char_width_pixels
305 |
306 | # Pretty-print the text
307 | pp = pprint.PrettyPrinter(
308 | indent=2, width=max_line_width, compact=True, sort_dicts=False
309 | )
310 | formatted_text = pp.pformat(self.originalText)
311 | self.label.setText(formatted_text)
312 | self.label.setAlignment(Qt.AlignmentFlag.AlignLeft | Qt.AlignmentFlag.AlignTop)
313 |
314 | def resizeEvent(self, event):
315 | if self.originalPixmap:
316 | self.updateImage()
317 | elif self.originalText:
318 | self.updateText()
319 | super().resizeEvent(event)
320 |
321 | def show_context_menu(self, position):
322 | menu = QMenu()
323 | copy_image_action = menu.addAction("Copy image")
324 |
325 | action = menu.exec(self.mapToGlobal(position))
326 |
327 | if action == copy_image_action and self.originalPixmap:
328 | clipboard = QApplication.clipboard()
329 | mime_data = QMimeData()
330 | mime_data.setImageData(self.originalPixmap.toImage())
331 | clipboard.setMimeData(mime_data, QClipboard.Mode.Clipboard)
332 |
333 |
334 | class LazyItemModel(QStandardItemModel):
335 | def __init__(self, root):
336 | super().__init__()
337 | self.root = root
338 |
339 | def index(self, row, column, parent=QModelIndex()):
340 | if not self.hasIndex(row, column, parent):
341 | return QModelIndex()
342 | parent_item = self.root if not parent.isValid() else parent.internalPointer()
343 | return (
344 | self.createIndex(row, column, parent_item.children[row])
345 | if row < len(parent_item.children)
346 | else QModelIndex()
347 | )
348 |
349 | def parent(self, index):
350 | if not index.isValid():
351 | return QModelIndex()
352 | parent_item = index.internalPointer().parent
353 | return self.createIndex(parent_item.row, 0, parent_item) if parent_item else QModelIndex()
354 |
355 | def rowCount(self, parent=QModelIndex()):
356 | parent_item = self.root if not parent.isValid() else parent.internalPointer()
357 | return len(parent_item.children)
358 |
359 | def columnCount(self, parent=QModelIndex()):
360 | return 3 # Name, Size, Count
361 |
362 | def headerData(self, section, orientation, role):
363 | if role == Qt.ItemDataRole.DisplayRole and orientation == Qt.Orientation.Horizontal:
364 | return ["Name", "Size", "Count"][section]
365 | return None
366 |
367 | def data(self, index, role):
368 | item = index.internalPointer()
369 | if role == Qt.ItemDataRole.DisplayRole:
370 | if index.column() == 0:
371 | if item.parent == self.root:
372 | return '[root]'
373 | return osp.basename(item.path)
374 | elif index.column() == 1:
375 | return format_size(item.size)
376 | elif index.column() == 2:
377 | return format_count(item.count)
378 | elif role == Qt.ItemDataRole.TextAlignmentRole:
379 | if index.column() in [1, 2]:
380 | return Qt.AlignmentFlag.AlignRight
381 | return None
382 |
383 | def canFetchMore(self, index):
384 | if not index.isValid():
385 | return False
386 | return not index.internalPointer().fetched
387 |
388 | def fetchMore(self, index):
389 | item = index.internalPointer()
390 | if item == self.root:
391 | return
392 | item.fetch_more()
393 | self.beginInsertRows(index, 0, len(item.children) - 1)
394 | self.endInsertRows()
395 |
396 | def hasChildren(self, index=QModelIndex()):
397 | if not index.isValid():
398 | return True
399 | return index.internalPointer().has_subdirs
400 |
401 |
402 | class TreeItem:
403 | def __init__(self, file_reader, path='', size=0, count=0, has_subdirs=True, parent=None):
404 | self.file_reader = file_reader
405 |
406 | self.path = path
407 | self.parent = parent
408 | self.children = []
409 |
410 | self.size = size
411 | self.count = count
412 | self.has_subdirs = has_subdirs
413 | self.fetched = False
414 |
415 | def fetch_more(self):
416 | if self.fetched:
417 | return
418 | subdir_infos = self.file_reader.index.list_subdir_dirinfos(self.path)
419 | subdir_infos = sorted(subdir_infos, key=lambda x: natural_sort_key(x.path))
420 | for dinfo in subdir_infos:
421 | self.children.append(
422 | TreeItem(
423 | self.file_reader,
424 | path=dinfo.path,
425 | size=dinfo.size_tree,
426 | count=dinfo.num_files_tree,
427 | has_subdirs=dinfo.num_subdirs > 0,
428 | parent=self,
429 | )
430 | )
431 |
432 | self.fetched = True
433 |
434 | @property
435 | def row(self):
436 | return self.parent.children.index(self) if self.parent else 0
437 |
438 |
439 | def format_size(size):
440 | units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
441 | index = 0
442 | while size >= 1024:
443 | index += 1
444 | size /= 1024
445 | return f'{size:.2f} {units[index]}'
446 |
447 |
448 | def format_count(size):
449 | units = ['', ' K', ' M', ' B']
450 | unit_index = 0
451 | while size >= 1000 and unit_index < len(units) - 1:
452 | size /= 1000
453 | unit_index += 1
454 | if unit_index == 0:
455 | return str(size)
456 | return f'{size:.1f}{units[unit_index]}'
457 |
458 |
459 | def natural_sort_key(s):
460 | """Normal string sort puts '10' before '2'. Natural sort puts '2' before '10'."""
461 | return [float(t) if t.isdigit() else t for t in re.split('([0-9]+)', s)]
462 |
463 |
464 | if __name__ == '__main__':
465 | main()
466 |
--------------------------------------------------------------------------------