├── docs ├── references.bib ├── _templates │ ├── autoapi │ │ ├── python │ │ │ ├── attribute.rst │ │ │ ├── exception.rst │ │ │ ├── package.rst │ │ │ ├── property.rst │ │ │ ├── method.rst │ │ │ ├── function.rst │ │ │ ├── data.rst │ │ │ ├── class.rst │ │ │ └── module.rst │ │ └── index.rst │ └── copyright.html ├── requirements.txt ├── Makefile ├── make.bat ├── _static │ └── styles │ │ └── my_theme.css ├── index.rst ├── conf.py └── abbrev_long.bib ├── src └── barecat │ ├── core │ ├── __init__.py │ └── sharder.py │ ├── threadsafe.py │ ├── progbar.py │ ├── codecs.py │ ├── to_tar_stream.py │ ├── from_tar_stream.py │ ├── exceptions.py │ ├── __init__.py │ ├── upgrade_database2.py │ ├── upgrade_database.py │ ├── consumed_threadpool.py │ ├── glob_to_regex.py │ ├── archive_formats.py │ ├── defrag.py │ ├── sql │ └── schema.sql │ ├── util.py │ ├── cli.py │ ├── cli_impl.py │ ├── common.py │ └── viewerqt6.py ├── MANIFEST.in ├── figure.png ├── .readthedocs.yaml ├── .github └── workflows │ └── python-publish.yml ├── LICENSE ├── pyproject.toml ├── tests ├── test_barecat.py └── test_cli.py ├── .gitignore └── README.md /docs/references.bib: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/barecat/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/barecat/sql/*.sql 2 | -------------------------------------------------------------------------------- /figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isarandi/barecat/HEAD/figure.png -------------------------------------------------------------------------------- /docs/_templates/autoapi/python/attribute.rst: -------------------------------------------------------------------------------- 1 | {% extends "python/data.rst" %} 2 | -------------------------------------------------------------------------------- /docs/_templates/autoapi/python/exception.rst: -------------------------------------------------------------------------------- 1 | {% extends "python/class.rst" %} 2 | -------------------------------------------------------------------------------- /docs/_templates/autoapi/python/package.rst: -------------------------------------------------------------------------------- 1 | {% extends "python/module.rst" %} 2 | -------------------------------------------------------------------------------- /docs/_templates/autoapi/index.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | Start at :class:`barecat.Barecat` to explore the API. 5 | 6 | .. toctree:: 7 | :titlesonly: 8 | 9 | {% for page in pages|selectattr("is_top_level_object") %} 10 | {{ page.include_path }} 11 | {% endfor %} 12 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-24.04 5 | tools: 6 | python: "3.10" 7 | commands: 8 | - python -m pip install . 9 | - python -m pip install --no-cache-dir -r docs/requirements.txt 10 | - python -m sphinx -E -b html docs $READTHEDOCS_OUTPUT/html 11 | 12 | sphinx: 13 | configuration: docs/conf.py -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinxcontrib-bibtex 3 | sphinx-autoapi 4 | sphinx-autobuild 5 | sphinx-autodoc-typehints 6 | sphinxcontrib-prettyspecialmethods 7 | sphinx-autodoc-napoleon-typehints 8 | sphinx-codeautolink 9 | sphinx-rtd-theme 10 | pydata-sphinx-theme 11 | sphinxcontrib-napoleon 12 | Cython 13 | numpy 14 | setuptools-scm 15 | toml -------------------------------------------------------------------------------- /docs/_templates/copyright.html: -------------------------------------------------------------------------------- 1 | {# Displays the copyright information (which is defined in conf.py). #} 2 | {% if show_copyright and copyright %} 3 | 12 | {% endif %} -------------------------------------------------------------------------------- /docs/_templates/autoapi/python/property.rst: -------------------------------------------------------------------------------- 1 | :html_theme.sidebar_secondary.remove: true 2 | 3 | {% if obj.display %} 4 | {% if is_own_page %} 5 | {{ obj.name }} 6 | {{ "=" * obj.name | length }} 7 | 8 | {% endif %} 9 | .. py:property:: {% if is_own_page %}{{ obj.id}}{% else %}{{ obj.short_name }}{% endif %} 10 | {% if obj.annotation %} 11 | 12 | :type: {{ obj.annotation }} 13 | {% endif %} 14 | {% for property in obj.properties %} 15 | 16 | :{{ property }}: 17 | {% endfor %} 18 | 19 | {% if obj.docstring %} 20 | 21 | {{ obj.docstring|indent(3) }} 22 | {% endif %} 23 | {% endif %} 24 | 25 | .. footbibliography:: -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /src/barecat/threadsafe.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import multiprocessing_utils 4 | 5 | from barecat.core import barecat as barecat 6 | 7 | 8 | def threadlocal_decorate(decorator): 9 | def my_decorator(fun): 10 | local = multiprocessing_utils.local() 11 | 12 | @functools.wraps(fun) 13 | def wrapper(*args, **kwargs): 14 | if not hasattr(local, 'fn'): 15 | local.fn = decorator(fun) 16 | return local.fn(*args, **kwargs) 17 | 18 | return wrapper 19 | 20 | return my_decorator 21 | 22 | 23 | @threadlocal_decorate(functools.lru_cache()) 24 | def get_cached_reader(path, auto_codec=True): 25 | return barecat.Barecat(path, readonly=True, auto_codec=auto_codec) 26 | -------------------------------------------------------------------------------- /docs/_templates/autoapi/python/method.rst: -------------------------------------------------------------------------------- 1 | :html_theme.sidebar_secondary.remove: true 2 | 3 | {% if obj.display %} 4 | {% if is_own_page %} 5 | {{ obj.name }} 6 | {{ "=" * obj.name | length }} 7 | 8 | {% endif %} 9 | .. py:method:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ obj.args }}){% if obj.return_annotation is not none %} -> {{ obj.return_annotation }}{% endif %} 10 | {% for (args, return_annotation) in obj.overloads %} 11 | 12 | {%+ if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ args }}){% if return_annotation is not none %} -> {{ return_annotation }}{% endif %} 13 | {% endfor %} 14 | {% for property in obj.properties %} 15 | 16 | :{{ property }}: 17 | {% endfor %} 18 | 19 | {% if obj.docstring %} 20 | 21 | {{ obj.docstring|indent(3) }} 22 | {% endif %} 23 | {% endif %} 24 | 25 | .. footbibliography:: -------------------------------------------------------------------------------- /docs/_templates/autoapi/python/function.rst: -------------------------------------------------------------------------------- 1 | :html_theme.sidebar_secondary.remove: true 2 | 3 | {% if obj.display %} 4 | {% if is_own_page %} 5 | {{ obj.name }} 6 | {{ "=" * obj.name | length }} 7 | 8 | {% endif %} 9 | .. py:function:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ obj.args }}){% if obj.return_annotation is not none %} -> {{ obj.return_annotation }}{% endif %} 10 | {% for (args, return_annotation) in obj.overloads %} 11 | 12 | {%+ if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}({{ args }}){% if return_annotation is not none %} -> {{ return_annotation }}{% endif %} 13 | {% endfor %} 14 | {% for property in obj.properties %} 15 | 16 | :{{ property }}: 17 | {% endfor %} 18 | 19 | {% if obj.docstring %} 20 | 21 | {{ obj.docstring|indent(3) }} 22 | {% endif %} 23 | {% endif %} 24 | 25 | .. footbibliography:: -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | pypi-publish: 12 | name: Upload release to PyPI 13 | runs-on: ubuntu-latest 14 | environment: pypi 15 | permissions: 16 | id-token: write 17 | steps: 18 | - name: Check out repository 19 | uses: actions/checkout@v4 20 | with: 21 | fetch-depth: 0 22 | 23 | - name: Set up Python 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: "3.x" 27 | 28 | - name: Install build dependencies 29 | run: python -m pip install --upgrade build 30 | 31 | - name: Build package distribution 32 | run: python -m build --sdist 33 | 34 | - name: Publish package distributions to PyPI 35 | uses: pypa/gh-action-pypi-publish@release/v1 36 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/_templates/autoapi/python/data.rst: -------------------------------------------------------------------------------- 1 | :html_theme.sidebar_secondary.remove: true 2 | 3 | {% if obj.display %} 4 | {% if is_own_page %} 5 | {{ obj.name }} 6 | {{ "=" * obj.name | length }} 7 | 8 | {% endif %} 9 | .. py:{{ obj.type }}:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.name }}{% endif %} 10 | {% if obj.annotation is not none %} 11 | 12 | :type: {% if obj.annotation %} {{ obj.annotation }}{% endif %} 13 | {% endif %} 14 | {% if obj.value is not none %} 15 | 16 | {% if obj.value.splitlines()|count > 1 %} 17 | :value: Multiline-String 18 | 19 | .. raw:: html 20 | 21 |
Show Value 22 | 23 | .. code-block:: python 24 | 25 | {{ obj.value|indent(width=6,blank=true) }} 26 | 27 | .. raw:: html 28 | 29 |
30 | 31 | {% else %} 32 | :value: {{ obj.value|truncate(100) }} 33 | {% endif %} 34 | {% endif %} 35 | 36 | {% if obj.docstring %} 37 | 38 | {{ obj.docstring|indent(3) }} 39 | {% endif %} 40 | {% endif %} 41 | 42 | .. footbibliography:: -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 István Sárándi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/barecat/progbar.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def is_running_in_jupyter_notebook(): 5 | try: 6 | # noinspection PyUnresolvedReferences 7 | shell = get_ipython().__class__.__name__ 8 | if shell == 'ZMQInteractiveShell': 9 | return True # Jupyter notebook or qtconsole 10 | elif shell == 'TerminalInteractiveShell': 11 | return False # Terminal running IPython 12 | else: 13 | return False # Other type (?) 14 | except NameError: 15 | return False # Probably standard Python interpreter 16 | 17 | 18 | def progressbar(iterable=None, *args, **kwargs): 19 | import tqdm 20 | 21 | if is_running_in_jupyter_notebook(): 22 | return tqdm.notebook.tqdm(iterable, *args, **kwargs) 23 | elif sys.stdout.isatty(): 24 | return tqdm.tqdm(iterable, *args, dynamic_ncols=True, **kwargs) 25 | elif iterable is None: 26 | 27 | class X: 28 | def update(self, *a, **kw): 29 | pass 30 | 31 | return X() 32 | else: 33 | return iterable 34 | 35 | 36 | def progressbar_items(dictionary, *args, **kwargs): 37 | return progressbar(dictionary.items(), total=len(dictionary), *args, **kwargs) 38 | -------------------------------------------------------------------------------- /src/barecat/codecs.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | 4 | def encode_jpeg(data): 5 | import imageio.v2 as imageio 6 | 7 | with io.BytesIO() as f: 8 | imageio.imwrite(f, data, format='jpeg', quality=95) 9 | return f.getvalue() 10 | 11 | 12 | def decode_jpeg(data): 13 | import jpeg4py 14 | import numpy as np 15 | 16 | return jpeg4py.JPEG(np.frombuffer(data, np.uint8)).decode() 17 | 18 | 19 | def encode_msgpack_np(data): 20 | import msgpack_numpy 21 | 22 | return msgpack_numpy.packb(data) 23 | 24 | 25 | def decode_msgpack_np(data): 26 | import msgpack_numpy 27 | 28 | return msgpack_numpy.unpackb(data) 29 | 30 | 31 | def encode_npy(data): 32 | import numpy as np 33 | 34 | with io.BytesIO() as f: 35 | np.save(f, data) 36 | return f.getvalue() 37 | 38 | 39 | def decode_npy(data): 40 | import numpy as np 41 | 42 | with io.BytesIO(data) as f: 43 | return np.load(f) 44 | 45 | 46 | def encode_npz(data): 47 | import numpy as np 48 | 49 | with io.BytesIO() as f: 50 | np.savez(f, **data) 51 | return f.getvalue() 52 | 53 | 54 | def decode_npz(data): 55 | import numpy as np 56 | 57 | with io.BytesIO(data) as f: 58 | return dict(np.load(f)) 59 | -------------------------------------------------------------------------------- /src/barecat/to_tar_stream.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from barecat.archive_formats import TarWriter 3 | import barecat.core.barecat as barecat_ 4 | import argparse 5 | import sys 6 | 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser(description='Convert a tar stream to a barecat file') 10 | parser.add_argument('barecat_file', type=str, help='path to the target barecat file') 11 | parser.add_argument("args", nargs=argparse.REMAINDER, help="Ordered --in and --ex arguments") 12 | 13 | args = parser.parse_args() 14 | patterns = parse_in_ex_patterns(args) 15 | 16 | with ( 17 | barecat_.Barecat(args.barecat_file, readonly=True) as bc_reader, 18 | TarWriter(fileobj=sys.stdout.buffer, mode='w|') as tar_writer, 19 | ): 20 | for finfo in bc_reader.index.raw_iterglob_infos_incl_excl( 21 | patterns=patterns, only_files=True 22 | ): 23 | with bc_reader.open(finfo.path) as fileobj: 24 | tar_writer.add(finfo, fileobj) 25 | 26 | 27 | def parse_in_ex_patterns(args): 28 | patterns = [] 29 | i = 0 30 | while i < len(args.args): 31 | arg = args.args[i] 32 | 33 | if arg.startswith("--in="): 34 | patterns.append((True, arg.split("=", 1)[1])) 35 | 36 | elif arg.startswith("--ex="): 37 | patterns.append((False, arg.split("=", 1)[1])) 38 | 39 | elif arg == "--in": 40 | if i + 1 < len(args.args): 41 | patterns.append((True, args.args[i + 1])) 42 | i += 1 43 | 44 | elif arg == "--ex": 45 | if i + 1 < len(args.args): 46 | patterns.append((False, args.args[i + 1])) 47 | i += 1 48 | 49 | i += 1 50 | 51 | return patterns 52 | 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /docs/_static/styles/my_theme.css: -------------------------------------------------------------------------------- 1 | @import url("theme.css"); 2 | @import url("https://fonts.googleapis.com/css2?family=Mona+Sans:ital,wght@0,200..900;1,200..900&family=Geist:wght@100..900&&family=JetBrains+Mono:ital,wght@0,100..800;1,100..800&family=Outfit:wght@100..900&display=swap"); 3 | 4 | /*@media (min-width: 960px) { 5 | .bd-page-width { 6 | max-width: 120rem; 7 | } 8 | }*/ 9 | 10 | #rtd-footer-container { 11 | margin-top: 0 !important; 12 | } 13 | 14 | html[data-theme="light"] { 15 | --pst-color-table-row-hover-bg: #dfc6ff; 16 | --pst-color-link-hover: #845818; 17 | } 18 | 19 | html[data-theme="dark"] { 20 | --pst-color-table-row-hover-bg: #41296c; 21 | --pst-color-inline-code: #dd8cd4; 22 | } 23 | 24 | 25 | html[data-theme="dark"] dt:target { 26 | background-color: #4f4500; 27 | } 28 | 29 | html[data-theme="dark"] .linkcode-link { 30 | color: #9090ff; 31 | } 32 | 33 | html[data-theme="dark"] table.indextable tr.cap { 34 | background-color: #464646; 35 | } 36 | 37 | html[data-theme="dark"] a:visited { 38 | color: #9E67D0; 39 | } 40 | 41 | .navbar-brand .logo__title { 42 | font-family: "Mona Sans", sans-serif; 43 | font-size: 2.5rem; 44 | font-weight: 400; 45 | font-style: normal; 46 | } 47 | 48 | :root { 49 | --pst-font-family-monospace: "JetBrains Mono", monospace; 50 | --pst-font-family-heading: "Mona Sans", sans-serif; 51 | --pst-font-family-base: "Mona Sans", sans-serif; 52 | } 53 | 54 | body { 55 | font-weight: 450; 56 | } 57 | 58 | .bd-main .bd-content .bd-article-container { 59 | max-width: 100%; /* default is 60em */ 60 | } 61 | 62 | /*.bd-sidebar-primary { 63 | max-width: 20%; 64 | }*/ 65 | 66 | /* Ensure links in code blocks are underlined */ 67 | .highlight a { 68 | text-decoration: underline; 69 | color: #394198; /* Adjust color as needed */ 70 | } 71 | 72 | /* For additional emphasis, change hover effect */ 73 | .highlight a:hover { 74 | text-decoration: underline; 75 | color: #9090ff; 76 | } 77 | -------------------------------------------------------------------------------- /src/barecat/from_tar_stream.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import tarfile 4 | 5 | import barecat.core.barecat as barecat_ 6 | from barecat.common import BarecatDirInfo, BarecatFileInfo 7 | 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser(description='Convert a tar stream to a barecat file') 11 | parser.add_argument('barecat_file', type=str, help='path to the target barecat file') 12 | parser.add_argument( 13 | '--shard-size-limit', 14 | type=str, 15 | default=None, 16 | help='maximum size of a shard in bytes (if not specified, ' 17 | 'all files will be concatenated into a single shard)', 18 | ) 19 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files') 20 | args = parser.parse_args() 21 | 22 | with barecat_.Barecat( 23 | args.barecat_file, 24 | shard_size_limit=args.shard_size_limit, 25 | readonly=False, 26 | overwrite=args.overwrite, 27 | ) as writer: 28 | with tarfile.open(fileobj=sys.stdin.buffer, mode='r|') as tar: 29 | for member in tar: 30 | if member.isdir(): 31 | dinfo = BarecatDirInfo( 32 | path=member.name, 33 | mode=member.mode, 34 | uid=member.uid, 35 | gid=member.gid, 36 | mtime_ns=member.mtime * 1_000_000_000, 37 | ) 38 | writer.add(dinfo, dir_exist_ok=True) 39 | if member.isfile(): 40 | finfo = BarecatFileInfo( 41 | path=member.name, 42 | size=member.size, 43 | mode=member.mode, 44 | uid=member.uid, 45 | gid=member.gid, 46 | mtime_ns=member.mtime * 1_000_000_000, 47 | ) 48 | with tar.extractfile(member) as file_in_tar: 49 | writer.add(finfo, fileobj=file_in_tar) 50 | 51 | 52 | if __name__ == '__main__': 53 | main() 54 | -------------------------------------------------------------------------------- /src/barecat/exceptions.py: -------------------------------------------------------------------------------- 1 | """Exceptions indicating various errors related to the use of Barecat archives""" 2 | 3 | 4 | class BarecatError(Exception): 5 | """Base class for all exceptions in Barecat""" 6 | 7 | def __init__(self, message: str): 8 | super().__init__(message) 9 | 10 | 11 | class FileExistsBarecatError(BarecatError): 12 | """Exception raised when trying to create a file that already exists 13 | 14 | Analogous to FileExistsError 15 | 16 | Args: 17 | path: path to the file that already exists 18 | """ 19 | 20 | def __init__(self, path: str): 21 | super().__init__(f'File already exists: {path}') 22 | 23 | 24 | class FileNotFoundBarecatError(BarecatError): 25 | """Exception raised when trying to access a file that does not exist 26 | 27 | Analogous to FileNotFoundError 28 | 29 | Args: 30 | path: path to the file that does not exist 31 | 32 | """ 33 | 34 | def __init__(self, path: str): 35 | super().__init__(f'File not found: {path}') 36 | 37 | 38 | class DirectoryNotEmptyBarecatError(BarecatError): 39 | """Exception raised when trying to delete a non-empty directory 40 | 41 | Args: 42 | path: path to the non-empty directory 43 | """ 44 | 45 | def __init__(self, path: str): 46 | super().__init__(f'Directory not empty: {path}') 47 | 48 | 49 | class IsADirectoryBarecatError(BarecatError): 50 | """Exception raised when trying to access a directory as a file. 51 | 52 | Args: 53 | path: path to the directory 54 | 55 | """ 56 | 57 | def __init__(self, path: str): 58 | super().__init__(f'Is a directory: {path}') 59 | 60 | 61 | class NotADirectoryBarecatError(BarecatError): 62 | """Exception raised when trying to access a file as a directory.""" 63 | 64 | def __init__(self, message: str): 65 | super().__init__(message) 66 | 67 | 68 | class BarecatIntegrityError(BarecatError): 69 | """Exception raised when the CRC32C checksum of a file does not match the expected checksum""" 70 | 71 | def __init__(self, message: str): 72 | super().__init__(message) 73 | 74 | 75 | class NotEnoughSpaceBarecatError(BarecatError): 76 | """Exception raised when there is not enough space to write a file to the archive""" 77 | 78 | def __init__(self, message: str): 79 | super().__init__(message) 80 | -------------------------------------------------------------------------------- /src/barecat/__init__.py: -------------------------------------------------------------------------------- 1 | """Barecat is a fast random-access, mountable archive format for storing and accessing many small 2 | files.""" 3 | 4 | from .core.barecat import Barecat 5 | from .core.index import Index 6 | 7 | from .cli_impl import ( 8 | archive2barecat, 9 | barecat2archive, 10 | extract, 11 | merge, 12 | merge_symlink, 13 | read_index, 14 | write_index, 15 | ) 16 | from .common import ( 17 | BarecatFileInfo, 18 | BarecatDirInfo, 19 | BarecatEntryInfo, 20 | FileSection, 21 | Order, 22 | SHARD_SIZE_UNLIMITED, 23 | ) 24 | 25 | from .exceptions import ( 26 | BarecatError, 27 | BarecatIntegrityError, 28 | FileExistsBarecatError, 29 | FileNotFoundBarecatError, 30 | IsADirectoryBarecatError, 31 | NotEnoughSpaceBarecatError, 32 | DirectoryNotEmptyBarecatError, 33 | ) 34 | 35 | from .threadsafe import get_cached_reader 36 | 37 | 38 | def open(path, mode='r', auto_codec=False, threadsafe_reader=True): 39 | if mode == 'r': 40 | return Barecat(path, readonly=True, threadsafe=threadsafe_reader, auto_codec=auto_codec) 41 | elif mode == 'w+': 42 | return Barecat( 43 | path, 44 | readonly=False, 45 | overwrite=True, 46 | exist_ok=True, 47 | append_only=False, 48 | auto_codec=auto_codec, 49 | ) 50 | elif mode == 'r+': 51 | return Barecat( 52 | path, 53 | readonly=False, 54 | overwrite=False, 55 | exist_ok=True, 56 | append_only=False, 57 | auto_codec=auto_codec, 58 | ) 59 | elif mode == 'a+': 60 | return Barecat( 61 | path, 62 | readonly=False, 63 | overwrite=False, 64 | exist_ok=True, 65 | append_only=True, 66 | auto_codec=auto_codec, 67 | ) 68 | elif mode == 'ax+': 69 | return Barecat( 70 | path, 71 | readonly=False, 72 | overwrite=False, 73 | exist_ok=False, 74 | append_only=True, 75 | auto_codec=auto_codec, 76 | ) 77 | elif mode == 'x+': 78 | return Barecat( 79 | path, 80 | readonly=False, 81 | overwrite=False, 82 | exist_ok=False, 83 | append_only=False, 84 | auto_codec=auto_codec, 85 | ) 86 | else: 87 | raise ValueError(f"Invalid mode: {mode}") 88 | -------------------------------------------------------------------------------- /src/barecat/upgrade_database2.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path 3 | 4 | import barecat 5 | 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser(description='Migrate index database to new version') 9 | parser.add_argument('path_in', type=str, help='Path to the old barecat') 10 | parser.add_argument('path_out', type=str, help='Path to the new barecat') 11 | 12 | args = parser.parse_args() 13 | upgrade_schema(args.path_in, args.path_out) 14 | 15 | 16 | def upgrade_schema(path_in: str, path_out: str): 17 | if os.path.exists(path_out + '-sqlite-index'): 18 | raise FileExistsError(f'Output path {path_out}-sqlite-index already exists') 19 | with barecat.Index(path_out + '-sqlite-index', readonly=False) as index_out: 20 | c = index_out.cursor 21 | c.execute('COMMIT') 22 | c.execute('PRAGMA foreign_keys=OFF') 23 | c.execute('PRAGMA synchronous=OFF') 24 | c.execute('PRAGMA journal_mode=OFF') 25 | c.execute(f'ATTACH DATABASE "file:{path_in}-sqlite-index?mode=ro" AS source') 26 | 27 | with index_out.no_triggers(), index_out.no_foreign_keys(): 28 | print('Migrating dir metadata...') 29 | c.execute( 30 | """ 31 | INSERT INTO dirs ( 32 | path, num_subdirs, num_files, num_files_tree, size_tree, mode, uid, gid, 33 | mtime_ns) 34 | SELECT path, num_subdirs, num_files, num_files_tree, size_tree, mode, uid, 35 | gid, mtime_ns 36 | FROM source.dirs 37 | WHERE path != '' 38 | """ 39 | ) 40 | c.execute(""" 41 | UPDATE dirs 42 | SET (num_subdirs, num_files, num_files_tree, size_tree, mode, uid, gid, mtime_ns) = 43 | (SELECT num_subdirs, num_files, num_files_tree, size_tree, mode, uid, gid, mtime_ns 44 | FROM source.dirs WHERE path = '') 45 | WHERE path = '' 46 | """) 47 | 48 | 49 | print('Migrating file metadata...') 50 | c.execute( 51 | f""" 52 | INSERT INTO files ( 53 | path, shard, offset, size, crc32c, mode, uid, gid, mtime_ns) 54 | SELECT path, shard, offset, size, crc32c, mode, uid, gid, mtime_ns 55 | FROM source.files 56 | """ 57 | ) 58 | 59 | c.execute( 60 | f""" 61 | INSERT OR REPLACE INTO config (key, value_text, value_int) 62 | SELECT key, value_text, value_int 63 | FROM source.config 64 | """ 65 | ) 66 | 67 | index_out.conn.commit() 68 | c.execute("DETACH DATABASE source") 69 | index_out.optimize() 70 | 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=64", 4 | "wheel", 5 | "setuptools_scm[toml]>=8" 6 | ] 7 | build-backend = "setuptools.build_meta" 8 | 9 | [project] 10 | name = "barecat" 11 | dynamic = ["version"] 12 | description = "Scalable archive format for storing millions of small files with random access and SQLite indexing." 13 | readme = "README.md" 14 | requires-python = ">=3.9" 15 | license = { file = "LICENSE" } 16 | 17 | authors = [ 18 | { name = "István Sárándi", email = "istvan.sarandi@gmail.com" } 19 | ] 20 | 21 | dependencies = [ 22 | "multiprocessing-utils", 23 | "tqdm", 24 | "crc32c" 25 | ] 26 | 27 | keywords = [ 28 | "sqlite", 29 | "dataset", 30 | "storage", 31 | "archive", 32 | "random-access", 33 | "image-dataset", 34 | "filesystem", 35 | "key-value-store", 36 | "deep-learning", 37 | "data-loader", 38 | "file-indexing" 39 | ] 40 | 41 | classifiers = [ 42 | "Development Status :: 4 - Beta", 43 | "Intended Audience :: Developers", 44 | "Intended Audience :: Science/Research", 45 | "Topic :: Scientific/Engineering :: Information Analysis", 46 | "Topic :: Software Development :: Libraries", 47 | "Topic :: System :: Archiving", 48 | "Topic :: System :: Filesystems", 49 | "License :: OSI Approved :: MIT License", 50 | "Programming Language :: Python", 51 | "Programming Language :: Python :: 3", 52 | "Programming Language :: Cython", 53 | "Operating System :: POSIX :: Linux" 54 | ] 55 | 56 | [project.scripts] 57 | barecat-create = "barecat.cli:create" 58 | barecat-extract = "barecat.cli:extract" 59 | barecat-merge = "barecat.cli:merge" 60 | barecat-merge-symlink = "barecat.cli:merge_symlink" 61 | barecat-extract-single = "barecat.cli:extract_single" 62 | barecat-index-to-csv = "barecat.cli:index_to_csv" 63 | barecat-verify = "barecat.cli:verify_integrity" 64 | barecat-to-ncdu-json = "barecat.cli:print_ncdu_json" 65 | archive2barecat = "barecat.cli:archive2barecat" 66 | barecat2archive = "barecat.cli:barecat2archive" 67 | barecat-defrag = "barecat.cli:defrag" 68 | barecat-create-recursive = "barecat.cli:create_recursive" 69 | barecat-viewer = "barecat.viewerqt6:main" 70 | barecat-upgrade-database = "barecat.upgrade_database:main" 71 | 72 | [project.urls] 73 | Homepage = "https://github.com/isarandi/barecat" 74 | Documentation = "https://istvansarandi.com/docs/barecat/api/barecat/Barecat.html" 75 | Repository = "https://github.com/isarandi/barecat" 76 | Issues = "https://github.com/isarandi/barecat/issues" 77 | Author = "https://istvansarandi.com" 78 | 79 | [tool.setuptools_scm] 80 | version_scheme = "guess-next-dev" 81 | local_scheme = "no-local-version" 82 | write_to = "src/barecat/_version.py" 83 | 84 | [tool.setuptools] 85 | package-dir = { "" = "src" } 86 | 87 | [tool.setuptools.packages.find] 88 | where = ["src"] 89 | 90 | [tool.black] 91 | line-length = 99 92 | skip-string-normalization = true 93 | -------------------------------------------------------------------------------- /tests/test_barecat.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import barecat 4 | from barecat import Barecat, BarecatFileInfo, BarecatDirInfo 5 | import pytest 6 | import tempfile 7 | import os.path as osp 8 | 9 | 10 | def test_barecat(): 11 | tempdir = tempfile.mkdtemp() 12 | filepath = osp.join(tempdir, 'test.barecat') 13 | with barecat.Barecat(filepath, readonly=False) as bc: 14 | bc['some/path.txt'] = b'hello' 15 | 16 | with barecat.Barecat(filepath, readonly=True) as bc: 17 | assert bc['some/path.txt'] == b'hello' 18 | 19 | with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc: 20 | bc.add(BarecatFileInfo(path='some/path.txt', mode=0o666), data=b'hello world') 21 | bc.add(BarecatDirInfo(path='some/dir', mode=0o777)) 22 | 23 | with barecat.Barecat(filepath, readonly=True) as bc: 24 | assert bc['some/path.txt'] == b'hello world' 25 | assert bc.listdir('some/dir') == [] 26 | 27 | with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc: 28 | bc['some/path.txt'] = b'hello world' 29 | assert bc['some/path.txt'] == b'hello world' 30 | del bc['some/path.txt'] 31 | with pytest.raises(KeyError): 32 | a = bc['some/path.txt'] 33 | 34 | with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc: 35 | bc['some/path.txt'] = b'hello world' 36 | 37 | with barecat.Barecat(filepath, readonly=True) as bc: 38 | with bc.open('some/path.txt') as f: 39 | f.seek(6) 40 | assert f.read() == b'world' 41 | 42 | with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc: 43 | bc['dir/file.txt'] = b'Hello, world!' 44 | bc['dir/subdir/file2.txt'] = b'Hello, world2!' 45 | 46 | with barecat.Barecat(filepath, readonly=True) as bc: 47 | assert bc.listdir('dir/subdir') == ['file2.txt'] 48 | 49 | assert list(bc.walk('dir')) == [ 50 | ('dir', ['subdir'], ['file.txt']), 51 | ('dir/subdir', [], ['file2.txt']), 52 | ] 53 | 54 | with open(osp.join(tempdir, 'file.txt'), 'wb') as f: 55 | f.write(b'Hello, world!') 56 | os.mkdir(osp.join(tempdir, 'dir2')) 57 | 58 | with barecat.Barecat(filepath, readonly=False, overwrite=True) as bc: 59 | bc.add_by_path(osp.join(tempdir, 'file.txt')) 60 | bc.add_by_path(osp.join(tempdir, 'dir2'), store_path='dir') 61 | 62 | with barecat.Barecat(filepath, readonly=True) as bc: 63 | assert bc[osp.join(tempdir, 'file.txt')] == b'Hello, world!' 64 | assert bc.listdir('dir') == [] 65 | 66 | with Barecat(filepath, readonly=False, overwrite=True) as bc: 67 | bc.add(BarecatFileInfo(path='file.txt', mode=0o666), data=b'Hello, world!') 68 | bc.add(BarecatDirInfo(path='dir', mode=0o777)) 69 | 70 | with Barecat(filepath, readonly=True) as bc: 71 | assert bc['file.txt'] == b'Hello, world!' 72 | assert bc.listdir('dir') == [] 73 | 74 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | *_cython.c 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /src/barecat/upgrade_database.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path 3 | import sqlite3 4 | 5 | import barecat 6 | import barecat_cython 7 | from barecat.consumed_threadpool import ConsumedThreadPool 8 | from barecat.progbar import progressbar 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser(description='Migrate index database to new version') 13 | parser.add_argument('path', type=str, help='Path to the old barecat') 14 | parser.add_argument( 15 | '--workers', type=int, default=8, help='Number of workers for calculating crc32c' 16 | ) 17 | 18 | args = parser.parse_args() 19 | dbase_path = args.path + '-sqlite-index' 20 | if not os.path.exists(dbase_path): 21 | raise FileNotFoundError(f'{dbase_path} does not exist!') 22 | 23 | os.rename(args.path + '-sqlite-index', args.path + '-sqlite-index.old') 24 | upgrade_schema(args.path) 25 | update_crc32c(args.path, workers=args.workers) 26 | 27 | 28 | def upgrade_schema(path: str): 29 | with barecat.Index(path + '-sqlite-index', readonly=False) as index_out: 30 | c = index_out.cursor 31 | c.execute('COMMIT') 32 | c.execute('PRAGMA foreign_keys=OFF') 33 | c.execute('PRAGMA synchronous=OFF') 34 | c.execute('PRAGMA journal_mode=OFF') 35 | c.execute('PRAGMA recursive_triggers=ON') 36 | c.execute(f'ATTACH DATABASE "file:{path}-sqlite-index.old?mode=ro" AS source') 37 | print('Migrating dir metadata...') 38 | c.execute( 39 | """ 40 | INSERT INTO dirs (path) 41 | SELECT path FROM source.directories 42 | WHERE path != '' 43 | """ 44 | ) 45 | print('Migrating file metadata...') 46 | c.execute( 47 | f""" 48 | INSERT INTO files (path, shard, offset, size) 49 | SELECT path, shard, offset, size 50 | FROM source.files 51 | """ 52 | ) 53 | 54 | c.execute('COMMIT') 55 | c.execute("DETACH DATABASE source") 56 | 57 | 58 | def update_crc32c(path_out: str, workers=8): 59 | with ( 60 | barecat_cython.BarecatMmapCython(path_out) as sh, 61 | barecat.Index(path_out + '-sqlite-index', readonly=False) as index, 62 | ): 63 | c = index.cursor 64 | c.execute('COMMIT') 65 | c.execute('PRAGMA synchronous=OFF') 66 | c.execute('PRAGMA journal_mode=OFF') 67 | index._triggers_enabled = False 68 | 69 | print('Calculating crc32c for all files to separate database...') 70 | path_newcrc_temp = f'{path_out}-sqlite-index-newcrc-temp' 71 | with ConsumedThreadPool( 72 | temp_crc_writer_main, 73 | main_args=(path_newcrc_temp,), 74 | max_workers=workers, 75 | queue_size=1024, 76 | ) as ctp: 77 | for fi in progressbar( 78 | index.iter_all_fileinfos(order=barecat.Order.ADDRESS), total=index.num_files 79 | ): 80 | ctp.submit( 81 | sh.crc32c_from_address, userdata=fi.path, args=(fi.shard, fi.offset, fi.size) 82 | ) 83 | 84 | print('Updating crc32c in the barecat index...') 85 | c.execute(f'ATTACH DATABASE "file:{path_newcrc_temp}?mode=ro" AS newdb') 86 | c.execute( 87 | """ 88 | UPDATE files 89 | SET crc32c=newdb.crc32c.crc32c 90 | FROM newdb.crc32c 91 | WHERE files.path=newdb.crc32c.path 92 | """ 93 | ) 94 | c.execute('COMMIT') 95 | c.execute('DETACH DATABASE newdb') 96 | 97 | os.remove(path_newcrc_temp) 98 | 99 | 100 | def temp_crc_writer_main(dbpath, future_iter): 101 | with sqlite3.connect(dbpath) as conn: 102 | c = conn.cursor() 103 | c.execute('PRAGMA synchronous=OFF') 104 | c.execute('PRAGMA journal_mode=OFF') 105 | c.execute("CREATE TABLE IF NOT EXISTS crc32c (path TEXT PRIMARY KEY, crc32c INTEGER)") 106 | for future in future_iter: 107 | path = future.userdata 108 | crc32c = future.result() 109 | c.execute("INSERT INTO crc32c (path, crc32c) VALUES (?, ?)", (path, crc32c)) 110 | 111 | 112 | if __name__ == '__main__': 113 | main() 114 | -------------------------------------------------------------------------------- /src/barecat/consumed_threadpool.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import os 3 | import queue 4 | import threading 5 | 6 | 7 | class ConsumedThreadPool: 8 | """This class solves a form of the producer-consumer problem. 9 | There is one main producer, whose items need to be processed in parallel by one of several 10 | workers, and finally the processed items are consumed by a single consumer thread. 11 | 12 | So the three steps are: 13 | 14 | 1. The main thread constructs this object, then iterates and calls submit() for each item, 15 | passing the appropriate processing function and arguments to submit(). 16 | 2. The workers process the items in parallel threads, these are the threads created by a 17 | ThreadPoolExecutor. 18 | 3. The consumer thread consumes the items, in the form of futures, running the consumer_main 19 | function originally passed to the constructor. 20 | 21 | The main producer's loop is meant to be computationally inexpensive, something that generates 22 | "tasks". The worker threads do the heavy lifting. 23 | The consumer does something that must happen in a serial manner or otherwise must happen in the 24 | same, single thread. 25 | 26 | Example: 27 | 28 | def producer_main(): 29 | with ConsumedThreadPool(consumer_main, main_args=('hello',), max_workers=8) as pool: 30 | for i in range(100): 31 | pool.submit(process_fn, userdata='anything', args=(i,)) 32 | 33 | def process_fn(i): 34 | return i * 2 35 | 36 | def consumer_main(greeting, future_iter): 37 | print(greeting) 38 | for future in future_iter: 39 | print(future.userdata) 40 | print(future.result()) 41 | """ 42 | 43 | def __init__( 44 | self, consumer_main, main_args=None, main_kwargs=None, max_workers=None, queue_size=None 45 | ): 46 | if max_workers is None: 47 | max_workers = len(os.sched_getaffinity(0)) 48 | if queue_size is None: 49 | queue_size = max_workers * 2 50 | self.q = queue.Queue(queue_size) 51 | self.semaphore = threading.Semaphore(queue_size) 52 | self.executor = concurrent.futures.ThreadPoolExecutor(max_workers) 53 | 54 | self.consumer_error_queue = queue.Queue() 55 | self.consumer_main = consumer_main 56 | 57 | if main_kwargs is None: 58 | main_kwargs = {} 59 | self.consumer_thread = threading.Thread( 60 | target=self._safe_consumer_main, args=(main_args, main_kwargs) 61 | ) 62 | self.consumer_thread.start() 63 | 64 | def _safe_consumer_main(self, main_args, main_kwargs): 65 | try: 66 | main_kwargs = {**main_kwargs, 'future_iter': IterableQueue(self.q)} 67 | self.consumer_main(*main_args, **main_kwargs) 68 | except Exception as e: 69 | self.consumer_error_queue.put(e) 70 | 71 | def submit(self, fn=None, userdata=None, args=None, kwargs=None): 72 | if not self.consumer_error_queue.empty(): 73 | consumer_exception = self.consumer_error_queue.get() 74 | raise RuntimeError('Consumer thread raised an exception') from consumer_exception 75 | 76 | self.semaphore.acquire() 77 | if args is None: 78 | args = () 79 | if kwargs is None: 80 | kwargs = {} 81 | if fn is None: 82 | fn = noop 83 | future = self.executor.submit(fn, *args, **kwargs) 84 | future.userdata = userdata 85 | future.add_done_callback(lambda f: self.semaphore.release()) 86 | future.add_done_callback(self.q.put) 87 | 88 | def close(self): 89 | self.executor.shutdown(wait=True) 90 | self.q.put(None) 91 | self.q.join() 92 | self.consumer_thread.join() 93 | 94 | if not self.consumer_error_queue.empty(): 95 | consumer_exception = self.consumer_error_queue.get() 96 | raise RuntimeError('Consumer thread raised an exception') from consumer_exception 97 | 98 | def __enter__(self): 99 | return self 100 | 101 | def __exit__(self, exc_type, exc_val, exc_tb): 102 | self.close() 103 | 104 | 105 | class IterableQueue: 106 | def __init__(self, q): 107 | self.q = q 108 | 109 | def __iter__(self): 110 | while (item := self.q.get()) is not None: 111 | yield item 112 | self.q.task_done() 113 | self.q.task_done() 114 | 115 | 116 | def noop(): 117 | pass 118 | -------------------------------------------------------------------------------- /docs/_templates/autoapi/python/class.rst: -------------------------------------------------------------------------------- 1 | :html_theme.sidebar_secondary.remove: true 2 | 3 | {% if obj.display %} 4 | {% if is_own_page %} 5 | {{ obj.name }} 6 | {{ "=" * obj.name | length }} 7 | 8 | {% endif %} 9 | {% set visible_children = obj.children|selectattr("display")|list %} 10 | {% set own_page_children = visible_children|selectattr("type", "in", own_page_types)|list %} 11 | {% if is_own_page and own_page_children %} 12 | .. toctree:: 13 | :hidden: 14 | 15 | {% for child in own_page_children %} 16 | {{ child.include_path }} 17 | {% endfor %} 18 | 19 | {% endif %} 20 | .. py:{{ obj.type }}:: {% if is_own_page %}{{ obj.id }}{% else %}{{ obj.short_name }}{% endif %}{% if obj.args %}({{ obj.args }}){% endif %} 21 | 22 | {% for (args, return_annotation) in obj.overloads %} 23 | {{ " " * (obj.type | length) }} {{ obj.short_name }}{% if args %}({{ args }}){% endif %} 24 | 25 | {% endfor %} 26 | {% if obj.bases %} 27 | {% if "show-inheritance" in autoapi_options %} 28 | 29 | Bases: {% for base in obj.bases %}{{ base|link_objs }}{% if not loop.last %}, {% endif %}{% endfor %} 30 | {% endif %} 31 | 32 | 33 | {% if "show-inheritance-diagram" in autoapi_options and obj.bases != ["object"] %} 34 | .. autoapi-inheritance-diagram:: {{ obj.obj["full_name"] }} 35 | :parts: 1 36 | {% if "private-members" in autoapi_options %} 37 | :private-bases: 38 | {% endif %} 39 | 40 | {% endif %} 41 | {% endif %} 42 | {% if obj.docstring %} 43 | 44 | {{ obj.docstring|indent(3) }} 45 | {% endif %} 46 | {% for obj_item in visible_children %} 47 | {% if obj_item.type not in own_page_types %} 48 | 49 | {{ obj_item.render()|indent(3) }} 50 | {% endif %} 51 | {% endfor %} 52 | {% if is_own_page and own_page_children %} 53 | {% set visible_attributes = own_page_children|selectattr("type", "equalto", "attribute")|list %} 54 | {% if visible_attributes %} 55 | Attributes 56 | ---------- 57 | 58 | .. autoapisummary:: 59 | 60 | {% for attribute in visible_attributes %} 61 | {{ attribute.id }} 62 | {% endfor %} 63 | 64 | 65 | {% endif %} 66 | {% set visible_properties = own_page_children|selectattr("type", "equalto", "property")|list %} 67 | {% if visible_properties %} 68 | Properties 69 | ---------- 70 | 71 | .. autoapisummary:: 72 | 73 | {% for property in visible_properties %} 74 | {{ property.id }} 75 | {% endfor %} 76 | 77 | 78 | {% endif %} 79 | {% set visible_exceptions = own_page_children|selectattr("type", "equalto", "exception")|list %} 80 | {% if visible_exceptions %} 81 | Exceptions 82 | ---------- 83 | 84 | .. autoapisummary:: 85 | 86 | {% for exception in visible_exceptions %} 87 | {{ exception.id }} 88 | {% endfor %} 89 | 90 | 91 | {% endif %} 92 | {% set visible_classes = own_page_children|selectattr("type", "equalto", "class")|list %} 93 | {% if visible_classes %} 94 | Classes 95 | ------- 96 | 97 | .. autoapisummary:: 98 | 99 | {% for klass in visible_classes %} 100 | {{ klass.id }} 101 | {% endfor %} 102 | 103 | 104 | {% endif %} 105 | 106 | {% set static_methods = own_page_children|selectattr("type", "equalto", "method")|selectattr("properties", "defined")|selectattr("properties", "equalto", ["staticmethod"])|list %} 107 | {% set class_methods = own_page_children|selectattr("type", "equalto", "method")|selectattr("properties", "defined")|selectattr("properties", "equalto", ["classmethod"])|list %} 108 | {% set instance_methods = own_page_children|selectattr("type", "equalto", "method")|rejectattr("properties", "equalto", ["staticmethod"])|rejectattr("properties", "equalto", ["classmethod"])|list %} 109 | 110 | {% if instance_methods %} 111 | Instance Methods 112 | ---------------- 113 | 114 | .. autoapisummary:: 115 | 116 | {% for method in instance_methods %} 117 | {{ method.id }} 118 | {% endfor %} 119 | 120 | 121 | {% endif %} 122 | {% if class_methods %} 123 | Class Methods 124 | ------------- 125 | 126 | .. autoapisummary:: 127 | 128 | {% for method in class_methods %} 129 | {{ method.id }} 130 | {% endfor %} 131 | 132 | 133 | {% endif %} 134 | {% if static_methods %} 135 | Static Methods 136 | -------------- 137 | 138 | .. autoapisummary:: 139 | 140 | {% for method in static_methods %} 141 | {{ method.id }} 142 | {% endfor %} 143 | 144 | 145 | {% endif %} 146 | {% endif %} 147 | {% endif %} 148 | 149 | 150 | .. footbibliography:: -------------------------------------------------------------------------------- /docs/_templates/autoapi/python/module.rst: -------------------------------------------------------------------------------- 1 | :html_theme.sidebar_secondary.remove: true 2 | 3 | {% if obj.display %} 4 | {% if is_own_page %} 5 | {{ obj.id }} 6 | {{ "=" * obj.id|length }} 7 | 8 | .. py:module:: {{ obj.name }} 9 | 10 | {% if obj.docstring %} 11 | .. autoapi-nested-parse:: 12 | 13 | {{ obj.docstring|indent(3) }} 14 | 15 | {% endif %} 16 | 17 | {% block submodules %} 18 | {% set visible_subpackages = obj.subpackages|selectattr("display")|list %} 19 | {% set visible_submodules = obj.submodules|selectattr("display")|list %} 20 | {% set visible_submodules = (visible_subpackages + visible_submodules)|sort %} 21 | {% if visible_submodules %} 22 | Submodules 23 | ---------- 24 | 25 | .. toctree:: 26 | :maxdepth: 1 27 | 28 | {% for submodule in visible_submodules %} 29 | {{ submodule.include_path }} 30 | {% endfor %} 31 | 32 | 33 | {% endif %} 34 | {% endblock %} 35 | {% block content %} 36 | {% set visible_children = obj.children|selectattr("display")|list %} 37 | {% if visible_children %} 38 | {% set visible_attributes = visible_children|selectattr("type", "equalto", "data")|list %} 39 | {% if visible_attributes %} 40 | {% if "attribute" in own_page_types or "show-module-summary" in autoapi_options %} 41 | Attributes 42 | ---------- 43 | 44 | {% if "attribute" in own_page_types %} 45 | .. toctree:: 46 | :hidden: 47 | 48 | {% for attribute in visible_attributes %} 49 | {{ attribute.include_path }} 50 | {% endfor %} 51 | 52 | {% endif %} 53 | .. autoapisummary:: 54 | 55 | {% for attribute in visible_attributes %} 56 | {{ attribute.id }} 57 | {% endfor %} 58 | {% endif %} 59 | 60 | 61 | {% endif %} 62 | {% set visible_exceptions = visible_children|selectattr("type", "equalto", "exception")|list %} 63 | {% if visible_exceptions %} 64 | {% if "exception" in own_page_types or "show-module-summary" in autoapi_options %} 65 | Exceptions 66 | ---------- 67 | 68 | {% if "exception" in own_page_types %} 69 | .. toctree:: 70 | :hidden: 71 | 72 | {% for exception in visible_exceptions %} 73 | {{ exception.include_path }} 74 | {% endfor %} 75 | 76 | {% endif %} 77 | .. autoapisummary:: 78 | 79 | {% for exception in visible_exceptions %} 80 | {{ exception.id }} 81 | {% endfor %} 82 | {% endif %} 83 | 84 | 85 | {% endif %} 86 | {% set visible_classes = visible_children|selectattr("type", "equalto", "class")|list %} 87 | {% if visible_classes %} 88 | {% if "class" in own_page_types or "show-module-summary" in autoapi_options %} 89 | Classes 90 | ------- 91 | 92 | {% if "class" in own_page_types %} 93 | .. toctree:: 94 | :hidden: 95 | 96 | {% for klass in visible_classes %} 97 | {{ klass.include_path }} 98 | {% endfor %} 99 | 100 | {% endif %} 101 | .. autoapisummary:: 102 | 103 | {% for klass in visible_classes %} 104 | {{ klass.id }} 105 | {% endfor %} 106 | {% endif %} 107 | 108 | 109 | {% endif %} 110 | {% set visible_functions = visible_children|selectattr("type", "equalto", "function")|list %} 111 | {% if visible_functions %} 112 | {% if "function" in own_page_types or "show-module-summary" in autoapi_options %} 113 | Functions 114 | --------- 115 | 116 | {% if "function" in own_page_types %} 117 | .. toctree:: 118 | :hidden: 119 | 120 | {% for function in visible_functions %} 121 | {{ function.include_path }} 122 | {% endfor %} 123 | 124 | {% endif %} 125 | .. autoapisummary:: 126 | 127 | {% for function in visible_functions %} 128 | {{ function.id }} 129 | {% endfor %} 130 | {% endif %} 131 | 132 | 133 | {% endif %} 134 | {% set this_page_children = visible_children|rejectattr("type", "in", own_page_types)|list %} 135 | {% if this_page_children %} 136 | {{ obj.type|title }} Contents 137 | {{ "-" * obj.type|length }}--------- 138 | 139 | {% for obj_item in this_page_children %} 140 | {{ obj_item.render()|indent(0) }} 141 | {% endfor %} 142 | {% endif %} 143 | {% endif %} 144 | {% endblock %} 145 | {% else %} 146 | .. py:module:: {{ obj.name }} 147 | 148 | {% if obj.docstring %} 149 | .. autoapi-nested-parse:: 150 | 151 | {{ obj.docstring|indent(6) }} 152 | 153 | {% endif %} 154 | {% for obj_item in visible_children %} 155 | {{ obj_item.render()|indent(3) }} 156 | {% endfor %} 157 | {% endif %} 158 | {% endif %} 159 | 160 | .. footbibliography:: -------------------------------------------------------------------------------- /src/barecat/glob_to_regex.py: -------------------------------------------------------------------------------- 1 | # This is copied from CPython main branch as of 2024-12-07. 2 | import re 3 | import os.path 4 | import functools 5 | 6 | _re_setops_sub = re.compile(r'([&~|])').sub 7 | _re_escape = functools.lru_cache(maxsize=512)(re.escape) 8 | 9 | 10 | def glob_to_regex(pat, *, recursive=False, include_hidden=False, seps=None): 11 | """Translate a pathname with shell wildcards to a regular expression. 12 | 13 | If `recursive` is true, the pattern segment '**' will match any number of 14 | path segments. 15 | 16 | If `include_hidden` is true, wildcards can match path segments beginning 17 | with a dot ('.'). 18 | 19 | If a sequence of separator characters is given to `seps`, they will be 20 | used to split the pattern into segments and match path separators. If not 21 | given, os.path.sep and os.path.altsep (where available) are used. 22 | """ 23 | if not seps: 24 | if os.path.altsep: 25 | seps = (os.path.sep, os.path.altsep) 26 | else: 27 | seps = os.path.sep 28 | escaped_seps = ''.join(map(re.escape, seps)) 29 | any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps 30 | not_sep = f'[^{escaped_seps}]' 31 | if include_hidden: 32 | one_last_segment = f'{not_sep}+' 33 | one_segment = f'{one_last_segment}{any_sep}' 34 | any_segments = f'(?:.+{any_sep})?' 35 | any_last_segments = '.*' 36 | else: 37 | one_last_segment = f'[^{escaped_seps}.]{not_sep}*' 38 | one_segment = f'{one_last_segment}{any_sep}' 39 | any_segments = f'(?:{one_segment})*' 40 | any_last_segments = f'{any_segments}(?:{one_last_segment})?' 41 | 42 | results = [] 43 | parts = re.split(any_sep, pat) 44 | last_part_idx = len(parts) - 1 45 | for idx, part in enumerate(parts): 46 | if part == '*': 47 | results.append(one_segment if idx < last_part_idx else one_last_segment) 48 | elif recursive and part == '**': 49 | if idx < last_part_idx: 50 | if parts[idx + 1] != '**': 51 | results.append(any_segments) 52 | else: 53 | results.append(any_last_segments) 54 | else: 55 | if part: 56 | if not include_hidden and part[0] in '*?': 57 | results.append(r'(?!\.)') 58 | results.extend(_translate(part, f'{not_sep}*', not_sep)[0]) 59 | if idx < last_part_idx: 60 | results.append(any_sep) 61 | res = ''.join(results) 62 | return fr'(?s:{res})\Z' 63 | 64 | 65 | def _translate(pat, star, question_mark): 66 | res = [] 67 | add = res.append 68 | star_indices = [] 69 | 70 | i, n = 0, len(pat) 71 | while i < n: 72 | c = pat[i] 73 | i = i + 1 74 | if c == '*': 75 | # store the position of the wildcard 76 | star_indices.append(len(res)) 77 | add(star) 78 | # compress consecutive `*` into one 79 | while i < n and pat[i] == '*': 80 | i += 1 81 | elif c == '?': 82 | add(question_mark) 83 | elif c == '[': 84 | j = i 85 | if j < n and pat[j] == '!': 86 | j = j + 1 87 | if j < n and pat[j] == ']': 88 | j = j + 1 89 | while j < n and pat[j] != ']': 90 | j = j + 1 91 | if j >= n: 92 | add('\\[') 93 | else: 94 | stuff = pat[i:j] 95 | if '-' not in stuff: 96 | stuff = stuff.replace('\\', r'\\') 97 | else: 98 | chunks = [] 99 | k = i + 2 if pat[i] == '!' else i + 1 100 | while True: 101 | k = pat.find('-', k, j) 102 | if k < 0: 103 | break 104 | chunks.append(pat[i:k]) 105 | i = k + 1 106 | k = k + 3 107 | chunk = pat[i:j] 108 | if chunk: 109 | chunks.append(chunk) 110 | else: 111 | chunks[-1] += '-' 112 | # Remove empty ranges -- invalid in RE. 113 | for k in range(len(chunks) - 1, 0, -1): 114 | if chunks[k - 1][-1] > chunks[k][0]: 115 | chunks[k - 1] = chunks[k - 1][:-1] + chunks[k][1:] 116 | del chunks[k] 117 | # Escape backslashes and hyphens for set difference (--). 118 | # Hyphens that create ranges shouldn't be escaped. 119 | stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') for s in chunks) 120 | i = j + 1 121 | if not stuff: 122 | # Empty range: never match. 123 | add('(?!)') 124 | elif stuff == '!': 125 | # Negated empty range: match any character. 126 | add('.') 127 | else: 128 | # Escape set operations (&&, ~~ and ||). 129 | stuff = _re_setops_sub(r'\\\1', stuff) 130 | if stuff[0] == '!': 131 | stuff = '^' + stuff[1:] 132 | elif stuff[0] in ('^', '['): 133 | stuff = '\\' + stuff 134 | add(f'[{stuff}]') 135 | else: 136 | add(_re_escape(c)) 137 | assert i == n 138 | return res, star_indices 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Barecat 2 | 3 | **[Full API Reference Docs](https://istvansarandi.com/docs/barecat/api/barecat/Barecat.html)** 4 | 5 | Barecat (**bare** con**cat**enation) is a highly scalable, simple aggregate storage format for 6 | storing many (tens of millions and more) small files, with focus on fast random access and 7 | minimal overhead. 8 | 9 | Barecat can be thought of as a simple filesystem, or as something akin to an indexed tarball, or a 10 | key-value store. Indeed, it can be [mounted via FUSE](https://github.com/isarandi/barecat-mount), converted to a tarball, or used like a dictionary 11 | within Python. 12 | 13 | Barecat associates strings (file paths) with binary data (file contents). It's like a dictionary, 14 | but it has some special handling for '/' characters in the keys, supporting a filesystem-like 15 | experience (`listdir`, `walk`, `glob`, etc). 16 | 17 | Internally, all the data is simply concatenated one after another into one or more data shard files. 18 | Additionally, an index is maintained in an SQLite database, which stores the shard number, the offset 19 | and the size of each inner file (as well as a checksum, and further filesystem-like metadata 20 | like modification time). Barecat also maintains aggregate statistics for each directory, such as the 21 | total number of files and total file size. 22 | 23 | 24 | ![Architecture](./figure.png) 25 | 26 | As you can see, the Barecat format is very simple. Readers/writers are easy to write in any language, since 27 | SQLite is a widely-supported format. 28 | 29 | 30 | ## Background 31 | 32 | A typical use case for Barecat is storing image files for training deep learning models, where the 33 | files are accessed randomly during training. The files are typically stored on a network file 34 | system, where accessing many small files can be slow, and clusters often put a limit on the number 35 | of files of a user. So it is necessary to somehow merge the small files into big ones. 36 | However, typical archive formats such as tar are not suitable, since they don't allow fast random 37 | lookups. In tar, one has to scan the entire archive as there is no central directory. 38 | Zip is better, but still requires scanning the central directory, which can be slow for very large 39 | archives with millions or tens of millions of files. 40 | 41 | We need an index into the archive, and the index itself cannot be required to be loaded 42 | into memory, to support very large datasets. 43 | 44 | Therefore, in this format the metadata is indexed separately in an SQLite database for fast lookup 45 | based on paths. The index also allows fast listing of directory contents and contains aggregate 46 | statistics (total file size, number of files) for each directory. 47 | 48 | ## Features 49 | 50 | - **Fast random access**: The archive can be accessed randomly, addressed by filepath, 51 | without having to scan the entire archive or all the metadata. 52 | The index is stored in a separate SQLite database file, which itself does not need to be loaded 53 | entirely into memory. Ideal for storing training image data for deep learning jobs. 54 | - **Sharding**: To make it easier to move the data around or to distribute it across multiple 55 | storage devices, the archive can be split into multiple files of equal size (shards, or volumes). 56 | The shards do not have to be concatenated to be used, the library will keep all shard files open 57 | and load data from the appropriate one during normal operations. 58 | - **Fast browsing**: The SQLite database contains an index for the parent directories, allowing 59 | fast listing of directory contents and aggregate statistics (total file size, number of files). 60 | - **Intuitive API**: Familiar filesystem-like API, as well as a dictionary-like one. 61 | - **Mountable**: The archive can be efficiently mounted in readonly or read-write mode. 62 | - **Simple storage format**: The files are simply concatenated after each other and the index contains 63 | the offsets and sizes of each file. There is no header format to understand. The index can be 64 | dumped into any format with simple SQL queries. 65 | 66 | ## Command line interface 67 | 68 | To create a Barecat archive, use the `barecat-create` or `barecat-create-recursive` commands, which 69 | are automatically installed executables with the pip package. 70 | 71 | ```bash 72 | barecat-create --file=mydata.barecat --shard-size=100G < path_of_paths.txt 73 | 74 | find dirname -name '*.jpg' -print0 | barecat-create --null --file=mydata.barecat --shard-size=100G 75 | 76 | barecat-create-recursive dir1 dir2 dir3 --file=mydata.barecat --shard-size=100G 77 | ``` 78 | 79 | This may yield the following files: 80 | 81 | ``` 82 | mydata.barecat-shard-00001 83 | mydata.barecat-shard-00002 84 | mydata.barecat-sqlite-index 85 | ``` 86 | 87 | The files can be extracted out again. Unix-like permissions, modification times, owner info are 88 | preserved. 89 | 90 | ```bash 91 | barecat-extract --file=mydata.barecat --target-directory=targetdir/ 92 | ``` 93 | 94 | ## Python API 95 | 96 | ```python 97 | 98 | import barecat 99 | 100 | with barecat.Barecat('mydata.barecat', readonly=False) as bc: 101 | bc['path/to/file/as/stored.jpg'] = binary_file_data 102 | bc.add_by_path('path/to/file/on/disk.jpg') 103 | 104 | with open('path', 'rb') as f: 105 | bc.add('path/to/file/on/disk.jpg', fileobj=f) 106 | 107 | with barecat.Barecat('mydata.barecat') as bc: 108 | binary_file_data = bc['path/to/file.jpg'] 109 | entrynames = bc.listdir('path/to') 110 | for root, dirs, files in bc.walk('path/to/something'): 111 | print(root, dirs, files) 112 | 113 | paths = bc.glob('path/to/**/*.jpg', recursive=True) 114 | 115 | with bc.open('path/to/file.jpg', 'rb') as f: 116 | data = f.read(123) 117 | ``` 118 | 119 | ## Image viewer 120 | 121 | Barecat comes with a simple image viewer that can be used to browse the contents of a Barecat 122 | archive. 123 | 124 | ```bash 125 | barecat-image-viewer mydata.barecat 126 | ``` 127 | 128 | 129 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Barecat 2 | ======= 3 | 4 | Barecat (**bare** con**cat**enation) is a highly scalable, simple aggregate storage format for 5 | storing many (tens of millions and more) small files, with focus on fast random access and 6 | minimal overhead. 7 | 8 | Barecat can be thought of as a simple filesystem, or as something akin to an indexed tarball, or a 9 | key-value store. Indeed, it can be `mounted via FUSE `_, converted to a tarball, or used like a dictionary 10 | within Python. 11 | 12 | Barecat associates strings (file paths) with binary data (file contents). It's like a dictionary, 13 | but it has some special handling for '/' characters in the keys, supporting a filesystem-like 14 | experience (``listdir``, ``walk``, ``glob``, etc). 15 | 16 | Internally, all the data is simply concatenated one after another into one or more data shard files. 17 | Additionally, an index is maintained in an SQLite database, which stores the shard number, the offset 18 | and the size of each inner file (as well as a checksum, and further filesystem-like metadata 19 | like modification time). Barecat also maintains aggregate statistics for each directory, such as the 20 | total number of files and total file size. 21 | 22 | .. image:: ../figure.png 23 | 24 | As you can see, the Barecat format is very simple. Readers/writers are easy to write in any language, since 25 | SQLite is a widely-supported format. 26 | 27 | Background 28 | ---------- 29 | 30 | A typical use case for Barecat is storing image files for training deep learning models, where the 31 | files are accessed randomly during training. The files are typically stored on a network file 32 | system, where accessing many small files can be slow, and clusters often put a limit on the number 33 | of files of a user. So it is necessary to somehow merge the small files into big ones. 34 | However, typical archive formats such as tar are not suitable, since they don't allow fast random 35 | lookups. In tar, one has to scan the entire archive as there is no central directory. 36 | Zip is better, but still requires scanning the central directory, which can be slow for very large 37 | archives with millions or tens of millions of files. 38 | 39 | We need an index into the archive, and the index itself cannot be required to be loaded 40 | into memory, to support very large datasets. 41 | 42 | Therefore, in this format the metadata is indexed separately in an SQLite database for fast lookup 43 | based on paths. The index also allows fast listing of directory contents and contains aggregate 44 | statistics (total file size, number of files) for each directory. 45 | 46 | Features 47 | -------- 48 | 49 | - **Fast random access**: The archive can be accessed randomly, addressed by filepath, 50 | without having to scan the entire archive or all the metadata. 51 | The index is stored in a separate SQLite database file, which itself does not need to be loaded 52 | entirely into memory. Ideal for storing training image data for deep learning jobs. 53 | - **Sharding**: To make it easier to move the data around or to distribute it across multiple 54 | storage devices, the archive can be split into multiple files of equal size (shards, or volumes). 55 | The shards do not have to be concatenated to be used, the library will keep all shard files open 56 | and load data from the appropriate one during normal operations. 57 | - **Fast browsing**: The SQLite database contains an index for the parent directories, allowing 58 | fast listing of directory contents and aggregate statistics (total file size, number of files). 59 | - **Intuitive API**: Familiar filesystem-like API, as well as a dictionary-like one. 60 | - **Mountable**: The archive can be efficiently mounted in readonly or read-write mode. 61 | - **Simple storage format**: The files are simply concatenated after each other and the index contains 62 | the offsets and sizes of each file. There is no header format to understand. The index can be 63 | dumped into any format with simple SQL queries. 64 | 65 | Command line interface 66 | ---------------------- 67 | 68 | To create a Barecat archive, use the ``barecat-create`` or ``barecat-create-recursive`` commands, which 69 | are automatically installed executables with the pip package. 70 | 71 | .. code-block:: bash 72 | 73 | barecat-create --file=mydata.barecat --shard-size=100G < path_of_paths.txt 74 | 75 | find dirname -name '*.jpg' -print0 | barecat-create --null --file=mydata.barecat --shard-size=100G 76 | 77 | barecat-create-recursive dir1 dir2 dir3 --file=mydata.barecat --shard-size=100G 78 | 79 | This may yield the following files: 80 | 81 | .. code-block:: text 82 | 83 | mydata.barecat-shard-00001 84 | mydata.barecat-shard-00002 85 | mydata.barecat-sqlite-index 86 | 87 | The files can be extracted out again. Unix-like permissions, modification times, owner info are 88 | preserved. 89 | 90 | .. code-block:: bash 91 | 92 | barecat-extract --file=mydata.barecat --target-directory=targetdir/ 93 | 94 | Python API 95 | ---------- 96 | 97 | .. code-block:: python 98 | 99 | import barecat 100 | 101 | with barecat.Barecat('mydata.barecat', readonly=False) as bc: 102 | bc['path/to/file/as/stored.jpg'] = binary_file_data 103 | bc.add_by_path('path/to/file/on/disk.jpg') 104 | 105 | with open('path', 'rb') as f: 106 | bc.add('path/to/file/on/disk.jpg', fileobj=f) 107 | 108 | with barecat.Barecat('mydata.barecat') as bc: 109 | binary_file_data = bc['path/to/file.jpg'] 110 | entrynames = bc.listdir('path/to') 111 | for root, dirs, files in bc.walk('path/to/something'): 112 | print(root, dirs, files) 113 | 114 | paths = bc.glob('path/to/**/*.jpg', recursive=True) 115 | 116 | with bc.open('path/to/file.jpg', 'rb') as f: 117 | data = f.read(123) 118 | 119 | Image viewer 120 | ------------ 121 | 122 | Barecat comes with a simple image viewer that can be used to browse the contents of a Barecat 123 | archive. 124 | 125 | .. code-block:: bash 126 | 127 | barecat-image-viewer mydata.barecat 128 | 129 | Sitemap 130 | ------- 131 | 132 | .. toctree:: 133 | :maxdepth: 3 134 | :caption: Contents 135 | 136 | 137 | * :ref:`genindex` 138 | * :ref:`modindex` 139 | * :ref:`search` 140 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import types 2 | import contextlib 3 | import importlib 4 | import inspect 5 | import os 6 | import re 7 | import sys 8 | from enum import Enum 9 | 10 | import setuptools_scm 11 | import toml 12 | 13 | sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) 14 | 15 | 16 | pyproject_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "pyproject.toml")) 17 | 18 | with open(pyproject_path) as f: 19 | data = toml.load(f) 20 | 21 | project_info = data["project"] 22 | project_slug = project_info["name"].replace(" ", "-").lower() 23 | tool_urls = project_info.get("urls", {}) 24 | 25 | repo_url = tool_urls.get("Repository", "") 26 | author_url = tool_urls.get("Author", "") 27 | github_username = re.match(r"https://github\.com/([^/]+)/?", repo_url)[1] 28 | 29 | project = project_info["name"] 30 | release = setuptools_scm.get_version('..') 31 | version = ".".join(release.split(".")[:2]) 32 | main_module_name = project_slug.replace('-', '_') 33 | repo_name = project_slug 34 | module = importlib.import_module(main_module_name) 35 | globals()[main_module_name] = module 36 | 37 | 38 | # -- Project information ----------------------------------------------------- 39 | linkcode_url = repo_url 40 | 41 | author = project_info["authors"][0]["name"] 42 | copyright = f'%Y' 43 | 44 | # -- General configuration --------------------------------------------------- 45 | add_module_names = False 46 | python_use_unqualified_type_names = True 47 | extensions = [ 48 | 'sphinx.ext.autodoc', 49 | 'sphinx.ext.napoleon', 50 | 'sphinx.ext.autosummary', 51 | 'sphinx.ext.intersphinx', 52 | 'sphinx.ext.linkcode', 53 | 'sphinx.ext.autodoc.typehints', 54 | 'sphinxcontrib.bibtex', 55 | 'autoapi.extension', 56 | 'sphinx.ext.inheritance_diagram', 57 | 'sphinx_codeautolink', 58 | ] 59 | bibtex_bibfiles = ['abbrev_long.bib', 'references.bib'] 60 | bibtex_footbibliography_header = ".. rubric:: References" 61 | intersphinx_mapping = { 62 | 'python': ('https://docs.python.org/3', None), 63 | 'torch': ('https://pytorch.org/docs/main/', None), 64 | 'numpy': ('https://numpy.org/doc/stable/', None), 65 | 'scipy': ('https://docs.scipy.org/doc/scipy/', None), 66 | } 67 | 68 | github_username = github_username 69 | github_repository = repo_name 70 | autodoc_show_sourcelink = False 71 | html_show_sourcelink = False 72 | 73 | templates_path = ['_templates'] 74 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 75 | python_display_short_literal_types = True 76 | 77 | html_title = project 78 | html_theme = 'pydata_sphinx_theme' 79 | html_theme_options = { 80 | "show_toc_level": 3, 81 | "icon_links": [ 82 | { 83 | "name": "GitHub", 84 | "url": repo_url, 85 | "icon": "fa-brands fa-square-github", 86 | "type": "fontawesome", 87 | } 88 | ], 89 | } 90 | html_static_path = ['_static'] 91 | html_css_files = ['styles/my_theme.css'] 92 | 93 | html_context = { 94 | "author_url": author_url, 95 | "author": author, 96 | } 97 | 98 | toc_object_entries_show_parents = "hide" 99 | 100 | autoapi_root = 'api' 101 | autoapi_member_order = 'bysource' 102 | autodoc_typehints = 'description' 103 | autoapi_own_page_level = 'attribute' 104 | autoapi_type = 'python' 105 | autodoc_default_options = { 106 | 'members': True, 107 | 'inherited-members': True, 108 | 'undoc-members': False, 109 | 'exclude-members': '__init__, __weakref__, __repr__, __str__', 110 | } 111 | autoapi_options = ['members', 'show-inheritance', 'special-members', 'show-module-summary'] 112 | autoapi_add_toctree_entry = True 113 | autoapi_dirs = ['../src'] 114 | autoapi_template_dir = '_templates/autoapi' 115 | 116 | autodoc_member_order = 'bysource' 117 | autoclass_content = 'class' 118 | 119 | autosummary_generate = True 120 | autosummary_imported_members = False 121 | 122 | 123 | def autodoc_skip_member(app, what, name, obj, skip, options): 124 | """ 125 | Skip members (functions, classes, modules) without docstrings. 126 | """ 127 | # Check if the object has a __doc__ attribute 128 | if not getattr(obj, 'docstring', None): 129 | print('no docstring', name) 130 | return True # Skip if there's no docstring 131 | elif what in ('class', 'function', 'attribute'): 132 | # Check if the module of the class has a docstring 133 | print('checking module', name) 134 | module_name = '.'.join(name.split('.')[:-1]) 135 | 136 | try: 137 | module = importlib.import_module(module_name) 138 | return not getattr(module, '__doc__', None) 139 | except ModuleNotFoundError as e: 140 | print('module not found', module_name, str(e)) 141 | return None 142 | 143 | 144 | def linkcode_resolve(domain, info): 145 | if domain != 'py': 146 | return None 147 | 148 | file, start, end = get_line_numbers(eval(info['fullname'])) 149 | relpath = os.path.relpath(file, os.path.dirname(module.__file__)) 150 | return f'{repo_url}/blob/v{release}/src/{main_module_name}/{relpath}#L{start}-L{end}' 151 | 152 | 153 | def get_line_numbers(obj): 154 | if isinstance(obj, property): 155 | obj = obj.fget 156 | 157 | if isinstance(obj, Enum): 158 | return get_enum_member_line_numbers(obj) 159 | 160 | if inspect.ismemberdescriptor(obj): 161 | return get_member_line_numbers(obj) 162 | 163 | with module_restored(obj): 164 | lines = inspect.getsourcelines(obj) 165 | file = inspect.getsourcefile(obj) 166 | 167 | start, end = lines[1], lines[1] + len(lines[0]) - 1 168 | return file, start, end 169 | 170 | 171 | def get_enum_member_line_numbers(obj): 172 | class_ = obj.__class__ 173 | with module_restored(class_): 174 | source_lines, start_line = inspect.getsourcelines(class_) 175 | 176 | for i, line in enumerate(source_lines): 177 | if f"{obj.name} =" in line: 178 | return inspect.getsourcefile(class_), start_line + i, start_line + i 179 | else: 180 | raise ValueError(f"Enum member {obj.name} not found in {class_}") 181 | 182 | 183 | def get_member_line_numbers(obj: types.MemberDescriptorType): 184 | class_ = obj.__objclass__ 185 | with module_restored(class_): 186 | source_lines, start_line = inspect.getsourcelines(class_) 187 | 188 | for i, line in enumerate(source_lines): 189 | if f"{obj.__name__} = " in line: 190 | return inspect.getsourcefile(class_), start_line + i, start_line + i 191 | else: 192 | raise ValueError(f"Member {obj.__name__} not found in {class_}") 193 | 194 | 195 | @contextlib.contextmanager 196 | def module_restored(obj): 197 | if not hasattr(obj, '_module_original_'): 198 | yield 199 | else: 200 | fake_module = obj.__module__ 201 | obj.__module__ = obj._module_original_ 202 | yield 203 | obj.__module__ = fake_module 204 | 205 | 206 | def setup(app): 207 | app.connect('autoapi-skip-member', autodoc_skip_member) 208 | app.connect('autodoc-skip-member', autodoc_skip_member) 209 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | import barecat 4 | import pytest 5 | 6 | 7 | @pytest.fixture 8 | def temp_jpeg_dir(tmp_path): 9 | """ 10 | Creates a complex temporary directory with sample JPEG files. 11 | """ 12 | (tmp_path / "dir1").mkdir() 13 | (tmp_path / "dir1/subdir1").mkdir() 14 | (tmp_path / "dir1/subdir1/test1.jpg").write_bytes(b"dummy data1") 15 | (tmp_path / "dir1/subdir2").mkdir() 16 | (tmp_path / "dir1/subdir2/test2.jpg").write_bytes(b"dummy data2") 17 | (tmp_path / "dir2").mkdir() 18 | (tmp_path / "dir2/test3.jpg").write_bytes(b"dummy data3") 19 | (tmp_path / "dir2/empty_subdir").mkdir() 20 | (tmp_path / "dir3").mkdir() 21 | return tmp_path 22 | 23 | 24 | @pytest.fixture 25 | def barecat_archive(temp_jpeg_dir): 26 | """ 27 | Creates a standard Barecat archive for testing. 28 | """ 29 | archive_file = temp_jpeg_dir / "mydata.barecat" 30 | 31 | create_cmd = [ 32 | "barecat-create-recursive", 33 | "--file", str(archive_file), 34 | "--overwrite", 35 | str(temp_jpeg_dir / "dir1"), 36 | str(temp_jpeg_dir / "dir2"), 37 | str(temp_jpeg_dir / "dir3"), 38 | '--shard-size=22' 39 | ] 40 | subprocess.run(create_cmd, check=True) 41 | 42 | return archive_file 43 | 44 | 45 | def test_barecat_creation(temp_jpeg_dir): 46 | """ 47 | Runs `find` with `barecat-create` and verifies output. 48 | """ 49 | output_file = temp_jpeg_dir / "mydata.barecat" 50 | cmd = f"cd {temp_jpeg_dir}; find . -name '*.jpg' -print0 | sort | barecat-create --null --file={output_file} --overwrite --shard-size=22" 51 | 52 | result = subprocess.run(cmd, shell=True, capture_output=True, text=True) 53 | 54 | with barecat.Barecat(output_file) as reader: 55 | file_list = list(reader) 56 | assert len(file_list) == 3, "Expected 3 files in the archive" 57 | assert "dir1/subdir1/test1.jpg" in file_list, "Expected dir1/subdir1/test1.jpg in the archive" 58 | assert "dir1/subdir2/test2.jpg" in file_list, "Expected dir1/subdir2/test2.jpg in the archive" 59 | assert "dir2/test3.jpg" in file_list, "Expected dir2/test3.jpg in the archive" 60 | assert reader[ 61 | "dir1/subdir1/test1.jpg"] == b"dummy data1", "Expected dir1/subdir1/test1.jpg to contain 'dummy data1'" 62 | assert reader[ 63 | "dir1/subdir2/test2.jpg"] == b"dummy data2", "Expected dir1/subdir2/test2.jpg to contain 'dummy data2'" 64 | assert reader[ 65 | "dir2/test3.jpg"] == b"dummy data3", "Expected dir2/test3.jpg to contain 'dummy data3'" 66 | assert reader.sharder.num_shards == 2, "Expected 2 shards in the archive" 67 | 68 | assert result.returncode == 0, f"Command failed: {result.stderr}" 69 | assert (temp_jpeg_dir / "mydata.barecat-sqlite-index").exists(), "Output file was not created" 70 | 71 | def test_barecat_creation_workers(temp_jpeg_dir): 72 | """ 73 | Runs `find` with `barecat-create` and verifies output. 74 | """ 75 | output_file = temp_jpeg_dir / "mydata.barecat" 76 | cmd = f"cd {temp_jpeg_dir}; find . -name '*.jpg' -print0 | sort | barecat-create --null --file={output_file} --overwrite --shard-size=22 --workers=8" 77 | 78 | result = subprocess.run(cmd, shell=True, capture_output=True, text=True) 79 | 80 | with barecat.Barecat(output_file) as reader: 81 | file_list = list(reader) 82 | assert len(file_list) == 3, "Expected 3 files in the archive" 83 | assert "dir1/subdir1/test1.jpg" in file_list, "Expected dir1/subdir1/test1.jpg in the archive" 84 | assert "dir1/subdir2/test2.jpg" in file_list, "Expected dir1/subdir2/test2.jpg in the archive" 85 | assert "dir2/test3.jpg" in file_list, "Expected dir2/test3.jpg in the archive" 86 | assert reader[ 87 | "dir1/subdir1/test1.jpg"] == b"dummy data1", "Expected dir1/subdir1/test1.jpg to contain 'dummy data1'" 88 | assert reader[ 89 | "dir1/subdir2/test2.jpg"] == b"dummy data2", "Expected dir1/subdir2/test2.jpg to contain 'dummy data2'" 90 | assert reader[ 91 | "dir2/test3.jpg"] == b"dummy data3", "Expected dir2/test3.jpg to contain 'dummy data3'" 92 | assert reader.sharder.num_shards == 2, "Expected 2 shards in the archive" 93 | 94 | assert result.returncode == 0, f"Command failed: {result.stderr}" 95 | assert (temp_jpeg_dir / "mydata.barecat-sqlite-index").exists(), "Output file was not created" 96 | 97 | 98 | def test_extract_single(barecat_archive): 99 | """ 100 | Tests `barecat-extract-single` to ensure a specific file is correctly extracted from the archive. 101 | """ 102 | extract_cmd = [ 103 | "barecat-extract-single", 104 | "--barecat-file", str(barecat_archive), 105 | "--path", "dir1/subdir1/test1.jpg" 106 | ] 107 | 108 | result = subprocess.run(extract_cmd, capture_output=True) 109 | 110 | assert result.stdout == b"dummy data1", "Unexpected content in extracted file" 111 | assert result.returncode == 0, f"Command failed: {result.stderr}" 112 | 113 | 114 | def test_defrag(barecat_archive): 115 | """ 116 | Tests `barecat-defrag` to ensure the archive can be defragmented properly. 117 | """ 118 | 119 | 120 | with barecat.Barecat(barecat_archive, readonly=False) as bc: 121 | first_file = next(iter(bc.index.iter_all_filepaths(barecat.Order.ADDRESS))) 122 | 123 | del bc[first_file] 124 | assert first_file not in bc 125 | assert bc.total_logical_size != bc.total_physical_size_seek 126 | 127 | 128 | defrag_cmd = [ 129 | "barecat-defrag", 130 | str(barecat_archive) 131 | ] 132 | 133 | result = subprocess.run(defrag_cmd, capture_output=True, text=True) 134 | 135 | with barecat.Barecat(barecat_archive) as reader: 136 | assert reader.total_logical_size == reader.total_physical_size_seek 137 | assert reader.sharder.num_shards == 1 138 | 139 | 140 | assert result.returncode == 0, f"Command failed: {result.stderr}" 141 | 142 | 143 | def test_verify_integrity(barecat_archive): 144 | """ 145 | Tests `barecat-verify` to ensure the archive's integrity. 146 | """ 147 | verify_cmd = [ 148 | "barecat-verify", 149 | str(barecat_archive) 150 | ] 151 | 152 | result = subprocess.run(verify_cmd, capture_output=True, text=True) 153 | 154 | assert result.returncode == 0, f"Command failed: {result.stderr}" 155 | 156 | # now edit the file and verify again 157 | with open(f'{barecat_archive}-shard-00000', "r+b") as f: 158 | f.seek(0) 159 | f.write(b"junk") 160 | 161 | result = subprocess.run(verify_cmd, capture_output=True, text=True) 162 | assert result.returncode != 0, f"Command should have failed: {result.stderr}" 163 | assert 'CRC32C' in result.stdout, "Expected CRC mismatch error message" 164 | 165 | 166 | def test_index_to_csv(barecat_archive): 167 | """ 168 | Tests `barecat-index-to-csv` to ensure index can be dumped as CSV. 169 | """ 170 | csv_cmd = [ 171 | "barecat-index-to-csv", 172 | str(barecat_archive) + "-sqlite-index" 173 | ] 174 | 175 | result = subprocess.run(csv_cmd, capture_output=True, text=True) 176 | 177 | assert '"path","shard","offset","size","crc32c"' in result.stdout, "CSV output missing expected header" 178 | assert result.returncode == 0, f"Command failed: {result.stderr}" 179 | -------------------------------------------------------------------------------- /src/barecat/archive_formats.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import shutil 3 | import tarfile 4 | import zipfile 5 | from datetime import datetime 6 | 7 | from barecat.core.index import BarecatDirInfo, BarecatFileInfo, BarecatEntryInfo 8 | from barecat.progbar import progressbar 9 | 10 | 11 | def iter_archive(src_path): 12 | if src_path.endswith(('.tar', '.tar.gz', '.tar.bz2', '.tar.xz')): 13 | return iter_tarfile(src_path) 14 | elif src_path.endswith('.zip'): 15 | return iter_zipfile(src_path) 16 | else: 17 | raise ValueError('Unsupported archive format') 18 | 19 | 20 | def iter_archive_nocontent(src_path): 21 | if src_path.endswith(('.tar', '.tar.gz', '.tar.bz2', '.tar.xz')): 22 | return iter_tarfile_nocontent(src_path) 23 | elif src_path.endswith('.zip'): 24 | return iter_zipfile_nocontent(src_path) 25 | else: 26 | raise ValueError('Unsupported archive format') 27 | 28 | 29 | def iter_zipfile(path): 30 | with zipfile.ZipFile(path, mode='r') as zipf: 31 | for member in progressbar(zipf.infolist(), desc='Packing files', unit=' files'): 32 | if member.is_dir(): 33 | di = BarecatDirInfo(path=member.filename) 34 | di.mtime_dt = datetime(*member.date_time) 35 | yield di, None 36 | else: 37 | fi = BarecatFileInfo(path=member.filename, size=member.file_size) 38 | fi.mtime_dt = datetime(*member.date_time) 39 | with zipf.open(member) as file_in_zip: 40 | yield fi, file_in_zip 41 | 42 | 43 | def iter_zipfile_nocontent(path): 44 | with open(path, 'rb') as f: 45 | with zipfile.ZipFile(f, mode='r') as zipf: 46 | for member in progressbar(zipf.infolist(), desc='Packing files', unit=' files'): 47 | if member.is_dir(): 48 | di = BarecatDirInfo(path=member.filename) 49 | di.mtime_dt = datetime(*member.date_time) 50 | yield di 51 | else: 52 | f.seek(member.header_offset + 26) 53 | namelen = int.from_bytes(f.read(2), byteorder='little') 54 | extralen = int.from_bytes(f.read(2), byteorder='little') 55 | data_offset = member.header_offset + 30 + namelen + extralen 56 | 57 | fi = BarecatFileInfo( 58 | path=member.filename, shard=0, offset=data_offset, size=member.file_size 59 | ) 60 | fi.mtime_dt = datetime(*member.date_time) 61 | yield fi 62 | 63 | 64 | def iter_tarfile(path): 65 | tar_file_size = osp.getsize(path) // 1024 // 1024 66 | pbar = progressbar(None, desc='Packing files', unit=' MB', total=tar_file_size) 67 | progpos = 0 68 | 69 | with tarfile.open(path, mode='r|*') as tar: 70 | for member in tar: 71 | if member.isdir(): 72 | di = BarecatDirInfo( 73 | path=member.name, 74 | mode=member.mode, 75 | uid=member.uid, 76 | gid=member.gid, 77 | mtime_ns=member.mtime * 1_000_000_000, 78 | ) 79 | yield di, None 80 | if member.isfile(): 81 | fi = BarecatFileInfo( 82 | path=member.name, 83 | size=member.size, 84 | mode=member.mode, 85 | uid=member.uid, 86 | gid=member.gid, 87 | mtime_ns=member.mtime * 1_000_000_000, 88 | ) 89 | 90 | with tar.extractfile(member) as file_in_tar: 91 | yield fi, file_in_tar 92 | 93 | new_pos = tar.fileobj.tell() // 1024 // 1024 94 | delta = new_pos - progpos 95 | pbar.update(delta) 96 | progpos += delta 97 | 98 | 99 | def iter_tarfile_nocontent(path): 100 | tar_file_size = osp.getsize(path) // 1024 // 1024 101 | pbar = progressbar(None, desc='Packing files', unit=' MB', total=tar_file_size) 102 | progpos = 0 103 | 104 | with tarfile.open(path, mode='r|*') as tar: 105 | for member in tar: 106 | if member.isdir(): 107 | di = BarecatDirInfo( 108 | path=member.name, 109 | mode=member.mode, 110 | uid=member.uid, 111 | gid=member.gid, 112 | mtime_ns=member.mtime * 1_000_000_000, 113 | ) 114 | yield di 115 | if member.isfile(): 116 | fi = BarecatFileInfo( 117 | path=member.name, 118 | shard=0, 119 | offset=member.offset_data, 120 | size=member.size, 121 | mode=member.mode, 122 | uid=member.uid, 123 | gid=member.gid, 124 | mtime_ns=member.mtime * 1_000_000_000, 125 | ) 126 | yield fi 127 | new_pos = tar.fileobj.tell() // 1024 // 1024 128 | delta = new_pos - progpos 129 | pbar.update(delta) 130 | progpos += delta 131 | 132 | 133 | def get_archive_writer(target_path): 134 | if target_path.endswith(('.tar', '.tar.gz', '.tar.bz2', '.tar.xz')): 135 | return TarWriter(target_path) 136 | elif target_path.endswith('.zip'): 137 | return ZipWriter(target_path) 138 | else: 139 | raise ValueError('Unsupported archive format') 140 | 141 | 142 | class ZipWriter: 143 | def __init__(self, target_path): 144 | self.zip = zipfile.ZipFile(target_path, mode='w') 145 | 146 | def add(self, info: BarecatEntryInfo, fileobj=None): 147 | if isinstance(info, BarecatDirInfo): 148 | zipinfo = zipfile.ZipInfo(info.path + '/') 149 | zipinfo.date_time = info.mtime_dt.timetuple()[:6] 150 | self.zip.writestr(zipinfo, '') 151 | else: 152 | zipinfo = zipfile.ZipInfo(info.path) 153 | zipinfo.date_time = info.mtime_dt.timetuple()[:6] 154 | zipinfo.file_size = info.size 155 | with self.zip.open(zipinfo, 'w') as file_in_zip: 156 | shutil.copyfileobj(fileobj, file_in_zip) 157 | 158 | def close(self): 159 | self.zip.close() 160 | 161 | def __enter__(self): 162 | return self 163 | 164 | def __exit__(self, *args): 165 | self.close() 166 | 167 | 168 | class TarWriter: 169 | def __init__(self, *args, **kwargs): 170 | if 'mode' not in kwargs: 171 | kwargs['mode'] = 'w' 172 | self.tar = tarfile.open(*args, **kwargs) 173 | 174 | def add(self, info: BarecatEntryInfo, fileobj=None): 175 | tarinfo = tarfile.TarInfo(info.path) 176 | tarinfo.uid = info.uid or 0 177 | tarinfo.gid = info.gid or 0 178 | if info.mtime_ns is not None: 179 | tarinfo.mtime = info.mtime_ns // 1_000_000_000 180 | if isinstance(info, BarecatDirInfo): 181 | tarinfo.type = tarfile.DIRTYPE 182 | tarinfo.mode = 0o755 if info.mode is None else info.mode 183 | self.tar.addfile(tarinfo) 184 | else: 185 | tarinfo.size = info.size 186 | tarinfo.mode = 0o644 if info.mode is None else info.mode 187 | self.tar.addfile(tarinfo, fileobj) 188 | 189 | def close(self): 190 | self.tar.close() 191 | 192 | def __enter__(self): 193 | return self 194 | 195 | def __exit__(self, *args): 196 | self.close() 197 | -------------------------------------------------------------------------------- /src/barecat/defrag.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dataclasses 4 | import os 5 | import time 6 | from typing import TYPE_CHECKING 7 | 8 | from barecat.core.index import Order 9 | from barecat.progbar import progressbar 10 | 11 | if TYPE_CHECKING: 12 | from barecat.core.barecat import Barecat 13 | 14 | 15 | class BarecatDefragger: 16 | def __init__(self, bc: Barecat): 17 | self.bc = bc 18 | self.index = bc.index 19 | self.shard_size_limit = bc.shard_size_limit 20 | self.readonly = bc.readonly 21 | self.shard_files = bc.sharder.shard_files 22 | 23 | def get_gaps(self): 24 | gaps = self.index.fetch_all(""" 25 | WITH x AS ( 26 | SELECT config.value_int AS shard_size_limit 27 | FROM config 28 | WHERE config.key = 'shard_size_limit' 29 | ), 30 | first_gaps AS ( 31 | SELECT 32 | f.shard, 33 | 0 AS offset, 34 | MIN(f.offset) AS size 35 | FROM files f 36 | GROUP BY f.shard 37 | ), 38 | nonfirst_gaps AS ( 39 | SELECT 40 | f.shard, 41 | (f.offset + f.size) AS offset, 42 | coalesce( 43 | lead(f.offset, 1) OVER (PARTITION BY f.shard ORDER BY f.offset), 44 | x.shard_size_limit 45 | ) - (f.offset + f.size) AS size 46 | FROM files f, x 47 | ), 48 | all_gaps AS (SELECT * FROM first_gaps UNION ALL SELECT * FROM nonfirst_gaps) 49 | SELECT shard, offset, size 50 | FROM all_gaps 51 | WHERE size > 0 52 | ORDER BY shard, offset 53 | """, rowcls=FragmentGap) 54 | 55 | empty_shard_gaps = [ 56 | FragmentGap(shard, 0, self.shard_size_limit) 57 | for shard in range(len(self.shard_files)) 58 | if self.bc.index.logical_shard_end(shard) == 0] 59 | gaps.extend(empty_shard_gaps) 60 | gaps.sort(key=lambda gap: (gap.shard, gap.offset)) 61 | return gaps 62 | 63 | # gaps = [] 64 | # prev_end = 0 65 | # prev_shard = -1 66 | # for fi in self.index.iter_all_fileinfos(order=Order.ADDRESS): 67 | # if fi.shard > prev_shard: 68 | # if self.shard_size_limit > prev_end and prev_shard >= 0: 69 | # gaps.append(FragmentGap(prev_shard, prev_end, self.shard_size_limit - 70 | # prev_end)) 71 | # for i in range(prev_shard + 1, fi.shard): 72 | # gaps.append(FragmentGap(i, 0, self.shard_size_limit)) 73 | # prev_end = 0 74 | # if fi.offset > prev_end: 75 | # gaps.append(FragmentGap(fi.shard, prev_end, fi.offset - prev_end)) 76 | # prev_shard = fi.shard 77 | # prev_end = fi.offset + fi.size 78 | # return gaps 79 | 80 | def needs_defrag(self): 81 | # check if total size of shards is larger than the sum of the sizes of the files in index 82 | # the getsize() function may not be fully up to date but this is only a heuristic anyway. 83 | return self.bc.total_physical_size_seek > self.bc.total_logical_size 84 | 85 | def get_defrag_info(self): 86 | return self.bc.total_physical_size_seek, self.bc.total_logical_size 87 | 88 | def defrag(self): 89 | if self.readonly: 90 | raise ValueError('Cannot defrag a read-only Barecat') 91 | 92 | new_shard = 0 93 | new_offset = 0 94 | 95 | old_total = self.bc.total_physical_size_seek 96 | 97 | try: 98 | for i in range(len(self.shard_files)): 99 | self.bc.sharder.reopen_shard(i, 'r+b') 100 | 101 | file_iter = self.index.iter_all_fileinfos(order=Order.ADDRESS) 102 | for fi in progressbar(file_iter, total=self.index.num_files, desc='Defragging'): 103 | if (self.shard_size_limit is not None and new_offset + fi.size > 104 | self.shard_size_limit): 105 | self.shard_files[new_shard].truncate(new_offset) 106 | self.bc.sharder.reopen_shard(new_shard, 'rb') 107 | new_shard += 1 108 | new_offset = 0 109 | 110 | if not (new_shard == fi.shard and new_offset == fi.offset): 111 | shift_n_bytes( 112 | self.shard_files[fi.shard], self.shard_files[new_shard], 113 | fi.offset, new_offset, fi.size) 114 | self.index.move_file(fi.path, new_shard, new_offset) 115 | 116 | new_offset += fi.size 117 | 118 | # Truncate the last shard to its real size (the others are truncated already) 119 | self.shard_files[new_shard].truncate(new_offset) 120 | # Close and delete all shards after the last one 121 | for i in range(new_shard + 1, len(self.shard_files)): 122 | self.shard_files[i].close() 123 | os.remove(self.shard_files[i].name) 124 | del self.shard_files[new_shard + 1:] 125 | 126 | new_total = self.bc.total_physical_size_seek 127 | return old_total - new_total 128 | finally: 129 | self.bc.sharder.reopen_shards() 130 | 131 | def defrag_quick(self, time_max_seconds=5): 132 | if self.readonly: 133 | raise ValueError('Cannot defrag a read-only Barecat') 134 | 135 | start_time = time.monotonic() 136 | # Collect all gaps in the shards 137 | gaps = self.get_gaps() 138 | freed_space = 0 139 | try: 140 | for i in range(len(self.shard_files)): 141 | self.bc.sharder.reopen_shard(i, 'r+b') 142 | 143 | for fi in self.index.iter_all_fileinfos(order=Order.ADDRESS | Order.DESC): 144 | moved = self.move_to_earlier_gap(fi, gaps) 145 | if not moved or time.monotonic() - start_time > time_max_seconds: 146 | # We stop when we reach the first file that cannot be moved to an earlier gap 147 | break 148 | freed_space += fi.size 149 | 150 | self.bc.truncate_all_to_logical_size() 151 | finally: 152 | self.bc.sharder.reopen_shards() 153 | 154 | return freed_space 155 | 156 | def move_to_earlier_gap(self, fi, gaps): 157 | for i_gap, gap in enumerate(gaps): 158 | if gap.shard > fi.shard or (gap.shard == fi.shard and gap.offset >= fi.offset): 159 | # reached the gap that is after the file, no move is possible 160 | return False 161 | if gap.size >= fi.size: 162 | shift_n_bytes( 163 | self.shard_files[fi.shard], self.shard_files[gap.shard], fi.offset, 164 | gap.offset, fi.size) 165 | self.index.move_file(fi.path, gap.shard, gap.offset) 166 | gap.size -= fi.size 167 | gap.offset += fi.size 168 | if gap.size == 0: 169 | # even though we are changing the list while in a for loop that is iterating 170 | # over it, this is safe because we are immediately returning in this iteration. 171 | del gaps[i_gap] 172 | return True 173 | return False 174 | 175 | 176 | def shift_n_bytes(src_file, dst_file, src_offset, dst_offset, length, bufsize=64 * 1024): 177 | if src_file == dst_file and src_offset < dst_offset: 178 | raise ValueError('This function can only shift left' 179 | ' because defragging is done towards the left') 180 | 181 | bytes_to_copy = length 182 | while bytes_to_copy > 0: 183 | src_file.seek(src_offset) 184 | data = src_file.read(min(bufsize, bytes_to_copy)) 185 | if not data: 186 | raise ValueError('Unexpected EOF') 187 | 188 | dst_file.seek(dst_offset) 189 | dst_file.write(data) 190 | 191 | len_data = len(data) 192 | src_offset += len_data 193 | dst_offset += len_data 194 | bytes_to_copy -= len_data 195 | 196 | 197 | @dataclasses.dataclass 198 | class FragmentGap: 199 | shard: int 200 | offset: int 201 | size: int 202 | 203 | @classmethod 204 | def row_factory(cls, cursor, row): 205 | field_names = [d[0] for d in cursor.description] 206 | return cls(**dict(zip(field_names, row))) 207 | -------------------------------------------------------------------------------- /src/barecat/sql/schema.sql: -------------------------------------------------------------------------------- 1 | -- Description: Schema for the barecat database 2 | 3 | 4 | --#################################### Tables 5 | CREATE TABLE files 6 | ( 7 | path TEXT PRIMARY KEY NOT NULL, 8 | parent TEXT GENERATED ALWAYS AS ( -- Parent directory is computed automatically 9 | rtrim(rtrim(path, replace(path, '/', '')), '/') 10 | ) VIRTUAL NOT NULL REFERENCES dirs (path) ON DELETE RESTRICT, 11 | 12 | shard INTEGER NOT NULL, 13 | offset INTEGER NOT NULL, 14 | size INTEGER DEFAULT 0, 15 | crc32c INTEGER DEFAULT NULL, 16 | 17 | mode INTEGER DEFAULT NULL, 18 | uid INTEGER DEFAULT NULL, 19 | gid INTEGER DEFAULT NULL, 20 | mtime_ns INTEGER DEFAULT NULL 21 | ); 22 | 23 | CREATE TABLE dirs 24 | ( 25 | path TEXT PRIMARY KEY, 26 | parent TEXT GENERATED ALWAYS AS ( 27 | CASE 28 | WHEN path = '' THEN NULL 29 | ELSE rtrim(rtrim(path, replace(path, '/', '')), '/') 30 | END 31 | ) VIRTUAL REFERENCES dirs (path) ON DELETE RESTRICT, 32 | 33 | num_subdirs INTEGER DEFAULT 0, -- These are maintained by triggers 34 | num_files INTEGER DEFAULT 0, 35 | num_files_tree INTEGER DEFAULT 0, 36 | size_tree INTEGER DEFAULT 0, 37 | 38 | mode INTEGER DEFAULT NULL, 39 | uid INTEGER DEFAULT NULL, 40 | gid INTEGER DEFAULT NULL, 41 | mtime_ns INTEGER DEFAULT NULL 42 | ); 43 | 44 | CREATE TABLE config -- For now, this table only holds the `shard_size_limit` 45 | ( 46 | key TEXT PRIMARY KEY, 47 | value_text TEXT DEFAULT NULL, 48 | value_int INTEGER DEFAULT NULL 49 | ) WITHOUT ROWID; 50 | 51 | INSERT INTO config (key, value_int) 52 | VALUES ('use_triggers', 1), 53 | ('shard_size_limit', CAST(power(2, 63) - 1 AS INTEGER)), 54 | ('schema_version_major', 0), 55 | ('schema_version_minor', 2); 56 | 57 | -- Indexes 58 | CREATE INDEX idx_files_parent ON files (parent); 59 | CREATE INDEX idx_dirs_parent ON dirs (parent); 60 | CREATE INDEX idx_files_shard_offset ON files (shard, offset); 61 | 62 | --#################################### Triggers 63 | -- The idea is: we propagate changes up the tree with triggers, as this is cumbersome to do in 64 | -- the Python code. There is no propagation downwards (for example when moving a dir, we do not 65 | -- update all the children with triggers). This is because the Python code can do this 66 | -- quite easily. Furthermore, if we did it with triggers, the chain would start upward again 67 | -- with a circular mess. So we only propagate upwards the tree. 68 | -- We propagate two kinds of things: 69 | -- 1) statistics: direct and aggregate file count and aggregate size 70 | -- 2) modification time of the parent directory 71 | -- We don't update the modification time of the entity being inserted or modified, 72 | -- this can be simply done in the Python code. If the app doesn't supply mtime, presumably it 73 | -- doesn't care about it, so the overhead of triggering it makes no sense. 74 | 75 | ---- Files: add, del, move, resize 76 | CREATE TRIGGER add_file -- Upsert the parent when adding a file 77 | AFTER INSERT 78 | ON files 79 | WHEN (SELECT value_int 80 | FROM config 81 | WHERE key = 'use_triggers') = 1 82 | BEGIN 83 | -- Add the parent directory if it doesn't exist 84 | INSERT INTO dirs (path, num_files, num_files_tree, size_tree, mtime_ns) 85 | VALUES (NEW.parent, 1, 1, NEW.size, 86 | CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)) 87 | -- If the parent directory already exists, update it 88 | ON CONFLICT(path) DO UPDATE 89 | SET num_files = num_files + 1, 90 | num_files_tree = num_files_tree + 1, 91 | size_tree = size_tree + excluded.size_tree, 92 | mtime_ns = excluded.mtime_ns; 93 | END; 94 | 95 | CREATE TRIGGER del_file -- Update the parent when deleting a file 96 | AFTER DELETE 97 | ON files 98 | WHEN (SELECT value_int 99 | FROM config 100 | WHERE key = 'use_triggers') = 1 101 | BEGIN 102 | UPDATE dirs 103 | SET num_files = num_files - 1, 104 | num_files_tree = num_files_tree - 1, 105 | size_tree = size_tree - OLD.size, 106 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER) 107 | WHERE path = OLD.parent; 108 | END; 109 | 110 | CREATE TRIGGER move_file -- Update both parents when moving a file 111 | AFTER UPDATE OF path 112 | ON files 113 | WHEN NEW.parent != OLD.parent 114 | AND (SELECT value_int 115 | FROM config 116 | WHERE key = 'use_triggers') = 1 117 | BEGIN 118 | UPDATE dirs 119 | SET num_files = num_files + 1, 120 | num_files_tree = num_files_tree + 1, 121 | size_tree = size_tree + NEW.size, 122 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER) 123 | WHERE path = NEW.parent; 124 | UPDATE dirs 125 | SET num_files = num_files - 1, 126 | num_files_tree = num_files_tree - 1, 127 | size_tree = size_tree - OLD.size, 128 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER) 129 | WHERE path = OLD.parent; 130 | END; 131 | 132 | CREATE TRIGGER resize_file -- When file size changes 133 | AFTER UPDATE OF size 134 | ON files 135 | WHEN NEW.parent == OLD.parent -- and the file was not moved 136 | AND (SELECT value_int 137 | FROM config 138 | WHERE key = 'use_triggers') = 1 139 | BEGIN 140 | UPDATE dirs 141 | SET size_tree = size_tree + NEW.size - OLD.size 142 | WHERE path = OLD.parent; 143 | END; 144 | 145 | ---- Directories: add, del, move, resize 146 | CREATE TRIGGER add_subdir -- Upsert the parent when adding a directory 147 | AFTER INSERT 148 | ON dirs 149 | WHEN (SELECT value_int 150 | FROM config 151 | WHERE key = 'use_triggers') = 1 152 | BEGIN 153 | INSERT INTO dirs (path, num_subdirs, size_tree, num_files_tree, mtime_ns) 154 | VALUES (NEW.parent, 1, NEW.size_tree, NEW.num_files_tree, 155 | CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER)) 156 | ON CONFLICT(path) DO UPDATE 157 | SET num_subdirs = num_subdirs + 1, 158 | size_tree = size_tree + excluded.size_tree, 159 | num_files_tree = num_files_tree + excluded.num_files_tree, 160 | mtime_ns= excluded.mtime_ns; 161 | END; 162 | 163 | CREATE TRIGGER del_subdir -- Update the parent when deleting a directory 164 | AFTER DELETE 165 | ON dirs 166 | WHEN (SELECT value_int 167 | FROM config 168 | WHERE key = 'use_triggers') = 1 169 | BEGIN 170 | UPDATE dirs 171 | SET num_subdirs = num_subdirs - 1, 172 | num_files = num_files - OLD.num_files, 173 | size_tree = size_tree - OLD.size_tree, 174 | num_files_tree = num_files_tree - OLD.num_files_tree, 175 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER) 176 | WHERE path = OLD.parent; 177 | END; 178 | 179 | CREATE TRIGGER move_subdir -- Update both parents when moving a directory 180 | AFTER UPDATE OF path 181 | ON dirs 182 | WHEN NEW.parent != OLD.parent 183 | AND (SELECT value_int 184 | FROM config 185 | WHERE key = 'use_triggers') = 1 186 | BEGIN 187 | UPDATE dirs 188 | SET num_subdirs = num_subdirs - 1, 189 | num_files = num_files - OLD.num_files, 190 | size_tree = size_tree - OLD.size_tree, 191 | num_files_tree = num_files_tree - OLD.num_files_tree, 192 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER) 193 | WHERE path = OLD.parent; 194 | UPDATE dirs 195 | SET num_subdirs = num_subdirs + 1, 196 | num_files = num_files + NEW.num_files, 197 | size_tree = size_tree + NEW.size_tree, 198 | num_files_tree = num_files_tree + NEW.num_files_tree, 199 | mtime_ns = CAST((julianday('now') - 2440587.5) * 86400.0 * 1e9 AS INTEGER) 200 | WHERE path = NEW.parent; 201 | END; 202 | 203 | 204 | CREATE TRIGGER resize_dir -- Update the parent when a directory changes size 205 | AFTER UPDATE OF size_tree, num_files_tree 206 | ON dirs 207 | WHEN NEW.parent = OLD.parent AND 208 | (NEW.size_tree != OLD.size_tree OR NEW.num_files_tree != OLD.num_files_tree) 209 | AND (SELECT value_int 210 | FROM config 211 | WHERE key = 'use_triggers') = 1 212 | BEGIN 213 | UPDATE dirs 214 | SET size_tree = size_tree + (NEW.size_tree - OLD.size_tree), 215 | num_files_tree = num_files_tree + (NEW.num_files_tree - OLD.num_files_tree) 216 | WHERE path = OLD.parent; 217 | END; -------------------------------------------------------------------------------- /src/barecat/util.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import glob 3 | import itertools 4 | import os 5 | import os.path as osp 6 | import shutil 7 | from datetime import datetime 8 | 9 | import crc32c as crc32c_lib 10 | 11 | 12 | def read_file(input_path, mode='r'): 13 | with open(input_path, mode) as f: 14 | return f.read() 15 | 16 | 17 | def remove(path): 18 | index_path = f'{path}-sqlite-index' 19 | shard_paths = glob.glob(f'{path}-shard-?????') 20 | for path in [index_path] + shard_paths: 21 | os.remove(path) 22 | 23 | 24 | def exists(path): 25 | index_path = f'{path}-sqlite-index' 26 | shard_paths = glob.glob(f'{path}-shard-?????') 27 | return osp.exists(index_path) or len(shard_paths) > 0 28 | 29 | 30 | # From `more-itertools` package. 31 | def chunked(iterable, n, strict=False): 32 | """Break *iterable* into lists of length *n*: 33 | 34 | >>> list(chunked([1, 2, 3, 4, 5, 6], 3)) 35 | [[1, 2, 3], [4, 5, 6]] 36 | 37 | By the default, the last yielded list will have fewer than *n* elements 38 | if the length of *iterable* is not divisible by *n*: 39 | 40 | >>> list(chunked([1, 2, 3, 4, 5, 6, 7, 8], 3)) 41 | [[1, 2, 3], [4, 5, 6], [7, 8]] 42 | 43 | To use a fill-in value instead, see the :func:`grouper` recipe. 44 | 45 | If the length of *iterable* is not divisible by *n* and *strict* is 46 | ``True``, then ``ValueError`` will be raised before the last 47 | list is yielded. 48 | 49 | """ 50 | iterator = iter(functools.partial(take, n, iter(iterable)), []) 51 | if strict: 52 | if n is None: 53 | raise ValueError('n must not be None when using strict mode.') 54 | 55 | def ret(): 56 | for chunk in iterator: 57 | if len(chunk) != n: 58 | raise ValueError('iterable is not divisible by n.') 59 | yield chunk 60 | 61 | return iter(ret()) 62 | else: 63 | return iterator 64 | 65 | 66 | def take(n, iterable): 67 | """Return first *n* items of the iterable as a list. 68 | 69 | >>> take(3, range(10)) 70 | [0, 1, 2] 71 | 72 | If there are fewer than *n* items in the iterable, all of them are 73 | returned. 74 | 75 | >>> take(10, range(3)) 76 | [0, 1, 2] 77 | 78 | """ 79 | return list(itertools.islice(iterable, n)) 80 | 81 | 82 | def copy_n_bytes(src_file, dest_file, n=None, bufsize=64 * 1024): 83 | if n is None: 84 | return shutil.copyfileobj(src_file, dest_file, bufsize) 85 | 86 | bytes_to_copy = n 87 | while bytes_to_copy > 0: 88 | data = src_file.read(min(bufsize, bytes_to_copy)) 89 | if not data: 90 | raise ValueError('Unexpected EOF') 91 | 92 | dest_file.write(data) 93 | bytes_to_copy -= len(data) 94 | 95 | 96 | def normalize_path(path): 97 | x = osp.normpath(path).removeprefix('/') 98 | return '' if x == '.' else x 99 | 100 | 101 | def get_parent(path): 102 | if path == '': 103 | # root already, has no parent 104 | return b'\x00' 105 | 106 | partition = path.rpartition('/') 107 | return partition[0] 108 | 109 | 110 | def partition_path(path): 111 | if path == '': 112 | # root already, has no parent 113 | return b'\x00', path 114 | 115 | parts = path.rpartition('/') 116 | return parts[0], parts[2] 117 | 118 | 119 | def get_ancestors(path): 120 | yield '' 121 | for i in range(len(path)): 122 | if path[i] == '/': 123 | yield path[:i] 124 | 125 | 126 | def reopen(file, mode): 127 | if file.mode == mode: 128 | return file 129 | file.close() 130 | return open_(file.name, mode) 131 | 132 | 133 | def fileobj_crc32c_until_end(fileobj, bufsize=64 * 1024): 134 | crc32c = 0 135 | while chunk := fileobj.read(bufsize): 136 | crc32c = crc32c_lib.crc32c(chunk, crc32c) 137 | return crc32c 138 | 139 | 140 | def fileobj_crc32c(fileobj, size=-1, bufsize=64 * 1024): 141 | if size == -1 or size is None: 142 | return fileobj_crc32c_until_end(fileobj, bufsize) 143 | 144 | crc32c = 0 145 | n_full_bufs, remainder = divmod(size, bufsize) 146 | 147 | for _ in range(n_full_bufs): 148 | data = fileobj.read(bufsize) 149 | if len(data) != bufsize: 150 | raise ValueError('Unexpected EOF') 151 | crc32c = crc32c_lib.crc32c(data, crc32c) 152 | 153 | if remainder: 154 | data = fileobj.read(remainder) 155 | if len(data) != remainder: 156 | raise ValueError('Unexpected EOF') 157 | crc32c = crc32c_lib.crc32c(data, crc32c) 158 | 159 | return crc32c 160 | 161 | 162 | def copyfileobj_crc32c_until_end(src_file, dst_file, bufsize=64 * 1024): 163 | crc32c = 0 164 | size = 0 165 | while chunk := src_file.read(bufsize): 166 | dst_file.write(chunk) 167 | crc32c = crc32c_lib.crc32c(chunk, crc32c) 168 | size += len(chunk) 169 | return size, crc32c 170 | 171 | 172 | def copyfileobj_crc32c(src_file, dst_file, size=None, bufsize=64 * 1024): 173 | if size is None: 174 | return copyfileobj_crc32c_until_end(src_file, dst_file, bufsize) 175 | 176 | crc32c = 0 177 | n_bytes_transferred = 0 178 | n_full_bufs, remainder = divmod(size, bufsize) 179 | 180 | for _ in range(n_full_bufs): 181 | data = src_file.read(bufsize) 182 | if len(data) != bufsize: 183 | raise ValueError('Unexpected EOF') 184 | 185 | crc32c = crc32c_lib.crc32c(data, crc32c) 186 | n_written = dst_file.write(data) 187 | if n_written != len(data): 188 | raise ValueError('Unexpected write problem') 189 | 190 | n_bytes_transferred += n_written 191 | 192 | if remainder: 193 | data = src_file.read(remainder) 194 | if len(data) != remainder: 195 | raise ValueError('Unexpected EOF') 196 | 197 | crc32c = crc32c_lib.crc32c(data, crc32c) 198 | n_written = dst_file.write(data) 199 | if n_written != len(data): 200 | raise ValueError('Unexpected write problem') 201 | 202 | n_bytes_transferred += n_written 203 | 204 | return n_bytes_transferred, crc32c 205 | 206 | 207 | def copyfileobj(src_file, dst_file, size=None, bufsize=64 * 1024): 208 | if size is None: 209 | return shutil.copyfileobj(src_file, dst_file, bufsize) 210 | 211 | n_bytes_transferred = 0 212 | nreads, remainder = divmod(size, bufsize) 213 | 214 | for _ in range(nreads): 215 | data = src_file.read(bufsize) 216 | dst_file.write(data) 217 | n_bytes_transferred += len(data) 218 | 219 | if remainder: 220 | data = src_file.read(remainder) 221 | dst_file.write(data) 222 | n_bytes_transferred += len(data) 223 | 224 | return n_bytes_transferred 225 | 226 | 227 | def write_zeroes(file, n, bufsize=64 * 1024): 228 | n_written = 0 229 | if n >= bufsize: 230 | zeroes = bytearray(bufsize) 231 | while n >= bufsize: 232 | n_written += file.write(zeroes) 233 | n -= bufsize 234 | n_written += file.write(bytearray(n)) 235 | return n_written 236 | 237 | 238 | def raise_if_readonly(method): 239 | @functools.wraps(method) 240 | def wrapper(self, *args, **kwargs): 241 | if self.readonly: 242 | raise PermissionError('This function is not allowed in readonly mode') 243 | return method(self, *args, **kwargs) 244 | 245 | return wrapper 246 | 247 | 248 | def raise_if_append_only(method): 249 | @functools.wraps(method) 250 | def wrapper(self, *args, **kwargs): 251 | if self.append_only: 252 | raise PermissionError('This function is not allowed in append-only mode') 253 | return method(self, *args, **kwargs) 254 | 255 | return wrapper 256 | 257 | 258 | def raise_if_readonly_or_append_only(method): 259 | @functools.wraps(method) 260 | def wrapper(self, *args, **kwargs): 261 | if self.readonly or self.append_only: 262 | raise PermissionError('This function is not allowed in append-only mode') 263 | return method(self, *args, **kwargs) 264 | 265 | return wrapper 266 | 267 | 268 | def parse_size(size): 269 | if size is None: 270 | return None 271 | units = dict(K=1024, M=1024**2, G=1024**3, T=1024**4) 272 | size = size.upper() 273 | 274 | for unit, factor in units.items(): 275 | if unit in size: 276 | return int(float(size.replace(unit, '')) * factor) 277 | 278 | return int(size) 279 | 280 | 281 | def open_(path, mode, *args, **kwargs): 282 | # This is like open() but supports an additional mode 'ax+b' which is like 283 | # 'x+b' in that it fails if the file already exists, and creates it if it doesn't, 284 | # but it also opens the file in append mode, like 'a+b' 285 | 286 | if sorted(mode) == sorted('ax+b'): 287 | fd = os.open(path, os.O_APPEND) 288 | return os.fdopen(fd, 'a+b', *args, **kwargs) 289 | return open(path, mode, *args, **kwargs) 290 | 291 | 292 | def datetime_to_ns(dt): 293 | return int(dt.timestamp() * 1e9) 294 | 295 | 296 | def ns_to_datetime(ns): 297 | return datetime.fromtimestamp(ns / 1e9) 298 | -------------------------------------------------------------------------------- /src/barecat/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import pickle 4 | import sys 5 | 6 | import barecat 7 | import barecat.cli_impl as impl 8 | from barecat.common import Order 9 | from barecat.defrag import BarecatDefragger 10 | from barecat.util import parse_size 11 | 12 | 13 | def create(): 14 | parser = argparse.ArgumentParser( 15 | description='Concatenate files to sharded blobs and create an sqlite index.' 16 | ) 17 | parser.add_argument('--file', type=str, help='target path', required=True) 18 | parser.add_argument( 19 | '--null', 20 | action='store_true', 21 | help='read input paths from stdin, separated by null bytes as output by ' 22 | 'the find command with the -print0 option (otherwise newlines are ' 23 | 'interpreted as delimiters)', 24 | ) 25 | parser.add_argument('--workers', type=int, default=None) 26 | parser.add_argument( 27 | '--shard-size-limit', 28 | type=str, 29 | default=None, 30 | help='maximum size of a shard in bytes (if not specified, ' 31 | 'all files will be concatenated into a single shard)', 32 | ) 33 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files') 34 | 35 | args = parser.parse_args() 36 | impl.create_from_stdin_paths( 37 | target_path=args.file, 38 | shard_size_limit=parse_size(args.shard_size_limit), 39 | zero_terminated=args.null, 40 | overwrite=args.overwrite, 41 | workers=args.workers, 42 | ) 43 | 44 | 45 | def create_recursive(): 46 | # args are --file, and --shard-size-limit and --workers and --overwrite, and positional args 47 | # are what you wanna pack in. if ya supply a single posarg thing then ya can use also the 48 | # flag --strip-root and then the root will be stripped from the paths 49 | parser = argparse.ArgumentParser( 50 | description='Concatenate files to sharded blobs and create an sqlite index.' 51 | ) 52 | parser.add_argument('--file', type=str, help='target path', required=True) 53 | parser.add_argument('--workers', type=int, default=None) 54 | parser.add_argument( 55 | '--shard-size-limit', 56 | type=str, 57 | default=None, 58 | help='maximum size of a shard in bytes (if not specified, ' 59 | 'all files will be concatenated into a single shard)', 60 | ) 61 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files') 62 | parser.add_argument('paths', type=str, nargs='+', help='paths to pack') 63 | parser.add_argument( 64 | '--strip-root', 65 | action='store_true', 66 | help='strip the root from the paths (only applicable if a single path is provided)', 67 | ) 68 | 69 | args = parser.parse_args() 70 | impl.create_recursive( 71 | target_path=args.file, 72 | shard_size_limit=parse_size(args.shard_size_limit), 73 | roots=args.paths, 74 | overwrite=args.overwrite, 75 | workers=args.workers, 76 | strip_root=args.strip_root, 77 | ) 78 | 79 | 80 | def extract(): 81 | parser = argparse.ArgumentParser(description='Extract files from a barecat archive.') 82 | parser.add_argument('--file', type=str, help='path to the archive file') 83 | parser.add_argument('--target-directory', type=str, help='path to the target directory') 84 | args = parser.parse_args() 85 | impl.extract(args.file, args.target_directory) 86 | 87 | 88 | def extract_single(): 89 | parser = argparse.ArgumentParser(description='Extract a single file from a barecat archive.') 90 | parser.add_argument('--barecat-file', type=str, help='path to the archive file') 91 | parser.add_argument('--path', type=str, help='path to the file to extract, within the archive') 92 | args = parser.parse_args() 93 | with barecat.Barecat(args.barecat_file) as reader: 94 | sys.stdout.buffer.write(reader[args.path]) 95 | 96 | 97 | def index_to_csv(): 98 | parser = argparse.ArgumentParser(description='Dump the index contents as csv') 99 | parser.add_argument('file', type=str, help='path to the index file') 100 | args = parser.parse_args() 101 | 102 | writer = csv.writer(sys.stdout, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) 103 | writer.writerow(['path', 'shard', 'offset', 'size', 'crc32c']) 104 | with barecat.Index(args.file) as index: 105 | for f in index.iter_all_fileinfos(order=Order.PATH): 106 | writer.writerow([f.path, f.shard, f.offset, f.size, f.crc32c]) 107 | 108 | 109 | def index_to_pickledict(): 110 | parser = argparse.ArgumentParser(description='Dump the index contents as a pickled dictionary') 111 | parser.add_argument('file', type=str, help='path to the index file') 112 | parser.add_argument('outfile', type=str, help='path to the result file') 113 | args = parser.parse_args() 114 | 115 | with barecat.Index(args.file) as index_reader: 116 | dicti = dict(index_reader.items()) 117 | 118 | with open(args.outfile, 'xb') as outfile: 119 | pickle.dump(dicti, outfile) 120 | 121 | 122 | def merge(): 123 | parser = argparse.ArgumentParser(description='Merge existing Barecat archives into one.') 124 | parser.add_argument( 125 | 'input_paths', metavar='N', type=str, nargs='+', help='paths to the archives to merge' 126 | ) 127 | parser.add_argument('--output', required=True, help='output path') 128 | parser.add_argument( 129 | '--shard-size-limit', 130 | type=str, 131 | default=None, 132 | help='maximum size of a shard in bytes (if not specified, ' 133 | 'all files will be concatenated into a single shard)', 134 | ) 135 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files') 136 | parser.add_argument( 137 | '--ignore-duplicates', 138 | action='store_true', 139 | help='if true then if a later file has the same path as an earlier one,' 140 | ' skip it; if false then raise an error', 141 | ) 142 | 143 | args = parser.parse_args() 144 | impl.merge( 145 | source_paths=args.input_paths, 146 | target_path=args.output, 147 | shard_size_limit=parse_size(args.shard_size_limit), 148 | overwrite=args.overwrite, 149 | ignore_duplicates=args.ignore_duplicates, 150 | ) 151 | 152 | 153 | def merge_symlink(): 154 | parser = argparse.ArgumentParser(description='Merge existing Barecat archives into one.') 155 | parser.add_argument( 156 | 'input_paths', metavar='N', type=str, nargs='+', help='paths to the archives to merge' 157 | ) 158 | parser.add_argument('--output', required=True, help='output path') 159 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files') 160 | parser.add_argument( 161 | '--ignore-duplicates', 162 | action='store_true', 163 | help='if true then if a later file has the same path as an earlier one,' 164 | ' skip it; if false then raise an error', 165 | ) 166 | 167 | args = parser.parse_args() 168 | impl.merge_symlink( 169 | source_paths=args.input_paths, 170 | target_path=args.output, 171 | overwrite=args.overwrite, 172 | ignore_duplicates=args.ignore_duplicates, 173 | ) 174 | 175 | 176 | def verify_integrity(): 177 | parser = argparse.ArgumentParser( 178 | description='Verify the integrity of a Barecat archive, including CRC32C, directory ' 179 | 'stats and no gaps between stored files.' 180 | ) 181 | parser.add_argument('file', type=str, help='path to the index file') 182 | parser.add_argument( 183 | '--quick', action='store_true', help='CRC32C is only verified on the last file' 184 | ) 185 | args = parser.parse_args() 186 | 187 | with barecat.Barecat(args.file) as bc: 188 | if not bc.verify_integrity(quick=args.quick): 189 | print(f'Integrity errors were found.') 190 | sys.exit(1) 191 | 192 | 193 | def defrag(): 194 | parser = argparse.ArgumentParser( 195 | description='Defragment a Barecat archive to remove gaps left by deleted files.' 196 | ) 197 | parser.add_argument('file', type=str, help='path to the index file') 198 | parser.add_argument( 199 | '--quick', 200 | action='store_true', 201 | help='faster but less thorough attempt at defrag, using the best-fit ' 202 | 'algorithm to move the last files into gaps.', 203 | ) 204 | 205 | args = parser.parse_args() 206 | with barecat.Barecat(args.file, readonly=False, append_only=False) as bc: 207 | defragger = BarecatDefragger(bc) 208 | if defragger.needs_defrag(): 209 | if args.quick: 210 | defragger.defrag_quick() 211 | else: 212 | defragger.defrag() 213 | 214 | 215 | def archive2barecat(): 216 | parser = argparse.ArgumentParser( 217 | description='Convert a tar or zip archive to a Barecat archive.' 218 | ) 219 | # 2 positional args are the tar file and the target barecat file 220 | parser.add_argument('archive_file', type=str, help='path to the tar or zip file') 221 | parser.add_argument('barecat_file', type=str, help='path to the target barecat file') 222 | 223 | parser.add_argument( 224 | '--shard-size-limit', 225 | type=str, 226 | default=None, 227 | help='maximum size of a shard in bytes (if not specified, ' 228 | 'all files will be concatenated into a single shard)', 229 | ) 230 | parser.add_argument('--overwrite', action='store_true', help='overwrite existing files') 231 | args = parser.parse_args() 232 | impl.archive2barecat( 233 | src_path=args.archive_file, 234 | target_path=args.barecat_file, 235 | shard_size_limit=parse_size(args.shard_size_limit), 236 | overwrite=args.overwrite, 237 | ) 238 | 239 | 240 | def barecat2archive(): 241 | parser = argparse.ArgumentParser( 242 | description='Convert a Barecat archive to a tar or tar or zip archive.' 243 | ) 244 | # 2 positional args are the barecat file and the target tar file 245 | parser.add_argument('barecat_file', type=str, help='path to the barecat file') 246 | parser.add_argument('archive_file', type=str, help='path to the target archive file') 247 | 248 | args = parser.parse_args() 249 | impl.barecat2archive(src_path=args.barecat_file, target_path=args.archive_file) 250 | 251 | 252 | 253 | 254 | def print_ncdu_json(): 255 | parser = argparse.ArgumentParser( 256 | description='Print the contents of a Barecat as JSON in the format expected by ncdu.' 257 | ) 258 | parser.add_argument('file', type=str, help='path to the index file') 259 | args = parser.parse_args() 260 | impl.print_ncdu_json(args.file) 261 | -------------------------------------------------------------------------------- /src/barecat/cli_impl.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import itertools 3 | import json 4 | import os 5 | import os.path as osp 6 | import shutil 7 | import stat 8 | import sys 9 | import time 10 | 11 | import barecat.util 12 | from barecat.archive_formats import ( 13 | get_archive_writer, 14 | iter_archive, 15 | iter_archive_nocontent, 16 | TarWriter, 17 | ) 18 | from barecat.consumed_threadpool import ConsumedThreadPool 19 | from barecat.core import barecat as barecat_ 20 | from barecat.core.index import BarecatDirInfo, BarecatFileInfo, Order 21 | from barecat.core.sharder import Sharder 22 | from barecat.progbar import progressbar 23 | 24 | 25 | def create_from_stdin_paths( 26 | target_path, shard_size_limit, zero_terminated=False, overwrite=False, workers=None 27 | ): 28 | iterator = generate_from_stdin(zero_terminated) 29 | create(iterator, target_path, shard_size_limit, overwrite, workers) 30 | 31 | 32 | def create_recursive(target_path, shard_size_limit, roots, overwrite, strip_root, workers=None): 33 | iterator = generate_from_walks(roots, strip_root) 34 | create(iterator, target_path, shard_size_limit, overwrite, workers) 35 | 36 | 37 | def generate_from_stdin(zero_terminated=False): 38 | if zero_terminated: 39 | input_paths = iterate_zero_terminated(sys.stdin.buffer) 40 | else: 41 | input_paths = (l.rstrip('\n') for l in sys.stdin) 42 | 43 | for input_path in progressbar(input_paths, desc='Packing files', unit=' files'): 44 | yield input_path, input_path 45 | 46 | 47 | def generate_from_walks(roots, strip_root): 48 | for root in roots: 49 | if not strip_root: 50 | yield root, osp.basename(root) 51 | 52 | for dirpath, subdirnames, filenames in os.walk(root): 53 | for entryname in itertools.chain(filenames, subdirnames): 54 | full_path = osp.join(dirpath, entryname) 55 | relpath = osp.relpath(full_path, start=root) 56 | if not strip_root: 57 | store_path = osp.join(osp.basename(root), relpath) 58 | else: 59 | store_path = relpath 60 | yield full_path, store_path 61 | 62 | 63 | def create( 64 | filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite=False, workers=8 65 | ): 66 | if workers is None: 67 | create_without_workers( 68 | filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite 69 | ) 70 | else: 71 | create_with_workers( 72 | filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite, workers 73 | ) 74 | 75 | 76 | def create_without_workers( 77 | filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite=False 78 | ): 79 | with barecat_.Barecat( 80 | target_path, 81 | shard_size_limit=shard_size_limit, 82 | readonly=False, 83 | overwrite=overwrite, 84 | append_only=False, 85 | ) as writer: 86 | for filesys_path, store_path in filesys_and_store_path_pairs: 87 | writer.add_by_path(filesys_path, store_path) 88 | 89 | 90 | def create_with_workers( 91 | filesys_and_store_path_pairs, target_path, shard_size_limit, overwrite=False, workers=8 92 | ): 93 | if overwrite and barecat.util.exists(target_path): 94 | barecat.util.remove(target_path) 95 | 96 | with ( 97 | Sharder( 98 | target_path, 99 | shard_size_limit=shard_size_limit, 100 | readonly=False, 101 | append_only=False, 102 | threadsafe=True, 103 | allow_writing_symlinked_shard=False, 104 | ) as sharder, 105 | ConsumedThreadPool( 106 | index_writer_main, main_args=(f'{target_path}-sqlite-index',), max_workers=workers 107 | ) as ctp, 108 | ): 109 | for filesys_path, store_path in filesys_and_store_path_pairs: 110 | statresult = os.stat(filesys_path) 111 | 112 | if stat.S_ISDIR(statresult.st_mode): 113 | dinfo = BarecatDirInfo(path=store_path) 114 | dinfo.fill_from_statresult(statresult) 115 | ctp.submit(userdata=dinfo) 116 | else: 117 | finfo = BarecatFileInfo(path=store_path) 118 | finfo.fill_from_statresult(statresult) 119 | finfo.shard, finfo.offset = sharder.reserve(finfo.size) 120 | ctp.submit( 121 | sharder.add_by_path, 122 | userdata=finfo, 123 | args=(filesys_path, finfo.shard, finfo.offset, finfo.size), 124 | kwargs=dict(raise_if_cannot_fit=True), 125 | ) 126 | 127 | 128 | def index_writer_main(target_path, future_iter): 129 | with barecat_.Index(target_path, readonly=False) as index_writer: 130 | for future in future_iter: 131 | info = future.userdata 132 | if isinstance(info, BarecatDirInfo): 133 | index_writer.add_dir(info) 134 | continue 135 | 136 | shard_real, offset_real, size_real, crc32c = future.result() 137 | info.shard = shard_real 138 | info.offset = offset_real 139 | info.crc32c = crc32c 140 | 141 | if info.size != size_real: 142 | raise ValueError('Size mismatch!') 143 | index_writer.add_file(info) 144 | 145 | 146 | def extract(barecat_path, target_directory): 147 | with barecat_.Barecat(barecat_path) as reader: 148 | for path_in_archive in progressbar(reader, desc='Extracting files', unit=' files'): 149 | target_path = osp.join(target_directory, path_in_archive) 150 | os.makedirs(osp.dirname(target_path), exist_ok=True) 151 | with open(target_path, 'wb') as output_file: 152 | shutil.copyfileobj(reader.open(path_in_archive), output_file) 153 | 154 | 155 | def merge(source_paths, target_path, shard_size_limit, overwrite=False, ignore_duplicates=False): 156 | with barecat_.Barecat( 157 | target_path, shard_size_limit=shard_size_limit, readonly=False, overwrite=overwrite 158 | ) as writer: 159 | for source_path in source_paths: 160 | print(f'Merging files from {source_path}') 161 | writer.merge_from_other_barecat(source_path, ignore_duplicates=ignore_duplicates) 162 | 163 | 164 | def merge_symlink(source_paths, target_path, overwrite=False, ignore_duplicates=False): 165 | index_path = f'{target_path}-sqlite-index' 166 | if overwrite and osp.exists(index_path): 167 | os.remove(index_path) 168 | 169 | with barecat_.Index(index_path, readonly=False) as index_writer: 170 | c = index_writer.cursor 171 | c.execute("COMMIT") 172 | c.execute('PRAGMA synchronous=OFF') 173 | c.execute('PRAGMA journal_mode=OFF') 174 | 175 | i_out_shard = 0 176 | for source_path in source_paths: 177 | index_writer.merge_from_other_barecat( 178 | f'{source_path}-sqlite-index', ignore_duplicates=ignore_duplicates 179 | ) 180 | for shard_path in sorted(glob.glob(f'{source_path}-shard-*')): 181 | os.symlink( 182 | osp.relpath(shard_path, start=osp.dirname(target_path)), 183 | f'{target_path}-shard-{i_out_shard:05d}', 184 | ) 185 | i_out_shard += 1 186 | 187 | 188 | def write_index(dictionary, target_path): 189 | with barecat_.Index(target_path, readonly=False) as index_writer: 190 | for path, (shard, offset, size) in dictionary.items(): 191 | index_writer.add_file( 192 | BarecatFileInfo(path=path, shard=shard, offset=offset, size=size) 193 | ) 194 | 195 | 196 | def read_index(path): 197 | with barecat_.Index(path) as reader: 198 | return dict(reader.items()) 199 | 200 | 201 | def iterate_zero_terminated(fileobj): 202 | partial_path = b'' 203 | while chunk := fileobj.read(4096): 204 | parts = chunk.split(b'\x00') 205 | parts[0] = partial_path + parts[0] 206 | partial_path = parts.pop() 207 | 208 | for input_path in parts: 209 | input_path = input_path.decode() 210 | yield input_path 211 | 212 | 213 | def archive2barecat(src_path, target_path, shard_size_limit, overwrite=False): 214 | with barecat_.Barecat( 215 | target_path, shard_size_limit=shard_size_limit, readonly=False, overwrite=overwrite 216 | ) as writer: 217 | for file_or_dir_info, fileobj in iter_archive(src_path): 218 | writer.add(file_or_dir_info, fileobj=fileobj, dir_exist_ok=True) 219 | 220 | 221 | def wrap_archive(src_path, target_path, overwrite=False): 222 | index_path = f'{target_path}-sqlite-index' 223 | if overwrite and osp.exists(index_path): 224 | os.remove(index_path) 225 | 226 | with barecat_.Index(target_path, readonly=False) as index: 227 | for file_or_dir_info in iter_archive_nocontent(src_path): 228 | index.add(file_or_dir_info) 229 | 230 | os.symlink(src_path, f'{target_path}-shard-00000') 231 | 232 | 233 | def barecat2archive(src_path, target_path): 234 | with barecat_.Barecat(src_path, readonly=True) as bc: 235 | with get_archive_writer(target_path) as target_archive: 236 | infos = bc.index.iter_all_infos(order=Order.PATH) 237 | num_total = bc.index.num_files + bc.index.num_dirs 238 | for entry in progressbar(infos, total=num_total, desc='Writing', unit=' entries'): 239 | if isinstance(entry, BarecatDirInfo): 240 | target_archive.add(entry) 241 | else: 242 | with bc.open(entry.path) as file_in_barecat: 243 | target_archive.add(entry, fileobj=file_in_barecat) 244 | 245 | 246 | def print_ncdu_json(path): 247 | timestamp = time.time() 248 | import importlib.metadata 249 | 250 | progver = importlib.metadata.version('barecat') 251 | progver = '.'.join(progver.split('.')[:3]) 252 | 253 | print(f'[1,1,{{"progname":"barecat","progver": {progver},"timestamp":{timestamp}}},') 254 | with barecat_.Index(path) as index_reader: 255 | _print_ncdu_json(index_reader, '') 256 | print(']') 257 | 258 | 259 | def _print_ncdu_json(index_reader, dirpath): 260 | basename = '/' if dirpath == '' else osp.basename(dirpath) 261 | 262 | print('[', json.dumps(dict(name=basename, asize=4096, ino=0)), end='') 263 | infos = index_reader.listdir_infos(dirpath) 264 | file_infos = [f for f in infos if isinstance(f, BarecatFileInfo)] 265 | subdir_infos = [d for d in infos if isinstance(d, BarecatDirInfo)] 266 | del infos 267 | 268 | if file_infos: 269 | filedump = json.dumps( 270 | [dict(name=osp.basename(fi.path), asize=fi.size, dsize=fi.size, ino=0) for fi in file_infos] 271 | ) 272 | print(',', filedump[1:-1], end='') 273 | del file_infos 274 | 275 | for subdir in subdir_infos: 276 | print(',') 277 | _print_ncdu_json(index_reader, subdir.path) 278 | 279 | print(']', end='') 280 | -------------------------------------------------------------------------------- /src/barecat/core/sharder.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import os.path as osp 4 | import shutil 5 | from contextlib import AbstractContextManager 6 | 7 | import crc32c as crc32c_lib 8 | from barecat.common import FileSection 9 | from barecat.util import ( 10 | copyfileobj, 11 | copyfileobj_crc32c, 12 | open_, 13 | raise_if_readonly, 14 | reopen, 15 | write_zeroes, 16 | ) 17 | 18 | 19 | class Sharder(AbstractContextManager): 20 | def __init__( 21 | self, 22 | path, 23 | shard_size_limit=None, 24 | readonly=True, 25 | append_only=False, 26 | threadsafe=False, 27 | allow_writing_symlinked_shard=False, 28 | ): 29 | 30 | self.path = path 31 | self.readonly = readonly 32 | self.append_only = append_only 33 | self.threadsafe = threadsafe 34 | self.allow_writing_symlinked_shard = allow_writing_symlinked_shard 35 | 36 | self.shard_size_limit = shard_size_limit 37 | 38 | if readonly: 39 | self.shard_mode_nonlast = 'rb' 40 | self.shard_mode_last_existing = 'rb' 41 | self.shard_mode_new = 'rb' 42 | elif append_only: 43 | self.shard_mode_nonlast = 'rb' 44 | self.shard_mode_last_existing = 'a+b' 45 | self.shard_mode_new = 'ax+b' 46 | else: 47 | self.shard_mode_nonlast = 'r+b' 48 | self.shard_mode_last_existing = 'r+b' 49 | self.shard_mode_new = 'x+b' 50 | 51 | self._shard_files = None 52 | if threadsafe: 53 | import multiprocessing_utils 54 | 55 | self.local = multiprocessing_utils.local() 56 | else: 57 | self.local = None 58 | 59 | # READING 60 | def readinto_from_address(self, shard, offset, buffer, expected_crc32c=None): 61 | shard_file = self.shard_files[shard] 62 | shard_file.seek(offset) 63 | num_read = shard_file.readinto(buffer) 64 | if expected_crc32c is not None and crc32c_lib.crc32c(buffer[:num_read]) != expected_crc32c: 65 | raise ValueError('CRC32C mismatch') 66 | return num_read 67 | 68 | def read_from_address(self, shard, offset, size, expected_crc32c=None): 69 | shard_file = self.shard_files[shard] 70 | shard_file.seek(offset) 71 | data = shard_file.read(size) 72 | if expected_crc32c is not None and crc32c_lib.crc32c(data) != expected_crc32c: 73 | raise ValueError('CRC32C mismatch') 74 | return data 75 | 76 | def open_from_address(self, shard, offset, size, mode='r'): 77 | return FileSection(self.shard_files[shard], offset, size, readonly=mode in ('r', 'rb')) 78 | 79 | # WRITING 80 | @raise_if_readonly 81 | def add_by_path(self, filesys_path, shard, offset, size, raise_if_cannot_fit=False): 82 | with open(filesys_path, 'rb') as in_file: 83 | return self.add( 84 | shard, offset, size, fileobj=in_file, raise_if_cannot_fit=raise_if_cannot_fit 85 | ) 86 | 87 | @raise_if_readonly 88 | def reopen_current_shard(self, mode): 89 | return self.reopen_shard(self.num_shards - 1, mode) 90 | 91 | @raise_if_readonly 92 | def reopen_shard(self, shard_number, mode): 93 | if mode != 'rb' and shard_number != self.num_shards - 1: 94 | self.raise_if_append_only( 95 | 'Cannot change mode of non-last shard in an append-only Barecat' 96 | ) 97 | self.shard_files[shard_number] = reopen(self.shard_files[shard_number], mode) 98 | return self.shard_files[shard_number] 99 | 100 | @raise_if_readonly 101 | def reopen_shards(self): 102 | for i in range(self.num_shards): 103 | if i == self.num_shards - 1: 104 | mode = self.shard_mode_last_existing 105 | else: 106 | mode = self.shard_mode_nonlast 107 | self.reopen_shard(i, mode) 108 | 109 | @raise_if_readonly 110 | def start_new_shard(self): 111 | self.reopen_current_shard(self.shard_mode_nonlast) 112 | new_shard_file = open_(f'{self.path}-shard-{self.num_shards:05d}', self.shard_mode_new) 113 | self.shard_files.append(new_shard_file) 114 | return new_shard_file 115 | 116 | @raise_if_readonly 117 | def start_new_shard_and_transfer_last_file(self, offset, size): 118 | self.raise_if_readonly('Cannot add to a read-only Barecat') 119 | 120 | old_shard_file = self.reopen_current_shard('r+b') 121 | new_shard_file = open_(f'{self.path}-shard-{self.num_shards:05d}', self.shard_mode_new) 122 | old_shard_file.seek(offset) 123 | copyfileobj(old_shard_file, new_shard_file, size) 124 | old_shard_file.truncate(offset) 125 | self.reopen_current_shard(self.shard_mode_nonlast) 126 | 127 | self.shard_files.append(new_shard_file) 128 | return new_shard_file 129 | 130 | @raise_if_readonly 131 | def add( 132 | self, 133 | shard=None, 134 | offset=None, 135 | size=None, 136 | data=None, 137 | fileobj=None, 138 | bufsize=shutil.COPY_BUFSIZE, 139 | raise_if_cannot_fit=False, 140 | ): 141 | if data is None and fileobj is None: 142 | raise ValueError('Either data or fileobj must be provided') 143 | if data is not None and fileobj is not None: 144 | raise ValueError('Both data and fileobj cannot be provided') 145 | if data is not None and size is not None and size != len(data): 146 | raise ValueError('Specified size does not match the length of the data') 147 | if shard is None and offset is not None: 148 | raise ValueError('Offset cannot be specified without a shard') 149 | if shard is not None and offset is None: 150 | raise ValueError('Shard cannot be specified without an offset') 151 | 152 | if size is None and data is not None: 153 | size = len(data) 154 | 155 | if shard is None: 156 | shard_file = self.shard_files[-1] 157 | shard = self.num_shards - 1 158 | offset = shard_file.seek(0, os.SEEK_END) 159 | else: 160 | self.ensure_open_shards(shard) 161 | shard_file = self.shard_files[shard] 162 | shard_file.seek(offset) 163 | 164 | offset_real = offset 165 | shard_real = shard 166 | if size is not None: 167 | if size > self.shard_size_limit: 168 | raise ValueError(f'File is too large to fit into a shard') 169 | if offset + size > self.shard_size_limit: 170 | if raise_if_cannot_fit: 171 | raise ValueError(f'File does not fit in the shard') 172 | shard_file = self.start_new_shard() 173 | offset_real = 0 174 | shard_real = self.num_shards - 1 175 | 176 | if data is not None: 177 | if not isinstance(data, (bytes, bytearray, memoryview)): 178 | raise ValueError( 179 | 'Data must be bytes, bytearray or memoryview. Are you using auto_codec/register_codec wrong?' 180 | ) 181 | shard_file.write(data) 182 | crc32c = crc32c_lib.crc32c(data) 183 | size_real = len(data) 184 | else: 185 | size_real, crc32c = copyfileobj_crc32c(fileobj, shard_file, size, bufsize) 186 | if size is not None and size != size_real: 187 | raise ValueError(f'Size mismatch! Expected {size}, got only {size_real}') 188 | 189 | if offset_real + size_real > self.shard_size_limit: 190 | if raise_if_cannot_fit: 191 | 192 | raise ValueError('File does not fit in the shard') 193 | self.start_new_shard_and_transfer_last_file(offset_real, size_real) 194 | offset_real = 0 195 | shard_real = self.num_shards - 1 196 | 197 | return shard_real, offset_real, size_real, crc32c 198 | 199 | def reserve(self, size): 200 | if size > self.shard_size_limit: 201 | raise ValueError(f'File is too large to fit into a shard') 202 | 203 | shard_file = self.shard_files[-1] 204 | offset = shard_file.seek(0, os.SEEK_END) 205 | if offset + size > self.shard_size_limit: 206 | shard_file = self.start_new_shard() 207 | offset = 0 208 | 209 | shard_file.seek(offset) 210 | write_zeroes(shard_file, size) 211 | shard_file.flush() 212 | return self.num_shards - 1, offset 213 | 214 | def ensure_open_shards(self, shard_id): 215 | if self.num_shards < shard_id + 1: 216 | for i in range(self.num_shards, shard_id + 1): 217 | self.shard_files.append( 218 | open_(f'{self.path}-shard-{i:05d}', mode=self.shard_mode_nonlast) 219 | ) 220 | 221 | def open_shard_files(self): 222 | shard_paths = sorted(glob.glob(f'{self.path}-shard-?????')) 223 | if ( 224 | not self.readonly 225 | and not self.allow_writing_symlinked_shard 226 | and any(osp.islink(p) for p in shard_paths) 227 | ): 228 | raise ValueError( 229 | 'Writing symlinked shards was disabled in this Barecat ' 230 | '(allow_writing_symlinked_shard on the constructor)' 231 | ) 232 | 233 | shard_files_nonlast = [open_(p, mode=self.shard_mode_nonlast) for p in shard_paths[:-1]] 234 | last_shard_name = f'{self.path}-shard-{len(shard_files_nonlast):05d}' 235 | try: 236 | last_shard_file = open_(last_shard_name, mode=self.shard_mode_last_existing) 237 | except FileNotFoundError: 238 | if self.readonly: 239 | raise 240 | last_shard_file = open_(last_shard_name, mode=self.shard_mode_new) 241 | 242 | return shard_files_nonlast + [last_shard_file] 243 | 244 | def truncate_all_to_logical_size(self, logical_shard_ends): 245 | shard_files = self.shard_files 246 | for i in range(self.num_shards - 1, 0, -1): 247 | if logical_shard_ends[i] == 0: 248 | shard_files[i].truncate(0) 249 | shard_files[i].close() 250 | os.remove(shard_files[i].name) 251 | del shard_files[i] 252 | else: 253 | break 254 | for i, f in enumerate(self.shard_files): 255 | f.truncate(logical_shard_ends[i]) 256 | self.reopen_current_shard(self.shard_mode_last_existing) 257 | 258 | def close(self): 259 | for f in self.shard_files: 260 | f.close() 261 | 262 | def raise_if_readonly(self, message): 263 | if self.readonly: 264 | raise ValueError(message) 265 | 266 | def raise_if_append_only(self, message): 267 | if self.append_only: 268 | raise ValueError(message) 269 | 270 | def physical_shard_end(self, shard_number): 271 | return self.shard_files[shard_number].seek(0, os.SEEK_END) 272 | 273 | @property 274 | def num_shards(self): 275 | return len(self.shard_files) 276 | 277 | @property 278 | def total_physical_size_seek(self): 279 | return sum(self.physical_shard_end(i) for i in range(self.num_shards)) 280 | 281 | @property 282 | def total_physical_size_stat(self): 283 | return sum(osp.getsize(f.name) for f in self.shard_files) 284 | 285 | # THREADSAFE 286 | @property 287 | def shard_files(self): 288 | if self.local is None: 289 | if self._shard_files is None: 290 | self._shard_files = self.open_shard_files() 291 | return self._shard_files 292 | try: 293 | return self.local.shard_files 294 | except AttributeError: 295 | self.local.shard_files = self.open_shard_files() 296 | return self.local.shard_files 297 | 298 | def __exit__(self, exc_type, exc_val, exc_tb): 299 | self.close() 300 | -------------------------------------------------------------------------------- /docs/abbrev_long.bib: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%% Journals %%%%%%%%%%%%%%%% 2 | @string{IJCV = "International Journal of Computer Vision (IJCV)"} 3 | @string{CVIU = "Computer Vision and Image Understanding (CVIU)"} 4 | @string{PR = "Pattern Recognition"} 5 | @string{PRL = "Pattern Recognition Letters"} 6 | 7 | @string{ML = "Machine Learning"} 8 | @string{AI = "Artificial Intelligence"} 9 | @string{AR = "Autonomous Robots"} 10 | @string{MVA = "Machine Vision and Applications"} 11 | @string{IVC = "Image and Vision Computing"} 12 | @string{BBS = "Behavioral and Brain Sciences (BBS)"} 13 | @string{VR = "Vision Research"} 14 | @string{IR = "Information Retrieval"} 15 | @string{NN = "Neural Networks"} 16 | @string{CAG = "Computers \& Graphics"} 17 | @string{CVGIP = "Computer Vision, Graphics, and Image Processing (CVGIP)"} 18 | @string{CVGIPIU = "CVGIP: Image Understanding"} 19 | @string{PP = "Perception \& Psychophysics"} 20 | @string{FTCGV = "Foundations and Trends in Computer Graphics and Vision"} 21 | @string{AdvRob = "Advanced Robotics"} 22 | 23 | @string{Nature = "Nature"} 24 | @string{Science = "Science"} 25 | @string{Mechatronics = "Mechatronics"} 26 | @string{NRN = "Nature Reviews Neuroscience"} 27 | @string{NM = "Nature Methods"} 28 | @string{PHY = "Physical Review E"} 29 | @string{PsychRev = "Psychological Review"} 30 | 31 | @string{JMLR = "Journal of Machine Learning Research (JMLR)"} 32 | @string{JSC = "Journal of Scientific Computing"} 33 | @string{JCN = "Journal of Cognitive Neuroscience"} 34 | @string{JEPHPP = "Journal of Experimental Psychology: Human Perception and Performance"} 35 | @string{JECP = "Journal of Experimental Child Psychology"} 36 | @string{JB = "Journal of Biomechanics"} 37 | 38 | @string{EURASIP = "EURASIP Journal on Advances in Signal Processing"} 39 | @string{PRESENCE = "Presence: Teleoperators and Virtual Environments"} 40 | @string{BMB = "The Bulletin of Mathematical Biophysics"} 41 | 42 | @string{TVC = "The Visual Computer"} 43 | @string{TJSC = "The Journal of Supercomputing"} 44 | 45 | % IEEE 46 | @string{PIEEE = "Proceedings of the IEEE"} 47 | @string{RAL = "IEEE Robotics and Automation Letters (RA-L)"} 48 | @string{CGA = "IEEE Computer Graphics and Applications"} 49 | @string{IEEEA = "IEEE Access"} 50 | @string{TPAMI = "IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"} 51 | @string{PAMI = "IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"} 52 | @string{TC = "IEEE Transactions on Communications"} 53 | @string{TCyb = "IEEE Transactions on Cybernetics"} 54 | @string{TSE = "IEEE Transactions on Software Engineering"} 55 | @string{TIV = "IEEE Transactions on Intelligent Vehicles"} 56 | @string{TIP = "IEEE Transactions on Image Processing"} 57 | @string{TOR = "IEEE Transactions on Robotics"} 58 | @string{TAC = "IEEE Transactions on Automatic Control"} 59 | @string{TITS = "IEEE Transactions on Intelligent Transportation Systems (T-ITS)"} 60 | @string{TOC = "IEEE Transactions on Computers"} 61 | @string{TVT = "IEEE Transactions on Vehicular Technologies"} 62 | @string{TNN = "IEEE Transactions on Neural Networks"} 63 | @string{THMS = "IEEE Transactions on Human-Machine Systems"} 64 | @string{TCSVT = "IEEE Transactions on Circuits and Systems for Video Technology"} 65 | @string{TBIOM = "IEEE Transactions on Biometrics, Behavior, and Identity Science (T-BIOM)"} 66 | @string{TIT = "IEEE Transactions on Information Theory"} 67 | @string{TVCG = "IEEE Transactions on Visualization and Computer Graphics (TVCG)"} 68 | @string{TSSC = "IEEE Transactions on Systems Science and Cybernetics"} 69 | @string{IRETIT= "IRE Transactions on Information Theory"} 70 | @string{IJTEHM= "IEEE Journal of Translational Engineering in Health and Medicine"} 71 | 72 | 73 | % ACM 74 | @string{TOCHI = "ACM Transactions on Computer-Human Interaction (TOCHI)"} 75 | @string{TOG = "ACM Transactions on Graphics (TOG)"} 76 | @string{CACM = "Communications of the ACM (CACM)"} 77 | @string{IMWUT = "Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT)"} 78 | @string{CSUR = "ACM Computing Surveys (CSUR)"} 79 | @string{THRI = "ACM Transactions on Human-Robot Interaction"} 80 | 81 | @string{AnnStat = "Annals of Statistics"} 82 | @string{JC = "Journal of Classification"} 83 | @string{IJRR = "International Journal of Robotics Research (IJRR)"} 84 | @string{RSS = "Robotics: Science and Systems (RSS)"} 85 | 86 | @string{PLOSOne = "PLOS One"} 87 | @string{SMO = "Sports Medicine -- Open"} 88 | @string{IJMIR = "International Journal of Multimedia Information Retrieval (IJMIR)"} 89 | 90 | @string{BiolCyb = "Biological Cybernetics"} 91 | @string{Psychomet = "Psychometrika"} 92 | @string{Biotelem = "Biotelemetry"} 93 | @string{NC = "Neural Computation"} 94 | @string{Neurocomputing = "Neurocomputing"} 95 | @string{PhilosMag = "London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science"} 96 | 97 | @string{TST = "Tsinghua Science and Technology"} 98 | @string{VRIH = "Virtual Reality \& Intelligent Hardware (VRIH)"} 99 | @string{AR = "Autonomous Robots Journal"} 100 | @string{ISPRS = "ISPRS Journal of Photogrammetry and Remote Sensing (P\&RS)"} 101 | @string{MMS = "Multimedia Systems"} 102 | @string{SSS = "Social Studies of Science"} 103 | @string{SIREV = "SIAM Review"} 104 | 105 | @string{Sensors = "Sensors"} 106 | @string{Electronics = "Electronics"} 107 | 108 | @string{ARVC = "Annual Review of Vision Science"} 109 | @string{ARP = "Annual Review of Psychology"} 110 | @string{PRSLB = "Proceedings of the Royal Society of London. Series B, Biological Sciences"} 111 | @string{PRSA = "Proceedings of the Royal Society A"} 112 | 113 | @string{TJP = "The Journal of Physiology"} 114 | @string{USSRCMMP = "USSR Computational Mathematics and Mathematical Physics"} 115 | @string{CRHSAS = "Comptes rendus hebdomadaires des séances de l'Académie des sciences"} 116 | 117 | 118 | %%%%%%%%%%%%%%%%%%%%% Conferences %%%%%%%%%%%%%% 119 | @string{CVPR = "IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"} 120 | @string{ICCV = "IEEE/CVF International Conference on Computer Vision (ICCV)"} 121 | @string{WACV = "IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)"} 122 | 123 | @string{ECCV = "European Conference on Computer Vision (ECCV)"} 124 | @string{ACCV = "Asian Conference on Computer Vision (ACCV)"} 125 | @string{BMVC = "British Machine Vision Conference (BMVC)"} 126 | @string{DAGM = "DAGM Annual Pattern Recognition Symposium"} 127 | @string{GCPR = "DAGM German Conference on Pattern Recognition (GCPR)"} 128 | 129 | @string{NIPS = "Advances in Neural Information Processing Systems (NIPS)"} 130 | @string{NeurIPS = "Advances in Neural Information Processing Systems (NeurIPS)"} 131 | @string{NeurIPSDB = "Neural Information Processing Systems: Datasets and Benchmarks Track"} 132 | 133 | @string{TDV = "International Conference on 3D Vision (3DV)"} 134 | @string{ICML = "International Conference on Machine Learning (ICML)"} 135 | @string{ICLR = "International Conference on Learning Representations (ICLR)"} 136 | @string{ICPR = "International Conference on Pattern Recogntion (ICPR)"} 137 | @string{CAIP = "International Conference on Analysis of Images and Patterns (CAIP)"} 138 | @string{ICIAP = "International Conference on Image Analysis and Processing (ICIAP)"} 139 | @string{ICIAR = "International Conference on Image Analysis and Recognition (ICIAR)"} 140 | 141 | @string{ISCS = "IEEE International Symposium on Circuits and Systems (ISCAS)"} 142 | @string{FG = "IEEE International Conference on Automatic Face and Gesture Recognition (FG)"} 143 | @string{CDC = "IEEE Conference on Decision and Control (CDC)"} 144 | @string{IROS = "IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)"} 145 | @string{ICRA = "IEEE International Conference on Robotics and Automation (ICRA)"} 146 | @string{IVS = "IEEE Intelligent Vehicles Symposium (IV)"} 147 | @string{ICASSP = "IEEE Conference on Acoustics, Speech and Signal Processing (ICASSP)"} 148 | @string{ITW = "IEEE Information Theory Workshop (ITW)"} 149 | @string{ICIP = "IEEE International Conference on Image Processing (ICIP)"} 150 | @string{ICME = "IEEE International Conference on Multimedia \& Expo (ICME)"} 151 | @string{CITS = "IEEE Conference on Intelligent Transportation Systems (ITSC)"} 152 | @string{RSS = "Robotics: Science and Systems (RSS)"} 153 | 154 | @string{SIGGRAPH = "ACM Transactions on Graphics (Proceedings of ACM SIGGRAPH)"} 155 | @STRING{SIGGRAPHAsia = "ACM Transactions on Graphics (Proceedings of ACM SIGGRAPH Asia)"} 156 | @string{CHI = "ACM Conference on Human Factors in Computing Systems (CHI)"} 157 | @string{MMSys = "ACM Multimedia Systems Conference (MMSys)"} 158 | @string{SIGMOD = "ACM SIGMOD International Conference on Management of Data"} 159 | @string{MM = "ACM International Conference on Multimedia"} 160 | @string{KDD = "ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)"} 161 | @string{AAAI = "AAAI Conference on Artificial Intelligence"} 162 | @string{AAAI = "AAAI Conference on Artificial Intelligence"} 163 | @string{IJCAI = "International Joint Conference on Artificial Intelligence (IJCAI)"} 164 | 165 | @string{ACC = "American Control Conference (ACC)"} 166 | @string{WAPCV = "International Workshop on Attention in Cognitive Systems (WAPCV)"} 167 | @string{COLT92 = "Annual Workshop on Computational Learning Theory (COLT)"} 168 | 169 | @string{SIBGRAPI = "SIBGRAPI Conference on Graphics, Patterns and Images"} 170 | @string{ICIRA = "International Conference on Intelligent Robotics and Applications (ICIRA)"} 171 | 172 | @string{AISTAT = "International Conference on Artificial Intelligence and Statistics (AISTATS)"} 173 | @string{AISTATS = "International Conference on Artificial Intelligence and Statistics (AISTATS)"} 174 | 175 | @string{SCIA = "Scandinavian Conference on Image Analysis (SCIA)"} 176 | @string{EUROCOLT = "European Conference on Computational Learning Theory (EuroCOLT)"} 177 | @string{ICVS = "International Conference on Computer Vision Systems (ICVS)"} 178 | @string{EMMCVPR = "International Conference on Energy Minimization Methods in Computer Vision and Pattern Recognition (EMMCVPR)"} 179 | @string{IJCNN = "International Joint Conference on Neural Networks (IJCNN)"} 180 | 181 | @string{MICCAI = "International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)"} 182 | @string{ICANN = "International Conference on Artificial Neural Networks (ICANN)"} 183 | @string{ISMIR = "International Society for Music Information Retrieval Conference (ISMIR)"} 184 | @string{AMDO = "International Conference on Articulated Motion and Deformable Objects (AMDO)"} 185 | @string{Allerton = "Annual Allerton Conference on Communication, Control, and Computing"} 186 | @string{OSDI = "USENIX Symposium on Operating Systems Design and Implementation (OSDI)"} 187 | 188 | @string{BRACIS = "Brazilian Conference on Intelligent Systems (BRACIS)"} 189 | @string{MIDL = "Medical Imaging with Deep Learning (MIDL)"} 190 | @string{TDBODYTECH = "International Conference and Exhibition on 3D Body Scanning and Processing Technologies (3DBODY.TECH)"} 191 | @string{IAS = "International Conference on Intelligent Autonomous Systems"} 192 | @string{CoRL = "Conference on Robot Learning"} 193 | @string{CRV = "Conference on Computer and Robot Vision"} 194 | @string{ICONIP = "International Conference on Neural Information Processing"} 195 | @string{SGP = "Symposium on Geometry Processing"} 196 | 197 | 198 | @string{WACV_until_2016 = "IEEE Workshop on Applications of Computer Vision (WACV)"} 199 | %%%%%%%%%%%%%%%%%%%%% Workshops %%%%%%%%%%%%%% 200 | @string{ICCVW = "IEEE International Conference on Computer Vision -- Workshops (ICCVW)"} 201 | @string{ECCVW = "European Conference on Computer Vision -- Workshops (ECCVW)"} 202 | @string{CVPRW = "IEEE Conference on Computer Vision and Pattern Recognition -- Workshops (CVPRW)"} 203 | @string{IROSW = "IEEE/RSJ International Conference on Intelligent Robots and Systems -- Workshops (IROSW)"} 204 | @string{WACVW = "IEEE Winter Conference on Applications of Computer Vision -- Workshops (WACVW)"} 205 | @string{MICCAIW = "International Conference on Medical Image Computing and Computer Assisted Intervention -- Workshops (MICCAIW)"} 206 | 207 | @string{MMWVSCC = "ACM Multimedia Conference (MM) -- Workshop on Visual Analysis in Smart and Connected Communities (VSCC)"} 208 | -------------------------------------------------------------------------------- /src/barecat/common.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | from datetime import datetime 4 | from enum import Flag, auto 5 | from typing import Union, TYPE_CHECKING, Optional 6 | from barecat.util import datetime_to_ns, normalize_path, ns_to_datetime 7 | 8 | if TYPE_CHECKING: 9 | from barecat import BarecatEntryInfo 10 | 11 | SHARD_SIZE_UNLIMITED = (1 << 63) - 1 #: An extremely large integer, representing unlimited size 12 | 13 | 14 | class BarecatEntryInfo: 15 | """ 16 | Base class for file and directory information classes. 17 | 18 | The two subclasses are :class:`barecat.BarecatFileInfo` and :class:`barecat.BarecatDirInfo`. 19 | 20 | Args: 21 | path: path to the file or directory 22 | mode: file mode, i.e. permissions 23 | uid: user ID 24 | gid: group ID 25 | mtime_ns: last modification time in nanoseconds since the Unix epoch 26 | """ 27 | 28 | __slots__ = ('_path', 'mode', 'uid', 'gid', 'mtime_ns') 29 | 30 | def __init__( 31 | self, 32 | path: Optional[str] = None, 33 | mode: Optional[int] = None, 34 | uid: Optional[int] = None, 35 | gid: Optional[int] = None, 36 | mtime_ns: Optional[Union[int, datetime]] = None, 37 | ): 38 | self._path = normalize_path(path) 39 | self.mode = mode 40 | """File mode, i.e., permissions.""" 41 | 42 | self.uid = uid 43 | """User ID.""" 44 | 45 | self.gid = gid 46 | """Group ID.""" 47 | 48 | self.mtime_ns = mtime_ns 49 | """Last modification time in nanoseconds since the Unix epoch.""" 50 | 51 | if isinstance(self.mtime_ns, datetime): 52 | self.mtime_ns = datetime_to_ns(self.mtime_ns) 53 | 54 | @property 55 | def path(self): 56 | """Path to the file or directory. The path is normalized on assignment.""" 57 | return self._path 58 | 59 | @path.setter 60 | def path(self, value): 61 | self._path = normalize_path(value) 62 | 63 | @property 64 | def mtime_dt(self) -> Optional[datetime]: 65 | """Last modification time as a datetime object.""" 66 | return ns_to_datetime(self.mtime_ns) if self.mtime_ns else None 67 | 68 | @mtime_dt.setter 69 | def mtime_dt(self, dt: datetime): 70 | self.mtime_ns = datetime_to_ns(dt) 71 | 72 | def update_mtime(self): 73 | """Update the last modification time to the current time.""" 74 | self.mtime_dt = datetime.now() 75 | 76 | def fill_from_statresult(self, s: os.stat_result): 77 | """Fills the metadata information from a stat result, obtained from the file system. 78 | 79 | Args: 80 | s: stat result object to fill the metadata from 81 | """ 82 | self.mode = s.st_mode 83 | self.uid = s.st_uid 84 | self.gid = s.st_gid 85 | self.mtime_ns = s.st_mtime_ns 86 | 87 | @classmethod 88 | def row_factory(cls, cursor, row): 89 | """Factory method for creating instances from SQLite query results. 90 | 91 | Args: 92 | cursor: SQLite cursor object 93 | row: row from the query result 94 | """ 95 | 96 | # Raw construction without any of that property business or validation, just for speed 97 | instance = cls.__new__(cls) 98 | for field, value in zip(cursor.description, row): 99 | fieldname = field[0] 100 | if fieldname == 'path': 101 | instance._path = value 102 | else: 103 | object.__setattr__(instance, fieldname, value) 104 | return instance 105 | 106 | 107 | class BarecatFileInfo(BarecatEntryInfo): 108 | """ 109 | Describes file information such as path, location in the shards and metadata. 110 | 111 | This class is used both when retrieving existing file information and when adding new files. 112 | 113 | Args: 114 | path: path to the file inside the archive 115 | mode: file mode, i.e., permissions 116 | uid: user ID 117 | gid: group ID 118 | mtime_ns: last modification time in nanoseconds since the Unix epoch 119 | shard: shard number 120 | offset: offset within the shard in bytes 121 | size: size of the file in bytes 122 | crc32c: CRC32C checksum of the file contents 123 | """ 124 | 125 | __slots__ = ('shard', 'offset', 'size', 'crc32c') 126 | 127 | def __init__( 128 | self, 129 | path: Optional[str] = None, 130 | mode: Optional[int] = None, 131 | uid: Optional[int] = None, 132 | gid: Optional[int] = None, 133 | mtime_ns: Optional[Union[int, datetime]] = None, 134 | shard: Optional[int] = None, 135 | offset: Optional[int] = None, 136 | size: Optional[int] = None, 137 | crc32c: Optional[int] = None, 138 | ): 139 | super().__init__(path, mode, uid, gid, mtime_ns) 140 | self.shard = shard 141 | """Shard number where the file is located.""" 142 | 143 | self.offset = offset 144 | """Offset within the shard in bytes.""" 145 | 146 | self.size = size 147 | """Size of the file in bytes.""" 148 | 149 | self.crc32c = crc32c 150 | """CRC32C checksum of the file contents.""" 151 | 152 | def asdict(self) -> dict: 153 | """Returns a dictionary representation of the file information. 154 | 155 | Returns: 156 | Dictionary with keys 'path', 'shard', 'offset', 'size', 'crc32c', 'mode', 'uid', 157 | 'gid', 'mtime_ns' 158 | """ 159 | return dict( 160 | path=self.path, 161 | shard=self.shard, 162 | offset=self.offset, 163 | size=self.size, 164 | crc32c=self.crc32c, 165 | mode=self.mode, 166 | uid=self.uid, 167 | gid=self.gid, 168 | mtime_ns=self.mtime_ns, 169 | ) 170 | 171 | def fill_from_statresult(self, s: os.stat_result): 172 | """Fills the file metadata information from a stat result, obtained from the file system. 173 | 174 | Args: 175 | s: stat result object to fill the metadata from 176 | """ 177 | super().fill_from_statresult(s) 178 | self.size = s.st_size 179 | 180 | @property 181 | def end(self) -> int: 182 | """End position of the file in the shard.""" 183 | return self.offset + self.size 184 | 185 | 186 | class BarecatDirInfo(BarecatEntryInfo): 187 | """ 188 | Describes directory information such as path, metadata and statistics. 189 | 190 | This class is used both when retrieving existing directory information and when adding new 191 | directories. 192 | 193 | Args: 194 | path: path to the directory inside the archive 195 | mode: directory mode, i.e., permissions 196 | uid: user ID 197 | gid: group ID 198 | mtime_ns: last modification time in nanoseconds since the Unix epoch 199 | num_subdirs: number of subdirectories in the directory 200 | num_files: number of files in the directory 201 | size_tree: total size of the directory contents in bytes 202 | num_files_tree: total number of files in the directory and its subdirectories 203 | """ 204 | 205 | __slots__ = ('num_subdirs', 'num_files', 'size_tree', 'num_files_tree') 206 | 207 | def __init__( 208 | self, 209 | path: Optional[str] = None, 210 | mode: Optional[int] = None, 211 | uid: Optional[int] = None, 212 | gid: Optional[int] = None, 213 | mtime_ns: Optional[Union[int, datetime]] = None, 214 | num_subdirs: Optional[bool] = None, 215 | num_files: Optional[int] = None, 216 | size_tree: Optional[int] = None, 217 | num_files_tree: Optional[int] = None, 218 | ): 219 | super().__init__(path, mode, uid, gid, mtime_ns) 220 | self.num_subdirs = num_subdirs 221 | """Number of immediate subdirectories in the directory.""" 222 | 223 | self.num_files = num_files 224 | """Number of immediate files in the directory.""" 225 | 226 | self.size_tree = size_tree 227 | """Total size of the directory's contents (recursively) in bytes.""" 228 | 229 | self.num_files_tree = num_files_tree 230 | """Total number of files in the directory and its subdirectories, recursively.""" 231 | 232 | def asdict(self) -> dict: 233 | """Returns a dictionary representation of the directory information. 234 | 235 | Returns: 236 | Dictionary with keys 'path', 'num_subdirs', 'num_files', 'size_tree', 'num_files_tree', 237 | 'mode', 'uid', 'gid', 'mtime_ns' 238 | """ 239 | return dict( 240 | path=self.path, 241 | num_subdirs=self.num_subdirs, 242 | num_files=self.num_files, 243 | size_tree=self.size_tree, 244 | num_files_tree=self.num_files_tree, 245 | mode=self.mode, 246 | uid=self.uid, 247 | gid=self.gid, 248 | mtime_ns=self.mtime_ns, 249 | ) 250 | 251 | @property 252 | def num_entries(self) -> int: 253 | """Total number of entries in the directory, including subdirectories and files.""" 254 | return self.num_subdirs + self.num_files 255 | 256 | def fill_from_statresult(self, s: os.stat_result): 257 | """Fills the directory metadata information from a stat result, from the file system. 258 | 259 | Args: 260 | s: stat result object to fill the metadata from 261 | """ 262 | super().fill_from_statresult(s) 263 | self.num_subdirs = s.st_nlink - 2 264 | 265 | 266 | class Order(Flag): 267 | """Ordering specification for file and directory listings. 268 | 269 | The ordering can be by address (shard and offset), path, or random. The order can be ascending 270 | or descending. The default order is ANY, which is the order in which SQLite yields rows. 271 | """ 272 | 273 | ANY = auto() 274 | """Default order, as returned by SQLite""" 275 | 276 | RANDOM = auto() 277 | """Randomized order""" 278 | 279 | ADDRESS = auto() 280 | """Order by shard and offset position""" 281 | 282 | PATH = auto() 283 | """Alphabetical order by path""" 284 | 285 | DESC = auto() 286 | """Descending order""" 287 | 288 | def as_query_text(self) -> str: 289 | """Returns the SQL ORDER BY clause corresponding to the ordering specification.""" 290 | 291 | if self & Order.ADDRESS and self & Order.DESC: 292 | return ' ORDER BY shard DESC, offset DESC' 293 | elif self & Order.ADDRESS: 294 | return ' ORDER BY shard, offset' 295 | elif self & Order.PATH and self & Order.DESC: 296 | return ' ORDER BY path DESC' 297 | elif self & Order.PATH: 298 | return ' ORDER BY path' 299 | elif self & Order.RANDOM: 300 | return ' ORDER BY RANDOM()' 301 | return '' 302 | 303 | 304 | class FileSection(io.IOBase): 305 | """File-like object representing a section of a file. 306 | 307 | Args: 308 | file: file-like object to read from or write to 309 | start: start position of the section in the file 310 | size: size of the section 311 | readonly: whether the section should be read-only 312 | """ 313 | 314 | def __init__(self, file: io.RawIOBase, start: int, size: int, readonly: bool = True): 315 | self.file = file 316 | self.start = start 317 | self.end = start + size 318 | self.position = start 319 | self.readonly = readonly 320 | 321 | def read(self, size: int = -1) -> bytes: 322 | """Read a from the section, starting from the current position. 323 | 324 | Args: 325 | size: number of bytes to read, or -1 to read until the end of the section 326 | 327 | Returns: 328 | Bytes read from the section. 329 | """ 330 | if size == -1: 331 | size = self.end - self.position 332 | 333 | size = min(size, self.end - self.position) 334 | self.file.seek(self.position) 335 | data = self.file.read(size) 336 | self.position += len(data) 337 | return data 338 | 339 | def readinto(self, buffer: Union[bytearray, memoryview]) -> int: 340 | """Read bytes into a buffer from the section, starting from the current position. 341 | 342 | Will read up to the length of the buffer or until the end of the section. 343 | 344 | Args: 345 | buffer: destination buffer to read into 346 | 347 | Returns: 348 | Number of bytes read into the buffer. 349 | """ 350 | size = min(len(buffer), self.end - self.position) 351 | if size == 0: 352 | return 0 353 | 354 | self.file.seek(self.position) 355 | num_read = self.file.readinto(buffer[:size]) 356 | self.position += num_read 357 | return num_read 358 | 359 | def readall(self) -> bytes: 360 | """Read all remaining bytes from the section. 361 | 362 | Returns: 363 | Bytes read from the section. 364 | """ 365 | 366 | return self.read() 367 | 368 | def readable(self): 369 | """Always returns True, since the section is always readable.""" 370 | return True 371 | 372 | def writable(self): 373 | return not self.readonly 374 | 375 | def write(self, data: Union[bytes, bytearray, memoryview]) -> int: 376 | """Write data to the section, starting from the current position. 377 | 378 | Args: 379 | data: data to write to the section 380 | 381 | Returns: 382 | Number of bytes written to the section. 383 | 384 | Raises: 385 | PermissionError: if the section is read-only 386 | EOFError: if the write would go past the end of the section 387 | """ 388 | 389 | if self.readonly: 390 | raise PermissionError('Cannot write to a read-only file section') 391 | 392 | if self.position + len(data) > self.end: 393 | raise EOFError('Cannot write past the end of the section') 394 | 395 | self.file.seek(self.position) 396 | n_written = self.file.write(data) 397 | self.position += n_written 398 | return n_written 399 | 400 | def readline(self, size: int = -1) -> bytes: 401 | size = min(size, self.end - self.position) 402 | if size == -1: 403 | size = self.end - self.position 404 | 405 | self.file.seek(self.position) 406 | data = self.file.readline(size) 407 | 408 | self.position += len(data) 409 | return data 410 | 411 | def tell(self): 412 | return self.position - self.start 413 | 414 | def seek(self, offset, whence=0): 415 | if whence == io.SEEK_SET: 416 | new_position = self.start + offset 417 | elif whence == io.SEEK_CUR: 418 | new_position = self.position + offset 419 | elif whence == io.SEEK_END: 420 | new_position = self.end + offset 421 | else: 422 | raise ValueError(f"Invalid value for whence: {whence}") 423 | 424 | if new_position < self.start or new_position > self.end: 425 | raise EOFError("Seek position out of bounds") 426 | 427 | self.position = new_position 428 | return self.position - self.start 429 | 430 | def close(self): 431 | """Close the file section, this is a no-op, since the real shard file is not closed.""" 432 | pass 433 | 434 | @property 435 | def size(self) -> int: 436 | """Size of the section in bytes.""" 437 | return self.end - self.start 438 | 439 | def __exit__(self, exc_type, exc_val, exc_tb): 440 | self.close() 441 | -------------------------------------------------------------------------------- /src/barecat/viewerqt6.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import os.path as osp 4 | import pprint 5 | import re 6 | import shutil 7 | import sys 8 | from typing import List 9 | 10 | import msgpack_numpy 11 | from PyQt6.QtCore import QBuffer, QByteArray, QMimeData, QModelIndex, Qt, pyqtSlot 12 | from PyQt6.QtGui import ( 13 | QClipboard, 14 | QFont, 15 | QFontMetrics, 16 | QImageReader, 17 | QPixmap, 18 | QStandardItem, 19 | QStandardItemModel, 20 | ) 21 | from PyQt6.QtWidgets import ( 22 | QAbstractItemView, 23 | QApplication, 24 | QFileDialog, 25 | QHBoxLayout, 26 | QHeaderView, 27 | QLabel, 28 | QMenu, 29 | QScrollArea, 30 | QSplitter, 31 | QStyleFactory, 32 | QTableView, 33 | QTreeView, 34 | QVBoxLayout, 35 | QWidget, 36 | ) 37 | 38 | import barecat 39 | from barecat.common import BarecatDirInfo, BarecatFileInfo 40 | 41 | 42 | def main(): 43 | app = QApplication(sys.argv) 44 | app.setStyle(QStyleFactory.create(QApplication.style().objectName())) 45 | 46 | parser = argparse.ArgumentParser(description='View images stored in a barecat archive.') 47 | parser.add_argument('path', type=str, help='path to load from') 48 | args = parser.parse_args() 49 | viewer = BarecatViewer(args.path) 50 | viewer.show() 51 | sys.exit(app.exec()) 52 | 53 | 54 | class BarecatViewer(QWidget): 55 | def __init__(self, path): 56 | super().__init__() 57 | self.file_reader = barecat.Barecat(path) 58 | self.barecat_path = path 59 | self.tree = QTreeView() 60 | self.tree.setEditTriggers(QAbstractItemView.EditTrigger.NoEditTriggers) 61 | 62 | self.file_table = self.create_file_table() 63 | self.content_viewer = ContentViewer() 64 | self.content_viewer.label.setWordWrap(True) 65 | font = QFont("Courier New") # Replace with the desired monospace font 66 | self.content_viewer.label.setFont(font) 67 | 68 | splitter = QSplitter() 69 | splitter.addWidget(self.tree) 70 | splitter.addWidget(self.file_table) 71 | splitter.addWidget(self.content_viewer) 72 | splitter.setSizes([650, 650, 1000]) 73 | layout = QHBoxLayout() 74 | layout.addWidget(splitter) 75 | self.setLayout(layout) 76 | 77 | self.resize(2400, 800) 78 | 79 | self.fill_tree() 80 | self.tree.selectionModel().selectionChanged.connect(self.update_file_table) 81 | self.tree.activated.connect(self.expand_tree_item) 82 | self.tree.doubleClicked.connect(self.expand_tree_item) 83 | self.tree.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu) 84 | self.tree.customContextMenuRequested.connect(self.show_tree_context_menu) 85 | 86 | root_index = self.tree.model().index(0, 0) 87 | self.tree.setCurrentIndex(root_index) 88 | 89 | def create_file_table(self): 90 | ft = QTableView() 91 | ft.verticalHeader().setVisible(False) 92 | ft.verticalHeader().setDefaultSectionSize(20) 93 | ft.setShowGrid(False) 94 | ft.setSelectionMode(QAbstractItemView.SelectionMode.SingleSelection) 95 | ft.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows) 96 | ft.setEditTriggers(QAbstractItemView.EditTrigger.NoEditTriggers) 97 | model = QStandardItemModel() 98 | model.setHorizontalHeaderLabels(['Name', 'Size']) 99 | ft.setModel(model) 100 | ft.selectionModel().selectionChanged.connect(self.show_selected_file) 101 | ft.horizontalHeader().setSectionResizeMode(0, QHeaderView.ResizeMode.Stretch) 102 | ft.horizontalHeader().setStyleSheet( 103 | "QHeaderView::section {font-weight: normal; text-align: left;}" 104 | ) 105 | ft.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu) 106 | ft.customContextMenuRequested.connect(self.show_file_table_context_menu) 107 | return ft 108 | 109 | def fill_tree(self): 110 | root_item = TreeItem(self.file_reader) 111 | dinfo: BarecatDirInfo = self.file_reader.index.lookup_dir('') 112 | item = TreeItem( 113 | self.file_reader, 114 | path='', 115 | size=dinfo.size_tree, 116 | count=dinfo.num_files_tree, 117 | has_subdirs=dinfo.num_subdirs > 0, 118 | parent=root_item, 119 | ) 120 | root_item.children.append(item) 121 | self.model = LazyItemModel(root_item) 122 | self.tree.setModel(self.model) 123 | 124 | root_index = self.tree.model().index(0, 0) 125 | self.tree.expand(root_index) # Expand the root item by default 126 | self.tree.setColumnWidth(0, 400) 127 | self.tree.setColumnWidth(1, 70) 128 | self.tree.setColumnWidth(2, 70) 129 | 130 | @pyqtSlot(QModelIndex) 131 | def expand_tree_item(self, index): 132 | if self.tree.isExpanded(index): 133 | self.tree.collapse(index) 134 | else: 135 | self.tree.expand(index) 136 | 137 | def update_file_table(self, selected, deselected): 138 | indexes = selected.indexes() 139 | if not indexes: 140 | return 141 | 142 | index = indexes[0] # Get the first selected index 143 | item = index.internalPointer() 144 | 145 | model = self.file_table.model() 146 | model.removeRows(0, model.rowCount()) 147 | finfos: List[BarecatFileInfo] = self.file_reader.index.list_direct_fileinfos(item.path) 148 | finfos = sorted(finfos, key=lambda x: natural_sort_key(x.path)) 149 | for finfo in finfos: 150 | file_item = QStandardItem(osp.basename(finfo.path)) 151 | file_item.setData(finfo, Qt.ItemDataRole.UserRole) # Store the fileinfo as user data 152 | model.appendRow([file_item, QStandardItem(format_size(finfo.size))]) 153 | 154 | if len(finfos) > 0: 155 | first_file_index = self.file_table.model().index(0, 0) 156 | self.file_table.setCurrentIndex(first_file_index) 157 | else: 158 | for dinfo, subdinfos, finfos in self.file_reader.index.walk_infos(item.path): 159 | finfo = next(iter(finfos), None) 160 | if finfo is not None: 161 | self.show_file(finfo) 162 | break 163 | 164 | def show_selected_file(self, selected, deselected): 165 | indexes = selected.indexes() 166 | if not indexes: 167 | return 168 | path = self.file_table.model().item(indexes[0].row(), 0).data(Qt.ItemDataRole.UserRole) 169 | self.show_file(path) 170 | 171 | def show_file(self, finfo): 172 | content = self.file_reader.read(finfo) 173 | extension = osp.splitext(finfo.path)[1].lower() 174 | if extension in ('.jpg', '.jpeg', '.png', '.gif', '.bmp'): 175 | byte_array = QByteArray(content) 176 | buffer = QBuffer(byte_array) 177 | imageReader = QImageReader() 178 | imageReader.setDecideFormatFromContent(True) 179 | imageReader.setQuality(100) 180 | imageReader.setDevice(buffer) 181 | qim = imageReader.read() 182 | 183 | if not qim.isNull(): 184 | pixmap = QPixmap.fromImage(qim) 185 | self.content_viewer.setPixmap(pixmap) 186 | elif extension == '.msgpack': 187 | data = msgpack_numpy.unpackb(content) 188 | self.content_viewer.setText(data) 189 | else: 190 | self.content_viewer.setText(repr(content)) 191 | 192 | def update_image_label(self, pixmap): 193 | self.content_viewer.setPixmap(pixmap) 194 | 195 | def show_file_table_context_menu(self, position): 196 | menu = QMenu() 197 | extract_action = menu.addAction("Extract file...") 198 | copy_path_action = menu.addAction("Copy path") 199 | 200 | action = menu.exec(self.file_table.viewport().mapToGlobal(position)) 201 | 202 | if action == extract_action: 203 | indexes = self.file_table.selectionModel().selectedRows() 204 | if indexes: 205 | path_of_what_to_extract = ( 206 | self.file_table.model().item(indexes[0].row(), 0).data(Qt.ItemDataRole.UserRole) 207 | ) 208 | default_filename = osp.basename(path_of_what_to_extract) 209 | target_filename, _ = QFileDialog.getSaveFileName( 210 | self, "Select Target File", default_filename 211 | ) 212 | if target_filename: 213 | self.extract_file(path_of_what_to_extract, target_filename) 214 | elif action == copy_path_action: 215 | indexes = self.file_table.selectionModel().selectedRows() 216 | if indexes: 217 | path = ( 218 | self.file_table.model().item(indexes[0].row(), 0).data(Qt.ItemDataRole.UserRole) 219 | ) 220 | clipboard = QApplication.clipboard() 221 | clipboard.setText(path) 222 | 223 | def show_tree_context_menu(self, position): 224 | menu = QMenu() 225 | extract_action = menu.addAction("Extract directory...") 226 | copy_path_action = menu.addAction("Copy path") 227 | 228 | action = menu.exec(self.tree.viewport().mapToGlobal(position)) 229 | if action == extract_action: 230 | index = self.tree.indexAt(position) 231 | if index.isValid(): 232 | if target_directory := QFileDialog.get(self, "Select Target Directory"): 233 | self.extract_directory(index.internalPointer().path, target_directory) 234 | elif action == copy_path_action: 235 | index = self.tree.indexAt(position) 236 | if index.isValid(): 237 | clipboard = QApplication.clipboard() 238 | clipboard.setText(index.internalPointer().path) 239 | 240 | def extract_file(self, path_of_what_to_extract, target_filename): 241 | with open(target_filename, 'wb') as f: 242 | shutil.copyfileobj(self.file_reader.open(path_of_what_to_extract), f) 243 | 244 | def extract_directory(self, dir_in_archive, target_directory): 245 | basename = osp.basename(dir_in_archive) 246 | for dinfo, _, finfos in self.file_reader.index.walk_infos(dir_in_archive): 247 | for finfo in finfos: 248 | target_path = osp.join( 249 | target_directory, basename, osp.relpath(finfo.path, dir_in_archive) 250 | ) 251 | os.makedirs(osp.dirname(target_path), exist_ok=True) 252 | with open(target_path, 'wb') as f: 253 | shutil.copyfileobj(self.file_reader.open(finfo.path), f) 254 | 255 | 256 | class ContentViewer(QWidget): 257 | def __init__(self): 258 | super().__init__() 259 | self.label = QLabel() 260 | self.originalPixmap = None 261 | self.originalText = None # New attribute to hold the original text 262 | self.scrollArea = QScrollArea(self) 263 | self.scrollArea.setWidgetResizable(True) 264 | self.scrollArea.setWidget(self.label) 265 | layout = QVBoxLayout(self) 266 | layout.addWidget(self.scrollArea) 267 | 268 | self.label.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu) 269 | self.label.customContextMenuRequested.connect(self.show_context_menu) 270 | 271 | def setPixmap(self, pixmap): 272 | self.originalPixmap = pixmap 273 | self.originalText = None # Reset the original text 274 | self.updateImage() 275 | 276 | def setText(self, original_data): 277 | self.originalText = original_data # Store the original data 278 | self.originalPixmap = None # Reset the pixmap 279 | self.updateText() 280 | 281 | def updateImage(self): 282 | if self.originalPixmap: 283 | availableSize = self.scrollArea.size() 284 | if ( 285 | self.originalPixmap.width() > availableSize.width() 286 | or self.originalPixmap.height() > availableSize.height() 287 | ): 288 | scaledPixmap = self.originalPixmap.scaled( 289 | availableSize, 290 | Qt.AspectRatioMode.KeepAspectRatio, 291 | Qt.TransformationMode.SmoothTransformation, 292 | ) 293 | else: 294 | scaledPixmap = self.originalPixmap 295 | self.label.setPixmap(scaledPixmap) 296 | self.label.setAlignment(Qt.AlignmentFlag.AlignCenter) 297 | 298 | def updateText(self): 299 | if self.originalText: 300 | # Calculate the maximum line width 301 | width_pixels = self.scrollArea.width() 302 | fm = QFontMetrics(self.label.font()) 303 | average_char_width_pixels = fm.averageCharWidth() 304 | max_line_width = width_pixels // average_char_width_pixels 305 | 306 | # Pretty-print the text 307 | pp = pprint.PrettyPrinter( 308 | indent=2, width=max_line_width, compact=True, sort_dicts=False 309 | ) 310 | formatted_text = pp.pformat(self.originalText) 311 | self.label.setText(formatted_text) 312 | self.label.setAlignment(Qt.AlignmentFlag.AlignLeft | Qt.AlignmentFlag.AlignTop) 313 | 314 | def resizeEvent(self, event): 315 | if self.originalPixmap: 316 | self.updateImage() 317 | elif self.originalText: 318 | self.updateText() 319 | super().resizeEvent(event) 320 | 321 | def show_context_menu(self, position): 322 | menu = QMenu() 323 | copy_image_action = menu.addAction("Copy image") 324 | 325 | action = menu.exec(self.mapToGlobal(position)) 326 | 327 | if action == copy_image_action and self.originalPixmap: 328 | clipboard = QApplication.clipboard() 329 | mime_data = QMimeData() 330 | mime_data.setImageData(self.originalPixmap.toImage()) 331 | clipboard.setMimeData(mime_data, QClipboard.Mode.Clipboard) 332 | 333 | 334 | class LazyItemModel(QStandardItemModel): 335 | def __init__(self, root): 336 | super().__init__() 337 | self.root = root 338 | 339 | def index(self, row, column, parent=QModelIndex()): 340 | if not self.hasIndex(row, column, parent): 341 | return QModelIndex() 342 | parent_item = self.root if not parent.isValid() else parent.internalPointer() 343 | return ( 344 | self.createIndex(row, column, parent_item.children[row]) 345 | if row < len(parent_item.children) 346 | else QModelIndex() 347 | ) 348 | 349 | def parent(self, index): 350 | if not index.isValid(): 351 | return QModelIndex() 352 | parent_item = index.internalPointer().parent 353 | return self.createIndex(parent_item.row, 0, parent_item) if parent_item else QModelIndex() 354 | 355 | def rowCount(self, parent=QModelIndex()): 356 | parent_item = self.root if not parent.isValid() else parent.internalPointer() 357 | return len(parent_item.children) 358 | 359 | def columnCount(self, parent=QModelIndex()): 360 | return 3 # Name, Size, Count 361 | 362 | def headerData(self, section, orientation, role): 363 | if role == Qt.ItemDataRole.DisplayRole and orientation == Qt.Orientation.Horizontal: 364 | return ["Name", "Size", "Count"][section] 365 | return None 366 | 367 | def data(self, index, role): 368 | item = index.internalPointer() 369 | if role == Qt.ItemDataRole.DisplayRole: 370 | if index.column() == 0: 371 | if item.parent == self.root: 372 | return '[root]' 373 | return osp.basename(item.path) 374 | elif index.column() == 1: 375 | return format_size(item.size) 376 | elif index.column() == 2: 377 | return format_count(item.count) 378 | elif role == Qt.ItemDataRole.TextAlignmentRole: 379 | if index.column() in [1, 2]: 380 | return Qt.AlignmentFlag.AlignRight 381 | return None 382 | 383 | def canFetchMore(self, index): 384 | if not index.isValid(): 385 | return False 386 | return not index.internalPointer().fetched 387 | 388 | def fetchMore(self, index): 389 | item = index.internalPointer() 390 | if item == self.root: 391 | return 392 | item.fetch_more() 393 | self.beginInsertRows(index, 0, len(item.children) - 1) 394 | self.endInsertRows() 395 | 396 | def hasChildren(self, index=QModelIndex()): 397 | if not index.isValid(): 398 | return True 399 | return index.internalPointer().has_subdirs 400 | 401 | 402 | class TreeItem: 403 | def __init__(self, file_reader, path='', size=0, count=0, has_subdirs=True, parent=None): 404 | self.file_reader = file_reader 405 | 406 | self.path = path 407 | self.parent = parent 408 | self.children = [] 409 | 410 | self.size = size 411 | self.count = count 412 | self.has_subdirs = has_subdirs 413 | self.fetched = False 414 | 415 | def fetch_more(self): 416 | if self.fetched: 417 | return 418 | subdir_infos = self.file_reader.index.list_subdir_dirinfos(self.path) 419 | subdir_infos = sorted(subdir_infos, key=lambda x: natural_sort_key(x.path)) 420 | for dinfo in subdir_infos: 421 | self.children.append( 422 | TreeItem( 423 | self.file_reader, 424 | path=dinfo.path, 425 | size=dinfo.size_tree, 426 | count=dinfo.num_files_tree, 427 | has_subdirs=dinfo.num_subdirs > 0, 428 | parent=self, 429 | ) 430 | ) 431 | 432 | self.fetched = True 433 | 434 | @property 435 | def row(self): 436 | return self.parent.children.index(self) if self.parent else 0 437 | 438 | 439 | def format_size(size): 440 | units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'] 441 | index = 0 442 | while size >= 1024: 443 | index += 1 444 | size /= 1024 445 | return f'{size:.2f} {units[index]}' 446 | 447 | 448 | def format_count(size): 449 | units = ['', ' K', ' M', ' B'] 450 | unit_index = 0 451 | while size >= 1000 and unit_index < len(units) - 1: 452 | size /= 1000 453 | unit_index += 1 454 | if unit_index == 0: 455 | return str(size) 456 | return f'{size:.1f}{units[unit_index]}' 457 | 458 | 459 | def natural_sort_key(s): 460 | """Normal string sort puts '10' before '2'. Natural sort puts '2' before '10'.""" 461 | return [float(t) if t.isdigit() else t for t in re.split('([0-9]+)', s)] 462 | 463 | 464 | if __name__ == '__main__': 465 | main() 466 | --------------------------------------------------------------------------------