├── .cruft.json
├── .gitattributes
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── benchmark.yml
    │   ├── release.yml
    │   ├── tests.yml
    │   └── update-template.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.rst
├── CONTRIBUTING.rst
├── LICENSE
├── README.rst
├── noxfile.py
├── pyproject.toml
├── src
    └── dvc_data
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── callbacks.py
    │   ├── cli.py
    │   ├── compat.py
    │   ├── fs.py
    │   ├── fsutils.py
    │   ├── hashfile
    │       ├── __init__.py
    │       ├── _ignore.py
    │       ├── _progress.py
    │       ├── build.py
    │       ├── cache.py
    │       ├── checkout.py
    │       ├── db
    │       │   ├── __init__.py
    │       │   ├── index.py
    │       │   ├── local.py
    │       │   ├── migrate.py
    │       │   └── reference.py
    │       ├── diff.py
    │       ├── gc.py
    │       ├── hash.py
    │       ├── hash_info.py
    │       ├── istextfile.py
    │       ├── meta.py
    │       ├── obj.py
    │       ├── state.py
    │       ├── status.py
    │       ├── transfer.py
    │       ├── tree.py
    │       └── utils.py
    │   ├── index
    │       ├── __init__.py
    │       ├── add.py
    │       ├── build.py
    │       ├── checkout.py
    │       ├── collect.py
    │       ├── diff.py
    │       ├── fetch.py
    │       ├── index.py
    │       ├── push.py
    │       ├── save.py
    │       ├── serialize.py
    │       ├── update.py
    │       └── view.py
    │   ├── json_compat.py
    │   ├── py.typed
    │   └── repo.py
└── tests
    ├── __init__.py
    ├── benchmarks
        ├── __init__.py
        └── test_checkout.py
    ├── conftest.py
    ├── hashfile
        ├── __init__.py
        ├── test_build.py
        ├── test_cache.py
        ├── test_checkout.py
        ├── test_db.py
        ├── test_db_index.py
        ├── test_diff.py
        ├── test_hash.py
        ├── test_hash_stream.py
        ├── test_istextfile.py
        ├── test_obj.py
        ├── test_state.py
        ├── test_tree.py
        └── test_utils.py
    └── index
        ├── __init__.py
        ├── conftest.py
        ├── test_build.py
        ├── test_checkout.py
        ├── test_diff.py
        ├── test_fs.py
        ├── test_index.py
        └── test_storage.py


/.cruft.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "template": "https://github.com/iterative/py-template",
 3 |   "commit": "15ee26df315020399731c6291d61bef81a3fc5d3",
 4 |   "checkout": null,
 5 |   "context": {
 6 |     "cookiecutter": {
 7 |       "project_name": "dvc-data",
 8 |       "package_name": "dvc_data",
 9 |       "friendly_name": "DVC data",
10 |       "author": "Iterative",
11 |       "email": "support@dvc.org",
12 |       "github_user": "iterative",
13 |       "version": "0.0.0",
14 |       "copyright_year": "2022",
15 |       "license": "Apache-2.0",
16 |       "docs": "False",
17 |       "short_description": "dvc data",
18 |       "development_status": "Development Status :: 4 - Beta",
19 |       "_template": "https://github.com/iterative/py-template"
20 |     }
21 |   },
22 |   "directory": null,
23 |   "skip": [
24 |     ".git"
25 |   ]
26 | }
27 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | updates:
 4 |   - directory: "/"
 5 |     package-ecosystem: "pip"
 6 |     schedule:
 7 |       interval: "weekly"
 8 |     labels:
 9 |       - "maintenance"
10 | 
11 |   - directory: "/"
12 |     package-ecosystem: "github-actions"
13 |     schedule:
14 |       interval: "weekly"
15 |     labels:
16 |       - "maintenance"
17 | 


--------------------------------------------------------------------------------
/.github/workflows/benchmark.yml:
--------------------------------------------------------------------------------
 1 | name: Benchmark
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |   workflow_dispatch:
 7 | 
 8 | env:
 9 |   FORCE_COLOR: "1"
10 |   PY_COLORS: "1"
11 | 
12 | concurrency:
13 |   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
14 |   cancel-in-progress: true
15 | 
16 | permissions: {}
17 | 
18 | jobs:
19 |   benchmark:
20 |     runs-on: ${{ matrix.os }}
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         os: [ubuntu-latest, macos-latest]
25 |     steps:
26 |     - name: Set up Python 3.12
27 |       uses: actions/setup-python@v5
28 |       with:
29 |         python-version: '3.12'
30 | 
31 |     - uses: actions/checkout@v4
32 |       with:
33 |         ref: ${{ github.event.pull_request.base.sha }}
34 |         fetch-depth: 0
35 | 
36 |     - uses: astral-sh/setup-uv@v6
37 |     - name: Install nox
38 |       run: uv pip install --system nox --upgrade
39 | 
40 |     - name: Benchmark on base branch
41 |       run: nox -s bench -- --benchmark-save=base
42 | 
43 |     - uses: actions/checkout@v4
44 |       with:
45 |         fetch-depth: 0
46 |         clean: false
47 | 
48 |     - name: Benchmark on pull request
49 |       run: nox -s bench -- --benchmark-save=${GITHUB_SHA::7} --benchmark-compare=0001 --benchmark-compare-fail=mean:10%
50 | 
51 |     - name: Compare benchmark
52 |       if: always()
53 |       run: uvx pytest-benchmark compare --group-by name
54 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 |   workflow_dispatch:
 7 | 
 8 | env:
 9 |   FORCE_COLOR: "1"
10 | 
11 | jobs:
12 |   release:
13 |     environment: pypi
14 |     permissions:
15 |       contents: read
16 |       id-token: write
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |     - name: Check out the repository
20 |       uses: actions/checkout@v4
21 |       with:
22 |         fetch-depth: 0
23 | 
24 |     - name: Set up Python 3.12
25 |       uses: actions/setup-python@v5
26 |       with:
27 |         python-version: '3.12'
28 | 
29 |     - uses: astral-sh/setup-uv@v6
30 |     - name: Install nox
31 |       run: uv pip install --system nox --upgrade
32 | 
33 |     - name: Build package
34 |       run: nox -s build
35 | 
36 |     - name: Upload package
37 |       if: github.event_name == 'release'
38 |       uses: pypa/gh-action-pypi-publish@release/v1
39 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 |   workflow_dispatch:
 8 | 
 9 | env:
10 |   FORCE_COLOR: "1"
11 | 
12 | concurrency:
13 |   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
14 |   cancel-in-progress: true
15 | 
16 | jobs:
17 |   tests:
18 |     timeout-minutes: 30
19 |     runs-on: ${{ matrix.os }}
20 |     strategy:
21 |       fail-fast: false
22 |       matrix:
23 |         os: [ubuntu-latest, windows-latest, macos-latest]
24 |         pyv: ['3.9', '3.10', '3.11', '3.12', '3.13']
25 |         include:
26 |         - {os: ubuntu-latest, pyv: 'pypy3.9'}
27 | 
28 |     steps:
29 |     - name: Check out the repository
30 |       uses: actions/checkout@v4
31 |       with:
32 |         fetch-depth: 0
33 | 
34 |     - name: Set up Python ${{ matrix.pyv }}
35 |       uses: actions/setup-python@v5
36 |       with:
37 |         python-version: ${{ matrix.pyv }}
38 |         allow-prereleases: true
39 | 
40 |     - uses: astral-sh/setup-uv@v6
41 |       with:
42 |         enable-cache: true
43 |         cache-suffix: ${{ matrix.pyv }}
44 |         cache-dependency-glob: pyproject.toml
45 |     - name: Install nox
46 |       run: uv pip install --system nox --upgrade
47 | 
48 |     - name: Cache pre-commit hooks
49 |       uses: actions/cache@v4
50 |       with:
51 |         path: ~/.cache/pre-commit
52 |         key: pre-commit-3|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
53 | 
54 |     - name: Lint code
55 |       run: nox -s lint
56 | 
57 |     - name: Run tests
58 |       run: nox -s tests-${{ matrix.nox_pyv || matrix.pyv }} -- --cov-report=xml
59 | 
60 |     - name: Upload coverage report
61 |       uses: codecov/codecov-action@v5
62 | 
63 |     - name: Build package
64 |       run: nox -s build
65 | 


--------------------------------------------------------------------------------
/.github/workflows/update-template.yaml:
--------------------------------------------------------------------------------
 1 | name: Update template
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '5 1 * * *'  # every day at 01:05
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   update:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - name: Check out the repository
13 |       uses: actions/checkout@v4
14 | 
15 |     - name: Update template
16 |       uses: iterative/py-template@main
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | .benchmarks/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # vim
142 | *.swp
143 | .dvc/
144 | 
145 | .DS_Store
146 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v5.0.0
 6 |     hooks:
 7 |       - id: check-added-large-files
 8 |       - id: check-case-conflict
 9 |       - id: check-docstring-first
10 |       - id: check-executables-have-shebangs
11 |       - id: check-json
12 |       - id: check-merge-conflict
13 |         args: ['--assume-in-merge']
14 |       - id: check-toml
15 |       - id: check-yaml
16 |       - id: debug-statements
17 |       - id: end-of-file-fixer
18 |       - id: mixed-line-ending
19 |         args: ['--fix=lf']
20 |       - id: sort-simple-yaml
21 |       - id: trailing-whitespace
22 |   - repo: https://github.com/astral-sh/ruff-pre-commit
23 |     rev: 'v0.11.13'
24 |     hooks:
25 |       - id: ruff
26 |         args: [--fix, --exit-non-zero-on-fix]
27 |       - id: ruff-format
28 |   - repo: https://github.com/codespell-project/codespell
29 |     rev: v2.4.1
30 |     hooks:
31 |       - id: codespell
32 |         additional_dependencies: ["tomli"]
33 |   - repo: https://github.com/asottile/pyupgrade
34 |     rev: v3.20.0
35 |     hooks:
36 |       - id: pyupgrade
37 |         args: [--py38-plus]
38 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.rst:
--------------------------------------------------------------------------------
  1 | Contributor Covenant Code of Conduct
  2 | ====================================
  3 | 
  4 | Our Pledge
  5 | ----------
  6 | 
  7 | We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
  8 | 
  9 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community.
 10 | 
 11 | 
 12 | Our Standards
 13 | -------------
 14 | 
 15 | Examples of behavior that contributes to a positive environment for our community include:
 16 | 
 17 | - Demonstrating empathy and kindness toward other people
 18 | - Being respectful of differing opinions, viewpoints, and experiences
 19 | - Giving and gracefully accepting constructive feedback
 20 | - Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience
 21 | - Focusing on what is best not just for us as individuals, but for the overall community
 22 | 
 23 | Examples of unacceptable behavior include:
 24 | 
 25 | - The use of sexualized language or imagery, and sexual attention or
 26 |   advances of any kind
 27 | - Trolling, insulting or derogatory comments, and personal or political attacks
 28 | - Public or private harassment
 29 | - Publishing others' private information, such as a physical or email
 30 |   address, without their explicit permission
 31 | - Other conduct which could reasonably be considered inappropriate in a
 32 |   professional setting
 33 | 
 34 | Enforcement Responsibilities
 35 | ----------------------------
 36 | 
 37 | Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful.
 38 | 
 39 | Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate.
 40 | 
 41 | 
 42 | Scope
 43 | -----
 44 | 
 45 | This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event.
 46 | 
 47 | 
 48 | Enforcement
 49 | -----------
 50 | 
 51 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at support@dvc.org. All complaints will be reviewed and investigated promptly and fairly.
 52 | 
 53 | All community leaders are obligated to respect the privacy and security of the reporter of any incident.
 54 | 
 55 | 
 56 | Enforcement Guidelines
 57 | ----------------------
 58 | 
 59 | Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct:
 60 | 
 61 | 
 62 | 1. Correction
 63 | ~~~~~~~~~~~~~
 64 | 
 65 | **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community.
 66 | 
 67 | **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested.
 68 | 
 69 | 
 70 | 2. Warning
 71 | ~~~~~~~~~~
 72 | 
 73 | **Community Impact**: A violation through a single incident or series of actions.
 74 | 
 75 | **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban.
 76 | 
 77 | 
 78 | 3. Temporary Ban
 79 | ~~~~~~~~~~~~~~~~
 80 | 
 81 | **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior.
 82 | 
 83 | **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban.
 84 | 
 85 | 
 86 | 4. Permanent Ban
 87 | ~~~~~~~~~~~~~~~~
 88 | 
 89 | **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals.
 90 | 
 91 | **Consequence**: A permanent ban from any sort of public interaction within the community.
 92 | 
 93 | 
 94 | Attribution
 95 | -----------
 96 | 
 97 | This Code of Conduct is adapted from the `Contributor Covenant <homepage_>`__, version 2.0,
 98 | available at https://www.contributor-covenant.org/version/2/0/code_of_conduct/.
 99 | 
100 | Community Impact Guidelines were inspired by `Mozilla’s code of conduct enforcement ladder <https://github.com/mozilla/inclusion>`__.
101 | 
102 | .. _homepage: https://www.contributor-covenant.org
103 | 
104 | For answers to common questions about this code of conduct, see the FAQ at
105 | https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations.
106 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | Contributor Guide
  2 | =================
  3 | 
  4 | Thank you for your interest in improving this project.
  5 | This project is open-source under the `Apache 2.0 license`_ and
  6 | welcomes contributions in the form of bug reports, feature requests, and pull requests.
  7 | 
  8 | Here is a list of important resources for contributors:
  9 | 
 10 | - `Source Code`_
 11 | - `Issue Tracker`_
 12 | - `Code of Conduct`_
 13 | 
 14 | .. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
 15 | .. _Source Code: https://github.com/iterative/dvc-data
 16 | .. _Issue Tracker: https://github.com/iterative/dvc-data/issues
 17 | 
 18 | How to report a bug
 19 | -------------------
 20 | 
 21 | Report bugs on the `Issue Tracker`_.
 22 | 
 23 | When filing an issue, make sure to answer these questions:
 24 | 
 25 | - Which operating system and Python version are you using?
 26 | - Which version of this project are you using?
 27 | - What did you do?
 28 | - What did you expect to see?
 29 | - What did you see instead?
 30 | 
 31 | The best way to get your bug fixed is to provide a test case,
 32 | and/or steps to reproduce the issue.
 33 | 
 34 | 
 35 | How to request a feature
 36 | ------------------------
 37 | 
 38 | Request features on the `Issue Tracker`_.
 39 | 
 40 | 
 41 | How to set up your development environment
 42 | ------------------------------------------
 43 | 
 44 | You need Python 3.8+ and the following tools:
 45 | 
 46 | - Nox_
 47 | 
 48 | Install the package with development requirements:
 49 | 
 50 | .. code:: console
 51 | 
 52 |    $ pip install nox
 53 | 
 54 | .. _Nox: https://nox.thea.codes/
 55 | 
 56 | 
 57 | How to test the project
 58 | -----------------------
 59 | 
 60 | Run the full test suite:
 61 | 
 62 | .. code:: console
 63 | 
 64 |    $ nox
 65 | 
 66 | List the available Nox sessions:
 67 | 
 68 | .. code:: console
 69 | 
 70 |    $ nox --list-sessions
 71 | 
 72 | You can also run a specific Nox session.
 73 | For example, invoke the unit test suite like this:
 74 | 
 75 | .. code:: console
 76 | 
 77 |    $ nox --session=tests
 78 | 
 79 | Unit tests are located in the ``tests`` directory,
 80 | and are written using the pytest_ testing framework.
 81 | 
 82 | .. _pytest: https://pytest.readthedocs.io/
 83 | 
 84 | 
 85 | How to submit changes
 86 | ---------------------
 87 | 
 88 | Open a `pull request`_ to submit changes to this project.
 89 | 
 90 | Your pull request needs to meet the following guidelines for acceptance:
 91 | 
 92 | - The Nox test suite must pass without errors and warnings.
 93 | - Include unit tests. This project maintains 100% code coverage.
 94 | - If your changes add functionality, update the documentation accordingly.
 95 | 
 96 | Feel free to submit early, though—we can always iterate on this.
 97 | 
 98 | To run linting and code formatting checks, you can invoke a `lint` session in nox:
 99 | 
100 | .. code:: console
101 | 
102 |    $ nox -s lint
103 | 
104 | It is recommended to open an issue before starting work on anything.
105 | This will allow a chance to talk it over with the owners and validate your approach.
106 | 
107 | .. _pull request: https://github.com/iterative/dvc-data/pulls
108 | .. github-only
109 | .. _Code of Conduct: CODE_OF_CONDUCT.rst
110 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | DVC data
  2 | ========
  3 | 
  4 | |PyPI| |Status| |Python Version| |License|
  5 | 
  6 | |Tests| |Codecov| |pre-commit| |Black|
  7 | 
  8 | .. |PyPI| image:: https://img.shields.io/pypi/v/dvc-data.svg
  9 |    :target: https://pypi.org/project/dvc-data/
 10 |    :alt: PyPI
 11 | .. |Status| image:: https://img.shields.io/pypi/status/dvc-data.svg
 12 |    :target: https://pypi.org/project/dvc-data/
 13 |    :alt: Status
 14 | .. |Python Version| image:: https://img.shields.io/pypi/pyversions/dvc-data
 15 |    :target: https://pypi.org/project/dvc-data
 16 |    :alt: Python Version
 17 | .. |License| image:: https://img.shields.io/pypi/l/dvc-data
 18 |    :target: https://opensource.org/licenses/Apache-2.0
 19 |    :alt: License
 20 | .. |Tests| image:: https://github.com/iterative/dvc-data/workflows/Tests/badge.svg
 21 |    :target: https://github.com/iterative/dvc-data/actions?workflow=Tests
 22 |    :alt: Tests
 23 | .. |Codecov| image:: https://codecov.io/gh/iterative/dvc-data/branch/main/graph/badge.svg
 24 |    :target: https://app.codecov.io/gh/iterative/dvc-data
 25 |    :alt: Codecov
 26 | .. |pre-commit| image:: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white
 27 |    :target: https://github.com/pre-commit/pre-commit
 28 |    :alt: pre-commit
 29 | .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
 30 |    :target: https://github.com/psf/black
 31 |    :alt: Black
 32 | 
 33 | 
 34 | Features
 35 | --------
 36 | 
 37 | * TODO
 38 | 
 39 | 
 40 | Requirements
 41 | ------------
 42 | 
 43 | * TODO
 44 | 
 45 | 
 46 | Installation
 47 | ------------
 48 | 
 49 | You can install *DVC data* via pip_ from PyPI_:
 50 | 
 51 | .. code:: console
 52 | 
 53 |    $ pip install dvc-data
 54 | 
 55 | 
 56 | Usage
 57 | -----
 58 | 
 59 | HashFile
 60 | ^^^^^^^^
 61 | 
 62 | HashFile
 63 | """"""""
 64 | 
 65 | Based on dvc-object's `Object`, this is an object that has a particular hash that can be used to verify its contents. Similar to git's `ShaFile`.
 66 | 
 67 | .. code:: python
 68 | 
 69 |     from dvc_data.hashfile import HashFile
 70 | 
 71 |     obj = HashFile("/path/to/file", fs, HashInfo("md5", "36eba1e1e343279857ea7f69a597324e")
 72 | 
 73 | HashFileDB
 74 | """"""""""
 75 | 
 76 | Based on dvc-object's `ObjectDB`, but stores `HashFile` objects and so is able to verify their contents by their `hash_info`. Similar to git's `ObjectStore`.
 77 | 
 78 | .. code:: python
 79 | 
 80 |     from dvc_data.hashfile import HashFileDB
 81 | 
 82 |     odb = HashFileDB(fs, "/path/to/odb")
 83 | 
 84 | Index
 85 | ^^^^^
 86 | 
 87 | Index
 88 | """""
 89 | 
 90 | A trie-like structure that represents data files and directories.
 91 | 
 92 | .. code:: python
 93 | 
 94 |     from dvc_data.index import DataIndex, DataIndexEntry
 95 | 
 96 |     index = DataIndex()
 97 |     index[("foo",)] = DataIndexEntry(hash_info=hash_info, meta=meta)
 98 | 
 99 | 
100 | Storage
101 | """""""
102 | 
103 | A mapping that describes where to find data contents for index entries. Can be either `ObjectStorage` for `HashFileDB`-based storage or `FileStorage` for backup-like plain file storage.
104 | 
105 | .. code:: python
106 | 
107 |     index.storage_map[("foo",)] = ObjectStorage(...)
108 | 
109 | Contributing
110 | ------------
111 | 
112 | Contributions are very welcome.
113 | To learn more, see the `Contributor Guide`_.
114 | 
115 | 
116 | License
117 | -------
118 | 
119 | Distributed under the terms of the `Apache 2.0 license`_,
120 | *DVC data* is free and open source software.
121 | 
122 | 
123 | Issues
124 | ------
125 | 
126 | If you encounter any problems,
127 | please `file an issue`_ along with a detailed description.
128 | 
129 | 
130 | .. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0
131 | .. _PyPI: https://pypi.org/
132 | .. _file an issue: https://github.com/iterative/dvc-data/issues
133 | .. _pip: https://pip.pypa.io/
134 | .. github-only
135 | .. _Contributor Guide: CONTRIBUTING.rst
136 | 


--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
 1 | """Automation using nox."""
 2 | 
 3 | import glob
 4 | import os
 5 | 
 6 | import nox
 7 | 
 8 | nox.options.default_venv_backend = "uv|virtualenv"
 9 | nox.options.reuse_existing_virtualenvs = True
10 | nox.options.sessions = "lint", "tests"
11 | 
12 | 
13 | @nox.session(
14 |     python=["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "pypy3.8", "pypy3.9"]
15 | )
16 | def tests(session: nox.Session) -> None:
17 |     session.install(".[tests,cli]")
18 |     session.run(
19 |         "pytest",
20 |         "--cov",
21 |         "--cov-config=pyproject.toml",
22 |         *session.posargs,
23 |         env={"COVERAGE_FILE": f".coverage.{session.python}"},
24 |     )
25 | 
26 | 
27 | @nox.session
28 | def bench(session: nox.Session) -> None:
29 |     session.install(".[tests,cli]")
30 |     storage = os.getenv("PYTEST_BENCHMARK_STORAGE", "file://.benchmarks")
31 |     session.run(
32 |         "pytest",
33 |         "--benchmark-storage",
34 |         storage,
35 |         "--benchmark-only",
36 |         *session.posargs,
37 |     )
38 | 
39 | 
40 | @nox.session
41 | def lint(session: nox.Session) -> None:
42 |     session.install("pre-commit")
43 |     session.install("-e", ".[dev]")
44 | 
45 |     args = *(session.posargs or ("--show-diff-on-failure",)), "--all-files"
46 |     session.run("pre-commit", "run", *args)
47 |     session.run("python", "-m", "mypy")
48 | 
49 | 
50 | @nox.session
51 | def build(session: nox.Session) -> None:
52 |     session.install("twine", "uv")
53 |     session.run("uv", "build")
54 |     dists = glob.glob("dist/*")
55 |     session.run("twine", "check", *dists, silent=True)
56 | 
57 | 
58 | @nox.session
59 | def dev(session: nox.Session) -> None:
60 |     """Sets up a python development environment for the project."""
61 |     args = session.posargs or ("venv",)
62 |     venv_dir = os.fsdecode(os.path.abspath(args[0]))
63 | 
64 |     session.log(f"Setting up virtual environment in {venv_dir}")
65 |     session.install("virtualenv")
66 |     session.run("virtualenv", venv_dir, silent=True)
67 | 
68 |     python = os.path.join(venv_dir, "bin/python")
69 |     session.run(python, "-m", "pip", "install", "-e", ".[dev]", external=True)
70 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools>=48", "setuptools_scm[toml]>=6.3.1"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [tool.setuptools_scm]
  6 | 
  7 | [project]
  8 | name = "dvc-data"
  9 | description = "DVC's data management subsystem"
 10 | readme = "README.rst"
 11 | license = {text = "Apache-2.0"}
 12 | authors = [{ name = "Iterative", email = "support@dvc.org" }]
 13 | classifiers = [
 14 |     "Programming Language :: Python :: 3",
 15 |     "Programming Language :: Python :: 3.9",
 16 |     "Programming Language :: Python :: 3.10",
 17 |     "Programming Language :: Python :: 3.11",
 18 |     "Programming Language :: Python :: 3.12",
 19 |     "Programming Language :: Python :: 3.13",
 20 |     "Development Status :: 4 - Beta",
 21 | ]
 22 | requires-python = ">=3.9"
 23 | dynamic = ["version"]
 24 | dependencies = [
 25 |     "attrs>=21.3.0",
 26 |     "dictdiffer>=0.8.1",
 27 |     "diskcache>=5.2.1",
 28 |     "dvc-objects>=4.0.1,<6",
 29 |     "fsspec>=2024.2.0",
 30 |     "funcy>=1.14; python_version < '3.12'",
 31 |     "pygtrie>=2.3.2",
 32 |     "sqltrie>=0.11.0,<1",
 33 |     "tqdm>=4.63.1,<5",
 34 |     "orjson>=3,<4; implementation_name=='cpython'",
 35 | ]
 36 | 
 37 | [project.urls]
 38 | Issues = "https://github.com/iterative/dvc-data/issues"
 39 | Source = "https://github.com/iterative/dvc-data"
 40 | 
 41 | [project.optional-dependencies]
 42 | cli = [
 43 |     "typer-slim>=0.12",
 44 | ]
 45 | all = [
 46 |     "dvc-data[cli]",
 47 | ]
 48 | tests = [
 49 |     "pytest>=7,<9",
 50 |     "pytest-sugar",
 51 |     "pytest-cov>=4.1.0",
 52 |     "pytest-mock",
 53 |     "pytest-benchmark>=4",
 54 |     "pytest-servers==0.5.10",
 55 | ]
 56 | dev = [
 57 |     "dvc-data[all]",
 58 |     "dvc-data[tests]",
 59 |     "blake3>=0.3.1",
 60 |     "mypy==1.15.0",
 61 |     "types-tqdm",
 62 | ]
 63 | 
 64 | [project.scripts]
 65 | dvc-data = "dvc_data.__main__:main"
 66 | 
 67 | [tool.setuptools.package-data]
 68 | dvc_data = ["py.typed"]
 69 | 
 70 | [tool.setuptools.packages.find]
 71 | where = ["src"]
 72 | namespaces = false
 73 | 
 74 | [tool.pytest.ini_options]
 75 | addopts = "-ra --benchmark-skip"
 76 | filterwarnings = [
 77 |     "error",
 78 |     "ignore:datetime.datetime.*:DeprecationWarning",
 79 | ]
 80 | 
 81 | [tool.coverage.run]
 82 | branch = true
 83 | source = ["dvc_data", "tests"]
 84 | 
 85 | [tool.coverage.paths]
 86 | source = ["src", "*/site-packages"]
 87 | 
 88 | [tool.coverage.report]
 89 | show_missing = true
 90 | exclude_lines = [
 91 |     "pragma: no cover",
 92 |     "if __name__ == .__main__.:",
 93 |     "if typing.TYPE_CHECKING:",
 94 |     "if TYPE_CHECKING:",
 95 |     "raise NotImplementedError",
 96 |     "raise AssertionError",
 97 |     "@overload",
 98 | ]
 99 | 
100 | [tool.mypy]
101 | # Error output
102 | show_column_numbers = true
103 | show_error_codes = true
104 | show_error_context = true
105 | show_traceback = true
106 | pretty = true
107 | check_untyped_defs = true
108 | # Warnings
109 | warn_no_return = true
110 | warn_redundant_casts = true
111 | warn_unreachable = true
112 | strict_equality = true
113 | no_implicit_optional = true
114 | warn_unused_configs = true
115 | files = ["src", "tests"]
116 | 
117 | [[tool.mypy.overrides]]
118 | ignore_missing_imports = true
119 | module = [
120 |     "fsspec.*",
121 |     "funcy",
122 |     "diskcache",
123 |     "pygtrie",
124 |     "dictdiffer",
125 |     "shortuuid.*",
126 | ]
127 | 
128 | [tool.codespell]
129 | ignore-words-list = "fo"
130 | skip = "CODE_OF_CONDUCT.rst"
131 | 
132 | [tool.ruff]
133 | show-fixes = true
134 | 
135 | [tool.ruff.lint]
136 | preview = true
137 | explicit-preview-rules = true
138 | ignore = [
139 |     "A005", # stdlib-module-shadowing
140 |     "PLR2004", # magic-value-comparison
141 |     "PLW2901", # redefined-loop-name
142 |     "RET501", # unnecessary-return-none
143 |     "RET502", # implicit-return-value
144 |     "RET503", # implicit-return
145 |     "S101", # assert
146 |     "SIM105", # suppressible-exception
147 |     "SIM108", # if-else-block-instead-of-if-exp
148 |     "SIM117", # multiple-with-statements
149 | ]
150 | select = [
151 |     "A", # flake8-buitlins
152 |     "ASYNC", # flake8-async
153 |     "B", # flake8-bugbear
154 |     "BLE", # flake8-blind-except
155 |     "C4", # flake8-comprehensions
156 |     "C90", # mccabe
157 |     "DTZ", # flake8-datetimez
158 |     "E", # pycodestyle - Error
159 |     "EXE", # flake8-executable
160 |     "F", # pyflakes
161 |     "FLY", # flynt-rules
162 |     "G", # flake8-logging-format
163 |     "I", # isort
164 |     "ICN", # flake8-import-conventions
165 |     "INP", # flake8-no-pep420
166 |     "ISC", # flake8-implicit-str-concat
167 |     "N", # pep8-naming
168 |     "PERF101", # perflint
169 |     "PGH", # pygrep-hooks
170 |     "PIE", # flake8-pie
171 |     "PL", # pylint
172 |     "PT", # flake8-pytest-style
173 |     "PYI", # flake8-pyi
174 |     "Q", # flae8-quotes
175 |     "RET", # flake8-return
176 |     "RSE", # flake8-raise
177 |     "RUF", # ruff
178 |     "S", # flake8-bandit
179 |     "SIM", # flake8-simplify
180 |     "SLOT", # flake8-slots
181 |     "T10", # flake8-debugger
182 |     "T20", # flake8-print
183 |     "TCH", # flake8-type-checking
184 |     "TCH", # flake8-type-checking
185 |     "TID", # flake8-tidy-imports
186 |     "UP", # pyupgrade
187 |     "W", # pycodestyle - Warning
188 |     "YTT", # flake8-2020
189 | ]
190 | 
191 | [tool.ruff.lint.flake8-pytest-style]
192 | fixture-parentheses = false
193 | mark-parentheses = false
194 | parametrize-names-type = "csv"
195 | 
196 | [tool.ruff.lint.flake8-tidy-imports]
197 | [tool.ruff.lint.flake8-tidy-imports.banned-api]
198 | "funcy.cached_property" = {msg = "use `from dvc_data.compat import cached_property` instead."}
199 | "functools.cached_property" = {msg = "use `from dvc_data.compat import cached_property` instead."}
200 | 
201 | [tool.ruff.lint.flake8-type-checking]
202 | strict = true
203 | 
204 | [tool.ruff.lint.isort]
205 | known-first-party = ["dvc_data"]
206 | 
207 | [tool.ruff.lint.pylint]
208 | max-args = 10
209 | 
210 | [tool.ruff.lint.per-file-ignores]
211 | "src/dvc_data/cli.py" = ["T201", "B008"]
212 | 


--------------------------------------------------------------------------------
/src/dvc_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/dvc-data/4ee77349a9712476cea4ac57154ee25ce79fcc02/src/dvc_data/__init__.py


--------------------------------------------------------------------------------
/src/dvc_data/__main__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from .cli import main
 3 | except ImportError:  # pragma: no cover
 4 | 
 5 |     def main():  # type: ignore[misc]
 6 |         import sys
 7 | 
 8 |         print(  # noqa: T201
 9 |             "dvc-data could not run because the required "
10 |             "dependencies are not installed.\n"
11 |             "Please install it with: pip install 'dvc-data[cli]'"
12 |         )
13 |         sys.exit(1)
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     main()
18 | 


--------------------------------------------------------------------------------
/src/dvc_data/callbacks.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import re
  4 | import sys
  5 | from typing import Any, BinaryIO, ClassVar, Optional, Union
  6 | 
  7 | import fsspec
  8 | from tqdm import tqdm
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def env2bool(var, undefined=False):
 14 |     """
 15 |     undefined: return value if env var is unset
 16 |     """
 17 |     var = os.getenv(var, None)
 18 |     if var is None:
 19 |         return undefined
 20 |     return bool(re.search("1|y|yes|true", var, flags=re.I))
 21 | 
 22 | 
 23 | class Tqdm(tqdm):
 24 |     """
 25 |     maximum-compatibility tqdm-based progressbars
 26 |     """
 27 | 
 28 |     BAR_FMT_DEFAULT = (
 29 |         "{percentage:3.0f}% {desc}|{bar}|"
 30 |         "{postfix[info]}{n_fmt}/{total_fmt}"
 31 |         " [{elapsed}<{remaining}, {rate_fmt:>11}]"
 32 |     )
 33 |     # nested bars should have fixed bar widths to align nicely
 34 |     BAR_FMT_DEFAULT_NESTED = (
 35 |         "{percentage:3.0f}%|{bar:10}|{desc:{ncols_desc}.{ncols_desc}}"
 36 |         "{postfix[info]}{n_fmt}/{total_fmt}"
 37 |         " [{elapsed}<{remaining}, {rate_fmt:>11}]"
 38 |     )
 39 |     BAR_FMT_NOTOTAL = "{desc}{bar:b}|{postfix[info]}{n_fmt} [{elapsed}, {rate_fmt:>11}]"
 40 |     BYTES_DEFAULTS: ClassVar[dict[str, Any]] = {
 41 |         "unit": "B",
 42 |         "unit_scale": True,
 43 |         "unit_divisor": 1024,
 44 |         "miniters": 1,
 45 |     }
 46 | 
 47 |     def __init__(
 48 |         self,
 49 |         iterable=None,
 50 |         disable=None,
 51 |         level=logging.ERROR,
 52 |         desc=None,
 53 |         leave=False,
 54 |         bar_format=None,
 55 |         bytes=False,  # noqa: A002
 56 |         file=None,
 57 |         total=None,
 58 |         postfix=None,
 59 |         **kwargs,
 60 |     ):
 61 |         """
 62 |         bytes   : shortcut for
 63 |             `unit='B', unit_scale=True, unit_divisor=1024, miniters=1`
 64 |         desc  : persists after `close()`
 65 |         level  : effective logging level for determining `disable`;
 66 |             used only if `disable` is unspecified
 67 |         disable  : If (default: None) or False,
 68 |             will be determined by logging level.
 69 |             May be overridden to `True` due to non-TTY status.
 70 |             Skip override by specifying env var `DVC_IGNORE_ISATTY`.
 71 |         kwargs  : anything accepted by `tqdm.tqdm()`
 72 |         """
 73 |         kwargs = kwargs.copy()
 74 |         if bytes:
 75 |             kwargs = {**self.BYTES_DEFAULTS, **kwargs}
 76 |         else:
 77 |             kwargs.setdefault("unit_scale", total > 999 if total else True)
 78 |         if file is None:
 79 |             file = sys.stderr
 80 |         # auto-disable based on `logger.level`
 81 |         if not disable:
 82 |             disable = logger.getEffectiveLevel() > level
 83 |         # auto-disable based on TTY
 84 |         if (
 85 |             not disable
 86 |             and not env2bool("DVC_IGNORE_ISATTY")
 87 |             and hasattr(file, "isatty")
 88 |         ):
 89 |             disable = not file.isatty()
 90 |         super().__init__(
 91 |             iterable=iterable,
 92 |             disable=disable,
 93 |             leave=leave,
 94 |             desc=desc,
 95 |             bar_format="!",
 96 |             lock_args=(False,),
 97 |             total=total,
 98 |             **kwargs,
 99 |         )
100 |         self.postfix = postfix or {"info": ""}
101 |         if bar_format is None:
102 |             if self.__len__():
103 |                 self.bar_format = (
104 |                     self.BAR_FMT_DEFAULT_NESTED if self.pos else self.BAR_FMT_DEFAULT
105 |                 )
106 |             else:
107 |                 self.bar_format = self.BAR_FMT_NOTOTAL
108 |         else:
109 |             self.bar_format = bar_format
110 |         self.refresh()
111 | 
112 |     def close(self):
113 |         self.postfix["info"] = ""
114 |         # remove ETA (either unknown or zero); remove completed bar
115 |         self.bar_format = self.bar_format.replace("<{remaining}", "").replace(
116 |             "|{bar:10}|", " "
117 |         )
118 |         super().close()
119 | 
120 |     @property
121 |     def format_dict(self):
122 |         """inject `ncols_desc` to fill the display width (`ncols`)"""
123 |         d = super().format_dict
124 |         ncols = d["ncols"] or 80
125 |         # assumes `bar_format` has max one of ("ncols_desc" & "ncols_info")
126 | 
127 |         meter = self.format_meter(  # type: ignore[call-arg]
128 |             ncols_desc=1, ncols_info=1, **d
129 |         )
130 |         ncols_left = ncols - len(meter) + 1
131 |         ncols_left = max(ncols_left, 0)
132 |         if ncols_left:
133 |             d["ncols_desc"] = d["ncols_info"] = ncols_left
134 |         else:
135 |             # work-around for zero-width description
136 |             d["ncols_desc"] = d["ncols_info"] = 1
137 |             d["prefix"] = ""
138 |         return d
139 | 
140 | 
141 | class TqdmCallback(fsspec.callbacks.TqdmCallback):
142 |     def __init__(
143 |         self,
144 |         size: Optional[int] = None,
145 |         value: int = 0,
146 |         progress_bar: Optional["tqdm"] = None,
147 |         tqdm_cls: Optional[type["tqdm"]] = None,
148 |         **tqdm_kwargs,
149 |     ):
150 |         tqdm_kwargs.pop("total", None)
151 |         super().__init__(
152 |             tqdm_kwargs=tqdm_kwargs, tqdm_cls=tqdm_cls or Tqdm, size=size, value=value
153 |         )
154 |         if progress_bar is None:
155 |             self.tqdm = progress_bar
156 | 
157 |     def branched(self, path_1: "Union[str, BinaryIO]", path_2: str, **kwargs):
158 |         desc = path_1 if isinstance(path_1, str) else path_2
159 |         return TqdmCallback(bytes=True, desc=desc)
160 | 


--------------------------------------------------------------------------------
/src/dvc_data/compat.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from collections.abc import Iterable, Iterator
 3 | from itertools import islice
 4 | from typing import TYPE_CHECKING, TypeVar
 5 | 
 6 | if sys.version_info >= (3, 12) or TYPE_CHECKING:
 7 |     from functools import cached_property  # noqa: TID251
 8 | else:
 9 |     from funcy import cached_property  # noqa: TID251
10 | 
11 | 
12 | T = TypeVar("T")
13 | 
14 | 
15 | def batched(iterable: Iterable[T], n: int) -> Iterator[tuple[T, ...]]:
16 |     if n < 1:
17 |         raise ValueError("n must be at least one")
18 |     it = iter(iterable)
19 |     while batch := tuple(islice(it, n)):
20 |         yield batch
21 | 
22 | 
23 | __all__ = ["batched", "cached_property"]
24 | 


--------------------------------------------------------------------------------
/src/dvc_data/fs.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import errno
  3 | import logging
  4 | import os
  5 | import posixpath
  6 | import typing
  7 | from collections import deque
  8 | from typing import Any, BinaryIO, NamedTuple, Optional
  9 | 
 10 | from fsspec import AbstractFileSystem
 11 | from fsspec.callbacks import DEFAULT_CALLBACK, NoOpCallback
 12 | 
 13 | if typing.TYPE_CHECKING:
 14 |     from dvc_objects.fs.base import AnyFSPath, FileSystem
 15 |     from fsspec import Callback
 16 | 
 17 |     from dvc_data.hashfile.db import HashFileDB
 18 | 
 19 |     from .hashfile.hash_info import HashInfo
 20 |     from .index import DataIndex, DataIndexEntry, ObjectStorage
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | 
 25 | class _WrappedCallback(NoOpCallback):
 26 |     # check `_get_file` for more details
 27 |     def branched(self, path_1, path_2, **kwargs):
 28 |         # NOTE: only safe for a single use
 29 |         return self.kw.get("callback", DEFAULT_CALLBACK)
 30 | 
 31 | 
 32 | class FileInfo(NamedTuple):
 33 |     typ: str
 34 |     storage: "ObjectStorage"
 35 |     cache_storage: "ObjectStorage"
 36 |     hash_info: Optional["HashInfo"]
 37 |     fs: "FileSystem"
 38 |     fs_path: "AnyFSPath"
 39 | 
 40 | 
 41 | class DataFileSystem(AbstractFileSystem):
 42 |     root_marker = "/"
 43 | 
 44 |     def __init__(self, index: "DataIndex", **kwargs: Any):
 45 |         super().__init__(**kwargs)
 46 |         self.index = index
 47 | 
 48 |     @classmethod
 49 |     def join(cls, *parts: str) -> str:
 50 |         return posixpath.join(*parts)
 51 | 
 52 |     @classmethod
 53 |     def parts(cls, path: str) -> tuple[str, ...]:
 54 |         ret = []
 55 |         while True:
 56 |             path, part = posixpath.split(path)
 57 | 
 58 |             if part:
 59 |                 ret.append(part)
 60 |                 continue
 61 | 
 62 |             if path:
 63 |                 ret.append(path)
 64 | 
 65 |             break
 66 | 
 67 |         ret.reverse()
 68 | 
 69 |         return tuple(ret)
 70 | 
 71 |     def getcwd(self) -> str:
 72 |         return self.root_marker
 73 | 
 74 |     def normpath(self, path: str) -> str:
 75 |         return posixpath.normpath(path)
 76 | 
 77 |     def abspath(self, path: str) -> str:
 78 |         if not posixpath.isabs(path):
 79 |             path = self.join(self.getcwd(), path)
 80 |         return self.normpath(path)
 81 | 
 82 |     def relpath(self, path: str, start: Optional[str] = None) -> str:
 83 |         if start is None:
 84 |             start = "."
 85 |         return posixpath.relpath(self.abspath(path), start=self.abspath(start))
 86 | 
 87 |     def relparts(self, path: str, start: Optional[str] = None) -> tuple[str, ...]:
 88 |         return self.parts(self.relpath(path, start=start))
 89 | 
 90 |     def _get_key(self, path: str) -> tuple[str, ...]:
 91 |         path = self.abspath(path)
 92 |         if path == self.root_marker:
 93 |             return ()
 94 | 
 95 |         key = self.relparts(path, self.root_marker)
 96 |         if key in ((".",), ("",)):
 97 |             key = ()
 98 | 
 99 |         return key
100 | 
101 |     def _get_fs_path(self, path: "AnyFSPath", info=None) -> FileInfo:
102 |         from .index import StorageKeyError
103 | 
104 |         info = info or self.info(path)
105 |         if info["type"] == "directory":
106 |             raise IsADirectoryError(errno.EISDIR, os.strerror(errno.EISDIR), path)
107 | 
108 |         entry: Optional[DataIndexEntry] = info["entry"]
109 | 
110 |         assert entry
111 |         hash_info: Optional[HashInfo] = entry.hash_info
112 | 
113 |         for typ in ["cache", "remote", "data"]:
114 |             try:
115 |                 info = self.index.storage_map[entry.key]
116 |                 storage = getattr(info, typ)
117 |                 if not storage:
118 |                     continue
119 |                 data = storage.get(entry)
120 |             except (ValueError, StorageKeyError):
121 |                 continue
122 |             if data:
123 |                 fs, fs_path = data
124 |                 if fs.exists(fs_path):
125 |                     return FileInfo(typ, storage, info.cache, hash_info, fs, fs_path)
126 | 
127 |         raise FileNotFoundError(errno.ENOENT, "No storage files available", path)
128 | 
129 |     def _cache_remote_file(
130 |         self,
131 |         cache_storage: "ObjectStorage",
132 |         fs: "FileSystem",
133 |         path: "AnyFSPath",
134 |         hash_info: Optional["HashInfo"],
135 |     ) -> tuple["FileSystem", "AnyFSPath"]:
136 |         from dvc_objects.fs.local import LocalFileSystem
137 | 
138 |         odb: HashFileDB = cache_storage.odb
139 |         oid = hash_info.value if hash_info else None
140 |         hash_name = hash_info.name if hash_info else None
141 |         assert odb.hash_name
142 | 
143 |         if isinstance(fs, LocalFileSystem) or not oid or odb.hash_name != hash_name:
144 |             return fs, path
145 | 
146 |         odb.add(path, fs, oid)
147 |         return odb.fs, odb.oid_to_path(oid)
148 | 
149 |     def _open(self, path: "AnyFSPath", **kwargs: Any) -> "BinaryIO":
150 |         typ, _, cache_storage, hi, fs, fspath = self._get_fs_path(path)
151 | 
152 |         if kwargs.get("cache", False) and typ == "remote" and cache_storage:
153 |             fs, fspath = self._cache_remote_file(cache_storage, fs, fspath, hi)
154 | 
155 |         return fs.open(fspath, mode="rb")
156 | 
157 |     def ls(self, path: "AnyFSPath", detail: bool = True, **kwargs: Any):
158 |         root_key = self._get_key(path)
159 |         try:
160 |             info = self.index.info(root_key)
161 |             if info["type"] == "file":
162 |                 info["name"] = path = self.join(*root_key)
163 |                 return [info] if detail else [path]
164 |             if not detail:
165 |                 return [
166 |                     self.join(path, key[-1])
167 |                     for key in self.index.ls(root_key, detail=False)
168 |                 ]
169 | 
170 |             entries = []
171 |             for key, info in self.index.ls(root_key, detail=True):
172 |                 info["name"] = self.join(path, key[-1])
173 |                 entries.append(info)
174 |             return entries
175 |         except KeyError as exc:
176 |             raise FileNotFoundError(
177 |                 errno.ENOENT, os.strerror(errno.ENOENT), path
178 |             ) from exc
179 | 
180 |     def info(self, path: "AnyFSPath", **kwargs: Any):
181 |         key = self._get_key(path)
182 | 
183 |         try:
184 |             info = self.index.info(key)
185 |         except KeyError as exc:
186 |             raise FileNotFoundError(
187 |                 errno.ENOENT,
188 |                 os.strerror(errno.ENOENT),
189 |                 path,
190 |             ) from exc
191 | 
192 |         info["name"] = path
193 |         return info
194 | 
195 |     def get_file(
196 |         self,
197 |         rpath: "AnyFSPath",
198 |         lpath: "AnyFSPath",
199 |         callback: "Callback" = DEFAULT_CALLBACK,
200 |         info: Optional[dict[str, Any]] = None,
201 |         **kwargs: Any,
202 |     ) -> None:
203 |         from dvc_objects.fs.generic import transfer
204 |         from dvc_objects.fs.local import LocalFileSystem
205 | 
206 |         from dvc_data.index import ObjectStorage
207 | 
208 |         try:
209 |             typ, storage, cache_storage, hi, fs, path = self._get_fs_path(rpath, info)
210 |         except IsADirectoryError:
211 |             os.makedirs(lpath, exist_ok=True)
212 |             return None
213 | 
214 |         cache = kwargs.pop("cache", False)
215 |         if cache and typ == "remote" and cache_storage:
216 |             fs, path = self._cache_remote_file(cache_storage, fs, path, hi)
217 |             storage = cache_storage
218 | 
219 |         if (
220 |             isinstance(storage, ObjectStorage)
221 |             and isinstance(fs, LocalFileSystem)
222 |             and storage.odb.cache_types
223 |         ):
224 |             try:
225 |                 transfer(
226 |                     fs,
227 |                     path,
228 |                     fs,
229 |                     os.fspath(lpath),
230 |                     # `transfer` supports uploading multiple files, so it uses the
231 |                     # passed callback to iterate for no. of files.
232 |                     # So, we wrap the given callback in a `NoOpCallback` and return it
233 |                     # in `branch` so that file copy callback gets properly called.
234 |                     # This is safe for transferring a single file.
235 |                     callback=_WrappedCallback(callback=callback),
236 |                     links=copy.copy(storage.odb.cache_types),
237 |                 )
238 |                 return
239 |             except OSError:
240 |                 pass
241 | 
242 |         fs.get_file(path, lpath, callback=callback, **kwargs)
243 | 
244 |     def checksum(self, path: str) -> str:
245 |         info = self.info(path)
246 |         md5 = info.get("md5")
247 |         if md5:
248 |             assert isinstance(md5, str)
249 |             return md5
250 |         raise NotImplementedError
251 | 
252 |     def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs):
253 |         if maxdepth is not None:
254 |             raise NotImplementedError
255 | 
256 |         sizes = {}
257 |         todo = deque([self.info(path)])
258 |         while todo:
259 |             info = todo.popleft()
260 | 
261 |             sizes[info["name"]] = info["size"] or 0
262 | 
263 |             if info["type"] != "directory":
264 |                 continue
265 | 
266 |             entry = info.get("entry")
267 |             if entry is not None and entry.size is not None:
268 |                 continue
269 | 
270 |             todo.extend(self.ls(info["name"], detail=True))
271 | 
272 |         if total:
273 |             return sum(sizes.values())
274 | 
275 |         return sizes
276 | 


--------------------------------------------------------------------------------
/src/dvc_data/fsutils.py:
--------------------------------------------------------------------------------
 1 | from os import readlink, stat
 2 | from stat import S_ISDIR, S_ISLNK, S_ISREG
 3 | from typing import Any
 4 | 
 5 | 
 6 | def _localfs_info(path: str) -> dict[str, Any]:
 7 |     out = stat(path, follow_symlinks=False)
 8 |     if link := S_ISLNK(out.st_mode):
 9 |         out = stat(path, follow_symlinks=True)
10 |     if S_ISDIR(out.st_mode):
11 |         t = "directory"
12 |     elif S_ISREG(out.st_mode):
13 |         t = "file"
14 |     else:
15 |         t = "other"
16 | 
17 |     result = {
18 |         "name": path,
19 |         "size": out.st_size,
20 |         "type": t,
21 |         "created": out.st_ctime,
22 |         "islink": link,
23 |         "mode": out.st_mode,
24 |         "uid": out.st_uid,
25 |         "gid": out.st_gid,
26 |         "mtime": out.st_mtime,
27 |         "ino": out.st_ino,
28 |         "nlink": out.st_nlink,
29 |     }
30 |     if link:
31 |         result["destination"] = readlink(path)
32 |     return result
33 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/__init__.py:
--------------------------------------------------------------------------------
 1 | """DVC data."""
 2 | 
 3 | import logging
 4 | from collections.abc import Iterator
 5 | from typing import TYPE_CHECKING, Union, cast
 6 | 
 7 | from .tree import Tree
 8 | 
 9 | if TYPE_CHECKING:
10 |     from .db import HashFileDB
11 |     from .hash_info import HashInfo
12 |     from .obj import HashFile
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def check(odb: "HashFileDB", obj: "HashFile", **kwargs):
18 |     if isinstance(obj, Tree):
19 |         for _, _, hash_info in obj:
20 |             odb.check(hash_info.value, **kwargs)
21 | 
22 |     odb.check(obj.oid, **kwargs)
23 | 
24 | 
25 | def load(odb: "HashFileDB", hash_info: "HashInfo") -> "HashFile":
26 |     if hash_info.isdir:
27 |         return Tree.load(odb, hash_info)
28 |     return odb.get(cast("str", hash_info.value))
29 | 
30 | 
31 | def iterobjs(obj: Union["Tree", "HashFile"]) -> Iterator[Union["Tree", "HashFile"]]:
32 |     if isinstance(obj, Tree):
33 |         yield from (entry_obj for _, entry_obj in obj)
34 |     yield obj
35 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/_ignore.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Iterator
 2 | from typing import TYPE_CHECKING, Any
 3 | 
 4 | from typing_extensions import Protocol
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from dvc_objects.fs.base import AnyFSPath, FileSystem
 8 | 
 9 | 
10 | class Ignore(Protocol):
11 |     def find(self, fs: "FileSystem", path: "AnyFSPath") -> Iterator["AnyFSPath"]: ...
12 | 
13 |     def walk(self, fs: "FileSystem", path: "AnyFSPath", **kwargs: Any): ...
14 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/_progress.py:
--------------------------------------------------------------------------------
 1 | from dvc_data.callbacks import Tqdm
 2 | 
 3 | 
 4 | class QueryingProgress(Tqdm):
 5 |     def __init__(self, iterable=None, total=None, name=None, phase="Querying"):
 6 |         msg_part = "cache in " + f"'{name}'" if name else "remote cache"
 7 |         msg_fmt = "{phase} " + msg_part
 8 | 
 9 |         self._estimating_msg = msg_fmt.format(phase="Estimating size of")
10 |         self._listing_msg = msg_fmt.format(phase="Querying")
11 |         self.desc = desc = msg_fmt.format(phase=phase)
12 |         super().__init__(
13 |             iterable=iterable,
14 |             desc=desc,
15 |             total=total,
16 |             unit="files",
17 |             unit_scale=False,
18 |             bar_format=self.BAR_FMT_DEFAULT,
19 |         )
20 | 
21 |     def callback(self, phase, *args):
22 |         total = args[0] if args else self.total
23 |         completed = args[1] if len(args) > 1 else self.n
24 |         if phase == "estimating":
25 |             self.desc = self._estimating_msg
26 |         elif phase == "querying":
27 |             self.desc = self._listing_msg
28 |         if total:
29 |             self.total = total
30 |         self.update(completed - self.n)
31 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/cache.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import sqlite3
  4 | from collections.abc import Iterable, Iterator, Sequence
  5 | from functools import wraps
  6 | from itertools import zip_longest
  7 | from typing import Any, ClassVar, Literal, Optional
  8 | 
  9 | import diskcache
 10 | from diskcache import Disk as _Disk
 11 | from diskcache import (
 12 |     Index,  # noqa: F401
 13 |     Timeout,  # noqa: F401
 14 | )
 15 | 
 16 | from dvc_data.compat import batched
 17 | 
 18 | 
 19 | class DiskError(Exception):
 20 |     def __init__(self, directory: str, type: str) -> None:  # noqa: A002
 21 |         self.directory = directory
 22 |         self.type = type
 23 |         super().__init__(f"Could not open disk '{type}' in {directory}")
 24 | 
 25 | 
 26 | def translate_pickle_error(fn):
 27 |     @wraps(fn)
 28 |     def wrapped(self, *args, **kwargs):
 29 |         try:
 30 |             return fn(self, *args, **kwargs)
 31 |         except (pickle.PickleError, ValueError) as e:
 32 |             if isinstance(e, ValueError) and "pickle protocol" not in str(e):
 33 |                 raise
 34 | 
 35 |             raise DiskError(self._directory, type=self._type) from e
 36 | 
 37 |     return wrapped
 38 | 
 39 | 
 40 | class Disk(_Disk):
 41 |     """Reraise pickle-related errors as DiskError."""
 42 | 
 43 |     # we need type to differentiate cache for better error messages
 44 |     _type: str
 45 | 
 46 |     put = translate_pickle_error(_Disk.put)
 47 |     get = translate_pickle_error(_Disk.get)
 48 |     store = translate_pickle_error(_Disk.store)
 49 |     fetch = translate_pickle_error(_Disk.fetch)
 50 | 
 51 | 
 52 | class Cache(diskcache.Cache):
 53 |     """Extended to handle pickle errors and use a constant pickle protocol."""
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         directory: Optional[str] = None,
 58 |         timeout: int = 60,
 59 |         disk: _Disk = Disk,
 60 |         type: Optional[str] = None,  # noqa: A002
 61 |         **settings: Any,
 62 |     ) -> None:
 63 |         settings.setdefault("disk_pickle_protocol", 4)
 64 |         settings.setdefault("cull_limit", 0)
 65 |         super().__init__(directory=directory, timeout=timeout, disk=disk, **settings)
 66 |         self.disk._type = self._type = type or os.path.basename(self.directory)
 67 | 
 68 |     def __getstate__(self):
 69 |         return (*super().__getstate__(), self._type)
 70 | 
 71 | 
 72 | class HashesCache(Cache):
 73 |     SUPPORTS_UPSERT = sqlite3.sqlite_version_info >= (3, 24, 0)
 74 |     SQLITE_MAX_VARIABLE_NUMBER: ClassVar[Literal[999]] = 999
 75 |     """The maximum number of host parameters is 999 for SQLite versions prior to 3.32.0
 76 |     (2020-05-22) or 32766 for SQLite versions after 3.32.0.
 77 | 
 78 |     Increasing this number does not yield any performance improvement, so we leave it at
 79 |     the old default.
 80 |     """
 81 | 
 82 |     def get_many(
 83 |         self, keys: Iterable[str], default=None
 84 |     ) -> Iterator[tuple[str, Optional[str]]]:
 85 |         if self.is_empty():
 86 |             yield from zip_longest(keys, [])
 87 |             return
 88 | 
 89 |         for chunk in batched(keys, self.SQLITE_MAX_VARIABLE_NUMBER):
 90 |             params = ", ".join("?" * len(chunk))
 91 |             query = f"SELECT key, value FROM Cache WHERE key IN ({params}) and raw = 1"  # noqa: S608
 92 |             d = dict(self._sql(query, chunk).fetchall())
 93 |             for key in chunk:
 94 |                 yield key, d.get(key, default)
 95 | 
 96 |     def set_many(self, items: Sequence[tuple[str, str]], retry: bool = False) -> None:
 97 |         if not items:
 98 |             return
 99 | 
100 |         if self.SUPPORTS_UPSERT:
101 |             query = (
102 |                 "INSERT INTO Cache("
103 |                 " key, raw, store_time, expire_time, access_time,"
104 |                 " tag, mode, filename, value"
105 |                 ") VALUES (?, 1, 0, null, 0, null, 1, null, ?)"
106 |                 " ON CONFLICT(key, raw) DO UPDATE SET value = excluded.value"
107 |             )
108 |         else:
109 |             query = (
110 |                 "INSERT OR REPLACE INTO Cache("
111 |                 " key, raw, store_time, expire_time, access_time,"
112 |                 " tag, mode, filename, value"
113 |                 ") VALUES (?, 1, 0, null, 0, null, 1, null, ?)"
114 |             )
115 |         with self.transact(retry):
116 |             self._con.executemany(query, items)
117 | 
118 |     def is_empty(self) -> bool:
119 |         res = self._sql("SELECT EXISTS (SELECT 1 FROM Cache)")
120 |         ((exists,),) = res
121 |         return exists == 0
122 | 
123 |     def get(
124 |         self, key, default=None, read=False, expire_time=False, tag=False, retry=False
125 |     ):
126 |         cursor = self._sql("SELECT value FROM Cache WHERE key = ? and raw = 1", (key,))
127 |         if rows := cursor.fetchall():
128 |             return rows[0][0]
129 |         return default
130 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/db/__init__.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from contextlib import suppress
  3 | from copy import copy
  4 | from typing import TYPE_CHECKING, Callable, ClassVar, Optional, Union
  5 | 
  6 | from dvc_objects.db import ObjectDB
  7 | from dvc_objects.errors import ObjectFormatError
  8 | from fsspec.callbacks import DEFAULT_CALLBACK
  9 | 
 10 | from dvc_data.hashfile.hash_info import HashInfo
 11 | from dvc_data.hashfile.obj import HashFile
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from dvc_objects.fs.base import AnyFSPath, FileSystem
 15 |     from fsspec import Callback
 16 | 
 17 |     from dvc_data.hashfile.meta import Meta
 18 |     from dvc_data.hashfile.state import StateBase
 19 |     from dvc_data.hashfile.tree import Tree
 20 | 
 21 |     from .index import ObjectDBIndexBase
 22 | 
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | def get_odb(fs, path, **config):
 28 |     from dvc_objects.fs import Schemes
 29 | 
 30 |     from .local import LocalHashFileDB
 31 | 
 32 |     if fs.protocol == Schemes.LOCAL:
 33 |         return LocalHashFileDB(fs, path, **config)
 34 | 
 35 |     return HashFileDB(fs, path, **config)
 36 | 
 37 | 
 38 | def get_index(odb) -> "ObjectDBIndexBase":
 39 |     import hashlib
 40 | 
 41 |     from .index import ObjectDBIndex, ObjectDBIndexNoop
 42 | 
 43 |     cls = ObjectDBIndex if odb.tmp_dir else ObjectDBIndexNoop
 44 |     return cls(
 45 |         odb.tmp_dir,
 46 |         hashlib.sha256(odb.fs.unstrip_protocol(odb.path).encode("utf-8")).hexdigest(),
 47 |     )
 48 | 
 49 | 
 50 | class HashFileDB(ObjectDB):
 51 |     DEFAULT_VERIFY = False
 52 |     DEFAULT_CACHE_TYPES: ClassVar[list[str]] = ["copy"]
 53 |     CACHE_MODE: ClassVar[Optional[int]] = None
 54 | 
 55 |     def __init__(self, fs: "FileSystem", path: str, read_only: bool = False, **config):
 56 |         from dvc_data.hashfile.state import StateNoop
 57 | 
 58 |         super().__init__(fs, path, read_only=read_only)
 59 |         self.state: StateBase = config.get("state", StateNoop())
 60 |         self.verify = config.get("verify", self.DEFAULT_VERIFY)
 61 |         self.cache_types = config.get("type") or copy(self.DEFAULT_CACHE_TYPES)
 62 |         self.slow_link_warning = config.get("slow_link_warning", True)
 63 |         self.tmp_dir = config.get("tmp_dir")
 64 |         self.hash_name = config.get("hash_name", self.fs.PARAM_CHECKSUM)
 65 | 
 66 |     def get(self, oid: str) -> HashFile:
 67 |         return HashFile(
 68 |             self.oid_to_path(oid),
 69 |             self.fs,
 70 |             HashInfo(self.hash_name, oid),
 71 |         )
 72 | 
 73 |     def add(
 74 |         self,
 75 |         path: Union["AnyFSPath", list["AnyFSPath"]],
 76 |         fs: "FileSystem",
 77 |         oid: Union[str, list[str]],
 78 |         hardlink: bool = False,
 79 |         callback: "Callback" = DEFAULT_CALLBACK,
 80 |         check_exists: bool = True,
 81 |         on_error: Optional[Callable[[str, BaseException], None]] = None,
 82 |         **kwargs,
 83 |     ) -> int:
 84 |         verify = kwargs.get("verify")
 85 |         if verify is None:
 86 |             verify = self.verify
 87 | 
 88 |         paths = [path] if isinstance(path, str) else path
 89 |         oids = [oid] if isinstance(oid, str) else oid
 90 |         assert len(paths) == len(oids)
 91 | 
 92 |         if verify:
 93 |             for o in oids:
 94 |                 try:
 95 |                     self.check(o, check_hash=True)
 96 |                 except (ObjectFormatError, FileNotFoundError):
 97 |                     pass
 98 | 
 99 |         transferred = super().add(
100 |             paths,
101 |             fs,
102 |             oids,
103 |             hardlink=hardlink,
104 |             callback=callback,
105 |             check_exists=check_exists,
106 |             on_error=on_error,
107 |             **kwargs,
108 |         )
109 | 
110 |         oid_cache_paths = {o: self.oid_to_path(o) for o in oids}
111 |         for o, cache_path in oid_cache_paths.items():
112 |             try:
113 |                 if verify:
114 |                     self.check(o, check_hash=True)
115 |                 self.protect(cache_path)
116 |             except (ObjectFormatError, FileNotFoundError):
117 |                 pass
118 | 
119 |         self.state.save_many(
120 |             (
121 |                 (cache_path, HashInfo(name=self.hash_name, value=o), None)
122 |                 for o, cache_path in oid_cache_paths.items()
123 |             ),
124 |             self.fs,
125 |         )
126 |         return transferred
127 | 
128 |     def protect(self, path):
129 |         pass
130 | 
131 |     def is_protected(self, path):
132 |         return False
133 | 
134 |     def unprotect(self, path):
135 |         pass
136 | 
137 |     def set_exec(self, path):
138 |         pass
139 | 
140 |     def check(
141 |         self,
142 |         oid: str,
143 |         check_hash: bool = True,
144 |         _info: Optional[dict] = None,
145 |     ) -> "Meta":
146 |         """Compare the given hash with the (corresponding) actual one if
147 |         check_hash is specified, or just verify the existence of the cache
148 |         files on the filesystem.
149 | 
150 |         - Use `State` as a cache for computed hashes
151 |             + The entries are invalidated by taking into account the following:
152 |                 * mtime
153 |                 * inode
154 |                 * size
155 |                 * hash
156 | 
157 |         - Remove the file from cache if it doesn't match the actual hash
158 |         """
159 |         from dvc_data.hashfile.hash import hash_file
160 |         from dvc_data.hashfile.meta import Meta
161 | 
162 |         obj = self.get(oid)
163 |         if not check_hash:
164 |             assert obj.fs
165 |             info = _info or obj.fs.info(obj.path)
166 |             return Meta.from_info(info)
167 | 
168 |         meta, actual = hash_file(
169 |             obj.path, obj.fs, self.hash_name, self.state, info=_info
170 |         )
171 | 
172 |         assert actual.name == self.hash_name
173 |         assert actual.value
174 |         if actual.value.split(".")[0] != oid.split(".")[0]:
175 |             logger.debug("corrupted cache file '%s'.", obj.path)
176 |             with suppress(FileNotFoundError):
177 |                 self.fs.remove(obj.path)
178 | 
179 |             raise ObjectFormatError(f"{obj} is corrupted")
180 | 
181 |         # making cache file read-only so we don't need to check it
182 |         # next time
183 |         self.protect(obj.path)
184 |         return meta
185 | 
186 |     def _remove_unpacked_dir(self, hash_):
187 |         pass
188 | 
189 | 
190 | def add_update_tree(odb: HashFileDB, tree: "Tree") -> "Tree":
191 |     """Add tree to ODB and update fs/path to use ODB fs/path."""
192 |     assert tree.oid
193 |     odb.add(tree.path, tree.fs, tree.oid, hardlink=False)
194 |     raw = odb.get(tree.oid)
195 |     tree.fs = raw.fs
196 |     tree.path = raw.path
197 |     return tree
198 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/db/index.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from abc import ABC, abstractmethod
  4 | from collections.abc import Iterable, Iterator
  5 | from typing import TYPE_CHECKING
  6 | 
  7 | from dvc_objects.errors import ObjectDBError
  8 | 
  9 | if TYPE_CHECKING:
 10 |     from dvc_objects.fs.base import AnyFSPath
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class ObjectDBIndexBase(ABC):
 16 |     @abstractmethod
 17 |     def __init__(
 18 |         self,
 19 |         tmp_dir: "AnyFSPath",
 20 |         name: str,
 21 |     ) -> None:
 22 |         pass
 23 | 
 24 |     @abstractmethod
 25 |     def close(self) -> None:
 26 |         pass
 27 | 
 28 |     @abstractmethod
 29 |     def __iter__(self) -> Iterator[str]:
 30 |         pass
 31 | 
 32 |     @abstractmethod
 33 |     def __contains__(self, hash_: str) -> bool:
 34 |         pass
 35 | 
 36 |     def hashes(self) -> Iterator[str]:
 37 |         return iter(self)
 38 | 
 39 |     @abstractmethod
 40 |     def dir_hashes(self) -> Iterator[str]:
 41 |         pass
 42 | 
 43 |     @abstractmethod
 44 |     def clear(self) -> None:
 45 |         pass
 46 | 
 47 |     @abstractmethod
 48 |     def update(self, dir_hashes: Iterable[str], file_hashes: Iterable[str]) -> None:
 49 |         pass
 50 | 
 51 |     @abstractmethod
 52 |     def intersection(self, hashes: set[str]) -> Iterator[str]:
 53 |         pass
 54 | 
 55 | 
 56 | class ObjectDBIndexNoop(ObjectDBIndexBase):
 57 |     """No-op class for ODBs which are not indexed."""
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         tmp_dir: "AnyFSPath",
 62 |         name: str,
 63 |     ) -> None:
 64 |         pass
 65 | 
 66 |     def close(self) -> None:
 67 |         pass
 68 | 
 69 |     def __iter__(self) -> Iterator[str]:
 70 |         return iter([])
 71 | 
 72 |     def __contains__(self, hash_: str) -> bool:
 73 |         return False
 74 | 
 75 |     def dir_hashes(self) -> Iterator[str]:
 76 |         yield from []
 77 | 
 78 |     def clear(self) -> None:
 79 |         pass
 80 | 
 81 |     def update(self, dir_hashes: Iterable[str], file_hashes: Iterable[str]) -> None:
 82 |         pass
 83 | 
 84 |     def intersection(self, hashes: set[str]) -> Iterator[str]:
 85 |         yield from []
 86 | 
 87 | 
 88 | class ObjectDBIndex(ObjectDBIndexBase):
 89 |     """Class for indexing hashes in an ODB."""
 90 | 
 91 |     INDEX_SUFFIX = ".idx"
 92 |     INDEX_DIR = "index"
 93 | 
 94 |     def __init__(
 95 |         self,
 96 |         tmp_dir: "AnyFSPath",
 97 |         name: str,
 98 |     ) -> None:
 99 |         from dvc_objects.fs import LocalFileSystem
100 | 
101 |         from dvc_data.hashfile.cache import Cache, Index
102 | 
103 |         self.index_dir = os.path.join(tmp_dir, self.INDEX_DIR, name)
104 |         self.fs = LocalFileSystem()
105 |         self.fs.makedirs(self.index_dir, exist_ok=True)
106 |         self._cache = Cache(self.index_dir, eviction_policy="none", type="index")
107 |         self.index = Index.fromcache(self._cache)
108 | 
109 |     def close(self) -> None:
110 |         return self._cache.close()
111 | 
112 |     def __iter__(self) -> Iterator[str]:
113 |         return iter(self.index)
114 | 
115 |     def __contains__(self, hash_: str) -> bool:
116 |         return hash_ in self.index
117 | 
118 |     def dir_hashes(self) -> Iterator[str]:
119 |         """Iterate over .dir hashes stored in the index."""
120 |         yield from (hash_ for hash_, is_dir in self.index.items() if is_dir)
121 | 
122 |     def clear(self) -> None:
123 |         """Clear this index (to force re-indexing later)."""
124 |         from dvc_data.hashfile.cache import Timeout
125 | 
126 |         try:
127 |             self.index.clear()
128 |         except Timeout as exc:
129 |             raise ObjectDBError("Failed to clear ODB index") from exc
130 | 
131 |     def update(self, dir_hashes: Iterable[str], file_hashes: Iterable[str]) -> None:
132 |         """Update this index, adding the specified hashes."""
133 |         from dvc_data.hashfile.cache import Timeout
134 | 
135 |         try:
136 |             with self.index.transact():
137 |                 for hash_ in dir_hashes:
138 |                     self.index[hash_] = True
139 |             with self.index.transact():
140 |                 for hash_ in file_hashes:
141 |                     self.index[hash_] = False
142 |         except Timeout as exc:
143 |             raise ObjectDBError("Failed to update ODB index") from exc
144 | 
145 |     def intersection(self, hashes: set[str]) -> Iterator[str]:
146 |         """Iterate over values from `hashes` which exist in the index."""
147 |         yield from hashes.intersection(self.index.keys())
148 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/db/local.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import stat
  4 | from functools import partial
  5 | from typing import ClassVar, Optional
  6 | 
  7 | from dvc_objects.db import noop, wrap_iter
  8 | from dvc_objects.errors import ObjectDBError, ObjectFormatError
  9 | from dvc_objects.fs.utils import copyfile, remove, tmp_fname
 10 | from fsspec.callbacks import DEFAULT_CALLBACK
 11 | 
 12 | from dvc_data.fsutils import _localfs_info
 13 | 
 14 | from . import HashFileDB
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | umask = os.umask(0)
 19 | os.umask(umask)
 20 | 
 21 | 
 22 | class LocalHashFileDB(HashFileDB):
 23 |     DEFAULT_CACHE_TYPES: ClassVar[list[str]] = ["reflink", "copy"]
 24 |     CACHE_MODE: ClassVar[int] = 0o444
 25 |     UNPACKED_DIR_SUFFIX = ".unpacked"
 26 | 
 27 |     def __init__(self, fs, path, **config):
 28 |         super().__init__(fs, path, **config)
 29 | 
 30 |         shared = config.get("shared")
 31 |         if shared:
 32 |             self._file_mode = 0o664
 33 |             self._dir_mode = 0o2775
 34 |         else:
 35 |             self._file_mode = 0o666 & ~umask
 36 |             self._dir_mode = 0o777 & ~umask
 37 | 
 38 |     def move(self, from_info, to_info):
 39 |         super().move(from_info, to_info)
 40 |         os.chmod(to_info, self._file_mode)
 41 | 
 42 |     def makedirs(self, path):
 43 |         from dvc_objects.fs.utils import makedirs
 44 | 
 45 |         makedirs(path, exist_ok=True, mode=self._dir_mode)
 46 | 
 47 |     def oid_to_path(self, oid):
 48 |         # NOTE: `self.path` is already normalized so we can simply use
 49 |         # `os.sep` instead of `os.path.join`. This results in this helper
 50 |         # being ~5.5 times faster.
 51 |         return f"{self.path}{os.sep}{oid[0:2]}{os.sep}{oid[2:]}"
 52 | 
 53 |     def oids_exist(self, oids, jobs=None, progress=noop):
 54 |         ret = []
 55 |         progress = partial(progress, "querying", len(oids))
 56 | 
 57 |         for oid in wrap_iter(oids, progress):
 58 |             try:
 59 |                 self.check(oid)
 60 |                 ret.append(oid)
 61 |             except (FileNotFoundError, ObjectFormatError):
 62 |                 pass
 63 | 
 64 |         return ret
 65 | 
 66 |     def _list_paths(self, prefix=None):
 67 |         assert self.path is not None
 68 |         if prefix:
 69 |             path = self.fs.join(self.path, prefix[:2])
 70 |             if not self.fs.exists(path):
 71 |                 return
 72 |         else:
 73 |             path = self.path
 74 |         yield from self.fs.find(path)
 75 | 
 76 |     def _remove_unpacked_dir(self, hash_):
 77 |         hash_path = self.oid_to_path(hash_)
 78 |         path = self.fs.with_name(
 79 |             hash_path,
 80 |             self.fs.name(hash_path) + self.UNPACKED_DIR_SUFFIX,
 81 |         )
 82 |         self.fs.remove(path)
 83 | 
 84 |     def _unprotect_file(self, path, callback=DEFAULT_CALLBACK):
 85 |         if self.fs.is_symlink(path) or self.fs.is_hardlink(path):
 86 |             logger.debug("Unprotecting '%s'", path)
 87 |             tmp = os.path.join(os.path.dirname(path), tmp_fname())
 88 | 
 89 |             # The operations order is important here - if some application
 90 |             # would access the file during the process of copyfile then it
 91 |             # would get only the part of file. So, at first, the file should be
 92 |             # copied with the temporary name, and then original file should be
 93 |             # replaced by new.
 94 |             copyfile(path, tmp, callback=callback)
 95 |             remove(path)
 96 |             os.rename(tmp, path)
 97 | 
 98 |         else:
 99 |             logger.debug(
100 |                 "Skipping copying for '%s', since it is not a symlink or a hardlink.",
101 |                 path,
102 |             )
103 | 
104 |         os.chmod(path, self._file_mode)
105 | 
106 |     def unprotect(self, path, callback=DEFAULT_CALLBACK):
107 |         if not os.path.exists(path):
108 |             raise ObjectDBError(f"can't unprotect non-existing data '{path}'")
109 | 
110 |         files = self.fs.find(path) if os.path.isdir(path) else [path]
111 |         for fname in callback.wrap(files):
112 |             with callback.branched(fname, fname) as cb:
113 |                 self._unprotect_file(fname, callback=cb)
114 | 
115 |     def protect(self, path):
116 |         try:
117 |             os.chmod(path, self.CACHE_MODE)
118 |         except OSError:
119 |             # NOTE: not being able to protect cache file is not fatal, it
120 |             # might happen on funky filesystems (e.g. Samba, see #5255),
121 |             # read-only filesystems or in a shared cache scenario.
122 |             logger.debug("failed to protect '%s'", path, exc_info=True)
123 | 
124 |     def check(self, oid: str, check_hash: bool = True, _info: Optional[dict] = None):
125 |         from dvc_data.hashfile.meta import Meta
126 | 
127 |         path = self.oid_to_path(oid)
128 |         info = _info or _localfs_info(path)
129 |         if stat.S_IMODE(info["mode"]) == self.CACHE_MODE:
130 |             return Meta.from_info(info)
131 |         return super().check(oid, check_hash, info)
132 | 
133 |     def is_protected(self, path):
134 |         try:
135 |             mode = os.stat(path).st_mode
136 |         except FileNotFoundError:
137 |             return False
138 | 
139 |         return stat.S_IMODE(mode) == self.CACHE_MODE
140 | 
141 |     def set_exec(self, path):
142 |         mode = os.stat(path).st_mode | stat.S_IEXEC
143 |         try:
144 |             os.chmod(path, mode)
145 |         except OSError:
146 |             logger.debug(
147 |                 "failed to chmod '%s' '%s'",
148 |                 oct(mode),
149 |                 path,
150 |                 exc_info=True,
151 |             )
152 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/db/migrate.py:
--------------------------------------------------------------------------------
 1 | from functools import partial, wraps
 2 | from typing import TYPE_CHECKING, Any, Callable, NamedTuple
 3 | 
 4 | from dvc_objects.executors import ThreadPoolExecutor
 5 | from fsspec.callbacks import DEFAULT_CALLBACK
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from dvc_objects.fs.base import FileSystem
 9 |     from fsspec import Callback
10 | 
11 |     from . import HashFileDB
12 | 
13 | 
14 | class PreparedMigration(NamedTuple):
15 |     src: "HashFileDB"
16 |     dest: "HashFileDB"
17 |     paths: list[str]
18 |     oids: list[str]
19 | 
20 | 
21 | def migrate(
22 |     migration: "PreparedMigration", callback: "Callback" = DEFAULT_CALLBACK
23 | ) -> int:
24 |     """Migrate objects from one HashFileDB to another.
25 | 
26 |     Files from src will be re-hashed and transferred to dest with hardlinking
27 |     enabled.
28 |     """
29 |     src, dest, paths, oids = migration
30 |     return dest.add(paths, src.fs, oids, hardlink=True, callback=callback)
31 | 
32 | 
33 | def prepare(
34 |     src: "HashFileDB",
35 |     dest: "HashFileDB",
36 |     callback: "Callback" = DEFAULT_CALLBACK,
37 | ) -> PreparedMigration:
38 |     """Prepare to migrate objects from one HashFileDB to another.
39 | 
40 |     Objects from src will be rehashed for addition to dest.
41 |     """
42 |     src_paths = [src.oid_to_path(oid) for oid in src._list_oids()]
43 |     callback.set_size(len(src_paths))
44 |     with ThreadPoolExecutor(
45 |         max_workers=src.fs.hash_jobs, cancel_on_error=True
46 |     ) as executor:
47 |         func = partial(
48 |             _hash_task,
49 |             dest.hash_name,
50 |             src.fs,
51 |             state=dest.state,
52 |             callback=callback,
53 |         )
54 |         results = list(executor.imap_unordered(func, src_paths))
55 |         if results:
56 |             paths, oids = zip(*results)
57 |         else:
58 |             paths, oids = (), ()
59 |     return PreparedMigration(src, dest, list(paths), list(oids))
60 | 
61 | 
62 | def _hash_task(
63 |     hash_name: str,
64 |     fs: "FileSystem",
65 |     path: str,
66 |     callback: "Callback" = DEFAULT_CALLBACK,
67 |     **kwargs,
68 | ) -> tuple[str, str]:
69 |     from dvc_data.hashfile.hash import hash_file
70 | 
71 |     func = _wrap_hash_file(callback, hash_file)
72 |     _meta, hash_info = func(path, fs, hash_name, **kwargs)
73 |     assert hash_info.value
74 |     if path.endswith(".dir"):
75 |         hash_info.value += ".dir"
76 |     return path, hash_info.value
77 | 
78 | 
79 | def _wrap_hash_file(callback: "Callback", fn: Callable):
80 |     @wraps(fn)
81 |     def func(path: str, *args, **kwargs):
82 |         kw: dict[str, Any] = dict(kwargs)
83 |         with callback.branched(path, path) as child:
84 |             res = fn(path, *args, callback=child, **kw)
85 |             callback.relative_update()
86 |             return res
87 | 
88 |     return func
89 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/db/reference.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import TYPE_CHECKING, Callable, Optional, Union
 3 | 
 4 | from dvc_data.hashfile.obj import HashFile
 5 | 
 6 | from . import HashFileDB, HashInfo
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from dvc_objects.fs.base import AnyFSPath, FileSystem
10 |     from fsspec import Callback
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | class ReferenceHashFileDB(HashFileDB):
16 |     def __init__(self, fs: "FileSystem", path: str, **config):
17 |         super().__init__(fs, path, **config)
18 |         self._obj_cache: dict[str, HashFile] = {}
19 | 
20 |     def __hash__(self):
21 |         return hash((self.fs.protocol, self.path, *self._obj_cache.keys()))
22 | 
23 |     def exists(self, oid: str) -> bool:
24 |         return oid in self._obj_cache
25 | 
26 |     def get(self, oid: str):
27 |         try:
28 |             return self._obj_cache[oid]
29 |         except KeyError:
30 |             return super().get(oid)
31 | 
32 |     def add(
33 |         self,
34 |         path: Union["AnyFSPath", list["AnyFSPath"]],
35 |         fs: "FileSystem",
36 |         oid: Union[str, list[str]],
37 |         hardlink: bool = False,
38 |         callback: Optional["Callback"] = None,
39 |         check_exists: bool = True,
40 |         on_error: Optional[Callable[[str, BaseException], None]] = None,
41 |         **kwargs,
42 |     ):
43 |         paths = [path] if isinstance(path, str) else path
44 |         oids = [oid] if isinstance(oid, str) else oid
45 |         assert len(paths) == len(oids)
46 | 
47 |         for i in range(len(paths)):
48 |             hash_info = HashInfo(self.hash_name, oids[i])
49 |             self._obj_cache[oids[i]] = HashFile(paths[i], fs, hash_info)
50 | 
51 |     def check(
52 |         self,
53 |         oid: str,
54 |         check_hash: bool = True,
55 |         _info: Optional[dict] = None,
56 |     ):
57 |         return
58 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/diff.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import reprlib
  3 | from typing import TYPE_CHECKING, Optional
  4 | 
  5 | from attrs import asdict, define, field
  6 | 
  7 | if TYPE_CHECKING:
  8 |     from .db import HashFileDB
  9 |     from .hash_info import HashInfo
 10 |     from .meta import Meta
 11 |     from .obj import HashFile
 12 | 
 13 | 
 14 | ADD = "add"
 15 | MODIFY = "modify"
 16 | DELETE = "delete"
 17 | UNCHANGED = "unchanged"
 18 | 
 19 | 
 20 | @define(unsafe_hash=True, order=True)
 21 | class TreeEntry:
 22 |     cache_meta: Optional["Meta"] = field(default=None, eq=False)
 23 |     key: tuple[str, ...] = ()
 24 |     meta: Optional["Meta"] = field(default=None, eq=False)
 25 |     oid: Optional["HashInfo"] = None
 26 | 
 27 |     def __bool__(self):
 28 |         return bool(self.oid)
 29 | 
 30 |     @property
 31 |     def in_cache(self) -> bool:
 32 |         return self.cache_meta is not None
 33 | 
 34 | 
 35 | @define(unsafe_hash=True, order=True)
 36 | class Change:
 37 |     old: TreeEntry = field(factory=TreeEntry)
 38 |     new: TreeEntry = field(factory=TreeEntry)
 39 |     typ: str = field(init=False)
 40 | 
 41 |     @typ.default
 42 |     def _(self):
 43 |         if not self.old and not self.new:
 44 |             return UNCHANGED
 45 | 
 46 |         if self.old and not self.new:
 47 |             return DELETE
 48 | 
 49 |         if not self.old and self.new:
 50 |             return ADD
 51 | 
 52 |         if self.old != self.new:
 53 |             return MODIFY
 54 | 
 55 |         return UNCHANGED
 56 | 
 57 |     def __bool__(self):
 58 |         return self.typ != UNCHANGED
 59 | 
 60 | 
 61 | @define
 62 | class DiffResult:
 63 |     added: list[Change] = field(factory=list, repr=reprlib.repr)
 64 |     modified: list[Change] = field(factory=list, repr=reprlib.repr)
 65 |     deleted: list[Change] = field(factory=list, repr=reprlib.repr)
 66 |     unchanged: list[Change] = field(factory=list, repr=reprlib.repr)
 67 | 
 68 |     def __bool__(self):
 69 |         return bool(self.added or self.modified or self.deleted)
 70 | 
 71 |     @property
 72 |     def stats(self) -> dict[str, int]:
 73 |         return {
 74 |             k: len(v)
 75 |             for k, v in asdict(self, recurse=False).items()
 76 |             if k != "unchanged"
 77 |         }
 78 | 
 79 | 
 80 | ROOT = ("",)
 81 | 
 82 | 
 83 | def diff(  # noqa: C901
 84 |     old: Optional["HashFile"],
 85 |     new: Optional["HashFile"],
 86 |     cache: "HashFileDB",
 87 | ) -> DiffResult:
 88 |     from .tree import Tree
 89 | 
 90 |     if old is None and new is None:
 91 |         return DiffResult()
 92 | 
 93 |     def _get_keys(obj):
 94 |         if not obj:
 95 |             return []
 96 |         return [ROOT] + ([key for key, _, _ in obj] if isinstance(obj, Tree) else [])
 97 | 
 98 |     old_keys = set(_get_keys(old))
 99 |     new_keys = set(_get_keys(new))
100 | 
101 |     def _get(obj, key):
102 |         if not obj or key == ROOT:
103 |             return None, (obj.hash_info if obj else None)
104 |         if not isinstance(obj, Tree):
105 |             # obj is not a Tree and key is not a ROOT
106 |             # hence object does not exist for a given key
107 |             return None, None
108 |         return obj.get(key, (None, None))
109 | 
110 |     @functools.cache
111 |     def _cache_check(oid: Optional["str"], cache: "HashFileDB") -> Optional["Meta"]:
112 |         from dvc_objects.errors import ObjectFormatError
113 | 
114 |         if not oid:
115 |             return None
116 | 
117 |         try:
118 |             return cache.check(oid)
119 |         except (FileNotFoundError, ObjectFormatError):
120 |             return None
121 | 
122 |     ret = DiffResult()
123 |     for key in old_keys | new_keys:
124 |         old_meta, old_oid = _get(old, key)
125 |         new_meta, new_oid = _get(new, key)
126 | 
127 |         old_cache_meta = _cache_check(old_oid.value, cache) if old_oid else None
128 |         new_cache_meta = _cache_check(new_oid.value, cache) if new_oid else None
129 |         change = Change(
130 |             old=TreeEntry(old_cache_meta, key, old_meta, old_oid),
131 |             new=TreeEntry(new_cache_meta, key, new_meta, new_oid),
132 |         )
133 | 
134 |         if change.typ == ADD:
135 |             ret.added.append(change)
136 |         elif change.typ == MODIFY:
137 |             ret.modified.append(change)
138 |         elif change.typ == DELETE:
139 |             ret.deleted.append(change)
140 |         else:
141 |             assert change.typ == UNCHANGED
142 |             ret.unchanged.append(change)
143 |     return ret
144 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/gc.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Iterable
 2 | from typing import TYPE_CHECKING, Optional
 3 | 
 4 | if TYPE_CHECKING:
 5 |     from .db import HashFileDB
 6 |     from .hash_info import HashInfo
 7 | 
 8 | 
 9 | def gc(  # noqa: C901
10 |     odb: "HashFileDB",
11 |     used: Iterable["HashInfo"],
12 |     jobs: Optional[int] = None,
13 |     cache_odb: Optional["HashFileDB"] = None,
14 |     shallow: bool = True,
15 |     dry: bool = False,
16 | ):
17 |     from dvc_objects.errors import ObjectDBPermissionError
18 | 
19 |     from ._progress import QueryingProgress
20 |     from .tree import Tree
21 | 
22 |     if odb.read_only:
23 |         raise ObjectDBPermissionError("Cannot gc read-only ODB")
24 |     if not cache_odb:
25 |         cache_odb = odb
26 |     used_hashes = set()
27 |     for hash_info in used:
28 |         if hash_info.name != odb.hash_name:
29 |             continue
30 |         used_hashes.add(hash_info.value)
31 |         if hash_info.isdir and not shallow:
32 |             tree = Tree.load(cache_odb, hash_info)
33 |             used_hashes.update(entry_obj.hash_info.value for _, entry_obj in tree)
34 | 
35 |     def _is_dir_hash(_hash):
36 |         from .hash_info import HASH_DIR_SUFFIX
37 | 
38 |         return _hash.endswith(HASH_DIR_SUFFIX)
39 | 
40 |     num_removed = 0
41 | 
42 |     dir_paths = []
43 |     file_paths = []
44 |     for hash_ in QueryingProgress(odb.all(jobs), name=odb.path):
45 |         if hash_ in used_hashes:
46 |             continue
47 |         path = odb.oid_to_path(hash_)
48 |         if _is_dir_hash(hash_):
49 |             # backward compatibility
50 |             odb._remove_unpacked_dir(hash_)
51 |             dir_paths.append(path)
52 |         else:
53 |             file_paths.append(path)
54 | 
55 |     for paths in (dir_paths, file_paths):
56 |         if paths:
57 |             num_removed += len(paths)
58 |             if not dry:
59 |                 odb.fs.remove(paths)
60 | 
61 |     return num_removed
62 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/hash.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import io
  3 | import logging
  4 | from typing import TYPE_CHECKING, BinaryIO, Optional, cast
  5 | 
  6 | from dvc_objects.fs import localfs
  7 | from fsspec.callbacks import Callback
  8 | from fsspec.utils import nullcontext
  9 | from tqdm.utils import CallbackIOWrapper
 10 | 
 11 | from dvc_data.callbacks import TqdmCallback
 12 | 
 13 | from .hash_info import HashInfo
 14 | from .istextfile import DEFAULT_CHUNK_SIZE, istextblock
 15 | from .meta import Meta
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | if TYPE_CHECKING:
 21 |     from dvc_objects.fs.base import AnyFSPath, FileSystem
 22 | 
 23 |     from .state import StateBase
 24 | 
 25 | 
 26 | def dos2unix(data: bytes) -> bytes:
 27 |     return data.replace(b"\r\n", b"\n")
 28 | 
 29 | 
 30 | algorithms_available = hashlib.algorithms_available | {
 31 |     "blake3",
 32 |     "md5-dos2unix",
 33 | }
 34 | DEFAULT_ALGORITHM = "md5"
 35 | 
 36 | 
 37 | def get_hasher(name: str) -> "hashlib._Hash":
 38 |     if name == "blake3":
 39 |         from blake3 import blake3  # type: ignore[import-not-found]
 40 | 
 41 |         return blake3(max_threads=blake3.AUTO)  # type: ignore[return-value]
 42 |     if name == "md5-dos2unix":
 43 |         name = "md5"
 44 | 
 45 |     try:
 46 |         return getattr(hashlib, name)()
 47 |     except AttributeError:
 48 |         return hashlib.new(name)
 49 | 
 50 | 
 51 | class HashStreamFile(io.IOBase):
 52 |     __slots__ = ("fobj", "hasher", "total_read")
 53 | 
 54 |     def __init__(
 55 |         self,
 56 |         fobj: BinaryIO,
 57 |         hash_name: str = DEFAULT_ALGORITHM,
 58 |     ) -> None:
 59 |         self.fobj = fobj
 60 |         self.total_read = 0
 61 |         hash_name = hash_name.lower()
 62 |         self.hasher = get_hasher(hash_name)
 63 |         super().__init__()
 64 | 
 65 |     def readable(self) -> bool:
 66 |         return True
 67 | 
 68 |     def tell(self) -> int:
 69 |         return self.fobj.tell()
 70 | 
 71 |     def read(self, n=-1) -> bytes:
 72 |         chunk = self.fobj.read(n)
 73 |         self.hasher.update(chunk)
 74 |         self.total_read += len(chunk)
 75 |         return chunk
 76 | 
 77 |     @property
 78 |     def hash_value(self) -> str:
 79 |         return self.hasher.hexdigest()
 80 | 
 81 |     @property
 82 |     def hash_name(self) -> str:
 83 |         return self.hasher.name
 84 | 
 85 | 
 86 | class Dos2UnixHashStreamFile(HashStreamFile):
 87 |     __slots__ = ()
 88 | 
 89 |     def read(self, n=-1) -> bytes:
 90 |         # ideally, we want the heuristics to be applied in a similar way,
 91 |         # regardless of the size of the first chunk,
 92 |         # for which we may need to buffer till DEFAULT_CHUNK_SIZE.
 93 |         assert n >= DEFAULT_CHUNK_SIZE
 94 |         chunk = self.fobj.read(n)
 95 |         is_text = istextblock(chunk[:DEFAULT_CHUNK_SIZE]) if chunk else False
 96 | 
 97 |         data = dos2unix(chunk) if is_text else chunk
 98 |         self.hasher.update(data)
 99 |         self.total_read += len(data)
100 |         return chunk
101 | 
102 | 
103 | def get_hash_stream(fobj: BinaryIO, name: str = DEFAULT_ALGORITHM) -> HashStreamFile:
104 |     cls = Dos2UnixHashStreamFile if name == "md5-dos2unix" else HashStreamFile
105 |     return cls(fobj, hash_name=name)
106 | 
107 | 
108 | def fobj_md5(
109 |     fobj: BinaryIO,
110 |     chunk_size: int = 2**20,
111 |     name: str = DEFAULT_ALGORITHM,
112 | ) -> str:
113 |     stream = get_hash_stream(fobj, name=name)
114 |     while True:
115 |         data = stream.read(chunk_size)
116 |         if not data:
117 |             break
118 |     return stream.hash_value
119 | 
120 | 
121 | def file_md5(
122 |     fname: "AnyFSPath",
123 |     fs: "FileSystem" = localfs,
124 |     callback: Optional["Callback"] = None,
125 |     name: str = DEFAULT_ALGORITHM,
126 |     size: Optional[int] = None,
127 | ) -> str:
128 |     if size is None and callback is not None:
129 |         size = fs.size(fname) or 0
130 |         callback.set_size(size)
131 | 
132 |     with fs.open(fname, "rb") as fobj:
133 |         if callback is not None:
134 |             fobj = cast("BinaryIO", CallbackIOWrapper(callback.relative_update, fobj))
135 |         return fobj_md5(fobj, name=name)
136 | 
137 | 
138 | def _hash_file(
139 |     path: "AnyFSPath",
140 |     fs: "FileSystem",
141 |     name: str,
142 |     callback: Optional["Callback"] = None,
143 |     info: Optional[dict] = None,
144 | ) -> tuple["str", Meta]:
145 |     info = info or fs.info(path)
146 |     meta = Meta.from_info(info, fs.protocol)
147 | 
148 |     value = getattr(meta, name, None)
149 |     if value:
150 |         assert not value.endswith(".dir")
151 |         return value, meta
152 | 
153 |     if hasattr(fs, name):
154 |         func = getattr(fs, name)
155 |         return str(func(path)), meta
156 | 
157 |     if name in algorithms_available:
158 |         return (
159 |             file_md5(path, fs, callback=callback, size=meta.size, name=name),
160 |             meta,
161 |         )
162 |     raise NotImplementedError
163 | 
164 | 
165 | class LargeFileHashingCallback(TqdmCallback):
166 |     """Callback that only shows progress bar if self.size > LARGE_FILE_SIZE."""
167 | 
168 |     LARGE_FILE_SIZE = 2**30
169 | 
170 |     def __init__(self, *args, **kwargs):
171 |         kwargs.setdefault("bytes", True)
172 |         super().__init__(*args, **kwargs)
173 |         self._logged = False
174 |         self.fname = kwargs.get("desc", "")
175 | 
176 |     # TqdmCallback force renders progress bar on `set_size`.
177 |     set_size = Callback.set_size
178 | 
179 |     def call(self, hook_name=None, **kwargs):
180 |         if self.size and self.size > self.LARGE_FILE_SIZE:
181 |             if not self._logged:
182 |                 logger.info(
183 |                     "Computing md5 for a large file %r. This is only done once.",
184 |                     self.fname,
185 |                 )
186 |                 self._logged = True
187 |             super().call()
188 | 
189 | 
190 | def hash_file(
191 |     path: "AnyFSPath",
192 |     fs: "FileSystem",
193 |     name: str,
194 |     state: Optional["StateBase"] = None,
195 |     callback: Optional["Callback"] = None,
196 |     info: Optional[dict] = None,
197 | ) -> tuple["Meta", "HashInfo"]:
198 |     if state:
199 |         meta, hash_info = state.get(path, fs, info=info)
200 |         if meta is not None and hash_info is not None and hash_info.name == name:
201 |             return meta, hash_info
202 | 
203 |     size = info.get("size") if info else None
204 |     _callback = callback
205 |     # never initialize callback if it's never going to be used
206 |     if size and size < LargeFileHashingCallback.LARGE_FILE_SIZE:
207 |         _callback = nullcontext(None)
208 |     else:
209 |         _callback = LargeFileHashingCallback(desc=path)
210 | 
211 |     with _callback as cb:
212 |         oid, meta = _hash_file(path, fs, name, callback=cb, info=info)
213 | 
214 |     hash_info = HashInfo(name, oid)
215 |     if state:
216 |         assert ".dir" not in oid
217 |         state.save(path, fs, hash_info, info=info)
218 |     return meta, hash_info
219 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/hash_info.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from attrs import define, field
 4 | 
 5 | HASH_DIR_SUFFIX = ".dir"
 6 | 
 7 | 
 8 | @define(unsafe_hash=True)
 9 | class HashInfo:
10 |     name: Optional[str] = None
11 |     value: Optional[str] = None
12 |     obj_name: Optional[str] = field(default=None, eq=False, hash=False)
13 | 
14 |     def __bool__(self) -> bool:
15 |         return bool(self.value)
16 | 
17 |     def __str__(self) -> str:
18 |         return f"{self.name}: {self.value}"
19 | 
20 |     @classmethod
21 |     def from_dict(cls, d: dict[str, str]) -> "HashInfo":
22 |         if not d:
23 |             return cls()
24 | 
25 |         ((name, value),) = d.items()
26 |         return cls(name, value)
27 | 
28 |     def to_dict(self) -> dict[str, str]:
29 |         if not self.value or not self.name:
30 |             return {}
31 |         return {self.name: self.value}
32 | 
33 |     @property
34 |     def isdir(self) -> bool:
35 |         if not self.value:
36 |             return False
37 |         return self.value.endswith(HASH_DIR_SUFFIX)
38 | 
39 |     def as_raw(self) -> "HashInfo":
40 |         assert self.value
41 |         value, *_ = self.value.rsplit(HASH_DIR_SUFFIX, 1)
42 |         return HashInfo(self.name, value, self.obj_name)
43 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/istextfile.py:
--------------------------------------------------------------------------------
 1 | """Use heuristics to guess if it is a text file or a binary file."""
 2 | 
 3 | # Based on https://eli.thegreenplace.net/2011/10/19/
 4 | # perls-guess-if-file-is-text-or-binary-implemented-in-python
 5 | from typing import TYPE_CHECKING
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from dvc_objects.fs.base import AnyFSPath, FileSystem
 9 | 
10 | TEXT_CHARS = bytes(range(32, 127)) + b"\n\r\t\f\b"
11 | DEFAULT_CHUNK_SIZE = 512
12 | 
13 | 
14 | def istextblock(block: bytes) -> bool:
15 |     if not block:
16 |         # An empty file is considered a valid text file
17 |         return True
18 | 
19 |     if b"\x00" in block:
20 |         # Files with null bytes are binary
21 |         return False
22 | 
23 |     # Use translate's 'deletechars' argument to efficiently remove all
24 |     # occurrences of TEXT_CHARS from the block
25 |     nontext = block.translate(None, TEXT_CHARS)
26 |     return float(len(nontext)) / len(block) <= 0.30
27 | 
28 | 
29 | def istextfile(
30 |     fname: "AnyFSPath", fs: "FileSystem", blocksize: int = DEFAULT_CHUNK_SIZE
31 | ) -> bool:
32 |     """Uses heuristics to guess whether the given file is text or binary,
33 |     by reading a single block of bytes from the file.
34 |     If more than 30% of the chars in the block are non-text, or there
35 |     are NUL ('\x00') bytes in the block, assume this is a binary file.
36 |     """
37 |     with fs.open(fname, "rb") as fobj:
38 |         block = fobj.read(blocksize)
39 |     return istextblock(block)
40 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/meta.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, ClassVar, Optional
  2 | 
  3 | from attrs import define, field, fields_dict
  4 | from dvc_objects.fs.utils import is_exec
  5 | 
  6 | 
  7 | @define(unsafe_hash=True)
  8 | class Meta:
  9 |     PARAM_ISDIR: ClassVar[str] = "isdir"
 10 |     PARAM_SIZE: ClassVar[str] = "size"
 11 |     PARAM_NFILES: ClassVar[str] = "nfiles"
 12 |     PARAM_ISEXEC: ClassVar[str] = "isexec"
 13 |     PARAM_VERSION_ID: ClassVar[str] = "version_id"
 14 |     PARAM_ETAG: ClassVar[str] = "etag"
 15 |     PARAM_CHECKSUM: ClassVar[str] = "checksum"
 16 |     PARAM_MD5: ClassVar[str] = "md5"
 17 |     PARAM_INODE: ClassVar[str] = "inode"
 18 |     PARAM_MTIME: ClassVar[str] = "mtime"
 19 |     PARAM_REMOTE: ClassVar[str] = "remote"
 20 | 
 21 |     fields: ClassVar[list[str]]
 22 | 
 23 |     isdir: bool = False
 24 |     size: Optional[int] = None
 25 |     nfiles: Optional[int] = None
 26 |     isexec: bool = False
 27 |     version_id: Optional[str] = None
 28 |     etag: Optional[str] = None
 29 |     checksum: Optional[str] = None
 30 |     md5: Optional[str] = None
 31 |     inode: Optional[int] = None
 32 |     mtime: Optional[float] = None
 33 | 
 34 |     remote: Optional[str] = field(default=None, eq=False)
 35 | 
 36 |     is_link: bool = field(default=False, eq=False)
 37 |     destination: Optional[str] = field(default=None, eq=False)
 38 |     nlink: int = field(default=1, eq=False)
 39 | 
 40 |     @classmethod
 41 |     def from_info(cls, info: dict[str, Any], protocol: Optional[str] = None) -> "Meta":
 42 |         etag = info.get("etag")
 43 |         checksum = info.get("checksum")
 44 | 
 45 |         if protocol == "azure" and etag and not etag.startswith('"'):
 46 |             etag = f'"{etag}"'
 47 |         if protocol == "s3" and "ETag" in info:
 48 |             etag = info["ETag"].strip('"')
 49 |         elif protocol == "gs" and "etag" in info:
 50 |             import base64
 51 | 
 52 |             etag = base64.b64decode(info["etag"]).hex()
 53 |         elif (
 54 |             protocol
 55 |             and protocol.startswith("http")
 56 |             and ("ETag" in info or "Content-MD5" in info)
 57 |         ):
 58 |             checksum = info.get("ETag") or info.get("Content-MD5")
 59 | 
 60 |         version_id = info.get("version_id")
 61 |         if protocol == "s3" and "VersionId" in info:
 62 |             version_id = info.get("VersionId")
 63 |         elif protocol == "gs" and "generation" in info:
 64 |             version_id = info.get("generation")
 65 | 
 66 |         return Meta(
 67 |             info["type"] == "directory",
 68 |             info.get("size"),
 69 |             None,
 70 |             is_exec(info.get("mode", 0)),
 71 |             version_id,
 72 |             etag,
 73 |             checksum,
 74 |             info.get("md5"),
 75 |             info.get("ino"),
 76 |             info.get("mtime"),
 77 |             info.get("remote"),
 78 |             info.get("islink", False),
 79 |             info.get("destination"),
 80 |             info.get("nlink", 1),
 81 |         )
 82 | 
 83 |     @classmethod
 84 |     def from_dict(cls, d: dict[str, Any]) -> "Meta":
 85 |         kwargs = {}
 86 |         for field_ in cls.fields:
 87 |             if field_ in d:
 88 |                 kwargs[field_] = d[field_]
 89 |         return cls(**kwargs)
 90 | 
 91 |     def to_dict(self) -> dict[str, Any]:
 92 |         ret: dict[str, Any] = {}
 93 | 
 94 |         if self.isdir:
 95 |             ret[self.PARAM_ISDIR] = self.isdir
 96 | 
 97 |         if self.size is not None:
 98 |             ret[self.PARAM_SIZE] = self.size
 99 | 
100 |         if self.nfiles is not None:
101 |             ret[self.PARAM_NFILES] = self.nfiles
102 | 
103 |         if self.isexec:
104 |             ret[self.PARAM_ISEXEC] = self.isexec
105 | 
106 |         if self.version_id:
107 |             ret[self.PARAM_VERSION_ID] = self.version_id
108 | 
109 |         if self.etag:
110 |             ret[self.PARAM_ETAG] = self.etag
111 | 
112 |         if self.checksum:
113 |             ret[self.PARAM_CHECKSUM] = self.checksum
114 | 
115 |         if self.md5:
116 |             ret[self.PARAM_MD5] = self.md5
117 | 
118 |         if self.remote:
119 |             ret[self.PARAM_REMOTE] = self.remote
120 | 
121 |         return ret
122 | 
123 | 
124 | Meta.fields = list(fields_dict(Meta))
125 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/obj.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | from dvc_objects.obj import Object
 4 | 
 5 | if TYPE_CHECKING:
 6 |     from dvc_objects.fs.base import AnyFSPath, FileSystem
 7 | 
 8 |     from .hash_info import HashInfo
 9 | 
10 | 
11 | class HashFile(Object):
12 |     __slots__ = ("hash_info",)
13 | 
14 |     def __init__(self, path: "AnyFSPath", fs: "FileSystem", hash_info: "HashInfo"):
15 |         assert hash_info.value
16 |         oid = hash_info.value
17 |         super().__init__(path, fs, oid)
18 |         self.hash_info = hash_info
19 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/status.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from collections.abc import Iterable
  3 | from typing import TYPE_CHECKING, NamedTuple, Optional
  4 | 
  5 | from dvc_objects.fs import Schemes
  6 | 
  7 | from .hash_info import HashInfo
  8 | from .tree import Tree
  9 | 
 10 | if TYPE_CHECKING:
 11 |     from dvc_objects.db import ObjectDB
 12 | 
 13 |     from .db import HashFileDB
 14 |     from .db.index import ObjectDBIndexBase
 15 |     from .obj import HashFile
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class StatusResult(NamedTuple):
 21 |     exists: set["HashInfo"]
 22 |     missing: set["HashInfo"]
 23 | 
 24 | 
 25 | class CompareStatusResult(NamedTuple):
 26 |     ok: set["HashInfo"]
 27 |     missing: set["HashInfo"]
 28 |     new: set["HashInfo"]
 29 |     deleted: set["HashInfo"]
 30 | 
 31 | 
 32 | def _indexed_dir_hashes(
 33 |     odb: "ObjectDB", index: "ObjectDBIndexBase", dir_objs, name, cache_odb, jobs=None
 34 | ):
 35 |     # Validate our index by verifying all indexed .dir hashes
 36 |     # still exist on the remote
 37 |     from ._progress import QueryingProgress
 38 | 
 39 |     dir_hashes = set(dir_objs.keys())
 40 |     indexed_dirs = set(index.dir_hashes())
 41 |     indexed_dir_exists: set[str] = set()
 42 |     if indexed_dirs:
 43 |         hashes = QueryingProgress(
 44 |             odb.list_oids_exists(indexed_dirs, jobs=jobs),
 45 |             total=len(indexed_dirs),
 46 |         )
 47 |         indexed_dir_exists.update(hashes)
 48 |         missing_dirs = indexed_dirs.difference(indexed_dir_exists)
 49 |         if missing_dirs:
 50 |             logger.debug(
 51 |                 "Remote cache missing indexed .dir hashes '%s', clearing remote index",
 52 |                 ", ".join(missing_dirs),
 53 |             )
 54 |             index.clear()
 55 | 
 56 |     # Check if non-indexed (new) dir hashes exist on remote
 57 |     dir_exists = dir_hashes.intersection(indexed_dir_exists)
 58 |     dir_missing = dir_hashes - dir_exists
 59 |     dir_exists.update(
 60 |         QueryingProgress(
 61 |             odb.list_oids_exists(dir_missing, jobs=jobs),
 62 |             total=len(dir_missing),
 63 |         )
 64 |     )
 65 | 
 66 |     # If .dir hash exists in the ODB, assume directory contents
 67 |     # also exists
 68 |     for dir_hash in dir_exists:
 69 |         tree = dir_objs.get(dir_hash)
 70 |         if not tree:
 71 |             try:
 72 |                 tree = Tree.load(cache_odb, HashInfo(name, dir_hash))
 73 |             except FileNotFoundError:
 74 |                 continue
 75 |         file_hashes = [hi.value for _, _, hi in tree]
 76 |         if dir_hash not in index:
 77 |             logger.debug(
 78 |                 "Indexing new .dir '%s' with '%s' nested files",
 79 |                 dir_hash,
 80 |                 len(file_hashes),
 81 |             )
 82 |             index.update([dir_hash], file_hashes)
 83 |         yield from file_hashes
 84 |         yield tree.hash_info.value
 85 | 
 86 | 
 87 | def status(  # noqa: C901, PLR0912
 88 |     odb: "HashFileDB",
 89 |     obj_ids: Iterable["HashInfo"],
 90 |     name: Optional[str] = None,
 91 |     index: Optional["ObjectDBIndexBase"] = None,
 92 |     cache_odb: Optional["HashFileDB"] = None,
 93 |     shallow: bool = True,
 94 |     jobs: Optional[int] = None,
 95 | ) -> "StatusResult":
 96 |     """Return status of whether or not the specified objects exist odb.
 97 | 
 98 |     If cache_odb is set, trees will be loaded from cache_odb instead of odb
 99 |     when needed.
100 | 
101 |     Status is returned as a tuple of:
102 |         exists: objs that exist in odb
103 |         missing: objs that do not exist in ODB
104 |     """
105 |     logger.debug("Preparing to collect status from '%s'", odb.path)
106 |     if not name:
107 |         name = odb.hash_name
108 | 
109 |     if cache_odb is None:
110 |         cache_odb = odb
111 | 
112 |     hash_infos: dict[str, HashInfo] = {}
113 |     dir_objs: dict[str, Optional[HashFile]] = {}
114 |     for hash_info in obj_ids:
115 |         assert hash_info.value
116 |         if hash_info.isdir:
117 |             if shallow:
118 |                 tree = None
119 |             else:
120 |                 tree = Tree.load(cache_odb, hash_info)
121 |                 for _, _, oid in tree:
122 |                     assert oid
123 |                     assert oid.value
124 |                     hash_infos[oid.value] = oid
125 |             if index:
126 |                 dir_objs[hash_info.value] = tree
127 |         hash_infos[hash_info.value] = hash_info
128 | 
129 |     if odb.fs.protocol == Schemes.MEMORY:
130 |         # assume memfs staged objects already exist
131 |         return StatusResult(set(hash_infos.values()), set())
132 | 
133 |     hashes: set[str] = set(hash_infos.keys())
134 |     exists: set[str] = set()
135 | 
136 |     logger.debug("Collecting status from '%s'", odb.path)
137 |     if index and hashes:
138 |         if dir_objs:
139 |             exists = hashes.intersection(
140 |                 _indexed_dir_hashes(odb, index, dir_objs, name, cache_odb, jobs=jobs)
141 |             )
142 |             hashes.difference_update(exists)
143 |         if hashes:
144 |             exists.update(index.intersection(hashes))
145 |             hashes.difference_update(exists)
146 | 
147 |     if hashes:
148 |         from ._progress import QueryingProgress
149 | 
150 |         with QueryingProgress(phase="Checking", name=odb.path) as pbar:
151 |             exists.update(odb.oids_exist(hashes, jobs=jobs, progress=pbar.callback))
152 |     return StatusResult(
153 |         {hash_infos[hash_] for hash_ in exists},
154 |         {hash_infos[hash_] for hash_ in (hashes - exists)},
155 |     )
156 | 
157 | 
158 | def compare_status(
159 |     src: "HashFileDB",
160 |     dest: "HashFileDB",
161 |     obj_ids: Iterable["HashInfo"],
162 |     check_deleted: bool = True,
163 |     src_index: Optional["ObjectDBIndexBase"] = None,
164 |     dest_index: Optional["ObjectDBIndexBase"] = None,
165 |     cache_odb: Optional["HashFileDB"] = None,
166 |     jobs: Optional[int] = None,
167 |     **kwargs,
168 | ) -> "CompareStatusResult":
169 |     """Compare status for the specified objects between two ODBs.
170 | 
171 |     Status is returned as a tuple of:
172 |         ok: hashes that exist in both src and dest
173 |         missing: hashes that do not exist in neither src nor dest
174 |         new: hashes that only exist in src
175 |         deleted: hashes that only exist in dest
176 |     """
177 |     if cache_odb is None:
178 |         cache_odb = src
179 |     dest_exists, dest_missing = status(
180 |         dest,
181 |         obj_ids,
182 |         index=dest_index,
183 |         jobs=jobs,
184 |         cache_odb=cache_odb,
185 |         **kwargs,
186 |     )
187 |     # for transfer operations we can skip src status check when all objects
188 |     # already exist in dest
189 |     if dest_missing or check_deleted:
190 |         src_exists, src_missing = status(
191 |             src, obj_ids, index=src_index, jobs=jobs, **kwargs
192 |         )
193 |     else:
194 |         src_exists = dest_exists
195 |         src_missing = set()
196 |     return CompareStatusResult(
197 |         src_exists & dest_exists,
198 |         src_missing & dest_missing,
199 |         src_exists - dest_exists,
200 |         dest_exists - src_exists,
201 |     )
202 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/transfer.py:
--------------------------------------------------------------------------------
  1 | import errno
  2 | import logging
  3 | from collections import defaultdict
  4 | from collections.abc import Iterable
  5 | from typing import (
  6 |     TYPE_CHECKING,
  7 |     Any,
  8 |     Callable,
  9 |     NamedTuple,
 10 |     Optional,
 11 | )
 12 | 
 13 | from fsspec.callbacks import DEFAULT_CALLBACK
 14 | 
 15 | from .hash_info import HashInfo
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from dvc_objects.fs.base import FileSystem
 19 |     from fsspec import Callback
 20 | 
 21 |     from .db import HashFileDB
 22 |     from .db.index import ObjectDBIndexBase
 23 |     from .status import CompareStatusResult
 24 |     from .tree import Tree
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | 
 29 | class TransferResult(NamedTuple):
 30 |     transferred: set["HashInfo"]
 31 |     failed: set["HashInfo"]
 32 | 
 33 | 
 34 | def _log_exception(oid: str, exc: BaseException):
 35 |     # NOTE: this means we ran out of file descriptors and there is no
 36 |     # reason to try to proceed, as we will hit this error anyways.
 37 |     if isinstance(exc, OSError) and exc.errno == errno.EMFILE:
 38 |         raise exc
 39 |     logger.error("failed to transfer '%s'", oid, exc_info=exc)
 40 | 
 41 | 
 42 | def find_tree_by_obj_id(
 43 |     odbs: Iterable[Optional["HashFileDB"]], obj_id: "HashInfo"
 44 | ) -> Optional["Tree"]:
 45 |     from dvc_objects.errors import ObjectFormatError
 46 | 
 47 |     from .tree import Tree
 48 | 
 49 |     for odb in odbs:
 50 |         if odb is not None:
 51 |             try:
 52 |                 return Tree.load(odb, obj_id)
 53 |             except (FileNotFoundError, ObjectFormatError):
 54 |                 pass
 55 |     return None
 56 | 
 57 | 
 58 | def _do_transfer(  # noqa: C901
 59 |     src: "HashFileDB",
 60 |     dest: "HashFileDB",
 61 |     obj_ids: Iterable["HashInfo"],
 62 |     missing_ids: Iterable["HashInfo"],
 63 |     src_index: Optional["ObjectDBIndexBase"] = None,
 64 |     dest_index: Optional["ObjectDBIndexBase"] = None,
 65 |     cache_odb: Optional["HashFileDB"] = None,
 66 |     **kwargs: Any,
 67 | ) -> set["HashInfo"]:
 68 |     """Do object transfer.
 69 | 
 70 |     Returns:
 71 |         Set containing any hash_infos which failed to transfer.
 72 |     """
 73 |     dir_ids, file_ids = set(), set()
 74 |     for hash_info in obj_ids:
 75 |         if hash_info.isdir:
 76 |             dir_ids.add(hash_info)
 77 |         else:
 78 |             file_ids.add(hash_info)
 79 | 
 80 |     failed_ids: set[HashInfo] = set()
 81 |     succeeded_dir_objs = []
 82 | 
 83 |     for dir_hash in dir_ids:
 84 |         dir_obj = find_tree_by_obj_id([cache_odb, src], dir_hash)
 85 |         assert dir_obj
 86 | 
 87 |         entry_ids = {oid for _, _, oid in dir_obj}
 88 |         bound_file_ids = file_ids & entry_ids
 89 |         file_ids -= entry_ids
 90 | 
 91 |         logger.debug("transfer dir: %s with %d files", dir_hash, len(bound_file_ids))
 92 | 
 93 |         dir_fails = _add(src, dest, bound_file_ids, **kwargs)
 94 |         if dir_fails:
 95 |             logger.debug(
 96 |                 "failed to upload full contents of '%s', aborting .dir file upload",
 97 |                 dir_hash,
 98 |             )
 99 |             logger.debug(
100 |                 "failed to upload '%s' to '%s'",
101 |                 src.get(dir_obj.oid).path,
102 |                 dest.get(dir_obj.oid).path,
103 |             )
104 |             failed_ids.update(dir_fails)
105 |             failed_ids.add(dir_obj.hash_info)
106 |         elif entry_ids.intersection(missing_ids):
107 |             # if for some reason a file contained in this dir is
108 |             # missing both locally and in the remote, we want to
109 |             # push whatever file content we have, but should not
110 |             # push .dir file
111 |             logger.debug(
112 |                 "directory '%s' contains missing files, skipping .dir file upload",
113 |                 dir_hash,
114 |             )
115 |         elif _add(src, dest, [dir_obj.hash_info], **kwargs):
116 |             failed_ids.add(dir_obj.hash_info)
117 |         else:
118 |             succeeded_dir_objs.append(dir_obj)
119 | 
120 |     # insert the rest
121 |     failed_ids.update(_add(src, dest, file_ids, **kwargs))
122 |     if failed_ids:
123 |         if src_index:
124 |             src_index.clear()
125 |         return failed_ids
126 | 
127 |     # index successfully pushed dirs
128 |     if dest_index:
129 |         for dir_obj in succeeded_dir_objs:
130 |             file_hashes = {oid.value for _, _, oid in dir_obj}
131 |             logger.debug(
132 |                 "Indexing pushed dir '%s' with '%s' nested files",
133 |                 dir_obj.hash_info,
134 |                 len(file_hashes),
135 |             )
136 |             assert dir_obj.hash_info
137 |             assert dir_obj.hash_info.value
138 |             dest_index.update([dir_obj.hash_info.value], file_hashes)
139 | 
140 |     return set()
141 | 
142 | 
143 | def _add(
144 |     src: "HashFileDB",
145 |     dest: "HashFileDB",
146 |     hash_infos: Iterable["HashInfo"],
147 |     **kwargs,
148 | ) -> set["HashInfo"]:
149 |     failed: set[HashInfo] = set()
150 |     if not hash_infos:
151 |         return failed
152 | 
153 |     def _error(oid: str, exc: BaseException):
154 |         _log_exception(oid, exc)
155 |         failed.add(HashInfo(src.hash_name, oid))
156 | 
157 |     fs_map: dict[FileSystem, list[tuple[str, str]]] = defaultdict(list)
158 |     for hash_info in hash_infos:
159 |         assert hash_info.value
160 |         obj = src.get(hash_info.value)
161 |         fs_map[obj.fs].append((obj.path, obj.oid))
162 | 
163 |     for fs, args in fs_map.items():
164 |         paths, oids = zip(*args)
165 |         dest.add(
166 |             list(paths),
167 |             fs,
168 |             list(oids),
169 |             on_error=_error,
170 |             **kwargs,
171 |         )
172 |     return failed
173 | 
174 | 
175 | def transfer(  # noqa: PLR0913
176 |     src: "HashFileDB",
177 |     dest: "HashFileDB",
178 |     obj_ids: Iterable["HashInfo"],
179 |     jobs: Optional[int] = None,
180 |     verify: bool = False,
181 |     hardlink: bool = False,
182 |     validate_status: Optional[Callable[["CompareStatusResult"], None]] = None,
183 |     src_index: Optional["ObjectDBIndexBase"] = None,
184 |     dest_index: Optional["ObjectDBIndexBase"] = None,
185 |     cache_odb: Optional["HashFileDB"] = None,
186 |     shallow: bool = True,
187 |     callback: "Callback" = DEFAULT_CALLBACK,
188 | ) -> "TransferResult":
189 |     """Transfer (copy) the specified objects from one ODB to another.
190 | 
191 |     Returns the number of successfully transferred objects
192 |     """
193 |     from .status import compare_status
194 | 
195 |     logger.debug(
196 |         "Preparing to transfer data from '%s' to '%s'",
197 |         src.fs.unstrip_protocol(src.path),
198 |         dest.fs.unstrip_protocol(dest.path),
199 |     )
200 |     if src == dest:
201 |         return TransferResult(set(), set())
202 | 
203 |     status = compare_status(
204 |         src,
205 |         dest,
206 |         obj_ids,
207 |         check_deleted=False,
208 |         jobs=jobs,
209 |         src_index=src_index,
210 |         dest_index=dest_index,
211 |         cache_odb=cache_odb,
212 |         shallow=shallow,
213 |     )
214 | 
215 |     if validate_status:
216 |         validate_status(status)
217 | 
218 |     if not status.new:
219 |         return TransferResult(set(), set())
220 | 
221 |     callback.set_size(len(status.new))
222 |     jobs = jobs or dest.fs.jobs
223 | 
224 |     failed = _do_transfer(
225 |         src,
226 |         dest,
227 |         status.new,
228 |         status.missing,
229 |         verify=verify,
230 |         hardlink=hardlink,
231 |         callback=callback,
232 |         batch_size=jobs,
233 |         check_exists=False,
234 |         src_index=src_index,
235 |         dest_index=dest_index,
236 |         cache_odb=cache_odb,
237 |     )
238 |     return TransferResult(status.new - failed, failed)
239 | 


--------------------------------------------------------------------------------
/src/dvc_data/hashfile/utils.py:
--------------------------------------------------------------------------------
 1 | import errno
 2 | import hashlib
 3 | import json
 4 | from typing import TYPE_CHECKING, Optional
 5 | 
 6 | from dvc_data.fsutils import _localfs_info
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from dvc_objects.fs.base import AnyFSPath, FileSystem
10 | 
11 |     from ._ignore import Ignore
12 |     from .diff import DiffResult
13 | 
14 | 
15 | def to_nanoseconds(ts: float) -> int:
16 |     return round(ts * 1_000_000_000)
17 | 
18 | 
19 | def _tokenize_mtimes(files_mtimes: dict[str, float]) -> str:
20 |     data = json.dumps(files_mtimes, sort_keys=True).encode("utf-8")
21 |     digest = hashlib.md5(data)  # noqa: S324
22 |     return digest.hexdigest()
23 | 
24 | 
25 | def get_mtime_and_size(
26 |     path: "AnyFSPath", fs: "FileSystem", ignore: Optional["Ignore"] = None
27 | ) -> tuple[str, int]:
28 |     if not fs.isdir(path):
29 |         base_stat = fs.info(path)
30 |         size = base_stat["size"]
31 |         mtime = str(to_nanoseconds(base_stat["mtime"]))
32 |         return mtime, size
33 | 
34 |     size = 0
35 |     files_mtimes = {}
36 |     if ignore:
37 |         walk_iterator = ignore.find(fs, path)
38 |     else:
39 |         walk_iterator = fs.find(path)
40 |     for file_path in walk_iterator:
41 |         try:
42 |             stats = _localfs_info(file_path)
43 |         except OSError as exc:
44 |             # NOTE: broken symlink case.
45 |             if exc.errno != errno.ENOENT:
46 |                 raise
47 |             continue
48 |         size += stats["size"]
49 |         files_mtimes[file_path] = stats["mtime"]
50 | 
51 |     # We track file changes and moves, which cannot be detected with simply
52 |     # max(mtime(f) for f in non_ignored_files)
53 |     mtime = _tokenize_mtimes(files_mtimes)
54 |     return mtime, size
55 | 
56 | 
57 | def _get_mtime_from_changes(
58 |     path: str,
59 |     fs: "FileSystem",
60 |     diff: "DiffResult",
61 |     updated_mtimes: dict[str, float],
62 | ) -> str:
63 |     from .diff import ROOT
64 | 
65 |     fs_info = _localfs_info(path)
66 |     if fs_info["type"] == "file":
67 |         return str(to_nanoseconds(fs_info["mtime"]))
68 | 
69 |     mtimes: dict[str, float] = {}
70 |     mtimes.update(updated_mtimes)
71 | 
72 |     sep = fs.sep
73 | 
74 |     for change in diff.unchanged:
75 |         key = change.old.key
76 |         if key == ROOT:
77 |             continue
78 | 
79 |         entry_path = sep.join((path, *key))
80 |         if entry_path in mtimes:
81 |             continue
82 |         meta = change.old.meta
83 |         mtime = meta.mtime if meta is not None else None
84 |         if mtime is None:
85 |             try:
86 |                 stats = _localfs_info(entry_path)
87 |             except OSError as exc:
88 |                 # NOTE: broken symlink case.
89 |                 if exc.errno != errno.ENOENT:
90 |                     raise
91 |                 continue
92 |             mtime = stats["mtime"]
93 |             assert mtime is not None
94 |         mtimes[entry_path] = mtime
95 | 
96 |     return _tokenize_mtimes(mtimes)
97 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/__init__.py:
--------------------------------------------------------------------------------
 1 | from .add import add  # noqa: F401
 2 | from .build import build  # noqa: F401
 3 | from .diff import diff  # noqa: F401
 4 | from .index import *  # noqa: F403
 5 | from .save import md5, save  # noqa: F401
 6 | from .serialize import (
 7 |     read_db,  # noqa: F401
 8 |     read_json,  # noqa: F401
 9 |     write_db,  # noqa: F401
10 |     write_json,  # noqa: F401
11 | )
12 | from .update import update  # noqa: F401
13 | from .view import DataIndexView, view  # noqa: F401
14 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/add.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Optional
 2 | 
 3 | from .build import build_entries, build_entry
 4 | from .index import FileStorage
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from dvc_objects.fs import FileSystem
 8 | 
 9 |     from dvc_data.hashfile._ignore import Ignore
10 | 
11 |     from .index import DataIndex, DataIndexKey
12 | 
13 | 
14 | def add(
15 |     index: "DataIndex",
16 |     path: str,
17 |     fs: "FileSystem",
18 |     key: "DataIndexKey",
19 |     ignore: Optional["Ignore"] = None,
20 | ):
21 |     entry = build_entry(path, fs)
22 |     entry.key = key
23 |     index.add(entry)
24 | 
25 |     index.storage_map.add_data(FileStorage(key=key, fs=fs, path=path))
26 | 
27 |     if not fs.isdir(path):
28 |         return
29 | 
30 |     for entry in build_entries(path, fs, ignore=ignore):
31 |         assert entry.key is not None
32 |         entry.key = (*key, *entry.key)
33 |         index.add(entry)
34 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/build.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterable, Iterator
  2 | from itertools import chain, repeat
  3 | from typing import TYPE_CHECKING, Any, Optional
  4 | 
  5 | from dvc_objects.fs.local import LocalFileSystem
  6 | 
  7 | from dvc_data.hashfile.hash import DEFAULT_ALGORITHM, hash_file
  8 | from dvc_data.hashfile.meta import Meta
  9 | 
 10 | from .index import DataIndex, DataIndexEntry, FileStorage
 11 | 
 12 | if TYPE_CHECKING:
 13 |     from dvc_objects.fs.base import FileSystem
 14 | 
 15 |     from dvc_data.hashfile._ignore import Ignore
 16 |     from dvc_data.hashfile.hash_info import HashInfo
 17 |     from dvc_data.hashfile.state import StateBase
 18 | 
 19 | 
 20 | def build_entry(
 21 |     path: str,
 22 |     fs: "FileSystem",
 23 |     info: Optional[dict[str, Any]] = None,
 24 |     compute_hash: Optional[bool] = False,
 25 |     state: Optional["StateBase"] = None,
 26 |     hash_name: str = DEFAULT_ALGORITHM,
 27 | ):
 28 |     if info is None:
 29 |         info = fs.info(path)
 30 | 
 31 |     if compute_hash and info["type"] != "directory":
 32 |         meta, hash_info = hash_file(path, fs, hash_name, state=state, info=info)
 33 |     else:
 34 |         meta, hash_info = Meta.from_info(info, fs.protocol), None
 35 | 
 36 |     return DataIndexEntry(
 37 |         meta=meta,
 38 |         hash_info=hash_info,
 39 |         loaded=meta.isdir or None,
 40 |     )
 41 | 
 42 | 
 43 | def safe_walk(
 44 |     path: str,
 45 |     fs: "FileSystem",
 46 |     ignore: Optional["Ignore"] = None,
 47 | ) -> Iterator[tuple[str, dict[str, dict], dict[str, dict], set[str]]]:
 48 |     if not isinstance(fs, LocalFileSystem):
 49 |         for root, dirs, files in fs.walk(path, detail=True):
 50 |             yield root, dirs, files, set()
 51 | 
 52 |         return
 53 | 
 54 |     # NOTE: can't use detail=True with walk, because that will make it error
 55 |     # out on broken symlinks.
 56 |     sep = fs.sep
 57 |     walk_iter = ignore.walk(fs, path, detail=False) if ignore else fs.walk(path)
 58 |     for root, dirs, files in walk_iter:
 59 |         _dirs: dict[str, dict] = {}
 60 |         _files: dict[str, dict] = {}
 61 |         broken = set()
 62 | 
 63 |         for name, d in chain(zip(dirs, repeat(_dirs)), zip(files, repeat(_files))):
 64 |             p = f"{root}{sep}{name}"
 65 |             try:
 66 |                 d[name] = fs.info(p)
 67 |             except FileNotFoundError:
 68 |                 d[name] = {}
 69 |                 broken.add(name)
 70 |         yield root, _dirs, _files, broken
 71 |         dirs[:] = list(_dirs)
 72 | 
 73 | 
 74 | def build_entries(
 75 |     path: str,
 76 |     fs: "FileSystem",
 77 |     ignore: Optional["Ignore"] = None,
 78 |     compute_hash: Optional[bool] = False,
 79 |     state: Optional["StateBase"] = None,
 80 |     hash_name: str = DEFAULT_ALGORITHM,
 81 |     checksum_jobs: Optional[int] = None,
 82 | ) -> Iterable[DataIndexEntry]:
 83 |     from dvc_data.hashfile.build import _get_hashes
 84 | 
 85 |     sep = fs.sep
 86 |     jobs = checksum_jobs or fs.hash_jobs
 87 |     for root, dirs, files, broken in safe_walk(path, fs, ignore=ignore):
 88 |         if root == path:
 89 |             root_key: tuple[str, ...] = ()
 90 |         else:
 91 |             root_key = fs.relparts(root, path)
 92 | 
 93 |         hashes: dict[str, tuple[Meta, HashInfo, dict]] = {}
 94 |         if compute_hash:
 95 |             file_infos = {
 96 |                 f"{root}{sep}{name}": info for name, info in files.items() if info
 97 |             }
 98 |             file_paths = list(file_infos)
 99 |             hashes = _get_hashes(
100 |                 file_paths, fs, hash_name, file_infos, state=state, jobs=jobs
101 |             )
102 | 
103 |         for name, info in chain(dirs.items(), files.items()):
104 |             key = (*root_key, name)
105 |             if name in broken:
106 |                 yield DataIndexEntry(key=key)
107 |                 continue
108 | 
109 |             p = f"{root}{sep}{name}"
110 |             if p in hashes:
111 |                 meta, hash_info, _ = hashes[p]
112 |             else:
113 |                 meta, hash_info = Meta.from_info(info, fs.protocol), None
114 |             loaded = meta.isdir or None
115 |             yield DataIndexEntry(key=key, meta=meta, hash_info=hash_info, loaded=loaded)
116 | 
117 | 
118 | def build(path: str, fs: "FileSystem", ignore: Optional["Ignore"] = None) -> DataIndex:
119 |     index = DataIndex()
120 | 
121 |     index.storage_map.add_data(FileStorage(key=(), fs=fs, path=path))
122 | 
123 |     for entry in build_entries(path, fs, ignore=ignore):
124 |         index.add(entry)
125 | 
126 |     return index
127 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/collect.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import TYPE_CHECKING, Optional
  3 | 
  4 | from fsspec.callbacks import DEFAULT_CALLBACK
  5 | 
  6 | from .index import DataIndex, DataIndexEntry, FileStorage, ObjectStorage, StorageInfo
  7 | 
  8 | if TYPE_CHECKING:
  9 |     from fsspec import Callback
 10 | 
 11 |     from .index import Storage
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def _collect_from_index(
 17 |     cache,
 18 |     cache_prefix,
 19 |     index,
 20 |     prefix,
 21 |     storage,
 22 |     callback: "Callback" = DEFAULT_CALLBACK,
 23 |     push: bool = False,
 24 | ):
 25 |     entries = {}
 26 | 
 27 |     dir_keys = set()
 28 |     try:
 29 |         for _, entry in index.iteritems(prefix):
 30 |             callback.relative_update()
 31 |             try:
 32 |                 storage_key = storage.get_key(entry)
 33 |             except ValueError:
 34 |                 continue
 35 | 
 36 |             if entry.meta and entry.meta.isdir and entry.loaded is None:
 37 |                 # NOTE: at this point it might not be loaded yet, so we can't
 38 |                 # rely on entry.loaded
 39 |                 dir_keys.add((entry.key, storage_key))
 40 | 
 41 |             meta = entry.meta
 42 |             hash_info = entry.hash_info
 43 |             if (
 44 |                 not push
 45 |                 and isinstance(storage, FileStorage)
 46 |                 and storage.fs.version_aware
 47 |                 and entry.meta
 48 |                 and not entry.meta.isdir
 49 |                 and entry.meta.version_id is None
 50 |             ):
 51 |                 meta.md5 = None
 52 |                 hash_info = None
 53 | 
 54 |             # NOTE: avoiding modifying cache right away, because you might
 55 |             # run into a locked database if idx and cache are using the same
 56 |             # table.
 57 |             entries[storage_key] = DataIndexEntry(
 58 |                 key=storage_key,
 59 |                 meta=meta,
 60 |                 hash_info=hash_info,
 61 |                 loaded=entry.loaded,
 62 |             )
 63 | 
 64 |     except KeyError:
 65 |         return
 66 | 
 67 |     for key, storage_key in dir_keys:
 68 |         entries[storage_key].loaded = index[key].loaded
 69 | 
 70 |     for key, entry in entries.items():
 71 |         cache[(*cache_prefix, *key)] = entry
 72 | 
 73 | 
 74 | def collect(  # noqa: C901, PLR0912, PLR0915
 75 |     idxs,
 76 |     storage,
 77 |     callback: "Callback" = DEFAULT_CALLBACK,
 78 |     cache_index=None,
 79 |     cache_key=None,
 80 |     push: bool = False,
 81 | ) -> list["DataIndex"]:
 82 |     from fsspec.utils import tokenize
 83 | 
 84 |     storage_by_fs: dict[tuple[str, str], StorageInfo] = {}
 85 |     skip = set()
 86 | 
 87 |     if cache_index is None:
 88 |         cache_index = DataIndex()
 89 |         cache_key = ()
 90 | 
 91 |     for idx in idxs:
 92 |         for prefix, storage_info in idx.storage_map.items():
 93 |             data = getattr(storage_info, storage)
 94 |             cache = storage_info.cache if storage != "cache" else None
 95 |             remote = storage_info.remote if storage != "remote" else None
 96 | 
 97 |             if not data or (push and data.read_only):
 98 |                 continue
 99 | 
100 |             try:
101 |                 fsid = data.fs.fsid
102 |             except (NotImplementedError, AttributeError):
103 |                 fsid = data.fs.protocol
104 |             except BaseException:  # noqa: BLE001
105 |                 logger.debug("skipping index collection for data with invalid fsid")
106 |                 continue
107 | 
108 |             key = (fsid, tokenize(data.path))
109 | 
110 |             if key not in storage_by_fs and cache_index.has_node((*cache_key, *key)):
111 |                 skip.add(key)
112 | 
113 |             if key not in skip:
114 |                 _collect_from_index(
115 |                     cache_index,
116 |                     (*cache_key, *key),
117 |                     idx,
118 |                     prefix,
119 |                     data,
120 |                     callback=callback,
121 |                     push=push,
122 |                 )
123 |                 cache_index.commit()
124 | 
125 |             if key not in storage_by_fs:
126 |                 fs_data: Storage
127 |                 fs_cache: Optional[Storage]
128 |                 fs_remote: Optional[Storage]
129 | 
130 |                 if isinstance(data, ObjectStorage):
131 |                     fs_data = ObjectStorage(key=(), odb=data.odb)
132 |                 else:
133 |                     fs_data = FileStorage(key=(), fs=data.fs, path=data.path)
134 | 
135 |                 if not cache:
136 |                     fs_cache = None
137 |                 elif isinstance(cache, ObjectStorage):
138 |                     fs_cache = ObjectStorage(key=(), odb=cache.odb)
139 |                 else:
140 |                     fs_cache = FileStorage(key=(), fs=cache.fs, path=cache.path)
141 | 
142 |                 if not remote:
143 |                     fs_remote = None
144 |                 elif isinstance(remote, ObjectStorage):
145 |                     fs_remote = ObjectStorage(key=(), odb=remote.odb)
146 |                 else:
147 |                     fs_remote = FileStorage(
148 |                         key=(),
149 |                         fs=remote.fs,
150 |                         path=remote.path,
151 |                     )
152 | 
153 |                 storage_by_fs[key] = StorageInfo(
154 |                     data=fs_data, cache=fs_cache, remote=fs_remote
155 |                 )
156 | 
157 |     storage_indexes = []
158 |     for key, storage_info in storage_by_fs.items():
159 |         idx = cache_index.view((*cache_key, *key))
160 |         idx.storage_map[()] = storage_info
161 | 
162 |         def _onerror(*args):
163 |             pass
164 | 
165 |         idx.onerror = _onerror
166 |         storage_indexes.append(idx)
167 | 
168 |     return storage_indexes
169 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/diff.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from collections import defaultdict, deque
  3 | from collections.abc import Iterable
  4 | from typing import TYPE_CHECKING, Any, Callable, Optional
  5 | 
  6 | from attrs import define
  7 | from fsspec.callbacks import DEFAULT_CALLBACK, Callback
  8 | 
  9 | if TYPE_CHECKING:
 10 |     from dvc_data.hashfile.hash_info import HashInfo
 11 |     from dvc_data.hashfile.meta import Meta
 12 | 
 13 |     from .index import BaseDataIndex, DataIndexKey
 14 | 
 15 | from .index import DataIndexDirError, DataIndexEntry
 16 | 
 17 | ADD = "add"
 18 | MODIFY = "modify"
 19 | RENAME = "rename"
 20 | DELETE = "delete"
 21 | UNCHANGED = "unchanged"
 22 | UNKNOWN = "unknown"
 23 | 
 24 | 
 25 | @define(frozen=True, unsafe_hash=True, order=True)
 26 | class Change:
 27 |     typ: str
 28 |     old: Optional[DataIndexEntry]
 29 |     new: Optional[DataIndexEntry]
 30 | 
 31 |     @property
 32 |     def key(self) -> "DataIndexKey":
 33 |         if self.typ == RENAME:
 34 |             raise ValueError
 35 | 
 36 |         if self.typ == ADD:
 37 |             entry = self.new
 38 |         elif self.typ == DELETE:
 39 |             entry = self.old
 40 |         else:
 41 |             entry = self.old or self.new
 42 | 
 43 |         assert entry
 44 |         assert entry.key is not None
 45 |         return entry.key
 46 | 
 47 |     def __bool__(self):
 48 |         return self.typ != UNCHANGED
 49 | 
 50 | 
 51 | def _diff_meta(
 52 |     old: Optional["Meta"],
 53 |     new: Optional["Meta"],
 54 |     *,
 55 |     cmp_key: Optional[Callable[[Optional["Meta"]], Any]] = None,
 56 | ):
 57 |     if old is None and new is not None:
 58 |         return ADD
 59 | 
 60 |     if old is not None and new is None:
 61 |         return DELETE
 62 | 
 63 |     if cmp_key is None and old != new:
 64 |         return MODIFY
 65 | 
 66 |     if cmp_key is not None and cmp_key(old) != cmp_key(new):
 67 |         return MODIFY
 68 | 
 69 |     return UNCHANGED
 70 | 
 71 | 
 72 | def _diff_hash_info(
 73 |     old: Optional["HashInfo"],
 74 |     new: Optional["HashInfo"],
 75 | ):
 76 |     if not old and new:
 77 |         return ADD
 78 | 
 79 |     if old and not new:
 80 |         return DELETE
 81 | 
 82 |     if old and new and old != new:
 83 |         return MODIFY
 84 | 
 85 |     return UNCHANGED
 86 | 
 87 | 
 88 | def _diff_entry(  # noqa: PLR0911
 89 |     old: Optional["DataIndexEntry"],
 90 |     new: Optional["DataIndexEntry"],
 91 |     *,
 92 |     hash_only: Optional[bool] = False,
 93 |     meta_only: Optional[bool] = False,
 94 |     meta_cmp_key: Optional[Callable[[Optional["Meta"]], Any]] = None,
 95 |     unknown: Optional[bool] = False,
 96 | ):
 97 |     if unknown:
 98 |         return UNKNOWN
 99 | 
100 |     old_hi = old.hash_info if old else None
101 |     new_hi = new.hash_info if new else None
102 |     old_meta = old.meta if old else None
103 |     new_meta = new.meta if new else None
104 | 
105 |     meta_diff = _diff_meta(old_meta, new_meta, cmp_key=meta_cmp_key)
106 |     hi_diff = _diff_hash_info(old_hi, new_hi)
107 | 
108 |     if old is None and new is not None:
109 |         entry_diff = ADD
110 |     elif old is not None and new is None:
111 |         entry_diff = DELETE
112 |     else:
113 |         entry_diff = UNCHANGED
114 | 
115 |     if meta_only:
116 |         return meta_diff
117 | 
118 |     if hash_only:
119 |         return hi_diff
120 | 
121 |     if entry_diff != UNCHANGED:
122 |         return entry_diff
123 | 
124 |     # If both meta's are None, return hi_diff
125 |     if meta_diff == UNCHANGED and old_meta is None:
126 |         return hi_diff
127 | 
128 |     # If both hi's are falsey, return meta_diff
129 |     if hi_diff == UNCHANGED and not old_hi:
130 |         return meta_diff
131 | 
132 |     # Only return UNCHANGED/ADD/DELETE when hi_diff and meta_diff match,
133 |     # otherwise return MODIFY
134 |     if meta_diff == hi_diff == entry_diff:
135 |         return meta_diff
136 | 
137 |     return MODIFY
138 | 
139 | 
140 | def _get_items(
141 |     index: Optional["BaseDataIndex"],
142 |     key,
143 |     entry,
144 |     *,
145 |     shallow=False,
146 |     with_unknown=False,
147 | ):
148 |     items = {}
149 |     unknown = False
150 | 
151 |     try:
152 |         if index is not None and not (shallow and entry and entry.hash_info):
153 |             items = dict(index.ls(key, detail=True))
154 |     except KeyError:
155 |         pass
156 |     except DataIndexDirError:
157 |         unknown = with_unknown
158 | 
159 |     return items, unknown
160 | 
161 | 
162 | def _diff(  # noqa: C901
163 |     old: Optional["BaseDataIndex"],
164 |     new: Optional["BaseDataIndex"],
165 |     *,
166 |     with_unchanged: Optional[bool] = False,
167 |     with_unknown: Optional[bool] = False,
168 |     hash_only: Optional[bool] = False,
169 |     meta_only: Optional[bool] = False,
170 |     meta_cmp_key: Optional[Callable[[Optional["Meta"]], Any]] = None,
171 |     shallow: Optional[bool] = False,
172 |     callback: Callback = DEFAULT_CALLBACK,
173 |     roots: Optional[Iterable["DataIndexKey"]] = None,
174 | ):
175 |     roots = roots or [()]
176 |     todo: deque[tuple[dict, dict, bool]] = deque()
177 | 
178 |     for root in roots:
179 |         old_root_items = {}
180 |         new_root_items = {}
181 | 
182 |         if old is not None:
183 |             try:
184 |                 old_root_items[root] = old.info(root)
185 |             except KeyError:
186 |                 pass
187 | 
188 |         if new is not None:
189 |             try:
190 |                 new_root_items[root] = new.info(root)
191 |             except KeyError:
192 |                 pass
193 | 
194 |         todo.append((old_root_items, new_root_items, False))
195 | 
196 |     while todo:
197 |         old_items, new_items, unknown = todo.popleft()
198 |         for key in callback.wrap(old_items.keys() | new_items.keys()):
199 |             old_info = old_items.get(key) or {}
200 |             new_info = new_items.get(key) or {}
201 | 
202 |             old_entry = old_info.get("entry")
203 |             new_entry = new_info.get("entry")
204 | 
205 |             typ = _diff_entry(
206 |                 old_entry,
207 |                 new_entry,
208 |                 hash_only=hash_only,
209 |                 meta_only=meta_only,
210 |                 meta_cmp_key=meta_cmp_key,
211 |                 unknown=unknown,
212 |             )
213 | 
214 |             if (
215 |                 hash_only
216 |                 and not with_unchanged
217 |                 and not unknown
218 |                 and typ == UNCHANGED
219 |                 and old_entry
220 |                 and old_entry.hash_info
221 |                 and old_entry.hash_info.isdir
222 |             ):
223 |                 # NOTE: skipping the whole branch since we know it is unchanged
224 |                 pass
225 |             elif (
226 |                 old_info.get("type") == "directory"
227 |                 or new_info.get("type") == "directory"
228 |             ):
229 |                 kwargs = {"shallow": shallow, "with_unknown": with_unknown}
230 |                 old_dir_items, old_unknown = _get_items(old, key, old_entry, **kwargs)
231 |                 new_dir_items, new_unknown = _get_items(new, key, new_entry, **kwargs)
232 |                 dir_unknown = old_unknown or new_unknown
233 |                 todo.append((old_dir_items, new_dir_items, dir_unknown))
234 | 
235 |             if old_entry is None and new_entry is None:
236 |                 continue
237 | 
238 |             if typ == UNCHANGED and not with_unchanged:
239 |                 continue
240 | 
241 |             yield Change(typ, old_entry, new_entry)
242 | 
243 | 
244 | def _detect_renames(changes: Iterable[Change]):
245 |     added: list[Change] = []
246 |     deleted: list[Change] = []
247 | 
248 |     for change in changes:
249 |         if change.typ == ADD:
250 |             added.append(change)
251 |         elif change.typ == DELETE:
252 |             deleted.append(change)
253 |         else:
254 |             yield change
255 | 
256 |     def _get_key(change):
257 |         return change.key
258 | 
259 |     added.sort(key=_get_key)
260 |     deleted.sort(key=_get_key)
261 | 
262 |     # Create a dictionary for fast lookup of deletions by hash_info
263 |     deleted_dict: dict[Optional[HashInfo], deque[Change]] = defaultdict(deque)
264 |     for deletion in deleted:
265 |         change_hash = deletion.old.hash_info if deletion.old else None
266 |         # appendleft to get queue behaviour (we pop off right)
267 |         deleted_dict[change_hash].appendleft(deletion)
268 | 
269 |     for addition in added:
270 |         new_hash_info = addition.new.hash_info if addition.new else None
271 | 
272 |         # If the new entry is the same as a deleted change,
273 |         # it is in fact a rename.
274 |         # Note: get instead of __getitem__, to avoid creating
275 |         # unnecessary entries.
276 |         if new_hash_info and (queue := deleted_dict.get(new_hash_info)):
277 |             deletion = queue.pop()
278 | 
279 |             yield Change(
280 |                 RENAME,
281 |                 deletion.old,
282 |                 addition.new,
283 |             )
284 |         else:
285 |             yield addition
286 | 
287 |     # Yield the remaining unmatched deletions
288 |     if deleted_dict:
289 |         yield from itertools.chain.from_iterable(deleted_dict.values())
290 | 
291 | 
292 | def diff(  # noqa: PLR0913
293 |     old: Optional["BaseDataIndex"],
294 |     new: Optional["BaseDataIndex"],
295 |     *,
296 |     with_renames: Optional[bool] = False,
297 |     with_unchanged: Optional[bool] = False,
298 |     with_unknown: Optional[bool] = False,
299 |     hash_only: Optional[bool] = False,
300 |     meta_only: Optional[bool] = False,
301 |     meta_cmp_key: Optional[Callable[[Optional["Meta"]], Any]] = None,
302 |     shallow: Optional[bool] = False,
303 |     callback: Callback = DEFAULT_CALLBACK,
304 |     roots: Optional[Iterable["DataIndexKey"]] = None,
305 | ):
306 |     changes = _diff(
307 |         old,
308 |         new,
309 |         with_unchanged=with_unchanged,
310 |         with_unknown=with_unknown,
311 |         hash_only=hash_only,
312 |         meta_only=meta_only,
313 |         meta_cmp_key=meta_cmp_key,
314 |         shallow=shallow,
315 |         callback=callback,
316 |         roots=roots,
317 |     )
318 | 
319 |     if with_renames and old is not None and new is not None:
320 |         assert not meta_only
321 |         yield from _detect_renames(changes)
322 |     else:
323 |         yield from changes
324 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/fetch.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from contextlib import closing
  3 | from functools import partial
  4 | from typing import TYPE_CHECKING, Optional
  5 | 
  6 | from dvc_objects.fs.local import LocalFileSystem
  7 | from fsspec.callbacks import DEFAULT_CALLBACK
  8 | 
  9 | from dvc_data.callbacks import TqdmCallback
 10 | from dvc_data.hashfile.db import get_index
 11 | from dvc_data.hashfile.meta import Meta
 12 | from dvc_data.hashfile.transfer import transfer
 13 | 
 14 | from .build import build
 15 | from .checkout import apply, compare
 16 | from .collect import collect  # noqa: F401
 17 | from .index import DataIndex, ObjectStorage
 18 | from .save import md5, save
 19 | 
 20 | if TYPE_CHECKING:
 21 |     from fsspec import Callback
 22 | 
 23 |     from dvc_data.hashfile.status import CompareStatusResult
 24 | 
 25 |     from .index import DataIndexKey
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | def _log_missing(status: "CompareStatusResult"):
 31 |     if status.missing:
 32 |         missing_desc = "\n".join(f"{hash_info}" for hash_info in status.missing)
 33 |         logger.warning(
 34 |             "Some of the cache files do not exist neither locally "
 35 |             "nor on remote. Missing cache files:\n%s",
 36 |             missing_desc,
 37 |         )
 38 | 
 39 | 
 40 | def _onerror(data, cache, failed_keys, src_path, dest_path, exc):
 41 |     if not isinstance(exc, FileNotFoundError) or data.fs.exists(src_path):
 42 |         failed_keys.add(cache.fs.relparts(dest_path, cache.path))
 43 | 
 44 |     logger.debug(
 45 |         "failed to create '%s' from '%s'",
 46 |         src_path,
 47 |         dest_path,
 48 |         exc_info=True,
 49 |     )
 50 | 
 51 | 
 52 | def _filter_changed(index):
 53 |     ret = DataIndex()
 54 |     ret.storage_map = index.storage_map
 55 | 
 56 |     for _, entry in index.items():
 57 |         if entry.meta and entry.meta.isdir:
 58 |             ret.add(entry)
 59 |             continue
 60 | 
 61 |         if not entry.meta or entry.meta.version_id:
 62 |             ret.add(entry)
 63 |             continue
 64 | 
 65 |         try:
 66 |             data_fs, data_path = index.storage_map.get_data(entry)
 67 |         except ValueError:
 68 |             continue
 69 | 
 70 |         try:
 71 |             info = data_fs.info(data_path)
 72 |         except FileNotFoundError:
 73 |             continue
 74 | 
 75 |         if getattr(data_fs, "immutable", None):
 76 |             ret.add(entry)
 77 |             continue
 78 | 
 79 |         meta = Meta.from_info(info)
 80 |         old = getattr(entry.meta, data_fs.PARAM_CHECKSUM, None) if entry.meta else None
 81 |         new = getattr(meta, data_fs.PARAM_CHECKSUM, None)
 82 | 
 83 |         if old and new is None and isinstance(data_fs, LocalFileSystem):
 84 |             # NOTE: temporary ugly hack to handle local sources where
 85 |             # the only thing we currently have is md5.
 86 |             from dvc_data.hashfile.hash import hash_file
 87 | 
 88 |             _, hi = hash_file(data_path, data_fs, "md5")
 89 |             new = hi.value
 90 | 
 91 |         if old and new and old == new:
 92 |             ret.add(entry)
 93 | 
 94 |     return ret
 95 | 
 96 | 
 97 | def fetch(
 98 |     idxs,
 99 |     callback: "Callback" = DEFAULT_CALLBACK,
100 |     jobs: Optional[int] = None,
101 | ):
102 |     fetched, failed = 0, 0
103 |     for fs_index in idxs:
104 |         data = fs_index.storage_map[()].data
105 |         cache = fs_index.storage_map[()].cache
106 | 
107 |         if callback != DEFAULT_CALLBACK:
108 |             cb = TqdmCallback(
109 |                 unit="file",
110 |                 total=len(fs_index),
111 |                 desc=f"Fetching from {data.fs.protocol}",
112 |             )
113 |         else:
114 |             cb = callback
115 | 
116 |         try:
117 |             # NOTE: make sure there are no auth errors
118 |             data.fs.exists(data.path)
119 |         except Exception:
120 |             failed += len(fs_index)
121 |             logger.exception(
122 |                 "failed to connect to %s (%s)", data.fs.protocol, data.path
123 |             )
124 |             continue
125 | 
126 |         with cb:
127 |             if isinstance(cache, ObjectStorage) and isinstance(data, ObjectStorage):
128 |                 with closing(get_index(data.odb)) as src_index:
129 |                     result = transfer(
130 |                         data.odb,
131 |                         cache.odb,
132 |                         [
133 |                             entry.hash_info
134 |                             for _, entry in fs_index.iteritems()
135 |                             if entry.hash_info
136 |                         ],
137 |                         jobs=jobs,
138 |                         src_index=src_index,
139 |                         cache_odb=cache.odb,
140 |                         verify=data.odb.verify,
141 |                         validate_status=_log_missing,
142 |                         callback=cb,
143 |                     )
144 |                 fetched += len(result.transferred)
145 |                 failed += len(result.failed)
146 |             elif isinstance(cache, ObjectStorage):
147 |                 updated = md5(fs_index)
148 | 
149 |                 def _on_error(failed, oid, exc):
150 |                     if isinstance(exc, FileNotFoundError):
151 |                         return
152 |                     failed += 1
153 |                     logger.debug(
154 |                         "failed to transfer '%s'",
155 |                         oid,
156 |                         exc_info=True,
157 |                     )
158 | 
159 |                 fetched += save(
160 |                     updated,
161 |                     jobs=jobs,
162 |                     callback=cb,
163 |                     on_error=partial(_on_error, failed),
164 |                 )
165 |             else:
166 |                 old = build(cache.path, cache.fs)
167 |                 filtered = _filter_changed(fs_index)
168 |                 diff = compare(old, filtered)
169 |                 cache.fs.makedirs(cache.fs.parent(cache.path), exist_ok=True)
170 | 
171 |                 failed_keys: set[DataIndexKey] = set()
172 |                 apply(
173 |                     diff,
174 |                     cache.path,
175 |                     cache.fs,
176 |                     update_meta=False,
177 |                     storage="data",
178 |                     jobs=jobs,
179 |                     callback=cb,
180 |                     onerror=partial(_onerror, data, cache, failed_keys),
181 |                 )
182 | 
183 |                 added_keys = {entry.key for entry in diff.files_create}
184 |                 fetched += len(added_keys - failed_keys)
185 |                 failed += len(failed_keys)
186 | 
187 |     return fetched, failed
188 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/push.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from contextlib import closing
  3 | from functools import partial
  4 | from typing import TYPE_CHECKING, Any, Optional
  5 | 
  6 | from fsspec.callbacks import DEFAULT_CALLBACK
  7 | 
  8 | from dvc_data.callbacks import TqdmCallback
  9 | from dvc_data.hashfile.db import get_index
 10 | from dvc_data.hashfile.transfer import transfer
 11 | 
 12 | from .build import build
 13 | from .checkout import _prune_existing_versions, apply, compare
 14 | from .fetch import _log_missing
 15 | from .index import DataIndex, ObjectStorage
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from dvc_objects.fs import FileSystem
 19 |     from fsspec import Callback
 20 | 
 21 |     from dvc_data.hashfile.meta import Meta
 22 | 
 23 |     from .index import DataIndexKey
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | # for files, if our version's checksum (etag) matches the latest remote
 29 | # checksum, we do not need to push, even if the version IDs don't match
 30 | def _meta_checksum(fs: "FileSystem", meta: "Meta") -> Any:
 31 |     if not meta or meta.isdir:
 32 |         return meta
 33 |     assert fs.PARAM_CHECKSUM
 34 |     return getattr(meta, fs.PARAM_CHECKSUM)
 35 | 
 36 | 
 37 | def _onerror(cache, data, failed_keys, src_path, dest_path, exc):
 38 |     if not isinstance(exc, FileNotFoundError) or cache.fs.exists(src_path):
 39 |         failed_keys.add(data.fs.relparts(dest_path, data.path))
 40 | 
 41 |     logger.debug(
 42 |         "failed to create '%s' from '%s'",
 43 |         src_path,
 44 |         dest_path,
 45 |         exc_info=True,
 46 |     )
 47 | 
 48 | 
 49 | def _filter_missing(index):
 50 |     ret = DataIndex()
 51 |     ret.storage_map = index.storage_map
 52 | 
 53 |     for _, entry in index.items():
 54 |         try:
 55 |             cache_fs, cache_path = index.storage_map.get_cache(entry)
 56 |         except ValueError:
 57 |             continue
 58 | 
 59 |         if cache_fs.exists(cache_path):
 60 |             ret.add(entry)
 61 | 
 62 |     return ret
 63 | 
 64 | 
 65 | def push(
 66 |     idxs,
 67 |     callback: "Callback" = DEFAULT_CALLBACK,
 68 |     jobs: Optional[int] = None,
 69 | ):
 70 |     pushed, failed = 0, 0
 71 |     for fs_index in idxs:
 72 |         data = fs_index.storage_map[()].data
 73 |         cache = fs_index.storage_map[()].cache
 74 | 
 75 |         if isinstance(cache, ObjectStorage) and isinstance(data, ObjectStorage):
 76 |             with TqdmCallback(unit="file", desc=f"Pushing to {data.fs.protocol}") as cb:
 77 |                 with closing(get_index(data.odb)) as dest_index:
 78 |                     result = transfer(
 79 |                         cache.odb,
 80 |                         data.odb,
 81 |                         [
 82 |                             entry.hash_info
 83 |                             for _, entry in fs_index.iteritems()
 84 |                             if entry.hash_info
 85 |                         ],
 86 |                         jobs=jobs,
 87 |                         dest_index=dest_index,
 88 |                         cache_odb=data.odb,
 89 |                         validate_status=_log_missing,
 90 |                         callback=cb,
 91 |                     )
 92 |                 pushed += len(result.transferred)
 93 |                 failed += len(result.failed)
 94 |         else:
 95 |             old = build(data.path, data.fs)
 96 | 
 97 |             existing_fs_index = _filter_missing(fs_index)
 98 |             diff = compare(
 99 |                 old,
100 |                 existing_fs_index,
101 |                 meta_only=True,
102 |                 meta_cmp_key=partial(_meta_checksum, data.fs),
103 |             )
104 |             data.fs.makedirs(data.fs.parent(data.path), exist_ok=True)
105 | 
106 |             failed_keys: set[DataIndexKey] = set()
107 | 
108 |             if data.fs.version_aware:
109 |                 desc = f"Checking status of existing versions in {data.path!r}"
110 |                 with TqdmCallback(desc=desc, unit="file") as cb:
111 |                     diff.files_create = list(
112 |                         _prune_existing_versions(
113 |                             diff.files_create, data.fs, data.path, callback=cb
114 |                         )
115 |                     )
116 | 
117 |             with TqdmCallback(unit="file", desc=f"Pushing to {data.fs.protocol}") as cb:
118 |                 cb.set_size(len(diff.files_create))
119 |                 apply(
120 |                     diff,
121 |                     data.path,
122 |                     data.fs,
123 |                     update_meta=False,
124 |                     storage="cache",
125 |                     jobs=jobs,
126 |                     callback=cb,
127 |                     links=["reflink", "copy"],
128 |                     onerror=partial(_onerror, cache, data, failed_keys),
129 |                 )
130 | 
131 |             added_keys = {entry.key for entry in diff.files_create}
132 |             pushed += len(added_keys - failed_keys)
133 |             failed += len(failed_keys)
134 | 
135 |     return pushed, failed
136 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/save.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import TYPE_CHECKING, Optional
  3 | 
  4 | from fsspec.callbacks import DEFAULT_CALLBACK
  5 | 
  6 | from dvc_data.hashfile.hash import DEFAULT_ALGORITHM, hash_file
  7 | from dvc_data.hashfile.meta import Meta
  8 | from dvc_data.hashfile.tree import Tree
  9 | 
 10 | if TYPE_CHECKING:
 11 |     from dvc_objects.fs.base import FileSystem
 12 |     from fsspec import Callback
 13 | 
 14 |     from dvc_data.hashfile.db import HashFileDB
 15 |     from dvc_data.hashfile.state import StateBase
 16 | 
 17 |     from .index import BaseDataIndex, DataIndex, DataIndexKey
 18 | 
 19 | 
 20 | def _meta_matches(fs, path, old_meta):
 21 |     try:
 22 |         info = fs.info(path)
 23 |     except FileNotFoundError:
 24 |         return False
 25 | 
 26 |     if getattr(fs, "immutable", False):
 27 |         return True
 28 | 
 29 |     new_meta = Meta.from_info(info, fs.protocol)
 30 |     old = getattr(old_meta, fs.PARAM_CHECKSUM, None) if old_meta else None
 31 |     new = getattr(new_meta, fs.PARAM_CHECKSUM, None)
 32 |     if not old or not new:
 33 |         return None
 34 | 
 35 |     return old == new
 36 | 
 37 | 
 38 | def md5(
 39 |     index: "BaseDataIndex",
 40 |     state: Optional["StateBase"] = None,
 41 |     storage: str = "data",
 42 |     name: str = DEFAULT_ALGORITHM,
 43 | ) -> "DataIndex":
 44 |     from .index import DataIndex, DataIndexEntry
 45 | 
 46 |     ret = DataIndex()
 47 | 
 48 |     for _, entry in index.iteritems():
 49 |         if entry.meta and entry.meta.isdir:
 50 |             ret.add(entry)
 51 |             continue
 52 | 
 53 |         hash_info = None
 54 |         if entry.hash_info and entry.hash_info.name in ("md5", "md5-dos2unix"):
 55 |             hash_info = entry.hash_info
 56 | 
 57 |         try:
 58 |             fs, path = index.storage_map.get_storage(entry, storage)
 59 |         except ValueError:
 60 |             continue
 61 | 
 62 |         matches = _meta_matches(fs, path, entry.meta)
 63 |         if matches:
 64 |             ret.add(entry)
 65 |         elif matches is not None:
 66 |             continue
 67 | 
 68 |         try:
 69 |             _, hi = hash_file(path, fs, name, state=state)
 70 |         except FileNotFoundError:
 71 |             continue
 72 | 
 73 |         if hash_info and hi != hash_info:
 74 |             continue
 75 | 
 76 |         ret.add(
 77 |             DataIndexEntry(
 78 |                 key=entry.key,
 79 |                 meta=entry.meta,
 80 |                 hash_info=hi,
 81 |             )
 82 |         )
 83 | 
 84 |     ret.storage_map = index.storage_map
 85 |     return ret
 86 | 
 87 | 
 88 | def build_tree(
 89 |     index: "BaseDataIndex",
 90 |     prefix: "DataIndexKey",
 91 |     name: str = DEFAULT_ALGORITHM,
 92 | ) -> tuple["Meta", Tree]:
 93 |     tree_meta = Meta(size=0, nfiles=0, isdir=True)
 94 |     assert tree_meta.size is not None
 95 |     assert tree_meta.nfiles is not None
 96 |     tree = Tree()
 97 |     for key, entry in index.iteritems(prefix=prefix):
 98 |         if key == prefix or (entry.meta and entry.meta.isdir):
 99 |             continue
100 |         tree_key = key[len(prefix) :]
101 |         tree.add(tree_key, entry.meta, entry.hash_info)
102 |         tree_meta.size += (entry.meta.size if entry.meta else 0) or 0
103 |         tree_meta.nfiles += 1
104 |     tree.digest(name=name)
105 |     return tree_meta, tree
106 | 
107 | 
108 | def _save_dir_entry(
109 |     index: "BaseDataIndex",
110 |     key: "DataIndexKey",
111 |     odb: Optional["HashFileDB"] = None,
112 | ) -> None:
113 |     from dvc_data.hashfile.db import add_update_tree
114 | 
115 |     from .index import StorageKeyError
116 | 
117 |     entry = index[key]
118 | 
119 |     try:
120 |         cache = odb or index.storage_map.get_cache_odb(entry)
121 |     except StorageKeyError:
122 |         return
123 | 
124 |     assert cache
125 |     meta, tree = build_tree(index, key)
126 |     tree = add_update_tree(cache, tree)
127 |     entry.meta = meta
128 |     entry.hash_info = tree.hash_info
129 |     assert tree.hash_info.name
130 |     assert tree.hash_info.value
131 |     setattr(entry.meta, tree.hash_info.name, tree.hash_info.value)
132 | 
133 | 
134 | if TYPE_CHECKING:
135 |     _ODBMap = dict["HashFileDB", "_FSMap"]
136 |     _FSMap = dict["FileSystem", list[tuple[str, str]]]
137 | 
138 | 
139 | def save(
140 |     index: "BaseDataIndex",
141 |     odb: Optional["HashFileDB"] = None,
142 |     callback: "Callback" = DEFAULT_CALLBACK,
143 |     jobs: Optional[int] = None,
144 |     storage: str = "data",
145 |     **kwargs,
146 | ) -> int:
147 |     dir_entries: list[DataIndexKey] = []
148 |     transferred = 0
149 | 
150 |     odb_map: _ODBMap = {}
151 |     for key, entry in index.iteritems():
152 |         if entry.meta and entry.meta.isdir:
153 |             dir_entries.append(key)
154 |             continue
155 | 
156 |         try:
157 |             fs, path = index.storage_map.get_storage(entry, storage)
158 |         except ValueError:
159 |             continue
160 | 
161 |         if entry.hash_info:
162 |             cache = odb or index.storage_map.get_cache_odb(entry)
163 |             assert cache
164 |             assert entry.hash_info.value
165 |             oid = entry.hash_info.value
166 |             if cache not in odb_map:
167 |                 odb_map[cache] = defaultdict(list)
168 |             odb_map[cache][fs].append((path, oid))
169 |     for cache, fs_map in odb_map.items():
170 |         for fs, args in fs_map.items():
171 |             paths, oids = zip(*args)
172 |             transferred += cache.add(
173 |                 list(paths),
174 |                 fs,
175 |                 list(oids),
176 |                 callback=callback,
177 |                 batch_size=jobs,
178 |                 **kwargs,
179 |             )
180 | 
181 |     for key in dir_entries:
182 |         _save_dir_entry(index, key, odb=odb)
183 | 
184 |     return transferred
185 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/serialize.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from contextlib import closing
 3 | 
 4 | from dvc_data.hashfile.cache import Cache
 5 | 
 6 | from .index import DataIndex, DataIndexEntry
 7 | 
 8 | 
 9 | def write_db(index: DataIndex, path: str) -> None:
10 |     cache = Cache(path)
11 |     with closing(cache), cache.transact():
12 |         for key, entry in index.iteritems():
13 |             cache["/".join(key)] = entry.to_dict()
14 | 
15 | 
16 | def read_db(path: str) -> DataIndex:
17 |     index = DataIndex()
18 |     cache = Cache(path)
19 | 
20 |     with closing(cache), cache.transact():
21 |         for key in cache:
22 |             value = cache.get(key)
23 |             entry = DataIndexEntry.from_dict(value)
24 |             entry.key = tuple(key.split("/"))
25 |             index.add(entry)
26 | 
27 |     return index
28 | 
29 | 
30 | def write_json(index: DataIndex, path: str) -> None:
31 |     with open(path, "w", encoding="utf-8") as fobj:
32 |         json.dump(
33 |             {"/".join(key): entry.to_dict() for key, entry in index.iteritems()},
34 |             fobj,
35 |         )
36 | 
37 | 
38 | def read_json(path: str) -> DataIndex:
39 |     index = DataIndex()
40 | 
41 |     with open(path, encoding="utf-8") as fobj:
42 |         for key, value in json.load(fobj).items():
43 |             entry = DataIndexEntry.from_dict(value)
44 |             entry.key = tuple(key.split("/"))
45 |             index.add(entry)
46 | 
47 |     return index
48 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/update.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | from .diff import UNCHANGED, diff
 4 | 
 5 | if TYPE_CHECKING:
 6 |     from .index import BaseDataIndex, DataIndex
 7 | 
 8 | 
 9 | def update(new: "DataIndex", old: "BaseDataIndex") -> None:
10 |     for change in diff(old, new, with_unchanged=True, meta_only=True):
11 |         if change.typ == UNCHANGED:
12 |             change.new.hash_info = change.old.hash_info
13 | 


--------------------------------------------------------------------------------
/src/dvc_data/index/view.py:
--------------------------------------------------------------------------------
  1 | from collections import deque
  2 | from collections.abc import Iterator
  3 | from typing import TYPE_CHECKING, Any, Callable, Optional
  4 | 
  5 | from .index import BaseDataIndex, DataIndex, DataIndexEntry, DataIndexKey
  6 | 
  7 | if TYPE_CHECKING:
  8 |     from .index import StorageMapping
  9 | 
 10 | 
 11 | class DataIndexView(BaseDataIndex):
 12 |     def __init__(
 13 |         self,
 14 |         index: DataIndex,
 15 |         filter_fn: Callable[[DataIndexKey], bool],
 16 |     ):
 17 |         self._index = index
 18 |         self.filter_fn = filter_fn
 19 | 
 20 |     @property
 21 |     def onerror(self):
 22 |         return self._index.onerror
 23 | 
 24 |     @onerror.setter
 25 |     def onerror(self, onerror):
 26 |         self._index.onerror = onerror
 27 | 
 28 |     @property
 29 |     def storage_map(self) -> "StorageMapping":  # type: ignore[override]
 30 |         return self._index.storage_map
 31 | 
 32 |     def __setitem__(self, key, value):
 33 |         if self.filter_fn(key):
 34 |             self._index[key] = value
 35 |         else:
 36 |             raise KeyError
 37 | 
 38 |     def __getitem__(self, key: DataIndexKey) -> DataIndexEntry:
 39 |         if key == () or self.filter_fn(key):
 40 |             return self._index[key]
 41 |         raise KeyError
 42 | 
 43 |     def __delitem__(self, key: DataIndexKey):
 44 |         if self.filter_fn(key):
 45 |             del self._index[key]
 46 |         else:
 47 |             raise KeyError
 48 | 
 49 |     def __iter__(self) -> Iterator[DataIndexKey]:
 50 |         return (key for key, _ in self._iteritems())
 51 | 
 52 |     def __len__(self):
 53 |         return len(list(iter(self)))
 54 | 
 55 |     def _iteritems(
 56 |         self,
 57 |         prefix: Optional[DataIndexKey] = None,
 58 |         shallow: bool = False,
 59 |         ensure_loaded: bool = False,
 60 |     ) -> Iterator[tuple[DataIndexKey, DataIndexEntry]]:
 61 |         # NOTE: iteration is implemented using traverse and not iter/iteritems
 62 |         # since it supports skipping subtrie traversal for prefixes that are
 63 |         # not in the view.
 64 | 
 65 |         class _FilterNode:
 66 |             def __init__(self, key, children, *args):
 67 |                 self.key = key
 68 |                 self.children = children
 69 |                 self.value = args[0] if args else None
 70 | 
 71 |             def build(self, stack):
 72 |                 if not self.key or not shallow:
 73 |                     for child in self.children:
 74 |                         stack.append(child)
 75 |                 return self.key, self.value
 76 | 
 77 |         def _node_factory(_, key, children, *args) -> Optional[_FilterNode]:
 78 |             return _FilterNode(key, children, *args)
 79 | 
 80 |         kwargs = {"prefix": prefix} if prefix is not None else {}
 81 |         stack = deque([self.traverse(_node_factory, **kwargs)])
 82 |         while stack:
 83 |             node = stack.popleft()
 84 |             if node is not None:
 85 |                 key, value = node.build(stack)
 86 |                 if key and value:
 87 |                     yield key, value
 88 |                     if ensure_loaded:
 89 |                         yield from self._load_dir_keys(key, value, shallow=shallow)
 90 | 
 91 |     def _load_dir_keys(
 92 |         self,
 93 |         prefix: DataIndexKey,
 94 |         entry: Optional[DataIndexEntry],
 95 |         shallow: Optional[bool] = False,
 96 |     ) -> Iterator[tuple[DataIndexKey, DataIndexEntry]]:
 97 |         # NOTE: traverse() will not enter subtries that have been added
 98 |         # in-place during traversal. So for dirs which we load in-place, we
 99 |         # need to iterate over the new keys ourselves.
100 |         if (
101 |             entry is not None
102 |             and entry.hash_info
103 |             and entry.hash_info.isdir
104 |             and not entry.loaded
105 |         ):
106 |             self._index._load(prefix, entry)
107 |             if not shallow:
108 |                 for key, val in self._index.iteritems(entry.key):
109 |                     if key != prefix and self.filter_fn(key):
110 |                         yield key, val
111 | 
112 |     def iteritems(
113 |         self,
114 |         prefix: Optional[DataIndexKey] = None,
115 |         shallow: bool = False,
116 |     ) -> Iterator[tuple[DataIndexKey, DataIndexEntry]]:
117 |         return self._iteritems(prefix=prefix, shallow=shallow, ensure_loaded=True)
118 | 
119 |     def traverse(self, node_factory: Callable, **kwargs) -> Any:
120 |         def _node_factory(path_conv, key, children, *args):
121 |             if not key or self.filter_fn(key):
122 |                 return node_factory(path_conv, key, children, *args)
123 |             return None
124 | 
125 |         return self._index.traverse(_node_factory, **kwargs)
126 | 
127 |     def ls(self, root_key: DataIndexKey, detail=True):
128 |         self._index._ensure_loaded(root_key)
129 | 
130 |         if detail:
131 |             yield from (
132 |                 (key, self._index._info_from_entry(key, entry))
133 |                 for key, entry in self._index._trie.ls(root_key, with_values=True)
134 |                 if self.filter_fn(key)
135 |             )
136 |         else:
137 |             yield from filter(self.filter_fn, self._index.ls(root_key, detail=False))
138 | 
139 |     def has_node(self, key: DataIndexKey) -> bool:
140 |         return self.filter_fn(key) and self._index.has_node(key)
141 | 
142 |     def delete_node(self, key: DataIndexKey) -> None:
143 |         if not self.filter_fn(key):
144 |             raise KeyError
145 |         self._index.delete_node(key)
146 | 
147 |     def longest_prefix(
148 |         self, key: DataIndexKey
149 |     ) -> tuple[Optional[DataIndexKey], Optional[DataIndexEntry]]:
150 |         if self.filter_fn(key):
151 |             return self._index.longest_prefix(key)
152 |         return (None, None)
153 | 
154 | 
155 | def view(index: DataIndex, filter_fn: Callable[[DataIndexKey], bool]) -> DataIndexView:
156 |     """Return read-only filtered view of an index."""
157 |     return DataIndexView(index, filter_fn=filter_fn)
158 | 


--------------------------------------------------------------------------------
/src/dvc_data/json_compat.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any
 3 | 
 4 | try:
 5 |     import orjson  # type: ignore[import-not-found]
 6 | except ImportError:
 7 | 
 8 |     def loads(data: str) -> Any:
 9 |         return json.loads(data)
10 | 
11 |     def dumps(data: Any) -> str:
12 |         return json.dumps(data)
13 | else:
14 | 
15 |     def loads(data: str) -> Any:
16 |         return orjson.loads(data)
17 | 
18 |     def dumps(data: Any) -> str:
19 |         return orjson.dumps(data).decode("utf8")
20 | 
21 | 
22 | __all__ = ["dumps", "loads"]
23 | 


--------------------------------------------------------------------------------
/src/dvc_data/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/dvc-data/4ee77349a9712476cea4ac57154ee25ce79fcc02/src/dvc_data/py.typed


--------------------------------------------------------------------------------
/src/dvc_data/repo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Optional
 3 | 
 4 | from dvc_objects.fs import localfs
 5 | from dvc_objects.fs.base import FileSystem
 6 | 
 7 | from .index import DataIndex
 8 | 
 9 | 
10 | class NotARepoError(Exception):
11 |     pass
12 | 
13 | 
14 | class Repo:
15 |     def __init__(self, root: str = "", fs: Optional[FileSystem] = None) -> None:
16 |         fs = fs or localfs
17 |         root = root or fs.getcwd()
18 |         control_dir: str = os.getenv("DVC_DIR") or fs.join(root, ".dvc")
19 | 
20 |         if not fs.isdir(control_dir):
21 |             raise NotARepoError(f"{root} is not a data repo.")
22 | 
23 |         self.fs = fs or localfs
24 |         self.root = root
25 |         self._control_dir = control_dir
26 |         self._tmp_dir: str = fs.join(self._control_dir, "tmp")
27 |         self._cache_dir = fs.join(self._control_dir, "cache")
28 |         self._object_dir = fs.join(self._cache_dir, "files", "md5")
29 | 
30 |         self.index = DataIndex()
31 | 
32 |     @classmethod
33 |     def discover(
34 |         cls,
35 |         start: str = ".",
36 |         fs: Optional[FileSystem] = None,
37 |     ) -> "Repo":
38 |         remaining = start
39 |         fs = fs or localfs
40 |         path = start = fs.abspath(start)
41 |         while remaining:
42 |             try:
43 |                 return cls(path, fs)
44 |             except NotARepoError:
45 |                 path, remaining = fs.split(path)
46 |         raise NotARepoError(f"No data repository was found at {start}")
47 | 
48 |     @property
49 |     def control_dir(self):
50 |         return self._control_dir
51 | 
52 |     @property
53 |     def tmp_dir(self):
54 |         return self._tmp_dir
55 | 
56 |     @property
57 |     def object_dir(self):
58 |         return self._object_dir
59 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test suite for the dvc_data package."""
2 | 


--------------------------------------------------------------------------------
/tests/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/dvc-data/4ee77349a9712476cea4ac57154ee25ce79fcc02/tests/benchmarks/__init__.py


--------------------------------------------------------------------------------
/tests/benchmarks/test_checkout.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from os import fspath
 3 | from pathlib import Path
 4 | from tempfile import TemporaryDirectory
 5 | 
 6 | import pytest
 7 | from dvc_objects.fs import localfs
 8 | from dvc_objects.fs.generic import test_links as _test_links
 9 | 
10 | from dvc_data.cli import build, gentree, get_odb
11 | from dvc_data.hashfile.checkout import checkout
12 | from dvc_data.hashfile.state import State
13 | 
14 | 
15 | @pytest.fixture
16 | def repo(request, monkeypatch):
17 |     """Create a dvc data repo within pytest'scache directory.
18 |     The cache directory by default, is in the root of the repo, where reflink
19 |     may be supported.
20 |     """
21 |     cache = request.config.cache
22 |     path = cache.mkdir("dvc_data_repo")
23 |     with TemporaryDirectory(dir=path) as tmp_dir:
24 |         monkeypatch.chdir(tmp_dir)
25 |         path = Path(tmp_dir)
26 |         (path / ".dvc").mkdir()
27 |         yield path
28 | 
29 | 
30 | @pytest.mark.parametrize("link", ["reflink", "copy", "symlink", "hardlink"])
31 | def test_checkout(repo, benchmark, link):
32 |     fs_path = fspath(repo / "dataset")
33 |     odb = get_odb(type=[link])
34 | 
35 |     if not _test_links([link], localfs, odb.path, localfs, fs_path):
36 |         pytest.skip(f"unsupported link type: {link}")
37 | 
38 |     gentree(repo / "dataset", 1000, "50Mb")
39 |     obj = build(repo / "dataset", write=True)
40 |     state = odb.state
41 | 
42 |     def setup():
43 |         for path in (state.tmp_dir, fs_path):
44 |             try:
45 |                 shutil.rmtree(path)
46 |             except FileNotFoundError:
47 |                 pass
48 |         State(state.root_dir, state.tmp_dir, state.ignore)  # recreate db
49 | 
50 |     assert benchmark.pedantic(
51 |         checkout,
52 |         setup=setup,
53 |         args=(fs_path, localfs, obj, odb),
54 |         kwargs={"state": state},
55 |         rounds=10,
56 |         warmup_rounds=2,
57 |     )
58 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import dvc_objects
2 | import pytest
3 | 
4 | 
5 | @pytest.fixture
6 | def as_filesystem():
7 |     return dvc_objects.fs.as_filesystem
8 | 


--------------------------------------------------------------------------------
/tests/hashfile/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/dvc-data/4ee77349a9712476cea4ac57154ee25ce79fcc02/tests/hashfile/__init__.py


--------------------------------------------------------------------------------
/tests/hashfile/test_build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dvc_objects.fs.local import LocalFileSystem
 4 | 
 5 | from dvc_data.hashfile.build import build
 6 | from dvc_data.hashfile.db import HashFileDB
 7 | from dvc_data.hashfile.hash_info import HashInfo
 8 | from dvc_data.hashfile.meta import Meta
 9 | from dvc_data.hashfile.tree import Tree
10 | 
11 | 
12 | def test_build_file(tmp_path):
13 |     fs = LocalFileSystem()
14 |     file = tmp_path / "foo"
15 | 
16 |     odb = HashFileDB(fs, os.fspath(tmp_path / ".dvc" / ".cache" / "files" / "md5"))
17 | 
18 |     fs.pipe({file: b"foo"})
19 | 
20 |     _, meta, obj = build(odb, str(file), fs, "md5")
21 |     assert meta.isdir is False
22 |     assert meta.size == 3
23 |     assert obj.hash_info == HashInfo("md5", "acbd18db4cc2f85cedef654fccc4a4d8")
24 | 
25 | 
26 | def test_build_directory(tmp_path):
27 |     fs = LocalFileSystem()
28 |     directory = tmp_path / "dir"
29 |     directory.mkdir()
30 | 
31 |     odb = HashFileDB(fs, os.fspath(tmp_path / ".dvc" / ".cache" / "files" / "md5"))
32 | 
33 |     fs.pipe({directory / "foo": b"foo", directory / "bar": b"bar"})
34 | 
35 |     _, meta, tree = build(odb, str(directory), fs, "md5")
36 |     assert meta == Meta(isdir=True, size=6, nfiles=2)
37 |     assert isinstance(tree, Tree)
38 |     assert tree.hash_info == HashInfo("md5", "5ea40360f5b4ec688df672a4db9c17d1.dir")
39 |     assert tree.as_list() == [
40 |         {"md5": "37b51d194a7513e45b56f6524f2d51f2", "relpath": "bar"},
41 |         {"md5": "acbd18db4cc2f85cedef654fccc4a4d8", "relpath": "foo"},
42 |     ]
43 | 


--------------------------------------------------------------------------------
/tests/hashfile/test_cache.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from os import fspath
 3 | from typing import Any
 4 | 
 5 | import pytest
 6 | 
 7 | from dvc_data.hashfile.cache import Cache, DiskError, HashesCache
 8 | 
 9 | 
10 | def set_value(cache: Cache, key: str, value: Any) -> Any:
11 |     cache[key] = value
12 |     return cache[key]
13 | 
14 | 
15 | @pytest.mark.parametrize("disk_type", [None, "test"])
16 | def test_pickle_protocol_error(tmp_path, disk_type):
17 |     directory = tmp_path / "test"
18 |     cache = Cache(
19 |         fspath(directory),
20 |         disk_pickle_protocol=pickle.HIGHEST_PROTOCOL + 1,
21 |         type=disk_type,
22 |     )
23 |     with pytest.raises(DiskError) as exc, cache as cache:
24 |         set_value(cache, "key", ("value1", "value2"))
25 |     assert exc.value.directory == fspath(directory)
26 |     assert exc.value.type == "test"
27 |     assert f"Could not open disk 'test' in {directory}" == str(exc.value)
28 | 
29 | 
30 | @pytest.mark.parametrize(
31 |     "proto_a, proto_b",
32 |     [
33 |         (pickle.HIGHEST_PROTOCOL - 1, pickle.HIGHEST_PROTOCOL),
34 |         (pickle.HIGHEST_PROTOCOL, pickle.HIGHEST_PROTOCOL - 1),
35 |     ],
36 | )
37 | def test_pickle_backwards_compat(tmp_path, proto_a, proto_b):
38 |     with Cache(
39 |         directory=fspath(tmp_path / "test"),
40 |         disk_pickle_protocol=proto_a,
41 |     ) as cache:
42 |         set_value(cache, "key", ("value1", "value2"))
43 |     with Cache(
44 |         directory=fspath(tmp_path / "test"),
45 |         disk_pickle_protocol=proto_b,
46 |     ) as cache:
47 |         assert cache["key"] == ("value1", "value2")
48 |         set_value(cache, "key", ("value3", "value4"))
49 |         assert cache["key"] == ("value3", "value4")
50 | 
51 | 
52 | def test_hashes_cache(tmp_path):
53 |     with HashesCache(tmp_path / "test") as cache:
54 |         assert cache.is_empty()
55 |         assert cache.set("key", "value")
56 |         assert not cache.is_empty()
57 |         assert cache.get("key") == "value"
58 |         assert cache.get("not-existing-key") is None
59 | 
60 | 
61 | def test_hashes_cache_many(tmp_path):
62 |     with HashesCache(tmp_path / "test") as cache:
63 |         assert cache.is_empty()
64 |         assert list(cache.get_many(("key1",))) == [("key1", None)]
65 | 
66 |         cache.set_many((("key1", "value1"), ("key2", "value2")))
67 |         assert not cache.is_empty()
68 |         assert list(cache.get_many(("key1", "key2"))) == [
69 |             ("key1", "value1"),
70 |             ("key2", "value2"),
71 |         ]
72 |         assert list(cache.get_many(("key1", "key2", "not-existing-key"))) == [
73 |             ("key1", "value1"),
74 |             ("key2", "value2"),
75 |             ("not-existing-key", None),
76 |         ]
77 | 
78 | 
79 | @pytest.mark.parametrize("upsert", [True, False])
80 | def test_hashes_cache_update(tmp_path, upsert):
81 |     with HashesCache(tmp_path / "test") as cache:
82 |         cache.SUPPORTS_UPSERT = upsert
83 | 
84 |         assert cache.is_empty()
85 |         cache.set("key1", "value")
86 |         cache.set_many((("key1", "value1"), ("key2", "value2")))
87 |         assert list(cache.get_many(("key1", "key2"))) == [
88 |             ("key1", "value1"),
89 |             ("key2", "value2"),
90 |         ]
91 | 


--------------------------------------------------------------------------------
/tests/hashfile/test_db.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from dvc_objects.errors import ObjectFormatError
 3 | 
 4 | from dvc_data.hashfile.db import HashFile, HashFileDB
 5 | from dvc_data.hashfile.db.local import LocalHashFileDB
 6 | from dvc_data.hashfile.meta import Meta
 7 | 
 8 | 
 9 | def test_db(tmp_upath, as_filesystem):
10 |     odb = HashFileDB(as_filesystem(tmp_upath.fs), str(tmp_upath))
11 | 
12 |     assert not odb.exists("123456")
13 |     assert list(odb.all()) == []
14 | 
15 |     obj = odb.get("123456")
16 |     assert isinstance(obj, HashFile)
17 | 
18 | 
19 | @pytest.mark.parametrize("tmp_upath", ["local", "memory"], indirect=True)
20 | def test_db_check(tmp_upath, as_filesystem):
21 |     fs = as_filesystem(tmp_upath.fs)
22 |     db_cls = LocalHashFileDB if fs.protocol == "local" else HashFileDB
23 |     odb = db_cls(as_filesystem(tmp_upath.fs), str(tmp_upath))
24 | 
25 |     oid = "acbd18db4cc2f85cedef654fccc4a4d8"
26 |     path = odb.oid_to_path(oid)
27 | 
28 |     with pytest.raises(FileNotFoundError):
29 |         odb.check(oid)
30 | 
31 |     odb.add_bytes(oid, b"foo")
32 |     assert odb.check(oid) == Meta.from_info(odb.fs.info(path))
33 | 
34 |     odb.protect(oid)
35 |     assert odb.check(oid) == Meta.from_info(odb.fs.info(path))
36 | 
37 |     odb.delete(oid)
38 | 
39 |     odb.add_bytes(oid, b"bar")
40 |     with pytest.raises(ObjectFormatError):
41 |         odb.check(oid)
42 | 


--------------------------------------------------------------------------------
/tests/hashfile/test_db_index.py:
--------------------------------------------------------------------------------
 1 | from contextlib import closing
 2 | 
 3 | import pytest
 4 | 
 5 | from dvc_data.hashfile.db.index import ObjectDBIndex
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def index(tmp_upath):
10 |     with closing(ObjectDBIndex(tmp_upath, "foo")) as _index:
11 |         yield _index
12 | 
13 | 
14 | def test_roundtrip(request, tmp_upath, index):
15 |     expected_dir = {"1234.dir"}
16 |     expected_file = {"5678"}
17 |     index.update(expected_dir, expected_file)
18 | 
19 |     new_index = ObjectDBIndex(tmp_upath, "foo")
20 |     request.addfinalizer(new_index.close)
21 | 
22 |     assert set(new_index.dir_hashes()) == expected_dir
23 |     assert set(new_index.hashes()) == expected_dir | expected_file
24 | 
25 | 
26 | def test_clear(index):
27 |     index.update(["1234.dir"], ["5678"])
28 |     index.clear()
29 |     assert not list(index.hashes())
30 | 
31 | 
32 | def test_update(index):
33 |     expected_dir = {"1234.dir"}
34 |     expected_file = {"5678"}
35 |     index.update(expected_dir, expected_file)
36 |     assert set(index.dir_hashes()) == expected_dir
37 |     assert set(index.hashes()) == expected_dir | expected_file
38 | 
39 | 
40 | def test_intersection(index):
41 |     hashes = (str(i) for i in range(2000))
42 |     expected = {str(i) for i in range(1000)}
43 |     index.update([], hashes)
44 |     assert set(index.intersection(expected)) == expected
45 | 


--------------------------------------------------------------------------------
/tests/hashfile/test_diff.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from dvc_data.hashfile.diff import ROOT, Change, TreeEntry, diff
 4 | from dvc_data.hashfile.meta import Meta
 5 | from dvc_data.hashfile.obj import HashFile
 6 | from dvc_data.hashfile.tree import Tree
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def tree():
11 |     tree = Tree.from_list(
12 |         [
13 |             {"md5": "37b51d194a7513e45b56f6524f2d51f2", "relpath": "bar"},
14 |             {"md5": "acbd18db4cc2f85cedef654fccc4a4d8", "relpath": "foo"},
15 |         ]
16 |     )
17 |     tree.digest()
18 |     return tree
19 | 
20 | 
21 | def test_diff_unchanged(mocker, tree):
22 |     meta = Meta()
23 |     mocked_cache = mocker.MagicMock(check=mocker.MagicMock(return_value=meta))
24 |     _, bar_oid = tree.get(("bar",))
25 |     obj = HashFile("data", mocker.MagicMock(), bar_oid)
26 | 
27 |     assert not diff(obj, obj, mocked_cache)
28 |     assert not diff(tree, tree, mocked_cache)
29 | 
30 | 
31 | def test_different_object_type_tree_to_hashfile(mocker, tree):
32 |     meta = Meta()
33 |     mocked_cache = mocker.MagicMock(check=mocker.MagicMock(return_value=meta))
34 | 
35 |     (_, bar_oid), (_, foo_oid) = tree.get(("bar",)), tree.get(("foo",))
36 |     obj = HashFile("data", mocker.MagicMock(), bar_oid)
37 |     d = diff(tree, obj, mocked_cache)
38 | 
39 |     assert d.stats == {"modified": 1, "deleted": 2, "added": 0}
40 |     assert not d.unchanged
41 |     assert d.modified == [
42 |         Change(
43 |             old=TreeEntry(cache_meta=meta, key=ROOT, oid=tree.hash_info),
44 |             new=TreeEntry(cache_meta=meta, key=ROOT, oid=bar_oid),
45 |         )
46 |     ]
47 |     assert sorted(d.deleted) == [
48 |         Change(
49 |             old=TreeEntry(cache_meta=meta, key=("bar",), oid=bar_oid),
50 |             new=TreeEntry(key=("bar",)),
51 |         ),
52 |         Change(
53 |             old=TreeEntry(cache_meta=meta, key=("foo",), oid=foo_oid),
54 |             new=TreeEntry(key=("foo",)),
55 |         ),
56 |     ]
57 | 
58 | 
59 | def test_different_object_type_hashfile_to_tree(mocker, tree):
60 |     meta = Meta()
61 |     mocked_cache = mocker.MagicMock(check=mocker.MagicMock(return_value=meta))
62 |     (_, bar_oid), (_, foo_oid) = tree.get(("bar",)), tree.get(("foo",))
63 |     obj = HashFile("data", mocker.MagicMock(), bar_oid)
64 |     d = diff(obj, tree, mocked_cache)
65 | 
66 |     assert d.stats == {"modified": 1, "deleted": 0, "added": 2}
67 |     assert not d.unchanged
68 |     assert d.modified == [
69 |         Change(
70 |             old=TreeEntry(cache_meta=meta, key=ROOT, oid=bar_oid),
71 |             new=TreeEntry(cache_meta=meta, key=ROOT, oid=tree.hash_info),
72 |         )
73 |     ]
74 |     assert sorted(d.added) == [
75 |         Change(
76 |             old=TreeEntry(cache_meta=meta, key=("bar",)),
77 |             new=TreeEntry(key=("bar",), oid=bar_oid),
78 |         ),
79 |         Change(
80 |             old=TreeEntry(cache_meta=meta, key=("foo",)),
81 |             new=TreeEntry(key=("foo",), oid=foo_oid),
82 |         ),
83 |     ]
84 | 


--------------------------------------------------------------------------------
/tests/hashfile/test_hash.py:
--------------------------------------------------------------------------------
 1 | from os import fspath
 2 | 
 3 | from dvc_objects.fs import LocalFileSystem
 4 | 
 5 | from dvc_data.hashfile.hash import file_md5
 6 | 
 7 | 
 8 | def test_file_md5(tmp_path):
 9 |     foo = tmp_path / "foo"
10 |     foo.write_text("foo content", encoding="utf8")
11 | 
12 |     fs = LocalFileSystem()
13 |     assert file_md5(fspath(foo), fs) == file_md5(fspath(foo), fs)
14 | 
15 | 
16 | def test_file_md5_dos2unix(tmp_path):
17 |     fs = LocalFileSystem()
18 |     cr = tmp_path / "cr"
19 |     crlf = tmp_path / "crlf"
20 |     cr.write_bytes(b"a\nb\nc")
21 |     crlf.write_bytes(b"a\r\nb\r\nc")
22 |     assert file_md5(fspath(cr), fs, name="md5-dos2unix") == file_md5(
23 |         fspath(crlf), fs, name="md5-dos2unix"
24 |     )
25 | 


--------------------------------------------------------------------------------
/tests/hashfile/test_hash_stream.py:
--------------------------------------------------------------------------------
 1 | from os import fspath
 2 | 
 3 | import pytest
 4 | from dvc_objects.fs import LocalFileSystem
 5 | 
 6 | from dvc_data.hashfile.hash import HashStreamFile, file_md5
 7 | from dvc_data.hashfile.istextfile import DEFAULT_CHUNK_SIZE
 8 | 
 9 | 
10 | def test_hashed_stream_reader(tmp_path):
11 |     foo = tmp_path / "foo"
12 |     foo.write_bytes(b"foo")
13 | 
14 |     with open(foo, "rb") as fobj:
15 |         stream_reader = HashStreamFile(fobj)
16 | 
17 |         assert stream_reader.readable()
18 |         assert not stream_reader.seekable()
19 | 
20 |         assert stream_reader.read(2) == b"fo"
21 |         assert stream_reader.tell() == 2
22 | 
23 |         assert stream_reader.read(1) == b"o"
24 |         assert stream_reader.tell() == 3
25 | 
26 |     hex_digest = file_md5(fspath(foo), LocalFileSystem())
27 |     assert hex_digest == stream_reader.hash_value
28 | 
29 | 
30 | def test_hashed_stream_reader_as_chunks(tmp_path):
31 |     foo = tmp_path / "foo"
32 |     foo.write_bytes(b"foo \x00" * 16)
33 | 
34 |     actual_size = len(foo.read_bytes())
35 |     with open(foo, "rb") as fobj:
36 |         stream_reader = HashStreamFile(fobj)
37 | 
38 |         total_read = 0
39 |         while True:
40 |             chunk = stream_reader.read(16)
41 |             total_read += len(chunk)
42 |             assert stream_reader.tell() == total_read
43 |             if not chunk:
44 |                 break
45 | 
46 |         assert stream_reader.tell() == actual_size == total_read
47 | 
48 |     hex_digest = file_md5(fspath(foo), LocalFileSystem())
49 |     assert hex_digest == stream_reader.hash_value
50 | 
51 | 
52 | @pytest.mark.parametrize(
53 |     "contents",
54 |     [b"x" * DEFAULT_CHUNK_SIZE + b"\x00", b"clean", b"not clean \x00"],
55 | )
56 | def test_hashed_stream_reader_compatibility(tmp_path, contents):
57 |     # Always read more than the DEFAULT_CHUNK_SIZE (512 bytes).
58 |     # This imitates the read actions performed by upload_fobj.
59 |     chunk_size = DEFAULT_CHUNK_SIZE * 2
60 | 
61 |     data = tmp_path / "data"
62 |     data.write_bytes(contents)
63 | 
64 |     with open(data, "rb") as fobj:
65 |         stream_reader = HashStreamFile(fobj)
66 |         stream_reader.read(chunk_size)
67 | 
68 |     local_fs = LocalFileSystem()
69 |     hex_digest = file_md5(fspath(data), local_fs)
70 | 
71 |     assert stream_reader.hash_value == hex_digest
72 | 


--------------------------------------------------------------------------------
/tests/hashfile/test_istextfile.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from dvc_objects.fs.memory import MemoryFileSystem
 3 | 
 4 | from dvc_data.hashfile.istextfile import istextblock, istextfile
 5 | 
 6 | pytestmark = pytest.mark.parametrize(
 7 |     "block, expected",
 8 |     [
 9 |         (b"", True),
10 |         (b"text", True),
11 |         (b"\x00\x001", False),
12 |         (
13 |             (
14 |                 b"True\x80\x04\x95\x1a\x00\x00\x00\x00\x00\x00\x00\x8c\x08\r\n"
15 |                 b"__main__\x94\x8c\x06Animal\x94\x93\x94)\x81\x94."
16 |             ),
17 |             False,
18 |         ),
19 |     ],
20 |     ids=["empty", "text", "binary", "long_binary"],
21 | )
22 | 
23 | 
24 | def test_istextblock(block, expected):
25 |     assert istextblock(block) is expected
26 | 
27 | 
28 | def test_istextfile(block, expected):
29 |     fs = MemoryFileSystem(global_store=False)
30 |     fs.pipe_file("/file", block)
31 | 
32 |     assert istextfile("/file", fs) is expected
33 | 


--------------------------------------------------------------------------------
/tests/hashfile/test_obj.py:
--------------------------------------------------------------------------------
 1 | from dvc_data.hashfile.hash_info import HashInfo
 2 | from dvc_data.hashfile.obj import HashFile
 3 | 
 4 | 
 5 | def test_obj(tmp_upath):
 6 |     hash_info = HashInfo("md5", "123456")
 7 |     obj = HashFile(tmp_upath, tmp_upath.fs, hash_info)
 8 |     assert obj.path == tmp_upath
 9 |     assert obj.fs == tmp_upath.fs
10 |     assert obj.oid == "123456"
11 |     assert obj.hash_info == hash_info
12 | 


--------------------------------------------------------------------------------
/tests/hashfile/test_state.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from contextlib import closing
  3 | 
  4 | import pytest
  5 | from dvc_objects.fs import MemoryFileSystem
  6 | from dvc_objects.fs.local import LocalFileSystem
  7 | from dvc_objects.fs.system import inode
  8 | 
  9 | from dvc_data.hashfile.hash import file_md5
 10 | from dvc_data.hashfile.hash_info import HashInfo
 11 | from dvc_data.hashfile.meta import Meta
 12 | from dvc_data.hashfile.state import State, StateNoop, _checksum
 13 | from dvc_data.hashfile.utils import get_mtime_and_size
 14 | from dvc_data.json_compat import dumps as json_dumps
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def state(tmp_path):
 19 |     with closing(State(tmp_path, tmp_path / "tmp")) as _state:
 20 |         yield _state
 21 | 
 22 | 
 23 | def test_hashes(tmp_path, state: State):
 24 |     path = tmp_path / "foo"
 25 |     path.write_text("foo content", encoding="utf-8")
 26 | 
 27 |     fs = LocalFileSystem()
 28 |     hash_info = HashInfo(name="md5", value="6dbda444875c24ec1bbdb433456be11f")
 29 | 
 30 |     state.save(str(path), fs, hash_info)
 31 |     info = fs.info(str(path))
 32 |     meta = Meta.from_info(info)
 33 |     assert state.hashes[str(path)] == json_dumps(
 34 |         {
 35 |             "version": 1,
 36 |             "checksum": _checksum(info),
 37 |             "size": 11,
 38 |             "hash_info": {"md5": hash_info.value},
 39 |         }
 40 |     )
 41 |     assert state.get(str(path), fs) == (meta, hash_info)
 42 |     assert list(state.get_many((str(path),), fs, {})) == [(str(path), meta, hash_info)]
 43 | 
 44 |     path.write_text("foo content 1", encoding="utf-8")
 45 |     info = fs.info(str(path))
 46 |     meta = Meta.from_info(info)
 47 |     hash_info = HashInfo(name="md5", value="8efcb74434c93f295375a9118292fd0c")
 48 |     path.unlink()
 49 | 
 50 |     state.save(str(path), fs, hash_info, info)
 51 |     assert state.hashes[str(path)] == json_dumps(
 52 |         {
 53 |             "version": 1,
 54 |             "checksum": _checksum(info),
 55 |             "size": 13,
 56 |             "hash_info": {"md5": hash_info.value},
 57 |         }
 58 |     )
 59 |     assert state.get(str(path), fs, info) == (meta, hash_info)
 60 |     assert list(state.get_many((str(path),), fs, {str(path): info})) == [
 61 |         (str(path), meta, hash_info)
 62 |     ]
 63 | 
 64 |     assert state.get(str(path), fs) == (None, None)
 65 |     assert list(state.get_many((str(path),), fs, {})) == [(str(path), None, None)]
 66 | 
 67 | 
 68 | def test_hashes_get_not_a_local_fs(tmp_path, state: State):
 69 |     fs = MemoryFileSystem()
 70 | 
 71 |     assert state.get("not-existing-file", fs) == (None, None)
 72 |     assert list(state.get_many(("not-existing-file",), fs, {})) == [
 73 |         ("not-existing-file", None, None)
 74 |     ]
 75 | 
 76 | 
 77 | def test_hashes_get_invalid_data(tmp_path, state: State):
 78 |     path = tmp_path / "foo"
 79 |     path.write_text("foo content", encoding="utf-8")
 80 | 
 81 |     fs = LocalFileSystem()
 82 | 
 83 |     # invalid json
 84 |     state.hashes[str(path)] = ""
 85 |     assert state.get(str(path), fs) == (None, None)
 86 |     assert list(state.get_many((str(path),), fs, {})) == [(str(path), None, None)]
 87 | 
 88 |     # invalid json
 89 |     state.hashes[str(path)] = '{"x"}'
 90 |     assert state.get(str(path), fs) == (None, None)
 91 |     assert list(state.get_many((str(path),), fs, {})) == [(str(path), None, None)]
 92 | 
 93 |     # invalid checksum
 94 |     state.hashes[str(path)] = json_dumps(
 95 |         {
 96 |             "version": 1,
 97 |             "checksum": 1,
 98 |             "size": 13,
 99 |             "hash_info": {"md5": "value"},
100 |         }
101 |     )
102 |     assert state.get(str(path), fs) == (None, None)
103 |     assert list(state.get_many((str(path),), fs, {})) == [(str(path), None, None)]
104 | 
105 |     # invalid version
106 |     state.hashes[str(path)] = json_dumps(
107 |         {
108 |             "version": state.HASH_VERSION + 1,
109 |             "checksum": _checksum(fs.info(str(path))),
110 |             "size": 13,
111 |             "hash_info": {"md5": "value"},
112 |         }
113 |     )
114 |     assert state.get(str(path), fs) == (None, None)
115 |     assert list(state.get_many((str(path),), fs, {})) == [(str(path), None, None)]
116 | 
117 | 
118 | def test_hashes_without_version(tmp_path, state: State):
119 |     # If there is no version, it is considered as old md5-dos2unix hashes.
120 |     # dvc-data does not write this format anymore, but it should be able to read it
121 |     fs = LocalFileSystem()
122 | 
123 |     path = tmp_path / "foo"
124 |     path.write_text("foo content", encoding="utf-8")
125 | 
126 |     info = fs.info(str(path))
127 |     meta = Meta.from_info(info)
128 | 
129 |     state.hashes[str(path)] = json_dumps(
130 |         {
131 |             "checksum": _checksum(info),
132 |             "size": 11,
133 |             "hash_info": {"md5": "value"},
134 |         }
135 |     )
136 |     assert state.get(str(path), fs) == (
137 |         meta,
138 |         HashInfo("md5-dos2unix", "value"),
139 |     )
140 |     assert list(state.get_many((str(path),), fs, {})) == [
141 |         (str(path), meta, HashInfo("md5-dos2unix", "value"))
142 |     ]
143 | 
144 | 
145 | def test_hashes_save_not_existing(tmp_path, state: State):
146 |     fs = LocalFileSystem()
147 | 
148 |     with pytest.raises(FileNotFoundError):
149 |         state.save("not-existing-file", fs, HashInfo("md5", "value"))
150 | 
151 |     state.save_many((("not-existing-file", HashInfo("md5", "value"), None),), fs)
152 |     assert len(state.hashes) == 0
153 | 
154 | 
155 | def test_hashes_save_when_fs_is_not_a_local_fs(tmp_path, state: State):
156 |     fs = MemoryFileSystem()
157 | 
158 |     state.save("not-existing-file", fs, HashInfo("md5", "value"))
159 |     assert len(state.hashes) == 0
160 | 
161 |     state.save_many((("not-existing-file", HashInfo("md5", "value"), None),), fs)
162 |     assert len(state.hashes) == 0
163 | 
164 | 
165 | def test_state_many(tmp_path, state: State):
166 |     foo = tmp_path / "foo"
167 |     foo.write_text("foo content", encoding="utf-8")
168 | 
169 |     bar = tmp_path / "bar"
170 |     bar.write_text("bar content", encoding="utf-8")
171 | 
172 |     fs = LocalFileSystem()
173 | 
174 |     hash_info_foo = HashInfo("md5", file_md5(foo, fs))
175 |     foo_info = fs.info(str(foo))
176 |     bar_info = fs.info(str(bar))
177 |     hash_info_bar = HashInfo("md5", file_md5(bar, fs))
178 | 
179 |     state.save_many(
180 |         [(str(foo), hash_info_foo, None), (str(bar), hash_info_bar, None)], fs
181 |     )
182 |     assert list(state.get_many([str(foo), str(bar)], fs, {})) == [
183 |         (str(foo), Meta.from_info(foo_info), hash_info_foo),
184 |         (str(bar), Meta.from_info(bar_info), hash_info_bar),
185 |     ]
186 | 
187 |     foo.write_text("foo content 1", encoding="utf-8")
188 |     foo_info = fs.info(str(foo))
189 |     hash_info_foo = HashInfo("md5", file_md5(foo, fs))
190 |     foo.unlink()
191 |     bar.write_text("bar content 1", encoding="utf-8")
192 |     bar_info = fs.info(str(bar))
193 |     hash_info_bar = HashInfo("md5", file_md5(bar, fs))
194 |     bar.unlink()
195 | 
196 |     state.save_many(
197 |         [(str(foo), hash_info_foo, foo_info), (str(bar), hash_info_bar, bar_info)], fs
198 |     )
199 |     assert list(
200 |         state.get_many(
201 |             [str(foo), str(bar)], fs, {str(foo): foo_info, str(bar): bar_info}
202 |         )
203 |     ) == [
204 |         (str(foo), Meta.from_info(foo_info), hash_info_foo),
205 |         (str(bar), Meta.from_info(bar_info), hash_info_bar),
206 |     ]
207 | 
208 | 
209 | def test_set_link(tmp_path, state):
210 |     state.set_link(tmp_path / "foo", 42, "mtime")
211 |     assert state.links["foo"] == (42, "mtime")
212 | 
213 | 
214 | def test_state_noop(tmp_path):
215 |     state = StateNoop()
216 |     fs = LocalFileSystem()
217 | 
218 |     state.save_many([("foo", HashInfo("md5", "value"), None)], fs)
219 |     assert state.get("foo", fs) == (None, None)
220 |     assert list(state.get_many(("foo", "bar"), fs, {})) == [
221 |         ("foo", None, None),
222 |         ("bar", None, None),
223 |     ]
224 | 
225 |     state.set_link(tmp_path / "foo", 42, "mtime")
226 |     assert state.get_unused_links([], fs) == []
227 | 
228 |     state.save_link(tmp_path / "foo", fs)
229 |     assert state.get_unused_links([], fs) == []
230 | 
231 | 
232 | def test_links(tmp_path, state: State):
233 |     foo, bar = tmp_path / "foo", tmp_path / "bar"
234 |     dataset = tmp_path / "dataset"
235 |     dataset.mkdir()
236 |     file = dataset / "file"
237 | 
238 |     for path in [foo, bar, file]:
239 |         path.write_text(f"{path.name} content", encoding="utf-8")
240 | 
241 |     fs = LocalFileSystem()
242 | 
243 |     state.save_link(os.fspath(foo), fs)
244 |     state.save_link(os.fspath(bar), fs)
245 |     state.save_link(os.fspath(dataset), fs)
246 | 
247 |     def _get_inode_mtime(path):
248 |         path = os.fspath(path)
249 |         return inode(path), get_mtime_and_size(path, fs)[0]
250 | 
251 |     assert len(state.links) == 3
252 |     assert {k: state.links[k] for k in state.links} == {
253 |         "foo": _get_inode_mtime(foo),
254 |         "bar": _get_inode_mtime(bar),
255 |         "dataset": _get_inode_mtime(dataset),
256 |     }
257 | 
258 |     links = [os.fspath(tmp_path / link) for link in ["foo", "bar", "dataset"]]
259 |     assert set(state.get_unused_links([], fs)) == {"foo", "bar", "dataset"}
260 |     assert set(state.get_unused_links(links[:1], fs)) == {"bar", "dataset"}
261 |     assert set(state.get_unused_links(links[:2], fs)) == {"dataset"}
262 |     assert set(state.get_unused_links(links, fs)) == set()
263 |     assert set(
264 |         state.get_unused_links(
265 |             ([*links[:1], os.path.join(tmp_path, "not-existing-file")]),
266 |             fs,
267 |         )
268 |     ) == {"bar", "dataset"}
269 | 
270 |     state.remove_links(["foo", "bar", "dataset"], fs)
271 |     assert len(state.links) == 0
272 |     assert not foo.exists()
273 |     assert not bar.exists()
274 |     assert not dataset.exists()
275 | 


--------------------------------------------------------------------------------
/tests/hashfile/test_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dvc_objects.fs import LocalFileSystem
 4 | 
 5 | from dvc_data.hashfile.utils import get_mtime_and_size
 6 | 
 7 | 
 8 | def test_mtime_and_size(tmp_path):
 9 |     directory = tmp_path / "dir"
10 |     directory.mkdir(parents=True)
11 |     dir_file = directory / "file"
12 |     dir_file.write_text("dir_file", encoding="utf8")
13 | 
14 |     sub = directory / "sub"
15 |     sub.mkdir(parents=True)
16 |     subfile = sub / "file"
17 |     subfile.write_text("sub_file", encoding="utf8")
18 | 
19 |     fs = LocalFileSystem(url=tmp_path)
20 |     file_time, file_size = get_mtime_and_size(str(dir_file), fs)
21 |     dir_time, dir_size = get_mtime_and_size(str(directory), fs)
22 | 
23 |     actual_file_size = os.path.getsize(dir_file)
24 |     actual_dir_size = os.path.getsize(dir_file) + os.path.getsize(subfile)
25 | 
26 |     assert isinstance(file_time, str)
27 |     assert isinstance(file_size, int)
28 |     assert file_size == actual_file_size
29 |     assert isinstance(dir_time, str)
30 |     assert isinstance(dir_size, int)
31 |     assert dir_size == actual_dir_size
32 | 
33 | 
34 | def test_path_object_and_str_are_valid_types_get_mtime_and_size(tmp_path):
35 |     directory = tmp_path / "dir"
36 |     directory.mkdir()
37 |     (directory / "file").write_text("dir_file_content")
38 |     file = directory / "file"
39 |     file.write_text("file_content", encoding="utf8")
40 | 
41 |     fs = LocalFileSystem(url=tmp_path)
42 | 
43 |     time, size = get_mtime_and_size(str(directory), fs)
44 |     object_time, object_size = get_mtime_and_size(str(directory), fs)
45 |     assert time == object_time
46 |     assert size == object_size
47 | 
48 |     time, size = get_mtime_and_size(str(file), fs)
49 |     object_time, object_size = get_mtime_and_size(str(file), fs)
50 |     assert time == object_time
51 |     assert size == object_size
52 | 


--------------------------------------------------------------------------------
/tests/index/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iterative/dvc-data/4ee77349a9712476cea4ac57154ee25ce79fcc02/tests/index/__init__.py


--------------------------------------------------------------------------------
/tests/index/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from dvc_data.hashfile.db import HashFileDB
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def make_odb(tmp_upath_factory, as_filesystem):
10 |     def _make_odb():
11 |         path = tmp_upath_factory.mktemp()
12 |         fs = as_filesystem(path.fs)
13 |         return HashFileDB(fs, os.fspath(path))
14 | 
15 |     return _make_odb
16 | 
17 | 
18 | @pytest.fixture
19 | def odb(make_odb):
20 |     odb = make_odb()
21 | 
22 |     odb.add_bytes("d3b07384d113edec49eaa6238ad5ff00", b"foo\n")
23 |     odb.add_bytes("c157a79031e1c40f85931829bc5fc552", b"bar\n")
24 |     odb.add_bytes("258622b1688250cb619f3c9ccaefb7eb", b"baz\n")
25 |     odb.add_bytes(
26 |         "1f69c66028c35037e8bf67e5bc4ceb6a.dir",
27 |         (
28 |             b'[{"md5": "c157a79031e1c40f85931829bc5fc552", "relpath": "bar"}, '
29 |             b'{"md5": "258622b1688250cb619f3c9ccaefb7eb", "relpath": "baz"}]'
30 |         ),
31 |     )
32 |     return odb
33 | 


--------------------------------------------------------------------------------
/tests/index/test_build.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from dvc_data.index.build import DataIndexEntry, build, build_entry
 4 | 
 5 | 
 6 | def test_build_entry(tmp_upath, as_filesystem):
 7 |     (tmp_upath / "foo").write_bytes(b"foo\n")
 8 | 
 9 |     fs = as_filesystem(tmp_upath.fs)
10 | 
11 |     entry = build_entry(str(tmp_upath / "foo"), fs)
12 |     assert isinstance(entry, DataIndexEntry)
13 | 
14 |     assert entry.meta
15 |     assert entry.meta.size == 4
16 |     assert entry.key is None
17 |     assert entry.hash_info is None
18 | 
19 |     with pytest.raises(FileNotFoundError):
20 |         build_entry(str(tmp_upath / "missing"), fs)
21 | 
22 | 
23 | def test_build(tmp_upath, as_filesystem):
24 |     (tmp_upath / "foo").write_bytes(b"foo\n")
25 |     (tmp_upath / "data").mkdir()
26 |     (tmp_upath / "data" / "bar").write_bytes(b"bar\n")
27 |     (tmp_upath / "data" / "baz").write_bytes(b"baz\n")
28 | 
29 |     fs = as_filesystem(tmp_upath.fs)
30 |     index = build(str(tmp_upath), fs)
31 |     assert index[("foo",)].meta.size == 4
32 |     assert index.storage_map.get_data(index[("foo",)]) == (
33 |         fs,
34 |         str(tmp_upath / "foo"),
35 |     )
36 |     assert index[("data",)].meta.isdir
37 |     assert index[("data", "bar")].meta.size == 4
38 |     assert index.storage_map.get_data(index[("data", "bar")]) == (
39 |         fs,
40 |         str(tmp_upath / "data" / "bar"),
41 |     )
42 |     assert index[("data", "baz")].meta.size == 4
43 |     assert index.storage_map.get_data(index[("data", "baz")]) == (
44 |         fs,
45 |         str(tmp_upath / "data" / "baz"),
46 |     )
47 | 


--------------------------------------------------------------------------------
/tests/index/test_checkout.py:
--------------------------------------------------------------------------------
  1 | from dvc_data.hashfile.hash_info import HashInfo
  2 | from dvc_data.hashfile.meta import Meta
  3 | from dvc_data.index import DataIndex, DataIndexEntry, ObjectStorage
  4 | from dvc_data.index.checkout import apply, compare
  5 | 
  6 | 
  7 | def test_checkout(tmp_upath, odb, as_filesystem):
  8 |     index = DataIndex(
  9 |         {
 10 |             ("foo",): DataIndexEntry(
 11 |                 key=("foo",),
 12 |                 meta=Meta(),
 13 |                 hash_info=HashInfo(
 14 |                     name="md5", value="d3b07384d113edec49eaa6238ad5ff00"
 15 |                 ),
 16 |             ),
 17 |             ("data",): DataIndexEntry(
 18 |                 key=("data",),
 19 |                 meta=Meta(isdir=True),
 20 |                 hash_info=HashInfo(
 21 |                     name="md5",
 22 |                     value="1f69c66028c35037e8bf67e5bc4ceb6a.dir",
 23 |                 ),
 24 |             ),
 25 |         }
 26 |     )
 27 |     index.storage_map.add_cache(ObjectStorage((), odb))
 28 |     diff = compare(None, index)
 29 |     apply(diff, str(tmp_upath), as_filesystem(tmp_upath.fs))
 30 |     assert (tmp_upath / "foo").read_text() == "foo\n"
 31 |     assert (tmp_upath / "data").is_dir()
 32 |     assert (tmp_upath / "data" / "bar").read_text() == "bar\n"
 33 |     assert (tmp_upath / "data" / "baz").read_text() == "baz\n"
 34 |     assert set(tmp_upath.iterdir()) == {
 35 |         (tmp_upath / "foo"),
 36 |         (tmp_upath / "data"),
 37 |     }
 38 |     assert set((tmp_upath / "data").iterdir()) == {
 39 |         (tmp_upath / "data" / "bar"),
 40 |         (tmp_upath / "data" / "baz"),
 41 |     }
 42 | 
 43 | 
 44 | def test_checkout_file(tmp_upath, odb, as_filesystem):
 45 |     index = DataIndex(
 46 |         {
 47 |             (): DataIndexEntry(
 48 |                 key=(),
 49 |                 meta=Meta(),
 50 |                 hash_info=HashInfo(
 51 |                     name="md5", value="d3b07384d113edec49eaa6238ad5ff00"
 52 |                 ),
 53 |             ),
 54 |         }
 55 |     )
 56 |     index.storage_map.add_cache(ObjectStorage((), odb))
 57 |     diff = compare(None, index)
 58 |     apply(diff, str(tmp_upath / "foo"), as_filesystem(tmp_upath.fs))
 59 |     assert (tmp_upath / "foo").read_text() == "foo\n"
 60 | 
 61 | 
 62 | def test_checkout_broken_dir(tmp_upath, odb, as_filesystem):
 63 |     index = DataIndex(
 64 |         {
 65 |             ("foo",): DataIndexEntry(
 66 |                 key=("foo",),
 67 |                 meta=Meta(),
 68 |                 hash_info=HashInfo(
 69 |                     name="md5", value="d3b07384d113edec49eaa6238ad5ff00"
 70 |                 ),
 71 |             ),
 72 |             ("data",): DataIndexEntry(
 73 |                 key=("data",),
 74 |                 meta=Meta(isdir=True),
 75 |                 hash_info=HashInfo(
 76 |                     name="md5",
 77 |                     value="1f69c66028c35037e8bf67e5bc4ceb6a.dir",
 78 |                 ),
 79 |             ),
 80 |             ("broken",): DataIndexEntry(
 81 |                 key=("broken",),
 82 |                 meta=Meta(isdir=True),
 83 |                 hash_info=HashInfo(
 84 |                     name="md5",
 85 |                     value="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.dir",
 86 |                 ),
 87 |             ),
 88 |         }
 89 |     )
 90 |     index.storage_map.add_cache(ObjectStorage((), odb))
 91 |     diff = compare(None, index)
 92 |     apply(diff, str(tmp_upath), as_filesystem(tmp_upath.fs))
 93 |     assert (tmp_upath / "foo").read_text() == "foo\n"
 94 |     assert (tmp_upath / "data").is_dir()
 95 |     assert (tmp_upath / "data" / "bar").read_text() == "bar\n"
 96 |     assert (tmp_upath / "data" / "baz").read_text() == "baz\n"
 97 |     assert set(tmp_upath.iterdir()) == {
 98 |         (tmp_upath / "foo"),
 99 |         (tmp_upath / "data"),
100 |     }
101 |     assert set((tmp_upath / "data").iterdir()) == {
102 |         (tmp_upath / "data" / "bar"),
103 |         (tmp_upath / "data" / "baz"),
104 |     }
105 |     assert not (tmp_upath / "broken").exists()
106 | 
107 | 
108 | def test_checkout_delete_nested_dir(tmp_upath, odb, as_filesystem):
109 |     old = DataIndex(
110 |         {
111 |             ("dir1",): DataIndexEntry(
112 |                 key=("dir1",),
113 |                 meta=Meta(isdir=True),
114 |             ),
115 |             ("dir1", "subdir1"): DataIndexEntry(
116 |                 key=("dir1", "subdir1"),
117 |                 meta=Meta(isdir=True),
118 |             ),
119 |         }
120 |     )
121 |     diff = compare(None, old)
122 |     apply(diff, str(tmp_upath), as_filesystem(tmp_upath.fs))
123 | 
124 |     assert (tmp_upath / "dir1").exists()
125 |     assert (tmp_upath / "dir1").is_dir()
126 |     assert (tmp_upath / "dir1" / "subdir1").exists()
127 |     assert (tmp_upath / "dir1" / "subdir1").is_dir()
128 | 
129 |     new = DataIndex({})
130 |     diff = compare(old, new, delete=True)
131 |     apply(diff, str(tmp_upath), as_filesystem(tmp_upath.fs))
132 | 
133 |     assert not (tmp_upath / "dir1" / "subdir1").exists()
134 |     assert not (tmp_upath / "dir1").exists()
135 | 


--------------------------------------------------------------------------------
/tests/index/test_diff.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from dvc_data.hashfile.hash_info import HashInfo
  4 | from dvc_data.hashfile.meta import Meta
  5 | from dvc_data.index import DataIndex, DataIndexEntry
  6 | from dvc_data.index.diff import ADD, DELETE, MODIFY, RENAME, UNCHANGED, Change, diff
  7 | 
  8 | 
  9 | def test_diff():
 10 |     old_foo_key = ("foo",)
 11 |     old_foo_entry = DataIndexEntry(
 12 |         key=old_foo_key,
 13 |         meta=Meta(),
 14 |         hash_info=HashInfo(name="md5", value="d3b07384d113edec49eaa6238ad5ff00"),
 15 |     )
 16 |     old_bar_key = ("dir", "subdir", "bar")
 17 |     old_bar_entry = DataIndexEntry(
 18 |         key=old_bar_key,
 19 |         meta=Meta(isdir=True),
 20 |         hash_info=HashInfo(
 21 |             name="md5",
 22 |             value="1f69c66028c35037e8bf67e5bc4ceb6a.dir",
 23 |         ),
 24 |     )
 25 |     old = DataIndex({old_foo_key: old_foo_entry, old_bar_key: old_bar_entry})
 26 | 
 27 |     assert set(diff(old, old, with_unchanged=True)) == {
 28 |         Change(UNCHANGED, old_foo_entry, old_foo_entry),
 29 |         Change(UNCHANGED, old_bar_entry, old_bar_entry),
 30 |     }
 31 |     assert set(diff(old, old, with_renames=True, with_unchanged=True)) == {
 32 |         Change(UNCHANGED, old_foo_entry, old_foo_entry),
 33 |         Change(UNCHANGED, old_bar_entry, old_bar_entry),
 34 |     }
 35 | 
 36 |     new_foo_key = ("data", "FOO")
 37 |     new_foo_entry = DataIndexEntry(
 38 |         key=new_foo_key,
 39 |         meta=Meta(),
 40 |         hash_info=HashInfo(name="md5", value="d3b07384d113edec49eaa6238ad5ff00"),
 41 |     )
 42 |     new = DataIndex(
 43 |         {
 44 |             (
 45 |                 "data",
 46 |                 "FOO",
 47 |             ): new_foo_entry,
 48 |             old_bar_key: old_bar_entry,
 49 |         }
 50 |     )
 51 | 
 52 |     assert set(diff(old, new, with_unchanged=True)) == {
 53 |         Change(ADD, None, new_foo_entry),
 54 |         Change(DELETE, old_foo_entry, None),
 55 |         Change(UNCHANGED, old_bar_entry, old_bar_entry),
 56 |     }
 57 |     assert set(diff(old, new, with_renames=True, with_unchanged=True)) == {
 58 |         Change(RENAME, old_foo_entry, new_foo_entry),
 59 |         Change(UNCHANGED, old_bar_entry, old_bar_entry),
 60 |     }
 61 | 
 62 | 
 63 | def test_diff_no_hashes():
 64 |     index = DataIndex(
 65 |         {
 66 |             ("foo",): DataIndexEntry(key=("foo",)),
 67 |         }
 68 |     )
 69 |     assert not set(diff(index, None, hash_only=True))
 70 | 
 71 | 
 72 | def test_diff_meta_only():
 73 |     key = ("foo",)
 74 |     old_entry = DataIndexEntry(
 75 |         key=key,
 76 |         meta=Meta(etag="abc"),
 77 |         hash_info=HashInfo(name="md5", value="123"),
 78 |     )
 79 |     new_entry = DataIndexEntry(
 80 |         key=key,
 81 |         meta=Meta(etag="abc"),
 82 |         hash_info=HashInfo(name="md5", value="456"),
 83 |     )
 84 |     old = DataIndex({key: old_entry})
 85 |     new = DataIndex({key: new_entry})
 86 | 
 87 |     assert list(diff(old, new, meta_only=True, with_unchanged=True)) == [
 88 |         Change(UNCHANGED, old_entry, new_entry),
 89 |     ]
 90 | 
 91 |     new_entry.meta = Meta(etag="def")
 92 |     assert list(diff(old, new, meta_only=True, with_unchanged=True)) == [
 93 |         Change(MODIFY, old_entry, new_entry),
 94 |     ]
 95 | 
 96 | 
 97 | @pytest.mark.parametrize(
 98 |     "typ, left_meta, left_hi, right_meta, right_hi",
 99 |     [
100 |         (
101 |             UNCHANGED,
102 |             Meta(etag="123"),
103 |             HashInfo(name="md5", value="123"),
104 |             Meta(etag="123"),
105 |             HashInfo(name="md5", value="123"),
106 |         ),
107 |         (
108 |             ADD,
109 |             None,
110 |             None,
111 |             Meta(etag="123"),
112 |             HashInfo(name="md5", value="123"),
113 |         ),
114 |         (
115 |             DELETE,
116 |             Meta(etag="123"),
117 |             HashInfo(name="md5", value="123"),
118 |             None,
119 |             None,
120 |         ),
121 |     ],
122 | )
123 | def test_diff_combined(typ, left_meta, left_hi, right_meta, right_hi):
124 |     key = ("foo",)
125 |     old_entry = DataIndexEntry(
126 |         key=key,
127 |         meta=left_meta,
128 |         hash_info=left_hi,
129 |     )
130 |     new_entry = DataIndexEntry(
131 |         key=key,
132 |         meta=right_meta,
133 |         hash_info=right_hi,
134 |     )
135 |     old = DataIndex({key: old_entry})
136 |     new = DataIndex({key: new_entry})
137 | 
138 |     # diff should return UNCHANGED if both meta and hash info match,
139 |     # but MODIFY if they don't since entries still exist
140 |     assert list(diff(old, new, with_unchanged=True)) == [
141 |         Change(UNCHANGED if typ == UNCHANGED else MODIFY, old_entry, new_entry),
142 |     ]
143 | 
144 |     # diff should return UNCHANGED if both meta and hash info match,
145 |     # but MODIFY if they don't since entries still exist
146 |     old_entry.meta = None
147 |     new_entry.meta = None
148 |     assert list(diff(old, new, with_unchanged=True)) == [
149 |         Change(UNCHANGED if typ == UNCHANGED else MODIFY, old_entry, new_entry),
150 |     ]
151 | 
152 |     # diff should return meta diff when both hash infos are None
153 |     old_entry.meta = left_meta
154 |     new_entry.meta = right_meta
155 |     old_entry.hash_info = None
156 |     new_entry.hash_info = None
157 |     assert list(diff(old, new, with_unchanged=True)) == [
158 |         Change(typ, old_entry, new_entry),
159 |     ]
160 | 
161 |     # diff should return modify when meta and hash info diff do not match
162 |     old_entry.meta = Meta(etag="abc")
163 |     new_entry.meta = Meta(etag="def")
164 |     old_entry.hash_info = left_hi
165 |     new_entry.hash_info = right_hi
166 |     assert list(diff(old, new, with_unchanged=True)) == [
167 |         Change(MODIFY, old_entry, new_entry),
168 |     ]
169 |     old_entry.meta = left_meta
170 |     new_entry.meta = right_meta
171 |     old_entry.hash_info = HashInfo(name="md5", value="abc")
172 |     new_entry.hash_info = HashInfo(name="md5", value="def")
173 |     assert list(diff(old, new, with_unchanged=True)) == [
174 |         Change(MODIFY, old_entry, new_entry),
175 |     ]
176 | 


--------------------------------------------------------------------------------
/tests/index/test_fs.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from dvc_data.fs import DataFileSystem
  4 | from dvc_data.hashfile.hash_info import HashInfo
  5 | from dvc_data.hashfile.meta import Meta
  6 | from dvc_data.index import (
  7 |     DataIndex,
  8 |     DataIndexDirError,
  9 |     DataIndexEntry,
 10 |     FileStorage,
 11 |     ObjectStorage,
 12 | )
 13 | 
 14 | 
 15 | def test_fs(tmp_upath, odb, as_filesystem):
 16 |     index = DataIndex(
 17 |         {
 18 |             ("foo",): DataIndexEntry(
 19 |                 key=("foo",),
 20 |                 hash_info=HashInfo(
 21 |                     name="md5", value="d3b07384d113edec49eaa6238ad5ff00"
 22 |                 ),
 23 |             ),
 24 |             ("data",): DataIndexEntry(
 25 |                 key=("data",),
 26 |                 meta=Meta(isdir=True),
 27 |                 hash_info=HashInfo(
 28 |                     name="md5",
 29 |                     value="1f69c66028c35037e8bf67e5bc4ceb6a.dir",
 30 |                 ),
 31 |             ),
 32 |         }
 33 |     )
 34 |     index.storage_map.add_cache(ObjectStorage((), odb))
 35 |     fs = DataFileSystem(index)
 36 |     assert fs.exists("foo")
 37 |     assert fs.cat("foo") == b"foo\n"
 38 |     assert fs.ls("foo") == [fs.info("foo")]
 39 |     assert fs.ls("/", detail=False) == ["/foo", "/data"]
 40 |     assert fs.ls("/", detail=True) == [fs.info("/foo"), fs.info("/data")]
 41 |     assert fs.cat("/data/bar") == b"bar\n"
 42 |     assert fs.cat("/data/baz") == b"baz\n"
 43 |     assert fs.ls("/data/bar") == [fs.info("data/bar")]
 44 |     assert fs.ls("/data", detail=False) == ["/data/bar", "/data/baz"]
 45 |     assert fs.ls("/data", detail=True) == [
 46 |         fs.info("/data/bar"),
 47 |         fs.info("/data/baz"),
 48 |     ]
 49 | 
 50 | 
 51 | def test_fs_file_storage(tmp_upath, as_filesystem):
 52 |     (tmp_upath / "foo").write_bytes(b"foo\n")
 53 |     (tmp_upath / "data").mkdir()
 54 |     (tmp_upath / "data" / "bar").write_bytes(b"bar\n")
 55 |     (tmp_upath / "data" / "baz").write_bytes(b"baz\n")
 56 | 
 57 |     index = DataIndex(
 58 |         {
 59 |             ("foo",): DataIndexEntry(
 60 |                 key=("foo",),
 61 |             ),
 62 |             ("data",): DataIndexEntry(
 63 |                 key=("data",),
 64 |             ),
 65 |         }
 66 |     )
 67 |     index.storage_map.add_cache(
 68 |         FileStorage((), as_filesystem(tmp_upath.fs), str(tmp_upath))
 69 |     )
 70 |     fs = DataFileSystem(index)
 71 |     assert fs.exists("foo")
 72 |     assert fs.cat("foo") == b"foo\n"
 73 |     assert sorted(fs.ls("/", detail=False)) == sorted(["/foo", "/data"])
 74 |     assert sorted(fs.ls("/", detail=True), key=lambda entry: entry["name"]) == sorted(
 75 |         [fs.info("/foo"), fs.info("/data")],
 76 |         key=lambda entry: entry["name"],
 77 |     )
 78 |     assert fs.cat("/data/bar") == b"bar\n"
 79 |     assert fs.cat("/data/baz") == b"baz\n"
 80 |     assert sorted(fs.ls("/data", detail=False)) == sorted(["/data/bar", "/data/baz"])
 81 |     assert sorted(
 82 |         fs.ls("/data", detail=True), key=lambda entry: entry["name"]
 83 |     ) == sorted(
 84 |         [
 85 |             fs.info("/data/bar"),
 86 |             fs.info("/data/baz"),
 87 |         ],
 88 |         key=lambda entry: entry["name"],
 89 |     )
 90 | 
 91 | 
 92 | def test_fs_broken(tmp_upath, odb, as_filesystem):
 93 |     index = DataIndex(
 94 |         {
 95 |             ("foo",): DataIndexEntry(
 96 |                 key=("foo",),
 97 |                 hash_info=HashInfo(
 98 |                     name="md5", value="d3b07384d113edec49eaa6238ad5ff00"
 99 |                 ),
100 |             ),
101 |             ("data",): DataIndexEntry(
102 |                 key=("data",),
103 |                 meta=Meta(isdir=True),
104 |                 hash_info=HashInfo(
105 |                     name="md5",
106 |                     value="1f69c66028c35037e8bf67e5bc4ceb6a.dir",
107 |                 ),
108 |             ),
109 |             ("broken",): DataIndexEntry(
110 |                 key=("broken",),
111 |                 meta=Meta(isdir=True),
112 |                 hash_info=HashInfo(
113 |                     name="md5",
114 |                     value="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.dir",
115 |                 ),
116 |             ),
117 |         }
118 |     )
119 |     index.storage_map.add_cache(ObjectStorage((), odb))
120 |     fs = DataFileSystem(index)
121 |     assert fs.exists("foo")
122 |     assert fs.cat("foo") == b"foo\n"
123 |     assert fs.ls("foo") == [fs.info("foo")]
124 | 
125 |     assert fs.ls("/", detail=False) == ["/foo", "/data", "/broken"]
126 |     assert fs.ls("/", detail=True) == [
127 |         fs.info("/foo"),
128 |         fs.info("/data"),
129 |         fs.info("/broken"),
130 |     ]
131 | 
132 |     assert fs.cat("/data/bar") == b"bar\n"
133 |     assert fs.cat("/data/baz") == b"baz\n"
134 |     assert fs.ls("/data/bar") == [fs.info("data/bar")]
135 |     assert fs.ls("/data", detail=False) == ["/data/bar", "/data/baz"]
136 |     assert fs.ls("/data", detail=True) == [
137 |         fs.info("/data/bar"),
138 |         fs.info("/data/baz"),
139 |     ]
140 | 
141 |     assert fs.exists("/broken")
142 |     assert fs.isdir("/broken")
143 |     with pytest.raises(DataIndexDirError):
144 |         fs.ls("/broken", detail=False)
145 | 
146 |     with pytest.raises(DataIndexDirError):
147 |         fs.ls("/broken", detail=True)
148 | 
149 |     def onerror(_entry, _exc):
150 |         pass
151 | 
152 |     fs.index.onerror = onerror
153 |     assert fs.ls("/broken", detail=False) == []
154 |     assert fs.ls("/broken", detail=True) == []
155 | 
156 | 
157 | def test_fs_du(tmp_upath, odb, as_filesystem):
158 |     index = DataIndex(
159 |         {
160 |             ("file_no_meta",): DataIndexEntry(
161 |                 key=("file_no_meta",),
162 |             ),
163 |             ("file_meta_size",): DataIndexEntry(
164 |                 key=("file_meta_size",),
165 |                 meta=Meta(size=4),
166 |             ),
167 |             ("file_meta_no_size",): DataIndexEntry(
168 |                 key=("file_meta_no_size",),
169 |                 meta=Meta(),
170 |             ),
171 |             ("prefix",): DataIndexEntry(
172 |                 key=("prefix",),
173 |                 meta=Meta(isdir=True),
174 |             ),
175 |             ("prefix", "dir"): DataIndexEntry(
176 |                 key=("prefix", "dir"),
177 |                 meta=Meta(isdir=True),
178 |             ),
179 |             ("prefix", "dir", "dir_size"): DataIndexEntry(
180 |                 key=("prefix", "dir", "dir_size"),
181 |                 meta=Meta(isdir=True, size=123),
182 |             ),
183 |         }
184 |     )
185 | 
186 |     fs = DataFileSystem(index)
187 | 
188 |     assert fs.du("file_no_meta") == 0
189 |     assert fs.du("file_meta_size") == 4
190 |     assert fs.du("file_meta_no_size") == 0
191 |     assert fs.du("prefix/dir/dir_size") == 123
192 |     assert fs.du("prefix/dir") == 123
193 |     assert fs.du("prefix") == 123
194 |     assert fs.du("/") == 127
195 | 
196 |     assert fs.du("file_meta_size", total=False) == {
197 |         "file_meta_size": 4,
198 |     }
199 |     assert fs.du("prefix", total=False) == {
200 |         "prefix": 0,
201 |         "prefix/dir": 0,
202 |         "prefix/dir/dir_size": 123,
203 |     }
204 |     assert fs.du("prefix/dir", total=False) == {
205 |         "prefix/dir": 0,
206 |         "prefix/dir/dir_size": 123,
207 |     }
208 |     assert fs.du("/", total=False) == {
209 |         "/": 0,
210 |         "/file_meta_no_size": 0,
211 |         "/file_meta_size": 4,
212 |         "/file_no_meta": 0,
213 |         "/prefix": 0,
214 |         "/prefix/dir": 0,
215 |         "/prefix/dir/dir_size": 123,
216 |     }
217 | 


--------------------------------------------------------------------------------
/tests/index/test_storage.py:
--------------------------------------------------------------------------------
 1 | from dvc_data.index import FileStorage, ObjectStorage, StorageInfo, StorageMapping
 2 | 
 3 | 
 4 | def test_map_get(tmp_upath, as_filesystem, odb):
 5 |     smap = StorageMapping()
 6 | 
 7 |     data = FileStorage(key=(), fs=as_filesystem(tmp_upath.fs), path=str(tmp_upath))
 8 |     cache = FileStorage(
 9 |         key=("dir",), fs=as_filesystem(tmp_upath.fs), path=str(tmp_upath)
10 |     )
11 |     remote = FileStorage(
12 |         key=("dir", "subdir"), fs=as_filesystem(tmp_upath.fs), path=str(tmp_upath)
13 |     )
14 |     foo_cache = ObjectStorage(key=("dir", "foo"), odb=odb)
15 | 
16 |     smap[()] = StorageInfo(data=data)
17 |     smap[("dir",)] = StorageInfo(cache=cache)
18 |     smap[("dir", "subdir")] = StorageInfo(remote=remote)
19 |     smap[("dir", "foo")] = StorageInfo(cache=foo_cache)
20 | 
21 |     sinfo = smap[()]
22 |     assert sinfo.data == data
23 |     assert sinfo.cache is None
24 |     assert sinfo.remote is None
25 | 
26 |     sinfo = smap[("dir",)]
27 |     assert sinfo.data == data
28 |     assert sinfo.cache == cache
29 |     assert sinfo.remote is None
30 | 
31 |     sinfo = smap[("dir", "foo")]
32 |     assert sinfo.data == data
33 |     assert sinfo.cache == foo_cache
34 |     assert sinfo.remote is None
35 | 
36 |     sinfo = smap[("dir", "subdir")]
37 |     assert sinfo.data == data
38 |     assert sinfo.cache == cache
39 |     assert sinfo.remote == remote
40 | 
41 |     sinfo = smap[("dir", "subdir", "file")]
42 |     assert sinfo.data == data
43 |     assert sinfo.cache == cache
44 |     assert sinfo.remote == remote
45 | 
46 |     sinfo = smap[("dir", "subdir", "subsubdir", "otherfile")]
47 |     assert sinfo.data == data
48 |     assert sinfo.cache == cache
49 |     assert sinfo.remote == remote
50 | 


--------------------------------------------------------------------------------