├── .cruft.json ├── .gitattributes ├── .github ├── dependabot.yml └── workflows │ ├── benchmark.yml │ ├── release.yml │ ├── tests.yml │ └── update-template.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.rst ├── CONTRIBUTING.rst ├── LICENSE ├── README.rst ├── noxfile.py ├── pyproject.toml ├── src └── dvc_data │ ├── __init__.py │ ├── __main__.py │ ├── callbacks.py │ ├── cli.py │ ├── compat.py │ ├── fs.py │ ├── fsutils.py │ ├── hashfile │ ├── __init__.py │ ├── _ignore.py │ ├── _progress.py │ ├── build.py │ ├── cache.py │ ├── checkout.py │ ├── db │ │ ├── __init__.py │ │ ├── index.py │ │ ├── local.py │ │ ├── migrate.py │ │ └── reference.py │ ├── diff.py │ ├── gc.py │ ├── hash.py │ ├── hash_info.py │ ├── istextfile.py │ ├── meta.py │ ├── obj.py │ ├── state.py │ ├── status.py │ ├── transfer.py │ ├── tree.py │ └── utils.py │ ├── index │ ├── __init__.py │ ├── add.py │ ├── build.py │ ├── checkout.py │ ├── collect.py │ ├── diff.py │ ├── fetch.py │ ├── index.py │ ├── push.py │ ├── save.py │ ├── serialize.py │ ├── update.py │ └── view.py │ ├── json_compat.py │ ├── py.typed │ └── repo.py └── tests ├── __init__.py ├── benchmarks ├── __init__.py └── test_checkout.py ├── conftest.py ├── hashfile ├── __init__.py ├── test_build.py ├── test_cache.py ├── test_checkout.py ├── test_db.py ├── test_db_index.py ├── test_diff.py ├── test_hash.py ├── test_hash_stream.py ├── test_istextfile.py ├── test_obj.py ├── test_state.py ├── test_tree.py └── test_utils.py └── index ├── __init__.py ├── conftest.py ├── test_build.py ├── test_checkout.py ├── test_diff.py ├── test_fs.py ├── test_index.py └── test_storage.py /.cruft.json: -------------------------------------------------------------------------------- 1 | { 2 | "template": "https://github.com/iterative/py-template", 3 | "commit": "15ee26df315020399731c6291d61bef81a3fc5d3", 4 | "checkout": null, 5 | "context": { 6 | "cookiecutter": { 7 | "project_name": "dvc-data", 8 | "package_name": "dvc_data", 9 | "friendly_name": "DVC data", 10 | "author": "Iterative", 11 | "email": "support@dvc.org", 12 | "github_user": "iterative", 13 | "version": "0.0.0", 14 | "copyright_year": "2022", 15 | "license": "Apache-2.0", 16 | "docs": "False", 17 | "short_description": "dvc data", 18 | "development_status": "Development Status :: 4 - Beta", 19 | "_template": "https://github.com/iterative/py-template" 20 | } 21 | }, 22 | "directory": null, 23 | "skip": [ 24 | ".git" 25 | ] 26 | } 27 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | - directory: "/" 5 | package-ecosystem: "pip" 6 | schedule: 7 | interval: "weekly" 8 | labels: 9 | - "maintenance" 10 | 11 | - directory: "/" 12 | package-ecosystem: "github-actions" 13 | schedule: 14 | interval: "weekly" 15 | labels: 16 | - "maintenance" 17 | -------------------------------------------------------------------------------- /.github/workflows/benchmark.yml: -------------------------------------------------------------------------------- 1 | name: Benchmark 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | workflow_dispatch: 7 | 8 | env: 9 | FORCE_COLOR: "1" 10 | PY_COLORS: "1" 11 | 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 14 | cancel-in-progress: true 15 | 16 | permissions: {} 17 | 18 | jobs: 19 | benchmark: 20 | runs-on: ${{ matrix.os }} 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | os: [ubuntu-latest, macos-latest] 25 | steps: 26 | - name: Set up Python 3.12 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version: '3.12' 30 | 31 | - uses: actions/checkout@v4 32 | with: 33 | ref: ${{ github.event.pull_request.base.sha }} 34 | fetch-depth: 0 35 | 36 | - uses: astral-sh/setup-uv@v6 37 | - name: Install nox 38 | run: uv pip install --system nox --upgrade 39 | 40 | - name: Benchmark on base branch 41 | run: nox -s bench -- --benchmark-save=base 42 | 43 | - uses: actions/checkout@v4 44 | with: 45 | fetch-depth: 0 46 | clean: false 47 | 48 | - name: Benchmark on pull request 49 | run: nox -s bench -- --benchmark-save=${GITHUB_SHA::7} --benchmark-compare=0001 --benchmark-compare-fail=mean:10% 50 | 51 | - name: Compare benchmark 52 | if: always() 53 | run: uvx pytest-benchmark compare --group-by name 54 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | release: 5 | types: [published] 6 | workflow_dispatch: 7 | 8 | env: 9 | FORCE_COLOR: "1" 10 | 11 | jobs: 12 | release: 13 | environment: pypi 14 | permissions: 15 | contents: read 16 | id-token: write 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Check out the repository 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Set up Python 3.12 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: '3.12' 28 | 29 | - uses: astral-sh/setup-uv@v6 30 | - name: Install nox 31 | run: uv pip install --system nox --upgrade 32 | 33 | - name: Build package 34 | run: nox -s build 35 | 36 | - name: Upload package 37 | if: github.event_name == 'release' 38 | uses: pypa/gh-action-pypi-publish@release/v1 39 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | workflow_dispatch: 8 | 9 | env: 10 | FORCE_COLOR: "1" 11 | 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | tests: 18 | timeout-minutes: 30 19 | runs-on: ${{ matrix.os }} 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | os: [ubuntu-latest, windows-latest, macos-latest] 24 | pyv: ['3.9', '3.10', '3.11', '3.12', '3.13'] 25 | include: 26 | - {os: ubuntu-latest, pyv: 'pypy3.9'} 27 | 28 | steps: 29 | - name: Check out the repository 30 | uses: actions/checkout@v4 31 | with: 32 | fetch-depth: 0 33 | 34 | - name: Set up Python ${{ matrix.pyv }} 35 | uses: actions/setup-python@v5 36 | with: 37 | python-version: ${{ matrix.pyv }} 38 | allow-prereleases: true 39 | 40 | - uses: astral-sh/setup-uv@v6 41 | with: 42 | enable-cache: true 43 | cache-suffix: ${{ matrix.pyv }} 44 | cache-dependency-glob: pyproject.toml 45 | - name: Install nox 46 | run: uv pip install --system nox --upgrade 47 | 48 | - name: Cache pre-commit hooks 49 | uses: actions/cache@v4 50 | with: 51 | path: ~/.cache/pre-commit 52 | key: pre-commit-3|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }} 53 | 54 | - name: Lint code 55 | run: nox -s lint 56 | 57 | - name: Run tests 58 | run: nox -s tests-${{ matrix.nox_pyv || matrix.pyv }} -- --cov-report=xml 59 | 60 | - name: Upload coverage report 61 | uses: codecov/codecov-action@v5 62 | 63 | - name: Build package 64 | run: nox -s build 65 | -------------------------------------------------------------------------------- /.github/workflows/update-template.yaml: -------------------------------------------------------------------------------- 1 | name: Update template 2 | 3 | on: 4 | schedule: 5 | - cron: '5 1 * * *' # every day at 01:05 6 | workflow_dispatch: 7 | 8 | jobs: 9 | update: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out the repository 13 | uses: actions/checkout@v4 14 | 15 | - name: Update template 16 | uses: iterative/py-template@main 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | .benchmarks/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # vim 142 | *.swp 143 | .dvc/ 144 | 145 | .DS_Store 146 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v5.0.0 6 | hooks: 7 | - id: check-added-large-files 8 | - id: check-case-conflict 9 | - id: check-docstring-first 10 | - id: check-executables-have-shebangs 11 | - id: check-json 12 | - id: check-merge-conflict 13 | args: ['--assume-in-merge'] 14 | - id: check-toml 15 | - id: check-yaml 16 | - id: debug-statements 17 | - id: end-of-file-fixer 18 | - id: mixed-line-ending 19 | args: ['--fix=lf'] 20 | - id: sort-simple-yaml 21 | - id: trailing-whitespace 22 | - repo: https://github.com/astral-sh/ruff-pre-commit 23 | rev: 'v0.11.13' 24 | hooks: 25 | - id: ruff 26 | args: [--fix, --exit-non-zero-on-fix] 27 | - id: ruff-format 28 | - repo: https://github.com/codespell-project/codespell 29 | rev: v2.4.1 30 | hooks: 31 | - id: codespell 32 | additional_dependencies: ["tomli"] 33 | - repo: https://github.com/asottile/pyupgrade 34 | rev: v3.20.0 35 | hooks: 36 | - id: pyupgrade 37 | args: [--py38-plus] 38 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.rst: -------------------------------------------------------------------------------- 1 | Contributor Covenant Code of Conduct 2 | ==================================== 3 | 4 | Our Pledge 5 | ---------- 6 | 7 | We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 8 | 9 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. 10 | 11 | 12 | Our Standards 13 | ------------- 14 | 15 | Examples of behavior that contributes to a positive environment for our community include: 16 | 17 | - Demonstrating empathy and kindness toward other people 18 | - Being respectful of differing opinions, viewpoints, and experiences 19 | - Giving and gracefully accepting constructive feedback 20 | - Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience 21 | - Focusing on what is best not just for us as individuals, but for the overall community 22 | 23 | Examples of unacceptable behavior include: 24 | 25 | - The use of sexualized language or imagery, and sexual attention or 26 | advances of any kind 27 | - Trolling, insulting or derogatory comments, and personal or political attacks 28 | - Public or private harassment 29 | - Publishing others' private information, such as a physical or email 30 | address, without their explicit permission 31 | - Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | Enforcement Responsibilities 35 | ---------------------------- 36 | 37 | Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. 38 | 39 | Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. 40 | 41 | 42 | Scope 43 | ----- 44 | 45 | This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. 46 | 47 | 48 | Enforcement 49 | ----------- 50 | 51 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at support@dvc.org. All complaints will be reviewed and investigated promptly and fairly. 52 | 53 | All community leaders are obligated to respect the privacy and security of the reporter of any incident. 54 | 55 | 56 | Enforcement Guidelines 57 | ---------------------- 58 | 59 | Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: 60 | 61 | 62 | 1. Correction 63 | ~~~~~~~~~~~~~ 64 | 65 | **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. 66 | 67 | **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. 68 | 69 | 70 | 2. Warning 71 | ~~~~~~~~~~ 72 | 73 | **Community Impact**: A violation through a single incident or series of actions. 74 | 75 | **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. 76 | 77 | 78 | 3. Temporary Ban 79 | ~~~~~~~~~~~~~~~~ 80 | 81 | **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. 82 | 83 | **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. 84 | 85 | 86 | 4. Permanent Ban 87 | ~~~~~~~~~~~~~~~~ 88 | 89 | **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. 90 | 91 | **Consequence**: A permanent ban from any sort of public interaction within the community. 92 | 93 | 94 | Attribution 95 | ----------- 96 | 97 | This Code of Conduct is adapted from the `Contributor Covenant `__, version 2.0, 98 | available at https://www.contributor-covenant.org/version/2/0/code_of_conduct/. 99 | 100 | Community Impact Guidelines were inspired by `Mozilla’s code of conduct enforcement ladder `__. 101 | 102 | .. _homepage: https://www.contributor-covenant.org 103 | 104 | For answers to common questions about this code of conduct, see the FAQ at 105 | https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. 106 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributor Guide 2 | ================= 3 | 4 | Thank you for your interest in improving this project. 5 | This project is open-source under the `Apache 2.0 license`_ and 6 | welcomes contributions in the form of bug reports, feature requests, and pull requests. 7 | 8 | Here is a list of important resources for contributors: 9 | 10 | - `Source Code`_ 11 | - `Issue Tracker`_ 12 | - `Code of Conduct`_ 13 | 14 | .. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0 15 | .. _Source Code: https://github.com/iterative/dvc-data 16 | .. _Issue Tracker: https://github.com/iterative/dvc-data/issues 17 | 18 | How to report a bug 19 | ------------------- 20 | 21 | Report bugs on the `Issue Tracker`_. 22 | 23 | When filing an issue, make sure to answer these questions: 24 | 25 | - Which operating system and Python version are you using? 26 | - Which version of this project are you using? 27 | - What did you do? 28 | - What did you expect to see? 29 | - What did you see instead? 30 | 31 | The best way to get your bug fixed is to provide a test case, 32 | and/or steps to reproduce the issue. 33 | 34 | 35 | How to request a feature 36 | ------------------------ 37 | 38 | Request features on the `Issue Tracker`_. 39 | 40 | 41 | How to set up your development environment 42 | ------------------------------------------ 43 | 44 | You need Python 3.8+ and the following tools: 45 | 46 | - Nox_ 47 | 48 | Install the package with development requirements: 49 | 50 | .. code:: console 51 | 52 | $ pip install nox 53 | 54 | .. _Nox: https://nox.thea.codes/ 55 | 56 | 57 | How to test the project 58 | ----------------------- 59 | 60 | Run the full test suite: 61 | 62 | .. code:: console 63 | 64 | $ nox 65 | 66 | List the available Nox sessions: 67 | 68 | .. code:: console 69 | 70 | $ nox --list-sessions 71 | 72 | You can also run a specific Nox session. 73 | For example, invoke the unit test suite like this: 74 | 75 | .. code:: console 76 | 77 | $ nox --session=tests 78 | 79 | Unit tests are located in the ``tests`` directory, 80 | and are written using the pytest_ testing framework. 81 | 82 | .. _pytest: https://pytest.readthedocs.io/ 83 | 84 | 85 | How to submit changes 86 | --------------------- 87 | 88 | Open a `pull request`_ to submit changes to this project. 89 | 90 | Your pull request needs to meet the following guidelines for acceptance: 91 | 92 | - The Nox test suite must pass without errors and warnings. 93 | - Include unit tests. This project maintains 100% code coverage. 94 | - If your changes add functionality, update the documentation accordingly. 95 | 96 | Feel free to submit early, though—we can always iterate on this. 97 | 98 | To run linting and code formatting checks, you can invoke a `lint` session in nox: 99 | 100 | .. code:: console 101 | 102 | $ nox -s lint 103 | 104 | It is recommended to open an issue before starting work on anything. 105 | This will allow a chance to talk it over with the owners and validate your approach. 106 | 107 | .. _pull request: https://github.com/iterative/dvc-data/pulls 108 | .. github-only 109 | .. _Code of Conduct: CODE_OF_CONDUCT.rst 110 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | DVC data 2 | ======== 3 | 4 | |PyPI| |Status| |Python Version| |License| 5 | 6 | |Tests| |Codecov| |pre-commit| |Black| 7 | 8 | .. |PyPI| image:: https://img.shields.io/pypi/v/dvc-data.svg 9 | :target: https://pypi.org/project/dvc-data/ 10 | :alt: PyPI 11 | .. |Status| image:: https://img.shields.io/pypi/status/dvc-data.svg 12 | :target: https://pypi.org/project/dvc-data/ 13 | :alt: Status 14 | .. |Python Version| image:: https://img.shields.io/pypi/pyversions/dvc-data 15 | :target: https://pypi.org/project/dvc-data 16 | :alt: Python Version 17 | .. |License| image:: https://img.shields.io/pypi/l/dvc-data 18 | :target: https://opensource.org/licenses/Apache-2.0 19 | :alt: License 20 | .. |Tests| image:: https://github.com/iterative/dvc-data/workflows/Tests/badge.svg 21 | :target: https://github.com/iterative/dvc-data/actions?workflow=Tests 22 | :alt: Tests 23 | .. |Codecov| image:: https://codecov.io/gh/iterative/dvc-data/branch/main/graph/badge.svg 24 | :target: https://app.codecov.io/gh/iterative/dvc-data 25 | :alt: Codecov 26 | .. |pre-commit| image:: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white 27 | :target: https://github.com/pre-commit/pre-commit 28 | :alt: pre-commit 29 | .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg 30 | :target: https://github.com/psf/black 31 | :alt: Black 32 | 33 | 34 | Features 35 | -------- 36 | 37 | * TODO 38 | 39 | 40 | Requirements 41 | ------------ 42 | 43 | * TODO 44 | 45 | 46 | Installation 47 | ------------ 48 | 49 | You can install *DVC data* via pip_ from PyPI_: 50 | 51 | .. code:: console 52 | 53 | $ pip install dvc-data 54 | 55 | 56 | Usage 57 | ----- 58 | 59 | HashFile 60 | ^^^^^^^^ 61 | 62 | HashFile 63 | """""""" 64 | 65 | Based on dvc-object's `Object`, this is an object that has a particular hash that can be used to verify its contents. Similar to git's `ShaFile`. 66 | 67 | .. code:: python 68 | 69 | from dvc_data.hashfile import HashFile 70 | 71 | obj = HashFile("/path/to/file", fs, HashInfo("md5", "36eba1e1e343279857ea7f69a597324e") 72 | 73 | HashFileDB 74 | """""""""" 75 | 76 | Based on dvc-object's `ObjectDB`, but stores `HashFile` objects and so is able to verify their contents by their `hash_info`. Similar to git's `ObjectStore`. 77 | 78 | .. code:: python 79 | 80 | from dvc_data.hashfile import HashFileDB 81 | 82 | odb = HashFileDB(fs, "/path/to/odb") 83 | 84 | Index 85 | ^^^^^ 86 | 87 | Index 88 | """"" 89 | 90 | A trie-like structure that represents data files and directories. 91 | 92 | .. code:: python 93 | 94 | from dvc_data.index import DataIndex, DataIndexEntry 95 | 96 | index = DataIndex() 97 | index[("foo",)] = DataIndexEntry(hash_info=hash_info, meta=meta) 98 | 99 | 100 | Storage 101 | """"""" 102 | 103 | A mapping that describes where to find data contents for index entries. Can be either `ObjectStorage` for `HashFileDB`-based storage or `FileStorage` for backup-like plain file storage. 104 | 105 | .. code:: python 106 | 107 | index.storage_map[("foo",)] = ObjectStorage(...) 108 | 109 | Contributing 110 | ------------ 111 | 112 | Contributions are very welcome. 113 | To learn more, see the `Contributor Guide`_. 114 | 115 | 116 | License 117 | ------- 118 | 119 | Distributed under the terms of the `Apache 2.0 license`_, 120 | *DVC data* is free and open source software. 121 | 122 | 123 | Issues 124 | ------ 125 | 126 | If you encounter any problems, 127 | please `file an issue`_ along with a detailed description. 128 | 129 | 130 | .. _Apache 2.0 license: https://opensource.org/licenses/Apache-2.0 131 | .. _PyPI: https://pypi.org/ 132 | .. _file an issue: https://github.com/iterative/dvc-data/issues 133 | .. _pip: https://pip.pypa.io/ 134 | .. github-only 135 | .. _Contributor Guide: CONTRIBUTING.rst 136 | -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | """Automation using nox.""" 2 | 3 | import glob 4 | import os 5 | 6 | import nox 7 | 8 | nox.options.default_venv_backend = "uv|virtualenv" 9 | nox.options.reuse_existing_virtualenvs = True 10 | nox.options.sessions = "lint", "tests" 11 | 12 | 13 | @nox.session( 14 | python=["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "pypy3.8", "pypy3.9"] 15 | ) 16 | def tests(session: nox.Session) -> None: 17 | session.install(".[tests,cli]") 18 | session.run( 19 | "pytest", 20 | "--cov", 21 | "--cov-config=pyproject.toml", 22 | *session.posargs, 23 | env={"COVERAGE_FILE": f".coverage.{session.python}"}, 24 | ) 25 | 26 | 27 | @nox.session 28 | def bench(session: nox.Session) -> None: 29 | session.install(".[tests,cli]") 30 | storage = os.getenv("PYTEST_BENCHMARK_STORAGE", "file://.benchmarks") 31 | session.run( 32 | "pytest", 33 | "--benchmark-storage", 34 | storage, 35 | "--benchmark-only", 36 | *session.posargs, 37 | ) 38 | 39 | 40 | @nox.session 41 | def lint(session: nox.Session) -> None: 42 | session.install("pre-commit") 43 | session.install("-e", ".[dev]") 44 | 45 | args = *(session.posargs or ("--show-diff-on-failure",)), "--all-files" 46 | session.run("pre-commit", "run", *args) 47 | session.run("python", "-m", "mypy") 48 | 49 | 50 | @nox.session 51 | def build(session: nox.Session) -> None: 52 | session.install("twine", "uv") 53 | session.run("uv", "build") 54 | dists = glob.glob("dist/*") 55 | session.run("twine", "check", *dists, silent=True) 56 | 57 | 58 | @nox.session 59 | def dev(session: nox.Session) -> None: 60 | """Sets up a python development environment for the project.""" 61 | args = session.posargs or ("venv",) 62 | venv_dir = os.fsdecode(os.path.abspath(args[0])) 63 | 64 | session.log(f"Setting up virtual environment in {venv_dir}") 65 | session.install("virtualenv") 66 | session.run("virtualenv", venv_dir, silent=True) 67 | 68 | python = os.path.join(venv_dir, "bin/python") 69 | session.run(python, "-m", "pip", "install", "-e", ".[dev]", external=True) 70 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=48", "setuptools_scm[toml]>=6.3.1"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools_scm] 6 | 7 | [project] 8 | name = "dvc-data" 9 | description = "DVC's data management subsystem" 10 | readme = "README.rst" 11 | license = {text = "Apache-2.0"} 12 | authors = [{ name = "Iterative", email = "support@dvc.org" }] 13 | classifiers = [ 14 | "Programming Language :: Python :: 3", 15 | "Programming Language :: Python :: 3.9", 16 | "Programming Language :: Python :: 3.10", 17 | "Programming Language :: Python :: 3.11", 18 | "Programming Language :: Python :: 3.12", 19 | "Programming Language :: Python :: 3.13", 20 | "Development Status :: 4 - Beta", 21 | ] 22 | requires-python = ">=3.9" 23 | dynamic = ["version"] 24 | dependencies = [ 25 | "attrs>=21.3.0", 26 | "dictdiffer>=0.8.1", 27 | "diskcache>=5.2.1", 28 | "dvc-objects>=4.0.1,<6", 29 | "fsspec>=2024.2.0", 30 | "funcy>=1.14; python_version < '3.12'", 31 | "pygtrie>=2.3.2", 32 | "sqltrie>=0.11.0,<1", 33 | "tqdm>=4.63.1,<5", 34 | "orjson>=3,<4; implementation_name=='cpython'", 35 | ] 36 | 37 | [project.urls] 38 | Issues = "https://github.com/iterative/dvc-data/issues" 39 | Source = "https://github.com/iterative/dvc-data" 40 | 41 | [project.optional-dependencies] 42 | cli = [ 43 | "typer-slim>=0.12", 44 | ] 45 | all = [ 46 | "dvc-data[cli]", 47 | ] 48 | tests = [ 49 | "pytest>=7,<9", 50 | "pytest-sugar", 51 | "pytest-cov>=4.1.0", 52 | "pytest-mock", 53 | "pytest-benchmark>=4", 54 | "pytest-servers==0.5.10", 55 | ] 56 | dev = [ 57 | "dvc-data[all]", 58 | "dvc-data[tests]", 59 | "blake3>=0.3.1", 60 | "mypy==1.15.0", 61 | "types-tqdm", 62 | ] 63 | 64 | [project.scripts] 65 | dvc-data = "dvc_data.__main__:main" 66 | 67 | [tool.setuptools.package-data] 68 | dvc_data = ["py.typed"] 69 | 70 | [tool.setuptools.packages.find] 71 | where = ["src"] 72 | namespaces = false 73 | 74 | [tool.pytest.ini_options] 75 | addopts = "-ra --benchmark-skip" 76 | filterwarnings = [ 77 | "error", 78 | "ignore:datetime.datetime.*:DeprecationWarning", 79 | ] 80 | 81 | [tool.coverage.run] 82 | branch = true 83 | source = ["dvc_data", "tests"] 84 | 85 | [tool.coverage.paths] 86 | source = ["src", "*/site-packages"] 87 | 88 | [tool.coverage.report] 89 | show_missing = true 90 | exclude_lines = [ 91 | "pragma: no cover", 92 | "if __name__ == .__main__.:", 93 | "if typing.TYPE_CHECKING:", 94 | "if TYPE_CHECKING:", 95 | "raise NotImplementedError", 96 | "raise AssertionError", 97 | "@overload", 98 | ] 99 | 100 | [tool.mypy] 101 | # Error output 102 | show_column_numbers = true 103 | show_error_codes = true 104 | show_error_context = true 105 | show_traceback = true 106 | pretty = true 107 | check_untyped_defs = true 108 | # Warnings 109 | warn_no_return = true 110 | warn_redundant_casts = true 111 | warn_unreachable = true 112 | strict_equality = true 113 | no_implicit_optional = true 114 | warn_unused_configs = true 115 | files = ["src", "tests"] 116 | 117 | [[tool.mypy.overrides]] 118 | ignore_missing_imports = true 119 | module = [ 120 | "fsspec.*", 121 | "funcy", 122 | "diskcache", 123 | "pygtrie", 124 | "dictdiffer", 125 | "shortuuid.*", 126 | ] 127 | 128 | [tool.codespell] 129 | ignore-words-list = "fo" 130 | skip = "CODE_OF_CONDUCT.rst" 131 | 132 | [tool.ruff] 133 | show-fixes = true 134 | 135 | [tool.ruff.lint] 136 | preview = true 137 | explicit-preview-rules = true 138 | ignore = [ 139 | "A005", # stdlib-module-shadowing 140 | "PLR2004", # magic-value-comparison 141 | "PLW2901", # redefined-loop-name 142 | "RET501", # unnecessary-return-none 143 | "RET502", # implicit-return-value 144 | "RET503", # implicit-return 145 | "S101", # assert 146 | "SIM105", # suppressible-exception 147 | "SIM108", # if-else-block-instead-of-if-exp 148 | "SIM117", # multiple-with-statements 149 | ] 150 | select = [ 151 | "A", # flake8-buitlins 152 | "ASYNC", # flake8-async 153 | "B", # flake8-bugbear 154 | "BLE", # flake8-blind-except 155 | "C4", # flake8-comprehensions 156 | "C90", # mccabe 157 | "DTZ", # flake8-datetimez 158 | "E", # pycodestyle - Error 159 | "EXE", # flake8-executable 160 | "F", # pyflakes 161 | "FLY", # flynt-rules 162 | "G", # flake8-logging-format 163 | "I", # isort 164 | "ICN", # flake8-import-conventions 165 | "INP", # flake8-no-pep420 166 | "ISC", # flake8-implicit-str-concat 167 | "N", # pep8-naming 168 | "PERF101", # perflint 169 | "PGH", # pygrep-hooks 170 | "PIE", # flake8-pie 171 | "PL", # pylint 172 | "PT", # flake8-pytest-style 173 | "PYI", # flake8-pyi 174 | "Q", # flae8-quotes 175 | "RET", # flake8-return 176 | "RSE", # flake8-raise 177 | "RUF", # ruff 178 | "S", # flake8-bandit 179 | "SIM", # flake8-simplify 180 | "SLOT", # flake8-slots 181 | "T10", # flake8-debugger 182 | "T20", # flake8-print 183 | "TCH", # flake8-type-checking 184 | "TCH", # flake8-type-checking 185 | "TID", # flake8-tidy-imports 186 | "UP", # pyupgrade 187 | "W", # pycodestyle - Warning 188 | "YTT", # flake8-2020 189 | ] 190 | 191 | [tool.ruff.lint.flake8-pytest-style] 192 | fixture-parentheses = false 193 | mark-parentheses = false 194 | parametrize-names-type = "csv" 195 | 196 | [tool.ruff.lint.flake8-tidy-imports] 197 | [tool.ruff.lint.flake8-tidy-imports.banned-api] 198 | "funcy.cached_property" = {msg = "use `from dvc_data.compat import cached_property` instead."} 199 | "functools.cached_property" = {msg = "use `from dvc_data.compat import cached_property` instead."} 200 | 201 | [tool.ruff.lint.flake8-type-checking] 202 | strict = true 203 | 204 | [tool.ruff.lint.isort] 205 | known-first-party = ["dvc_data"] 206 | 207 | [tool.ruff.lint.pylint] 208 | max-args = 10 209 | 210 | [tool.ruff.lint.per-file-ignores] 211 | "src/dvc_data/cli.py" = ["T201", "B008"] 212 | -------------------------------------------------------------------------------- /src/dvc_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iterative/dvc-data/4ee77349a9712476cea4ac57154ee25ce79fcc02/src/dvc_data/__init__.py -------------------------------------------------------------------------------- /src/dvc_data/__main__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .cli import main 3 | except ImportError: # pragma: no cover 4 | 5 | def main(): # type: ignore[misc] 6 | import sys 7 | 8 | print( # noqa: T201 9 | "dvc-data could not run because the required " 10 | "dependencies are not installed.\n" 11 | "Please install it with: pip install 'dvc-data[cli]'" 12 | ) 13 | sys.exit(1) 14 | 15 | 16 | if __name__ == "__main__": 17 | main() 18 | -------------------------------------------------------------------------------- /src/dvc_data/callbacks.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | import sys 5 | from typing import Any, BinaryIO, ClassVar, Optional, Union 6 | 7 | import fsspec 8 | from tqdm import tqdm 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def env2bool(var, undefined=False): 14 | """ 15 | undefined: return value if env var is unset 16 | """ 17 | var = os.getenv(var, None) 18 | if var is None: 19 | return undefined 20 | return bool(re.search("1|y|yes|true", var, flags=re.I)) 21 | 22 | 23 | class Tqdm(tqdm): 24 | """ 25 | maximum-compatibility tqdm-based progressbars 26 | """ 27 | 28 | BAR_FMT_DEFAULT = ( 29 | "{percentage:3.0f}% {desc}|{bar}|" 30 | "{postfix[info]}{n_fmt}/{total_fmt}" 31 | " [{elapsed}<{remaining}, {rate_fmt:>11}]" 32 | ) 33 | # nested bars should have fixed bar widths to align nicely 34 | BAR_FMT_DEFAULT_NESTED = ( 35 | "{percentage:3.0f}%|{bar:10}|{desc:{ncols_desc}.{ncols_desc}}" 36 | "{postfix[info]}{n_fmt}/{total_fmt}" 37 | " [{elapsed}<{remaining}, {rate_fmt:>11}]" 38 | ) 39 | BAR_FMT_NOTOTAL = "{desc}{bar:b}|{postfix[info]}{n_fmt} [{elapsed}, {rate_fmt:>11}]" 40 | BYTES_DEFAULTS: ClassVar[dict[str, Any]] = { 41 | "unit": "B", 42 | "unit_scale": True, 43 | "unit_divisor": 1024, 44 | "miniters": 1, 45 | } 46 | 47 | def __init__( 48 | self, 49 | iterable=None, 50 | disable=None, 51 | level=logging.ERROR, 52 | desc=None, 53 | leave=False, 54 | bar_format=None, 55 | bytes=False, # noqa: A002 56 | file=None, 57 | total=None, 58 | postfix=None, 59 | **kwargs, 60 | ): 61 | """ 62 | bytes : shortcut for 63 | `unit='B', unit_scale=True, unit_divisor=1024, miniters=1` 64 | desc : persists after `close()` 65 | level : effective logging level for determining `disable`; 66 | used only if `disable` is unspecified 67 | disable : If (default: None) or False, 68 | will be determined by logging level. 69 | May be overridden to `True` due to non-TTY status. 70 | Skip override by specifying env var `DVC_IGNORE_ISATTY`. 71 | kwargs : anything accepted by `tqdm.tqdm()` 72 | """ 73 | kwargs = kwargs.copy() 74 | if bytes: 75 | kwargs = {**self.BYTES_DEFAULTS, **kwargs} 76 | else: 77 | kwargs.setdefault("unit_scale", total > 999 if total else True) 78 | if file is None: 79 | file = sys.stderr 80 | # auto-disable based on `logger.level` 81 | if not disable: 82 | disable = logger.getEffectiveLevel() > level 83 | # auto-disable based on TTY 84 | if ( 85 | not disable 86 | and not env2bool("DVC_IGNORE_ISATTY") 87 | and hasattr(file, "isatty") 88 | ): 89 | disable = not file.isatty() 90 | super().__init__( 91 | iterable=iterable, 92 | disable=disable, 93 | leave=leave, 94 | desc=desc, 95 | bar_format="!", 96 | lock_args=(False,), 97 | total=total, 98 | **kwargs, 99 | ) 100 | self.postfix = postfix or {"info": ""} 101 | if bar_format is None: 102 | if self.__len__(): 103 | self.bar_format = ( 104 | self.BAR_FMT_DEFAULT_NESTED if self.pos else self.BAR_FMT_DEFAULT 105 | ) 106 | else: 107 | self.bar_format = self.BAR_FMT_NOTOTAL 108 | else: 109 | self.bar_format = bar_format 110 | self.refresh() 111 | 112 | def close(self): 113 | self.postfix["info"] = "" 114 | # remove ETA (either unknown or zero); remove completed bar 115 | self.bar_format = self.bar_format.replace("<{remaining}", "").replace( 116 | "|{bar:10}|", " " 117 | ) 118 | super().close() 119 | 120 | @property 121 | def format_dict(self): 122 | """inject `ncols_desc` to fill the display width (`ncols`)""" 123 | d = super().format_dict 124 | ncols = d["ncols"] or 80 125 | # assumes `bar_format` has max one of ("ncols_desc" & "ncols_info") 126 | 127 | meter = self.format_meter( # type: ignore[call-arg] 128 | ncols_desc=1, ncols_info=1, **d 129 | ) 130 | ncols_left = ncols - len(meter) + 1 131 | ncols_left = max(ncols_left, 0) 132 | if ncols_left: 133 | d["ncols_desc"] = d["ncols_info"] = ncols_left 134 | else: 135 | # work-around for zero-width description 136 | d["ncols_desc"] = d["ncols_info"] = 1 137 | d["prefix"] = "" 138 | return d 139 | 140 | 141 | class TqdmCallback(fsspec.callbacks.TqdmCallback): 142 | def __init__( 143 | self, 144 | size: Optional[int] = None, 145 | value: int = 0, 146 | progress_bar: Optional["tqdm"] = None, 147 | tqdm_cls: Optional[type["tqdm"]] = None, 148 | **tqdm_kwargs, 149 | ): 150 | tqdm_kwargs.pop("total", None) 151 | super().__init__( 152 | tqdm_kwargs=tqdm_kwargs, tqdm_cls=tqdm_cls or Tqdm, size=size, value=value 153 | ) 154 | if progress_bar is None: 155 | self.tqdm = progress_bar 156 | 157 | def branched(self, path_1: "Union[str, BinaryIO]", path_2: str, **kwargs): 158 | desc = path_1 if isinstance(path_1, str) else path_2 159 | return TqdmCallback(bytes=True, desc=desc) 160 | -------------------------------------------------------------------------------- /src/dvc_data/compat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections.abc import Iterable, Iterator 3 | from itertools import islice 4 | from typing import TYPE_CHECKING, TypeVar 5 | 6 | if sys.version_info >= (3, 12) or TYPE_CHECKING: 7 | from functools import cached_property # noqa: TID251 8 | else: 9 | from funcy import cached_property # noqa: TID251 10 | 11 | 12 | T = TypeVar("T") 13 | 14 | 15 | def batched(iterable: Iterable[T], n: int) -> Iterator[tuple[T, ...]]: 16 | if n < 1: 17 | raise ValueError("n must be at least one") 18 | it = iter(iterable) 19 | while batch := tuple(islice(it, n)): 20 | yield batch 21 | 22 | 23 | __all__ = ["batched", "cached_property"] 24 | -------------------------------------------------------------------------------- /src/dvc_data/fs.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import errno 3 | import logging 4 | import os 5 | import posixpath 6 | import typing 7 | from collections import deque 8 | from typing import Any, BinaryIO, NamedTuple, Optional 9 | 10 | from fsspec import AbstractFileSystem 11 | from fsspec.callbacks import DEFAULT_CALLBACK, NoOpCallback 12 | 13 | if typing.TYPE_CHECKING: 14 | from dvc_objects.fs.base import AnyFSPath, FileSystem 15 | from fsspec import Callback 16 | 17 | from dvc_data.hashfile.db import HashFileDB 18 | 19 | from .hashfile.hash_info import HashInfo 20 | from .index import DataIndex, DataIndexEntry, ObjectStorage 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class _WrappedCallback(NoOpCallback): 26 | # check `_get_file` for more details 27 | def branched(self, path_1, path_2, **kwargs): 28 | # NOTE: only safe for a single use 29 | return self.kw.get("callback", DEFAULT_CALLBACK) 30 | 31 | 32 | class FileInfo(NamedTuple): 33 | typ: str 34 | storage: "ObjectStorage" 35 | cache_storage: "ObjectStorage" 36 | hash_info: Optional["HashInfo"] 37 | fs: "FileSystem" 38 | fs_path: "AnyFSPath" 39 | 40 | 41 | class DataFileSystem(AbstractFileSystem): 42 | root_marker = "/" 43 | 44 | def __init__(self, index: "DataIndex", **kwargs: Any): 45 | super().__init__(**kwargs) 46 | self.index = index 47 | 48 | @classmethod 49 | def join(cls, *parts: str) -> str: 50 | return posixpath.join(*parts) 51 | 52 | @classmethod 53 | def parts(cls, path: str) -> tuple[str, ...]: 54 | ret = [] 55 | while True: 56 | path, part = posixpath.split(path) 57 | 58 | if part: 59 | ret.append(part) 60 | continue 61 | 62 | if path: 63 | ret.append(path) 64 | 65 | break 66 | 67 | ret.reverse() 68 | 69 | return tuple(ret) 70 | 71 | def getcwd(self) -> str: 72 | return self.root_marker 73 | 74 | def normpath(self, path: str) -> str: 75 | return posixpath.normpath(path) 76 | 77 | def abspath(self, path: str) -> str: 78 | if not posixpath.isabs(path): 79 | path = self.join(self.getcwd(), path) 80 | return self.normpath(path) 81 | 82 | def relpath(self, path: str, start: Optional[str] = None) -> str: 83 | if start is None: 84 | start = "." 85 | return posixpath.relpath(self.abspath(path), start=self.abspath(start)) 86 | 87 | def relparts(self, path: str, start: Optional[str] = None) -> tuple[str, ...]: 88 | return self.parts(self.relpath(path, start=start)) 89 | 90 | def _get_key(self, path: str) -> tuple[str, ...]: 91 | path = self.abspath(path) 92 | if path == self.root_marker: 93 | return () 94 | 95 | key = self.relparts(path, self.root_marker) 96 | if key in ((".",), ("",)): 97 | key = () 98 | 99 | return key 100 | 101 | def _get_fs_path(self, path: "AnyFSPath", info=None) -> FileInfo: 102 | from .index import StorageKeyError 103 | 104 | info = info or self.info(path) 105 | if info["type"] == "directory": 106 | raise IsADirectoryError(errno.EISDIR, os.strerror(errno.EISDIR), path) 107 | 108 | entry: Optional[DataIndexEntry] = info["entry"] 109 | 110 | assert entry 111 | hash_info: Optional[HashInfo] = entry.hash_info 112 | 113 | for typ in ["cache", "remote", "data"]: 114 | try: 115 | info = self.index.storage_map[entry.key] 116 | storage = getattr(info, typ) 117 | if not storage: 118 | continue 119 | data = storage.get(entry) 120 | except (ValueError, StorageKeyError): 121 | continue 122 | if data: 123 | fs, fs_path = data 124 | if fs.exists(fs_path): 125 | return FileInfo(typ, storage, info.cache, hash_info, fs, fs_path) 126 | 127 | raise FileNotFoundError(errno.ENOENT, "No storage files available", path) 128 | 129 | def _cache_remote_file( 130 | self, 131 | cache_storage: "ObjectStorage", 132 | fs: "FileSystem", 133 | path: "AnyFSPath", 134 | hash_info: Optional["HashInfo"], 135 | ) -> tuple["FileSystem", "AnyFSPath"]: 136 | from dvc_objects.fs.local import LocalFileSystem 137 | 138 | odb: HashFileDB = cache_storage.odb 139 | oid = hash_info.value if hash_info else None 140 | hash_name = hash_info.name if hash_info else None 141 | assert odb.hash_name 142 | 143 | if isinstance(fs, LocalFileSystem) or not oid or odb.hash_name != hash_name: 144 | return fs, path 145 | 146 | odb.add(path, fs, oid) 147 | return odb.fs, odb.oid_to_path(oid) 148 | 149 | def _open(self, path: "AnyFSPath", **kwargs: Any) -> "BinaryIO": 150 | typ, _, cache_storage, hi, fs, fspath = self._get_fs_path(path) 151 | 152 | if kwargs.get("cache", False) and typ == "remote" and cache_storage: 153 | fs, fspath = self._cache_remote_file(cache_storage, fs, fspath, hi) 154 | 155 | return fs.open(fspath, mode="rb") 156 | 157 | def ls(self, path: "AnyFSPath", detail: bool = True, **kwargs: Any): 158 | root_key = self._get_key(path) 159 | try: 160 | info = self.index.info(root_key) 161 | if info["type"] == "file": 162 | info["name"] = path = self.join(*root_key) 163 | return [info] if detail else [path] 164 | if not detail: 165 | return [ 166 | self.join(path, key[-1]) 167 | for key in self.index.ls(root_key, detail=False) 168 | ] 169 | 170 | entries = [] 171 | for key, info in self.index.ls(root_key, detail=True): 172 | info["name"] = self.join(path, key[-1]) 173 | entries.append(info) 174 | return entries 175 | except KeyError as exc: 176 | raise FileNotFoundError( 177 | errno.ENOENT, os.strerror(errno.ENOENT), path 178 | ) from exc 179 | 180 | def info(self, path: "AnyFSPath", **kwargs: Any): 181 | key = self._get_key(path) 182 | 183 | try: 184 | info = self.index.info(key) 185 | except KeyError as exc: 186 | raise FileNotFoundError( 187 | errno.ENOENT, 188 | os.strerror(errno.ENOENT), 189 | path, 190 | ) from exc 191 | 192 | info["name"] = path 193 | return info 194 | 195 | def get_file( 196 | self, 197 | rpath: "AnyFSPath", 198 | lpath: "AnyFSPath", 199 | callback: "Callback" = DEFAULT_CALLBACK, 200 | info: Optional[dict[str, Any]] = None, 201 | **kwargs: Any, 202 | ) -> None: 203 | from dvc_objects.fs.generic import transfer 204 | from dvc_objects.fs.local import LocalFileSystem 205 | 206 | from dvc_data.index import ObjectStorage 207 | 208 | try: 209 | typ, storage, cache_storage, hi, fs, path = self._get_fs_path(rpath, info) 210 | except IsADirectoryError: 211 | os.makedirs(lpath, exist_ok=True) 212 | return None 213 | 214 | cache = kwargs.pop("cache", False) 215 | if cache and typ == "remote" and cache_storage: 216 | fs, path = self._cache_remote_file(cache_storage, fs, path, hi) 217 | storage = cache_storage 218 | 219 | if ( 220 | isinstance(storage, ObjectStorage) 221 | and isinstance(fs, LocalFileSystem) 222 | and storage.odb.cache_types 223 | ): 224 | try: 225 | transfer( 226 | fs, 227 | path, 228 | fs, 229 | os.fspath(lpath), 230 | # `transfer` supports uploading multiple files, so it uses the 231 | # passed callback to iterate for no. of files. 232 | # So, we wrap the given callback in a `NoOpCallback` and return it 233 | # in `branch` so that file copy callback gets properly called. 234 | # This is safe for transferring a single file. 235 | callback=_WrappedCallback(callback=callback), 236 | links=copy.copy(storage.odb.cache_types), 237 | ) 238 | return 239 | except OSError: 240 | pass 241 | 242 | fs.get_file(path, lpath, callback=callback, **kwargs) 243 | 244 | def checksum(self, path: str) -> str: 245 | info = self.info(path) 246 | md5 = info.get("md5") 247 | if md5: 248 | assert isinstance(md5, str) 249 | return md5 250 | raise NotImplementedError 251 | 252 | def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs): 253 | if maxdepth is not None: 254 | raise NotImplementedError 255 | 256 | sizes = {} 257 | todo = deque([self.info(path)]) 258 | while todo: 259 | info = todo.popleft() 260 | 261 | sizes[info["name"]] = info["size"] or 0 262 | 263 | if info["type"] != "directory": 264 | continue 265 | 266 | entry = info.get("entry") 267 | if entry is not None and entry.size is not None: 268 | continue 269 | 270 | todo.extend(self.ls(info["name"], detail=True)) 271 | 272 | if total: 273 | return sum(sizes.values()) 274 | 275 | return sizes 276 | -------------------------------------------------------------------------------- /src/dvc_data/fsutils.py: -------------------------------------------------------------------------------- 1 | from os import readlink, stat 2 | from stat import S_ISDIR, S_ISLNK, S_ISREG 3 | from typing import Any 4 | 5 | 6 | def _localfs_info(path: str) -> dict[str, Any]: 7 | out = stat(path, follow_symlinks=False) 8 | if link := S_ISLNK(out.st_mode): 9 | out = stat(path, follow_symlinks=True) 10 | if S_ISDIR(out.st_mode): 11 | t = "directory" 12 | elif S_ISREG(out.st_mode): 13 | t = "file" 14 | else: 15 | t = "other" 16 | 17 | result = { 18 | "name": path, 19 | "size": out.st_size, 20 | "type": t, 21 | "created": out.st_ctime, 22 | "islink": link, 23 | "mode": out.st_mode, 24 | "uid": out.st_uid, 25 | "gid": out.st_gid, 26 | "mtime": out.st_mtime, 27 | "ino": out.st_ino, 28 | "nlink": out.st_nlink, 29 | } 30 | if link: 31 | result["destination"] = readlink(path) 32 | return result 33 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/__init__.py: -------------------------------------------------------------------------------- 1 | """DVC data.""" 2 | 3 | import logging 4 | from collections.abc import Iterator 5 | from typing import TYPE_CHECKING, Union, cast 6 | 7 | from .tree import Tree 8 | 9 | if TYPE_CHECKING: 10 | from .db import HashFileDB 11 | from .hash_info import HashInfo 12 | from .obj import HashFile 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def check(odb: "HashFileDB", obj: "HashFile", **kwargs): 18 | if isinstance(obj, Tree): 19 | for _, _, hash_info in obj: 20 | odb.check(hash_info.value, **kwargs) 21 | 22 | odb.check(obj.oid, **kwargs) 23 | 24 | 25 | def load(odb: "HashFileDB", hash_info: "HashInfo") -> "HashFile": 26 | if hash_info.isdir: 27 | return Tree.load(odb, hash_info) 28 | return odb.get(cast("str", hash_info.value)) 29 | 30 | 31 | def iterobjs(obj: Union["Tree", "HashFile"]) -> Iterator[Union["Tree", "HashFile"]]: 32 | if isinstance(obj, Tree): 33 | yield from (entry_obj for _, entry_obj in obj) 34 | yield obj 35 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/_ignore.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterator 2 | from typing import TYPE_CHECKING, Any 3 | 4 | from typing_extensions import Protocol 5 | 6 | if TYPE_CHECKING: 7 | from dvc_objects.fs.base import AnyFSPath, FileSystem 8 | 9 | 10 | class Ignore(Protocol): 11 | def find(self, fs: "FileSystem", path: "AnyFSPath") -> Iterator["AnyFSPath"]: ... 12 | 13 | def walk(self, fs: "FileSystem", path: "AnyFSPath", **kwargs: Any): ... 14 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/_progress.py: -------------------------------------------------------------------------------- 1 | from dvc_data.callbacks import Tqdm 2 | 3 | 4 | class QueryingProgress(Tqdm): 5 | def __init__(self, iterable=None, total=None, name=None, phase="Querying"): 6 | msg_part = "cache in " + f"'{name}'" if name else "remote cache" 7 | msg_fmt = "{phase} " + msg_part 8 | 9 | self._estimating_msg = msg_fmt.format(phase="Estimating size of") 10 | self._listing_msg = msg_fmt.format(phase="Querying") 11 | self.desc = desc = msg_fmt.format(phase=phase) 12 | super().__init__( 13 | iterable=iterable, 14 | desc=desc, 15 | total=total, 16 | unit="files", 17 | unit_scale=False, 18 | bar_format=self.BAR_FMT_DEFAULT, 19 | ) 20 | 21 | def callback(self, phase, *args): 22 | total = args[0] if args else self.total 23 | completed = args[1] if len(args) > 1 else self.n 24 | if phase == "estimating": 25 | self.desc = self._estimating_msg 26 | elif phase == "querying": 27 | self.desc = self._listing_msg 28 | if total: 29 | self.total = total 30 | self.update(completed - self.n) 31 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import sqlite3 4 | from collections.abc import Iterable, Iterator, Sequence 5 | from functools import wraps 6 | from itertools import zip_longest 7 | from typing import Any, ClassVar, Literal, Optional 8 | 9 | import diskcache 10 | from diskcache import Disk as _Disk 11 | from diskcache import ( 12 | Index, # noqa: F401 13 | Timeout, # noqa: F401 14 | ) 15 | 16 | from dvc_data.compat import batched 17 | 18 | 19 | class DiskError(Exception): 20 | def __init__(self, directory: str, type: str) -> None: # noqa: A002 21 | self.directory = directory 22 | self.type = type 23 | super().__init__(f"Could not open disk '{type}' in {directory}") 24 | 25 | 26 | def translate_pickle_error(fn): 27 | @wraps(fn) 28 | def wrapped(self, *args, **kwargs): 29 | try: 30 | return fn(self, *args, **kwargs) 31 | except (pickle.PickleError, ValueError) as e: 32 | if isinstance(e, ValueError) and "pickle protocol" not in str(e): 33 | raise 34 | 35 | raise DiskError(self._directory, type=self._type) from e 36 | 37 | return wrapped 38 | 39 | 40 | class Disk(_Disk): 41 | """Reraise pickle-related errors as DiskError.""" 42 | 43 | # we need type to differentiate cache for better error messages 44 | _type: str 45 | 46 | put = translate_pickle_error(_Disk.put) 47 | get = translate_pickle_error(_Disk.get) 48 | store = translate_pickle_error(_Disk.store) 49 | fetch = translate_pickle_error(_Disk.fetch) 50 | 51 | 52 | class Cache(diskcache.Cache): 53 | """Extended to handle pickle errors and use a constant pickle protocol.""" 54 | 55 | def __init__( 56 | self, 57 | directory: Optional[str] = None, 58 | timeout: int = 60, 59 | disk: _Disk = Disk, 60 | type: Optional[str] = None, # noqa: A002 61 | **settings: Any, 62 | ) -> None: 63 | settings.setdefault("disk_pickle_protocol", 4) 64 | settings.setdefault("cull_limit", 0) 65 | super().__init__(directory=directory, timeout=timeout, disk=disk, **settings) 66 | self.disk._type = self._type = type or os.path.basename(self.directory) 67 | 68 | def __getstate__(self): 69 | return (*super().__getstate__(), self._type) 70 | 71 | 72 | class HashesCache(Cache): 73 | SUPPORTS_UPSERT = sqlite3.sqlite_version_info >= (3, 24, 0) 74 | SQLITE_MAX_VARIABLE_NUMBER: ClassVar[Literal[999]] = 999 75 | """The maximum number of host parameters is 999 for SQLite versions prior to 3.32.0 76 | (2020-05-22) or 32766 for SQLite versions after 3.32.0. 77 | 78 | Increasing this number does not yield any performance improvement, so we leave it at 79 | the old default. 80 | """ 81 | 82 | def get_many( 83 | self, keys: Iterable[str], default=None 84 | ) -> Iterator[tuple[str, Optional[str]]]: 85 | if self.is_empty(): 86 | yield from zip_longest(keys, []) 87 | return 88 | 89 | for chunk in batched(keys, self.SQLITE_MAX_VARIABLE_NUMBER): 90 | params = ", ".join("?" * len(chunk)) 91 | query = f"SELECT key, value FROM Cache WHERE key IN ({params}) and raw = 1" # noqa: S608 92 | d = dict(self._sql(query, chunk).fetchall()) 93 | for key in chunk: 94 | yield key, d.get(key, default) 95 | 96 | def set_many(self, items: Sequence[tuple[str, str]], retry: bool = False) -> None: 97 | if not items: 98 | return 99 | 100 | if self.SUPPORTS_UPSERT: 101 | query = ( 102 | "INSERT INTO Cache(" 103 | " key, raw, store_time, expire_time, access_time," 104 | " tag, mode, filename, value" 105 | ") VALUES (?, 1, 0, null, 0, null, 1, null, ?)" 106 | " ON CONFLICT(key, raw) DO UPDATE SET value = excluded.value" 107 | ) 108 | else: 109 | query = ( 110 | "INSERT OR REPLACE INTO Cache(" 111 | " key, raw, store_time, expire_time, access_time," 112 | " tag, mode, filename, value" 113 | ") VALUES (?, 1, 0, null, 0, null, 1, null, ?)" 114 | ) 115 | with self.transact(retry): 116 | self._con.executemany(query, items) 117 | 118 | def is_empty(self) -> bool: 119 | res = self._sql("SELECT EXISTS (SELECT 1 FROM Cache)") 120 | ((exists,),) = res 121 | return exists == 0 122 | 123 | def get( 124 | self, key, default=None, read=False, expire_time=False, tag=False, retry=False 125 | ): 126 | cursor = self._sql("SELECT value FROM Cache WHERE key = ? and raw = 1", (key,)) 127 | if rows := cursor.fetchall(): 128 | return rows[0][0] 129 | return default 130 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/db/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from contextlib import suppress 3 | from copy import copy 4 | from typing import TYPE_CHECKING, Callable, ClassVar, Optional, Union 5 | 6 | from dvc_objects.db import ObjectDB 7 | from dvc_objects.errors import ObjectFormatError 8 | from fsspec.callbacks import DEFAULT_CALLBACK 9 | 10 | from dvc_data.hashfile.hash_info import HashInfo 11 | from dvc_data.hashfile.obj import HashFile 12 | 13 | if TYPE_CHECKING: 14 | from dvc_objects.fs.base import AnyFSPath, FileSystem 15 | from fsspec import Callback 16 | 17 | from dvc_data.hashfile.meta import Meta 18 | from dvc_data.hashfile.state import StateBase 19 | from dvc_data.hashfile.tree import Tree 20 | 21 | from .index import ObjectDBIndexBase 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | def get_odb(fs, path, **config): 28 | from dvc_objects.fs import Schemes 29 | 30 | from .local import LocalHashFileDB 31 | 32 | if fs.protocol == Schemes.LOCAL: 33 | return LocalHashFileDB(fs, path, **config) 34 | 35 | return HashFileDB(fs, path, **config) 36 | 37 | 38 | def get_index(odb) -> "ObjectDBIndexBase": 39 | import hashlib 40 | 41 | from .index import ObjectDBIndex, ObjectDBIndexNoop 42 | 43 | cls = ObjectDBIndex if odb.tmp_dir else ObjectDBIndexNoop 44 | return cls( 45 | odb.tmp_dir, 46 | hashlib.sha256(odb.fs.unstrip_protocol(odb.path).encode("utf-8")).hexdigest(), 47 | ) 48 | 49 | 50 | class HashFileDB(ObjectDB): 51 | DEFAULT_VERIFY = False 52 | DEFAULT_CACHE_TYPES: ClassVar[list[str]] = ["copy"] 53 | CACHE_MODE: ClassVar[Optional[int]] = None 54 | 55 | def __init__(self, fs: "FileSystem", path: str, read_only: bool = False, **config): 56 | from dvc_data.hashfile.state import StateNoop 57 | 58 | super().__init__(fs, path, read_only=read_only) 59 | self.state: StateBase = config.get("state", StateNoop()) 60 | self.verify = config.get("verify", self.DEFAULT_VERIFY) 61 | self.cache_types = config.get("type") or copy(self.DEFAULT_CACHE_TYPES) 62 | self.slow_link_warning = config.get("slow_link_warning", True) 63 | self.tmp_dir = config.get("tmp_dir") 64 | self.hash_name = config.get("hash_name", self.fs.PARAM_CHECKSUM) 65 | 66 | def get(self, oid: str) -> HashFile: 67 | return HashFile( 68 | self.oid_to_path(oid), 69 | self.fs, 70 | HashInfo(self.hash_name, oid), 71 | ) 72 | 73 | def add( 74 | self, 75 | path: Union["AnyFSPath", list["AnyFSPath"]], 76 | fs: "FileSystem", 77 | oid: Union[str, list[str]], 78 | hardlink: bool = False, 79 | callback: "Callback" = DEFAULT_CALLBACK, 80 | check_exists: bool = True, 81 | on_error: Optional[Callable[[str, BaseException], None]] = None, 82 | **kwargs, 83 | ) -> int: 84 | verify = kwargs.get("verify") 85 | if verify is None: 86 | verify = self.verify 87 | 88 | paths = [path] if isinstance(path, str) else path 89 | oids = [oid] if isinstance(oid, str) else oid 90 | assert len(paths) == len(oids) 91 | 92 | if verify: 93 | for o in oids: 94 | try: 95 | self.check(o, check_hash=True) 96 | except (ObjectFormatError, FileNotFoundError): 97 | pass 98 | 99 | transferred = super().add( 100 | paths, 101 | fs, 102 | oids, 103 | hardlink=hardlink, 104 | callback=callback, 105 | check_exists=check_exists, 106 | on_error=on_error, 107 | **kwargs, 108 | ) 109 | 110 | oid_cache_paths = {o: self.oid_to_path(o) for o in oids} 111 | for o, cache_path in oid_cache_paths.items(): 112 | try: 113 | if verify: 114 | self.check(o, check_hash=True) 115 | self.protect(cache_path) 116 | except (ObjectFormatError, FileNotFoundError): 117 | pass 118 | 119 | self.state.save_many( 120 | ( 121 | (cache_path, HashInfo(name=self.hash_name, value=o), None) 122 | for o, cache_path in oid_cache_paths.items() 123 | ), 124 | self.fs, 125 | ) 126 | return transferred 127 | 128 | def protect(self, path): 129 | pass 130 | 131 | def is_protected(self, path): 132 | return False 133 | 134 | def unprotect(self, path): 135 | pass 136 | 137 | def set_exec(self, path): 138 | pass 139 | 140 | def check( 141 | self, 142 | oid: str, 143 | check_hash: bool = True, 144 | _info: Optional[dict] = None, 145 | ) -> "Meta": 146 | """Compare the given hash with the (corresponding) actual one if 147 | check_hash is specified, or just verify the existence of the cache 148 | files on the filesystem. 149 | 150 | - Use `State` as a cache for computed hashes 151 | + The entries are invalidated by taking into account the following: 152 | * mtime 153 | * inode 154 | * size 155 | * hash 156 | 157 | - Remove the file from cache if it doesn't match the actual hash 158 | """ 159 | from dvc_data.hashfile.hash import hash_file 160 | from dvc_data.hashfile.meta import Meta 161 | 162 | obj = self.get(oid) 163 | if not check_hash: 164 | assert obj.fs 165 | info = _info or obj.fs.info(obj.path) 166 | return Meta.from_info(info) 167 | 168 | meta, actual = hash_file( 169 | obj.path, obj.fs, self.hash_name, self.state, info=_info 170 | ) 171 | 172 | assert actual.name == self.hash_name 173 | assert actual.value 174 | if actual.value.split(".")[0] != oid.split(".")[0]: 175 | logger.debug("corrupted cache file '%s'.", obj.path) 176 | with suppress(FileNotFoundError): 177 | self.fs.remove(obj.path) 178 | 179 | raise ObjectFormatError(f"{obj} is corrupted") 180 | 181 | # making cache file read-only so we don't need to check it 182 | # next time 183 | self.protect(obj.path) 184 | return meta 185 | 186 | def _remove_unpacked_dir(self, hash_): 187 | pass 188 | 189 | 190 | def add_update_tree(odb: HashFileDB, tree: "Tree") -> "Tree": 191 | """Add tree to ODB and update fs/path to use ODB fs/path.""" 192 | assert tree.oid 193 | odb.add(tree.path, tree.fs, tree.oid, hardlink=False) 194 | raw = odb.get(tree.oid) 195 | tree.fs = raw.fs 196 | tree.path = raw.path 197 | return tree 198 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/db/index.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from abc import ABC, abstractmethod 4 | from collections.abc import Iterable, Iterator 5 | from typing import TYPE_CHECKING 6 | 7 | from dvc_objects.errors import ObjectDBError 8 | 9 | if TYPE_CHECKING: 10 | from dvc_objects.fs.base import AnyFSPath 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class ObjectDBIndexBase(ABC): 16 | @abstractmethod 17 | def __init__( 18 | self, 19 | tmp_dir: "AnyFSPath", 20 | name: str, 21 | ) -> None: 22 | pass 23 | 24 | @abstractmethod 25 | def close(self) -> None: 26 | pass 27 | 28 | @abstractmethod 29 | def __iter__(self) -> Iterator[str]: 30 | pass 31 | 32 | @abstractmethod 33 | def __contains__(self, hash_: str) -> bool: 34 | pass 35 | 36 | def hashes(self) -> Iterator[str]: 37 | return iter(self) 38 | 39 | @abstractmethod 40 | def dir_hashes(self) -> Iterator[str]: 41 | pass 42 | 43 | @abstractmethod 44 | def clear(self) -> None: 45 | pass 46 | 47 | @abstractmethod 48 | def update(self, dir_hashes: Iterable[str], file_hashes: Iterable[str]) -> None: 49 | pass 50 | 51 | @abstractmethod 52 | def intersection(self, hashes: set[str]) -> Iterator[str]: 53 | pass 54 | 55 | 56 | class ObjectDBIndexNoop(ObjectDBIndexBase): 57 | """No-op class for ODBs which are not indexed.""" 58 | 59 | def __init__( 60 | self, 61 | tmp_dir: "AnyFSPath", 62 | name: str, 63 | ) -> None: 64 | pass 65 | 66 | def close(self) -> None: 67 | pass 68 | 69 | def __iter__(self) -> Iterator[str]: 70 | return iter([]) 71 | 72 | def __contains__(self, hash_: str) -> bool: 73 | return False 74 | 75 | def dir_hashes(self) -> Iterator[str]: 76 | yield from [] 77 | 78 | def clear(self) -> None: 79 | pass 80 | 81 | def update(self, dir_hashes: Iterable[str], file_hashes: Iterable[str]) -> None: 82 | pass 83 | 84 | def intersection(self, hashes: set[str]) -> Iterator[str]: 85 | yield from [] 86 | 87 | 88 | class ObjectDBIndex(ObjectDBIndexBase): 89 | """Class for indexing hashes in an ODB.""" 90 | 91 | INDEX_SUFFIX = ".idx" 92 | INDEX_DIR = "index" 93 | 94 | def __init__( 95 | self, 96 | tmp_dir: "AnyFSPath", 97 | name: str, 98 | ) -> None: 99 | from dvc_objects.fs import LocalFileSystem 100 | 101 | from dvc_data.hashfile.cache import Cache, Index 102 | 103 | self.index_dir = os.path.join(tmp_dir, self.INDEX_DIR, name) 104 | self.fs = LocalFileSystem() 105 | self.fs.makedirs(self.index_dir, exist_ok=True) 106 | self._cache = Cache(self.index_dir, eviction_policy="none", type="index") 107 | self.index = Index.fromcache(self._cache) 108 | 109 | def close(self) -> None: 110 | return self._cache.close() 111 | 112 | def __iter__(self) -> Iterator[str]: 113 | return iter(self.index) 114 | 115 | def __contains__(self, hash_: str) -> bool: 116 | return hash_ in self.index 117 | 118 | def dir_hashes(self) -> Iterator[str]: 119 | """Iterate over .dir hashes stored in the index.""" 120 | yield from (hash_ for hash_, is_dir in self.index.items() if is_dir) 121 | 122 | def clear(self) -> None: 123 | """Clear this index (to force re-indexing later).""" 124 | from dvc_data.hashfile.cache import Timeout 125 | 126 | try: 127 | self.index.clear() 128 | except Timeout as exc: 129 | raise ObjectDBError("Failed to clear ODB index") from exc 130 | 131 | def update(self, dir_hashes: Iterable[str], file_hashes: Iterable[str]) -> None: 132 | """Update this index, adding the specified hashes.""" 133 | from dvc_data.hashfile.cache import Timeout 134 | 135 | try: 136 | with self.index.transact(): 137 | for hash_ in dir_hashes: 138 | self.index[hash_] = True 139 | with self.index.transact(): 140 | for hash_ in file_hashes: 141 | self.index[hash_] = False 142 | except Timeout as exc: 143 | raise ObjectDBError("Failed to update ODB index") from exc 144 | 145 | def intersection(self, hashes: set[str]) -> Iterator[str]: 146 | """Iterate over values from `hashes` which exist in the index.""" 147 | yield from hashes.intersection(self.index.keys()) 148 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/db/local.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import stat 4 | from functools import partial 5 | from typing import ClassVar, Optional 6 | 7 | from dvc_objects.db import noop, wrap_iter 8 | from dvc_objects.errors import ObjectDBError, ObjectFormatError 9 | from dvc_objects.fs.utils import copyfile, remove, tmp_fname 10 | from fsspec.callbacks import DEFAULT_CALLBACK 11 | 12 | from dvc_data.fsutils import _localfs_info 13 | 14 | from . import HashFileDB 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | umask = os.umask(0) 19 | os.umask(umask) 20 | 21 | 22 | class LocalHashFileDB(HashFileDB): 23 | DEFAULT_CACHE_TYPES: ClassVar[list[str]] = ["reflink", "copy"] 24 | CACHE_MODE: ClassVar[int] = 0o444 25 | UNPACKED_DIR_SUFFIX = ".unpacked" 26 | 27 | def __init__(self, fs, path, **config): 28 | super().__init__(fs, path, **config) 29 | 30 | shared = config.get("shared") 31 | if shared: 32 | self._file_mode = 0o664 33 | self._dir_mode = 0o2775 34 | else: 35 | self._file_mode = 0o666 & ~umask 36 | self._dir_mode = 0o777 & ~umask 37 | 38 | def move(self, from_info, to_info): 39 | super().move(from_info, to_info) 40 | os.chmod(to_info, self._file_mode) 41 | 42 | def makedirs(self, path): 43 | from dvc_objects.fs.utils import makedirs 44 | 45 | makedirs(path, exist_ok=True, mode=self._dir_mode) 46 | 47 | def oid_to_path(self, oid): 48 | # NOTE: `self.path` is already normalized so we can simply use 49 | # `os.sep` instead of `os.path.join`. This results in this helper 50 | # being ~5.5 times faster. 51 | return f"{self.path}{os.sep}{oid[0:2]}{os.sep}{oid[2:]}" 52 | 53 | def oids_exist(self, oids, jobs=None, progress=noop): 54 | ret = [] 55 | progress = partial(progress, "querying", len(oids)) 56 | 57 | for oid in wrap_iter(oids, progress): 58 | try: 59 | self.check(oid) 60 | ret.append(oid) 61 | except (FileNotFoundError, ObjectFormatError): 62 | pass 63 | 64 | return ret 65 | 66 | def _list_paths(self, prefix=None): 67 | assert self.path is not None 68 | if prefix: 69 | path = self.fs.join(self.path, prefix[:2]) 70 | if not self.fs.exists(path): 71 | return 72 | else: 73 | path = self.path 74 | yield from self.fs.find(path) 75 | 76 | def _remove_unpacked_dir(self, hash_): 77 | hash_path = self.oid_to_path(hash_) 78 | path = self.fs.with_name( 79 | hash_path, 80 | self.fs.name(hash_path) + self.UNPACKED_DIR_SUFFIX, 81 | ) 82 | self.fs.remove(path) 83 | 84 | def _unprotect_file(self, path, callback=DEFAULT_CALLBACK): 85 | if self.fs.is_symlink(path) or self.fs.is_hardlink(path): 86 | logger.debug("Unprotecting '%s'", path) 87 | tmp = os.path.join(os.path.dirname(path), tmp_fname()) 88 | 89 | # The operations order is important here - if some application 90 | # would access the file during the process of copyfile then it 91 | # would get only the part of file. So, at first, the file should be 92 | # copied with the temporary name, and then original file should be 93 | # replaced by new. 94 | copyfile(path, tmp, callback=callback) 95 | remove(path) 96 | os.rename(tmp, path) 97 | 98 | else: 99 | logger.debug( 100 | "Skipping copying for '%s', since it is not a symlink or a hardlink.", 101 | path, 102 | ) 103 | 104 | os.chmod(path, self._file_mode) 105 | 106 | def unprotect(self, path, callback=DEFAULT_CALLBACK): 107 | if not os.path.exists(path): 108 | raise ObjectDBError(f"can't unprotect non-existing data '{path}'") 109 | 110 | files = self.fs.find(path) if os.path.isdir(path) else [path] 111 | for fname in callback.wrap(files): 112 | with callback.branched(fname, fname) as cb: 113 | self._unprotect_file(fname, callback=cb) 114 | 115 | def protect(self, path): 116 | try: 117 | os.chmod(path, self.CACHE_MODE) 118 | except OSError: 119 | # NOTE: not being able to protect cache file is not fatal, it 120 | # might happen on funky filesystems (e.g. Samba, see #5255), 121 | # read-only filesystems or in a shared cache scenario. 122 | logger.debug("failed to protect '%s'", path, exc_info=True) 123 | 124 | def check(self, oid: str, check_hash: bool = True, _info: Optional[dict] = None): 125 | from dvc_data.hashfile.meta import Meta 126 | 127 | path = self.oid_to_path(oid) 128 | info = _info or _localfs_info(path) 129 | if stat.S_IMODE(info["mode"]) == self.CACHE_MODE: 130 | return Meta.from_info(info) 131 | return super().check(oid, check_hash, info) 132 | 133 | def is_protected(self, path): 134 | try: 135 | mode = os.stat(path).st_mode 136 | except FileNotFoundError: 137 | return False 138 | 139 | return stat.S_IMODE(mode) == self.CACHE_MODE 140 | 141 | def set_exec(self, path): 142 | mode = os.stat(path).st_mode | stat.S_IEXEC 143 | try: 144 | os.chmod(path, mode) 145 | except OSError: 146 | logger.debug( 147 | "failed to chmod '%s' '%s'", 148 | oct(mode), 149 | path, 150 | exc_info=True, 151 | ) 152 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/db/migrate.py: -------------------------------------------------------------------------------- 1 | from functools import partial, wraps 2 | from typing import TYPE_CHECKING, Any, Callable, NamedTuple 3 | 4 | from dvc_objects.executors import ThreadPoolExecutor 5 | from fsspec.callbacks import DEFAULT_CALLBACK 6 | 7 | if TYPE_CHECKING: 8 | from dvc_objects.fs.base import FileSystem 9 | from fsspec import Callback 10 | 11 | from . import HashFileDB 12 | 13 | 14 | class PreparedMigration(NamedTuple): 15 | src: "HashFileDB" 16 | dest: "HashFileDB" 17 | paths: list[str] 18 | oids: list[str] 19 | 20 | 21 | def migrate( 22 | migration: "PreparedMigration", callback: "Callback" = DEFAULT_CALLBACK 23 | ) -> int: 24 | """Migrate objects from one HashFileDB to another. 25 | 26 | Files from src will be re-hashed and transferred to dest with hardlinking 27 | enabled. 28 | """ 29 | src, dest, paths, oids = migration 30 | return dest.add(paths, src.fs, oids, hardlink=True, callback=callback) 31 | 32 | 33 | def prepare( 34 | src: "HashFileDB", 35 | dest: "HashFileDB", 36 | callback: "Callback" = DEFAULT_CALLBACK, 37 | ) -> PreparedMigration: 38 | """Prepare to migrate objects from one HashFileDB to another. 39 | 40 | Objects from src will be rehashed for addition to dest. 41 | """ 42 | src_paths = [src.oid_to_path(oid) for oid in src._list_oids()] 43 | callback.set_size(len(src_paths)) 44 | with ThreadPoolExecutor( 45 | max_workers=src.fs.hash_jobs, cancel_on_error=True 46 | ) as executor: 47 | func = partial( 48 | _hash_task, 49 | dest.hash_name, 50 | src.fs, 51 | state=dest.state, 52 | callback=callback, 53 | ) 54 | results = list(executor.imap_unordered(func, src_paths)) 55 | if results: 56 | paths, oids = zip(*results) 57 | else: 58 | paths, oids = (), () 59 | return PreparedMigration(src, dest, list(paths), list(oids)) 60 | 61 | 62 | def _hash_task( 63 | hash_name: str, 64 | fs: "FileSystem", 65 | path: str, 66 | callback: "Callback" = DEFAULT_CALLBACK, 67 | **kwargs, 68 | ) -> tuple[str, str]: 69 | from dvc_data.hashfile.hash import hash_file 70 | 71 | func = _wrap_hash_file(callback, hash_file) 72 | _meta, hash_info = func(path, fs, hash_name, **kwargs) 73 | assert hash_info.value 74 | if path.endswith(".dir"): 75 | hash_info.value += ".dir" 76 | return path, hash_info.value 77 | 78 | 79 | def _wrap_hash_file(callback: "Callback", fn: Callable): 80 | @wraps(fn) 81 | def func(path: str, *args, **kwargs): 82 | kw: dict[str, Any] = dict(kwargs) 83 | with callback.branched(path, path) as child: 84 | res = fn(path, *args, callback=child, **kw) 85 | callback.relative_update() 86 | return res 87 | 88 | return func 89 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/db/reference.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import TYPE_CHECKING, Callable, Optional, Union 3 | 4 | from dvc_data.hashfile.obj import HashFile 5 | 6 | from . import HashFileDB, HashInfo 7 | 8 | if TYPE_CHECKING: 9 | from dvc_objects.fs.base import AnyFSPath, FileSystem 10 | from fsspec import Callback 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class ReferenceHashFileDB(HashFileDB): 16 | def __init__(self, fs: "FileSystem", path: str, **config): 17 | super().__init__(fs, path, **config) 18 | self._obj_cache: dict[str, HashFile] = {} 19 | 20 | def __hash__(self): 21 | return hash((self.fs.protocol, self.path, *self._obj_cache.keys())) 22 | 23 | def exists(self, oid: str) -> bool: 24 | return oid in self._obj_cache 25 | 26 | def get(self, oid: str): 27 | try: 28 | return self._obj_cache[oid] 29 | except KeyError: 30 | return super().get(oid) 31 | 32 | def add( 33 | self, 34 | path: Union["AnyFSPath", list["AnyFSPath"]], 35 | fs: "FileSystem", 36 | oid: Union[str, list[str]], 37 | hardlink: bool = False, 38 | callback: Optional["Callback"] = None, 39 | check_exists: bool = True, 40 | on_error: Optional[Callable[[str, BaseException], None]] = None, 41 | **kwargs, 42 | ): 43 | paths = [path] if isinstance(path, str) else path 44 | oids = [oid] if isinstance(oid, str) else oid 45 | assert len(paths) == len(oids) 46 | 47 | for i in range(len(paths)): 48 | hash_info = HashInfo(self.hash_name, oids[i]) 49 | self._obj_cache[oids[i]] = HashFile(paths[i], fs, hash_info) 50 | 51 | def check( 52 | self, 53 | oid: str, 54 | check_hash: bool = True, 55 | _info: Optional[dict] = None, 56 | ): 57 | return 58 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/diff.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import reprlib 3 | from typing import TYPE_CHECKING, Optional 4 | 5 | from attrs import asdict, define, field 6 | 7 | if TYPE_CHECKING: 8 | from .db import HashFileDB 9 | from .hash_info import HashInfo 10 | from .meta import Meta 11 | from .obj import HashFile 12 | 13 | 14 | ADD = "add" 15 | MODIFY = "modify" 16 | DELETE = "delete" 17 | UNCHANGED = "unchanged" 18 | 19 | 20 | @define(unsafe_hash=True, order=True) 21 | class TreeEntry: 22 | cache_meta: Optional["Meta"] = field(default=None, eq=False) 23 | key: tuple[str, ...] = () 24 | meta: Optional["Meta"] = field(default=None, eq=False) 25 | oid: Optional["HashInfo"] = None 26 | 27 | def __bool__(self): 28 | return bool(self.oid) 29 | 30 | @property 31 | def in_cache(self) -> bool: 32 | return self.cache_meta is not None 33 | 34 | 35 | @define(unsafe_hash=True, order=True) 36 | class Change: 37 | old: TreeEntry = field(factory=TreeEntry) 38 | new: TreeEntry = field(factory=TreeEntry) 39 | typ: str = field(init=False) 40 | 41 | @typ.default 42 | def _(self): 43 | if not self.old and not self.new: 44 | return UNCHANGED 45 | 46 | if self.old and not self.new: 47 | return DELETE 48 | 49 | if not self.old and self.new: 50 | return ADD 51 | 52 | if self.old != self.new: 53 | return MODIFY 54 | 55 | return UNCHANGED 56 | 57 | def __bool__(self): 58 | return self.typ != UNCHANGED 59 | 60 | 61 | @define 62 | class DiffResult: 63 | added: list[Change] = field(factory=list, repr=reprlib.repr) 64 | modified: list[Change] = field(factory=list, repr=reprlib.repr) 65 | deleted: list[Change] = field(factory=list, repr=reprlib.repr) 66 | unchanged: list[Change] = field(factory=list, repr=reprlib.repr) 67 | 68 | def __bool__(self): 69 | return bool(self.added or self.modified or self.deleted) 70 | 71 | @property 72 | def stats(self) -> dict[str, int]: 73 | return { 74 | k: len(v) 75 | for k, v in asdict(self, recurse=False).items() 76 | if k != "unchanged" 77 | } 78 | 79 | 80 | ROOT = ("",) 81 | 82 | 83 | def diff( # noqa: C901 84 | old: Optional["HashFile"], 85 | new: Optional["HashFile"], 86 | cache: "HashFileDB", 87 | ) -> DiffResult: 88 | from .tree import Tree 89 | 90 | if old is None and new is None: 91 | return DiffResult() 92 | 93 | def _get_keys(obj): 94 | if not obj: 95 | return [] 96 | return [ROOT] + ([key for key, _, _ in obj] if isinstance(obj, Tree) else []) 97 | 98 | old_keys = set(_get_keys(old)) 99 | new_keys = set(_get_keys(new)) 100 | 101 | def _get(obj, key): 102 | if not obj or key == ROOT: 103 | return None, (obj.hash_info if obj else None) 104 | if not isinstance(obj, Tree): 105 | # obj is not a Tree and key is not a ROOT 106 | # hence object does not exist for a given key 107 | return None, None 108 | return obj.get(key, (None, None)) 109 | 110 | @functools.cache 111 | def _cache_check(oid: Optional["str"], cache: "HashFileDB") -> Optional["Meta"]: 112 | from dvc_objects.errors import ObjectFormatError 113 | 114 | if not oid: 115 | return None 116 | 117 | try: 118 | return cache.check(oid) 119 | except (FileNotFoundError, ObjectFormatError): 120 | return None 121 | 122 | ret = DiffResult() 123 | for key in old_keys | new_keys: 124 | old_meta, old_oid = _get(old, key) 125 | new_meta, new_oid = _get(new, key) 126 | 127 | old_cache_meta = _cache_check(old_oid.value, cache) if old_oid else None 128 | new_cache_meta = _cache_check(new_oid.value, cache) if new_oid else None 129 | change = Change( 130 | old=TreeEntry(old_cache_meta, key, old_meta, old_oid), 131 | new=TreeEntry(new_cache_meta, key, new_meta, new_oid), 132 | ) 133 | 134 | if change.typ == ADD: 135 | ret.added.append(change) 136 | elif change.typ == MODIFY: 137 | ret.modified.append(change) 138 | elif change.typ == DELETE: 139 | ret.deleted.append(change) 140 | else: 141 | assert change.typ == UNCHANGED 142 | ret.unchanged.append(change) 143 | return ret 144 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/gc.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | from typing import TYPE_CHECKING, Optional 3 | 4 | if TYPE_CHECKING: 5 | from .db import HashFileDB 6 | from .hash_info import HashInfo 7 | 8 | 9 | def gc( # noqa: C901 10 | odb: "HashFileDB", 11 | used: Iterable["HashInfo"], 12 | jobs: Optional[int] = None, 13 | cache_odb: Optional["HashFileDB"] = None, 14 | shallow: bool = True, 15 | dry: bool = False, 16 | ): 17 | from dvc_objects.errors import ObjectDBPermissionError 18 | 19 | from ._progress import QueryingProgress 20 | from .tree import Tree 21 | 22 | if odb.read_only: 23 | raise ObjectDBPermissionError("Cannot gc read-only ODB") 24 | if not cache_odb: 25 | cache_odb = odb 26 | used_hashes = set() 27 | for hash_info in used: 28 | if hash_info.name != odb.hash_name: 29 | continue 30 | used_hashes.add(hash_info.value) 31 | if hash_info.isdir and not shallow: 32 | tree = Tree.load(cache_odb, hash_info) 33 | used_hashes.update(entry_obj.hash_info.value for _, entry_obj in tree) 34 | 35 | def _is_dir_hash(_hash): 36 | from .hash_info import HASH_DIR_SUFFIX 37 | 38 | return _hash.endswith(HASH_DIR_SUFFIX) 39 | 40 | num_removed = 0 41 | 42 | dir_paths = [] 43 | file_paths = [] 44 | for hash_ in QueryingProgress(odb.all(jobs), name=odb.path): 45 | if hash_ in used_hashes: 46 | continue 47 | path = odb.oid_to_path(hash_) 48 | if _is_dir_hash(hash_): 49 | # backward compatibility 50 | odb._remove_unpacked_dir(hash_) 51 | dir_paths.append(path) 52 | else: 53 | file_paths.append(path) 54 | 55 | for paths in (dir_paths, file_paths): 56 | if paths: 57 | num_removed += len(paths) 58 | if not dry: 59 | odb.fs.remove(paths) 60 | 61 | return num_removed 62 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/hash.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import io 3 | import logging 4 | from typing import TYPE_CHECKING, BinaryIO, Optional, cast 5 | 6 | from dvc_objects.fs import localfs 7 | from fsspec.callbacks import Callback 8 | from fsspec.utils import nullcontext 9 | from tqdm.utils import CallbackIOWrapper 10 | 11 | from dvc_data.callbacks import TqdmCallback 12 | 13 | from .hash_info import HashInfo 14 | from .istextfile import DEFAULT_CHUNK_SIZE, istextblock 15 | from .meta import Meta 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | if TYPE_CHECKING: 21 | from dvc_objects.fs.base import AnyFSPath, FileSystem 22 | 23 | from .state import StateBase 24 | 25 | 26 | def dos2unix(data: bytes) -> bytes: 27 | return data.replace(b"\r\n", b"\n") 28 | 29 | 30 | algorithms_available = hashlib.algorithms_available | { 31 | "blake3", 32 | "md5-dos2unix", 33 | } 34 | DEFAULT_ALGORITHM = "md5" 35 | 36 | 37 | def get_hasher(name: str) -> "hashlib._Hash": 38 | if name == "blake3": 39 | from blake3 import blake3 # type: ignore[import-not-found] 40 | 41 | return blake3(max_threads=blake3.AUTO) # type: ignore[return-value] 42 | if name == "md5-dos2unix": 43 | name = "md5" 44 | 45 | try: 46 | return getattr(hashlib, name)() 47 | except AttributeError: 48 | return hashlib.new(name) 49 | 50 | 51 | class HashStreamFile(io.IOBase): 52 | __slots__ = ("fobj", "hasher", "total_read") 53 | 54 | def __init__( 55 | self, 56 | fobj: BinaryIO, 57 | hash_name: str = DEFAULT_ALGORITHM, 58 | ) -> None: 59 | self.fobj = fobj 60 | self.total_read = 0 61 | hash_name = hash_name.lower() 62 | self.hasher = get_hasher(hash_name) 63 | super().__init__() 64 | 65 | def readable(self) -> bool: 66 | return True 67 | 68 | def tell(self) -> int: 69 | return self.fobj.tell() 70 | 71 | def read(self, n=-1) -> bytes: 72 | chunk = self.fobj.read(n) 73 | self.hasher.update(chunk) 74 | self.total_read += len(chunk) 75 | return chunk 76 | 77 | @property 78 | def hash_value(self) -> str: 79 | return self.hasher.hexdigest() 80 | 81 | @property 82 | def hash_name(self) -> str: 83 | return self.hasher.name 84 | 85 | 86 | class Dos2UnixHashStreamFile(HashStreamFile): 87 | __slots__ = () 88 | 89 | def read(self, n=-1) -> bytes: 90 | # ideally, we want the heuristics to be applied in a similar way, 91 | # regardless of the size of the first chunk, 92 | # for which we may need to buffer till DEFAULT_CHUNK_SIZE. 93 | assert n >= DEFAULT_CHUNK_SIZE 94 | chunk = self.fobj.read(n) 95 | is_text = istextblock(chunk[:DEFAULT_CHUNK_SIZE]) if chunk else False 96 | 97 | data = dos2unix(chunk) if is_text else chunk 98 | self.hasher.update(data) 99 | self.total_read += len(data) 100 | return chunk 101 | 102 | 103 | def get_hash_stream(fobj: BinaryIO, name: str = DEFAULT_ALGORITHM) -> HashStreamFile: 104 | cls = Dos2UnixHashStreamFile if name == "md5-dos2unix" else HashStreamFile 105 | return cls(fobj, hash_name=name) 106 | 107 | 108 | def fobj_md5( 109 | fobj: BinaryIO, 110 | chunk_size: int = 2**20, 111 | name: str = DEFAULT_ALGORITHM, 112 | ) -> str: 113 | stream = get_hash_stream(fobj, name=name) 114 | while True: 115 | data = stream.read(chunk_size) 116 | if not data: 117 | break 118 | return stream.hash_value 119 | 120 | 121 | def file_md5( 122 | fname: "AnyFSPath", 123 | fs: "FileSystem" = localfs, 124 | callback: Optional["Callback"] = None, 125 | name: str = DEFAULT_ALGORITHM, 126 | size: Optional[int] = None, 127 | ) -> str: 128 | if size is None and callback is not None: 129 | size = fs.size(fname) or 0 130 | callback.set_size(size) 131 | 132 | with fs.open(fname, "rb") as fobj: 133 | if callback is not None: 134 | fobj = cast("BinaryIO", CallbackIOWrapper(callback.relative_update, fobj)) 135 | return fobj_md5(fobj, name=name) 136 | 137 | 138 | def _hash_file( 139 | path: "AnyFSPath", 140 | fs: "FileSystem", 141 | name: str, 142 | callback: Optional["Callback"] = None, 143 | info: Optional[dict] = None, 144 | ) -> tuple["str", Meta]: 145 | info = info or fs.info(path) 146 | meta = Meta.from_info(info, fs.protocol) 147 | 148 | value = getattr(meta, name, None) 149 | if value: 150 | assert not value.endswith(".dir") 151 | return value, meta 152 | 153 | if hasattr(fs, name): 154 | func = getattr(fs, name) 155 | return str(func(path)), meta 156 | 157 | if name in algorithms_available: 158 | return ( 159 | file_md5(path, fs, callback=callback, size=meta.size, name=name), 160 | meta, 161 | ) 162 | raise NotImplementedError 163 | 164 | 165 | class LargeFileHashingCallback(TqdmCallback): 166 | """Callback that only shows progress bar if self.size > LARGE_FILE_SIZE.""" 167 | 168 | LARGE_FILE_SIZE = 2**30 169 | 170 | def __init__(self, *args, **kwargs): 171 | kwargs.setdefault("bytes", True) 172 | super().__init__(*args, **kwargs) 173 | self._logged = False 174 | self.fname = kwargs.get("desc", "") 175 | 176 | # TqdmCallback force renders progress bar on `set_size`. 177 | set_size = Callback.set_size 178 | 179 | def call(self, hook_name=None, **kwargs): 180 | if self.size and self.size > self.LARGE_FILE_SIZE: 181 | if not self._logged: 182 | logger.info( 183 | "Computing md5 for a large file %r. This is only done once.", 184 | self.fname, 185 | ) 186 | self._logged = True 187 | super().call() 188 | 189 | 190 | def hash_file( 191 | path: "AnyFSPath", 192 | fs: "FileSystem", 193 | name: str, 194 | state: Optional["StateBase"] = None, 195 | callback: Optional["Callback"] = None, 196 | info: Optional[dict] = None, 197 | ) -> tuple["Meta", "HashInfo"]: 198 | if state: 199 | meta, hash_info = state.get(path, fs, info=info) 200 | if meta is not None and hash_info is not None and hash_info.name == name: 201 | return meta, hash_info 202 | 203 | size = info.get("size") if info else None 204 | _callback = callback 205 | # never initialize callback if it's never going to be used 206 | if size and size < LargeFileHashingCallback.LARGE_FILE_SIZE: 207 | _callback = nullcontext(None) 208 | else: 209 | _callback = LargeFileHashingCallback(desc=path) 210 | 211 | with _callback as cb: 212 | oid, meta = _hash_file(path, fs, name, callback=cb, info=info) 213 | 214 | hash_info = HashInfo(name, oid) 215 | if state: 216 | assert ".dir" not in oid 217 | state.save(path, fs, hash_info, info=info) 218 | return meta, hash_info 219 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/hash_info.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from attrs import define, field 4 | 5 | HASH_DIR_SUFFIX = ".dir" 6 | 7 | 8 | @define(unsafe_hash=True) 9 | class HashInfo: 10 | name: Optional[str] = None 11 | value: Optional[str] = None 12 | obj_name: Optional[str] = field(default=None, eq=False, hash=False) 13 | 14 | def __bool__(self) -> bool: 15 | return bool(self.value) 16 | 17 | def __str__(self) -> str: 18 | return f"{self.name}: {self.value}" 19 | 20 | @classmethod 21 | def from_dict(cls, d: dict[str, str]) -> "HashInfo": 22 | if not d: 23 | return cls() 24 | 25 | ((name, value),) = d.items() 26 | return cls(name, value) 27 | 28 | def to_dict(self) -> dict[str, str]: 29 | if not self.value or not self.name: 30 | return {} 31 | return {self.name: self.value} 32 | 33 | @property 34 | def isdir(self) -> bool: 35 | if not self.value: 36 | return False 37 | return self.value.endswith(HASH_DIR_SUFFIX) 38 | 39 | def as_raw(self) -> "HashInfo": 40 | assert self.value 41 | value, *_ = self.value.rsplit(HASH_DIR_SUFFIX, 1) 42 | return HashInfo(self.name, value, self.obj_name) 43 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/istextfile.py: -------------------------------------------------------------------------------- 1 | """Use heuristics to guess if it is a text file or a binary file.""" 2 | 3 | # Based on https://eli.thegreenplace.net/2011/10/19/ 4 | # perls-guess-if-file-is-text-or-binary-implemented-in-python 5 | from typing import TYPE_CHECKING 6 | 7 | if TYPE_CHECKING: 8 | from dvc_objects.fs.base import AnyFSPath, FileSystem 9 | 10 | TEXT_CHARS = bytes(range(32, 127)) + b"\n\r\t\f\b" 11 | DEFAULT_CHUNK_SIZE = 512 12 | 13 | 14 | def istextblock(block: bytes) -> bool: 15 | if not block: 16 | # An empty file is considered a valid text file 17 | return True 18 | 19 | if b"\x00" in block: 20 | # Files with null bytes are binary 21 | return False 22 | 23 | # Use translate's 'deletechars' argument to efficiently remove all 24 | # occurrences of TEXT_CHARS from the block 25 | nontext = block.translate(None, TEXT_CHARS) 26 | return float(len(nontext)) / len(block) <= 0.30 27 | 28 | 29 | def istextfile( 30 | fname: "AnyFSPath", fs: "FileSystem", blocksize: int = DEFAULT_CHUNK_SIZE 31 | ) -> bool: 32 | """Uses heuristics to guess whether the given file is text or binary, 33 | by reading a single block of bytes from the file. 34 | If more than 30% of the chars in the block are non-text, or there 35 | are NUL ('\x00') bytes in the block, assume this is a binary file. 36 | """ 37 | with fs.open(fname, "rb") as fobj: 38 | block = fobj.read(blocksize) 39 | return istextblock(block) 40 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/meta.py: -------------------------------------------------------------------------------- 1 | from typing import Any, ClassVar, Optional 2 | 3 | from attrs import define, field, fields_dict 4 | from dvc_objects.fs.utils import is_exec 5 | 6 | 7 | @define(unsafe_hash=True) 8 | class Meta: 9 | PARAM_ISDIR: ClassVar[str] = "isdir" 10 | PARAM_SIZE: ClassVar[str] = "size" 11 | PARAM_NFILES: ClassVar[str] = "nfiles" 12 | PARAM_ISEXEC: ClassVar[str] = "isexec" 13 | PARAM_VERSION_ID: ClassVar[str] = "version_id" 14 | PARAM_ETAG: ClassVar[str] = "etag" 15 | PARAM_CHECKSUM: ClassVar[str] = "checksum" 16 | PARAM_MD5: ClassVar[str] = "md5" 17 | PARAM_INODE: ClassVar[str] = "inode" 18 | PARAM_MTIME: ClassVar[str] = "mtime" 19 | PARAM_REMOTE: ClassVar[str] = "remote" 20 | 21 | fields: ClassVar[list[str]] 22 | 23 | isdir: bool = False 24 | size: Optional[int] = None 25 | nfiles: Optional[int] = None 26 | isexec: bool = False 27 | version_id: Optional[str] = None 28 | etag: Optional[str] = None 29 | checksum: Optional[str] = None 30 | md5: Optional[str] = None 31 | inode: Optional[int] = None 32 | mtime: Optional[float] = None 33 | 34 | remote: Optional[str] = field(default=None, eq=False) 35 | 36 | is_link: bool = field(default=False, eq=False) 37 | destination: Optional[str] = field(default=None, eq=False) 38 | nlink: int = field(default=1, eq=False) 39 | 40 | @classmethod 41 | def from_info(cls, info: dict[str, Any], protocol: Optional[str] = None) -> "Meta": 42 | etag = info.get("etag") 43 | checksum = info.get("checksum") 44 | 45 | if protocol == "azure" and etag and not etag.startswith('"'): 46 | etag = f'"{etag}"' 47 | if protocol == "s3" and "ETag" in info: 48 | etag = info["ETag"].strip('"') 49 | elif protocol == "gs" and "etag" in info: 50 | import base64 51 | 52 | etag = base64.b64decode(info["etag"]).hex() 53 | elif ( 54 | protocol 55 | and protocol.startswith("http") 56 | and ("ETag" in info or "Content-MD5" in info) 57 | ): 58 | checksum = info.get("ETag") or info.get("Content-MD5") 59 | 60 | version_id = info.get("version_id") 61 | if protocol == "s3" and "VersionId" in info: 62 | version_id = info.get("VersionId") 63 | elif protocol == "gs" and "generation" in info: 64 | version_id = info.get("generation") 65 | 66 | return Meta( 67 | info["type"] == "directory", 68 | info.get("size"), 69 | None, 70 | is_exec(info.get("mode", 0)), 71 | version_id, 72 | etag, 73 | checksum, 74 | info.get("md5"), 75 | info.get("ino"), 76 | info.get("mtime"), 77 | info.get("remote"), 78 | info.get("islink", False), 79 | info.get("destination"), 80 | info.get("nlink", 1), 81 | ) 82 | 83 | @classmethod 84 | def from_dict(cls, d: dict[str, Any]) -> "Meta": 85 | kwargs = {} 86 | for field_ in cls.fields: 87 | if field_ in d: 88 | kwargs[field_] = d[field_] 89 | return cls(**kwargs) 90 | 91 | def to_dict(self) -> dict[str, Any]: 92 | ret: dict[str, Any] = {} 93 | 94 | if self.isdir: 95 | ret[self.PARAM_ISDIR] = self.isdir 96 | 97 | if self.size is not None: 98 | ret[self.PARAM_SIZE] = self.size 99 | 100 | if self.nfiles is not None: 101 | ret[self.PARAM_NFILES] = self.nfiles 102 | 103 | if self.isexec: 104 | ret[self.PARAM_ISEXEC] = self.isexec 105 | 106 | if self.version_id: 107 | ret[self.PARAM_VERSION_ID] = self.version_id 108 | 109 | if self.etag: 110 | ret[self.PARAM_ETAG] = self.etag 111 | 112 | if self.checksum: 113 | ret[self.PARAM_CHECKSUM] = self.checksum 114 | 115 | if self.md5: 116 | ret[self.PARAM_MD5] = self.md5 117 | 118 | if self.remote: 119 | ret[self.PARAM_REMOTE] = self.remote 120 | 121 | return ret 122 | 123 | 124 | Meta.fields = list(fields_dict(Meta)) 125 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/obj.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from dvc_objects.obj import Object 4 | 5 | if TYPE_CHECKING: 6 | from dvc_objects.fs.base import AnyFSPath, FileSystem 7 | 8 | from .hash_info import HashInfo 9 | 10 | 11 | class HashFile(Object): 12 | __slots__ = ("hash_info",) 13 | 14 | def __init__(self, path: "AnyFSPath", fs: "FileSystem", hash_info: "HashInfo"): 15 | assert hash_info.value 16 | oid = hash_info.value 17 | super().__init__(path, fs, oid) 18 | self.hash_info = hash_info 19 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/status.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections.abc import Iterable 3 | from typing import TYPE_CHECKING, NamedTuple, Optional 4 | 5 | from dvc_objects.fs import Schemes 6 | 7 | from .hash_info import HashInfo 8 | from .tree import Tree 9 | 10 | if TYPE_CHECKING: 11 | from dvc_objects.db import ObjectDB 12 | 13 | from .db import HashFileDB 14 | from .db.index import ObjectDBIndexBase 15 | from .obj import HashFile 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class StatusResult(NamedTuple): 21 | exists: set["HashInfo"] 22 | missing: set["HashInfo"] 23 | 24 | 25 | class CompareStatusResult(NamedTuple): 26 | ok: set["HashInfo"] 27 | missing: set["HashInfo"] 28 | new: set["HashInfo"] 29 | deleted: set["HashInfo"] 30 | 31 | 32 | def _indexed_dir_hashes( 33 | odb: "ObjectDB", index: "ObjectDBIndexBase", dir_objs, name, cache_odb, jobs=None 34 | ): 35 | # Validate our index by verifying all indexed .dir hashes 36 | # still exist on the remote 37 | from ._progress import QueryingProgress 38 | 39 | dir_hashes = set(dir_objs.keys()) 40 | indexed_dirs = set(index.dir_hashes()) 41 | indexed_dir_exists: set[str] = set() 42 | if indexed_dirs: 43 | hashes = QueryingProgress( 44 | odb.list_oids_exists(indexed_dirs, jobs=jobs), 45 | total=len(indexed_dirs), 46 | ) 47 | indexed_dir_exists.update(hashes) 48 | missing_dirs = indexed_dirs.difference(indexed_dir_exists) 49 | if missing_dirs: 50 | logger.debug( 51 | "Remote cache missing indexed .dir hashes '%s', clearing remote index", 52 | ", ".join(missing_dirs), 53 | ) 54 | index.clear() 55 | 56 | # Check if non-indexed (new) dir hashes exist on remote 57 | dir_exists = dir_hashes.intersection(indexed_dir_exists) 58 | dir_missing = dir_hashes - dir_exists 59 | dir_exists.update( 60 | QueryingProgress( 61 | odb.list_oids_exists(dir_missing, jobs=jobs), 62 | total=len(dir_missing), 63 | ) 64 | ) 65 | 66 | # If .dir hash exists in the ODB, assume directory contents 67 | # also exists 68 | for dir_hash in dir_exists: 69 | tree = dir_objs.get(dir_hash) 70 | if not tree: 71 | try: 72 | tree = Tree.load(cache_odb, HashInfo(name, dir_hash)) 73 | except FileNotFoundError: 74 | continue 75 | file_hashes = [hi.value for _, _, hi in tree] 76 | if dir_hash not in index: 77 | logger.debug( 78 | "Indexing new .dir '%s' with '%s' nested files", 79 | dir_hash, 80 | len(file_hashes), 81 | ) 82 | index.update([dir_hash], file_hashes) 83 | yield from file_hashes 84 | yield tree.hash_info.value 85 | 86 | 87 | def status( # noqa: C901, PLR0912 88 | odb: "HashFileDB", 89 | obj_ids: Iterable["HashInfo"], 90 | name: Optional[str] = None, 91 | index: Optional["ObjectDBIndexBase"] = None, 92 | cache_odb: Optional["HashFileDB"] = None, 93 | shallow: bool = True, 94 | jobs: Optional[int] = None, 95 | ) -> "StatusResult": 96 | """Return status of whether or not the specified objects exist odb. 97 | 98 | If cache_odb is set, trees will be loaded from cache_odb instead of odb 99 | when needed. 100 | 101 | Status is returned as a tuple of: 102 | exists: objs that exist in odb 103 | missing: objs that do not exist in ODB 104 | """ 105 | logger.debug("Preparing to collect status from '%s'", odb.path) 106 | if not name: 107 | name = odb.hash_name 108 | 109 | if cache_odb is None: 110 | cache_odb = odb 111 | 112 | hash_infos: dict[str, HashInfo] = {} 113 | dir_objs: dict[str, Optional[HashFile]] = {} 114 | for hash_info in obj_ids: 115 | assert hash_info.value 116 | if hash_info.isdir: 117 | if shallow: 118 | tree = None 119 | else: 120 | tree = Tree.load(cache_odb, hash_info) 121 | for _, _, oid in tree: 122 | assert oid 123 | assert oid.value 124 | hash_infos[oid.value] = oid 125 | if index: 126 | dir_objs[hash_info.value] = tree 127 | hash_infos[hash_info.value] = hash_info 128 | 129 | if odb.fs.protocol == Schemes.MEMORY: 130 | # assume memfs staged objects already exist 131 | return StatusResult(set(hash_infos.values()), set()) 132 | 133 | hashes: set[str] = set(hash_infos.keys()) 134 | exists: set[str] = set() 135 | 136 | logger.debug("Collecting status from '%s'", odb.path) 137 | if index and hashes: 138 | if dir_objs: 139 | exists = hashes.intersection( 140 | _indexed_dir_hashes(odb, index, dir_objs, name, cache_odb, jobs=jobs) 141 | ) 142 | hashes.difference_update(exists) 143 | if hashes: 144 | exists.update(index.intersection(hashes)) 145 | hashes.difference_update(exists) 146 | 147 | if hashes: 148 | from ._progress import QueryingProgress 149 | 150 | with QueryingProgress(phase="Checking", name=odb.path) as pbar: 151 | exists.update(odb.oids_exist(hashes, jobs=jobs, progress=pbar.callback)) 152 | return StatusResult( 153 | {hash_infos[hash_] for hash_ in exists}, 154 | {hash_infos[hash_] for hash_ in (hashes - exists)}, 155 | ) 156 | 157 | 158 | def compare_status( 159 | src: "HashFileDB", 160 | dest: "HashFileDB", 161 | obj_ids: Iterable["HashInfo"], 162 | check_deleted: bool = True, 163 | src_index: Optional["ObjectDBIndexBase"] = None, 164 | dest_index: Optional["ObjectDBIndexBase"] = None, 165 | cache_odb: Optional["HashFileDB"] = None, 166 | jobs: Optional[int] = None, 167 | **kwargs, 168 | ) -> "CompareStatusResult": 169 | """Compare status for the specified objects between two ODBs. 170 | 171 | Status is returned as a tuple of: 172 | ok: hashes that exist in both src and dest 173 | missing: hashes that do not exist in neither src nor dest 174 | new: hashes that only exist in src 175 | deleted: hashes that only exist in dest 176 | """ 177 | if cache_odb is None: 178 | cache_odb = src 179 | dest_exists, dest_missing = status( 180 | dest, 181 | obj_ids, 182 | index=dest_index, 183 | jobs=jobs, 184 | cache_odb=cache_odb, 185 | **kwargs, 186 | ) 187 | # for transfer operations we can skip src status check when all objects 188 | # already exist in dest 189 | if dest_missing or check_deleted: 190 | src_exists, src_missing = status( 191 | src, obj_ids, index=src_index, jobs=jobs, **kwargs 192 | ) 193 | else: 194 | src_exists = dest_exists 195 | src_missing = set() 196 | return CompareStatusResult( 197 | src_exists & dest_exists, 198 | src_missing & dest_missing, 199 | src_exists - dest_exists, 200 | dest_exists - src_exists, 201 | ) 202 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/transfer.py: -------------------------------------------------------------------------------- 1 | import errno 2 | import logging 3 | from collections import defaultdict 4 | from collections.abc import Iterable 5 | from typing import ( 6 | TYPE_CHECKING, 7 | Any, 8 | Callable, 9 | NamedTuple, 10 | Optional, 11 | ) 12 | 13 | from fsspec.callbacks import DEFAULT_CALLBACK 14 | 15 | from .hash_info import HashInfo 16 | 17 | if TYPE_CHECKING: 18 | from dvc_objects.fs.base import FileSystem 19 | from fsspec import Callback 20 | 21 | from .db import HashFileDB 22 | from .db.index import ObjectDBIndexBase 23 | from .status import CompareStatusResult 24 | from .tree import Tree 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | class TransferResult(NamedTuple): 30 | transferred: set["HashInfo"] 31 | failed: set["HashInfo"] 32 | 33 | 34 | def _log_exception(oid: str, exc: BaseException): 35 | # NOTE: this means we ran out of file descriptors and there is no 36 | # reason to try to proceed, as we will hit this error anyways. 37 | if isinstance(exc, OSError) and exc.errno == errno.EMFILE: 38 | raise exc 39 | logger.error("failed to transfer '%s'", oid, exc_info=exc) 40 | 41 | 42 | def find_tree_by_obj_id( 43 | odbs: Iterable[Optional["HashFileDB"]], obj_id: "HashInfo" 44 | ) -> Optional["Tree"]: 45 | from dvc_objects.errors import ObjectFormatError 46 | 47 | from .tree import Tree 48 | 49 | for odb in odbs: 50 | if odb is not None: 51 | try: 52 | return Tree.load(odb, obj_id) 53 | except (FileNotFoundError, ObjectFormatError): 54 | pass 55 | return None 56 | 57 | 58 | def _do_transfer( # noqa: C901 59 | src: "HashFileDB", 60 | dest: "HashFileDB", 61 | obj_ids: Iterable["HashInfo"], 62 | missing_ids: Iterable["HashInfo"], 63 | src_index: Optional["ObjectDBIndexBase"] = None, 64 | dest_index: Optional["ObjectDBIndexBase"] = None, 65 | cache_odb: Optional["HashFileDB"] = None, 66 | **kwargs: Any, 67 | ) -> set["HashInfo"]: 68 | """Do object transfer. 69 | 70 | Returns: 71 | Set containing any hash_infos which failed to transfer. 72 | """ 73 | dir_ids, file_ids = set(), set() 74 | for hash_info in obj_ids: 75 | if hash_info.isdir: 76 | dir_ids.add(hash_info) 77 | else: 78 | file_ids.add(hash_info) 79 | 80 | failed_ids: set[HashInfo] = set() 81 | succeeded_dir_objs = [] 82 | 83 | for dir_hash in dir_ids: 84 | dir_obj = find_tree_by_obj_id([cache_odb, src], dir_hash) 85 | assert dir_obj 86 | 87 | entry_ids = {oid for _, _, oid in dir_obj} 88 | bound_file_ids = file_ids & entry_ids 89 | file_ids -= entry_ids 90 | 91 | logger.debug("transfer dir: %s with %d files", dir_hash, len(bound_file_ids)) 92 | 93 | dir_fails = _add(src, dest, bound_file_ids, **kwargs) 94 | if dir_fails: 95 | logger.debug( 96 | "failed to upload full contents of '%s', aborting .dir file upload", 97 | dir_hash, 98 | ) 99 | logger.debug( 100 | "failed to upload '%s' to '%s'", 101 | src.get(dir_obj.oid).path, 102 | dest.get(dir_obj.oid).path, 103 | ) 104 | failed_ids.update(dir_fails) 105 | failed_ids.add(dir_obj.hash_info) 106 | elif entry_ids.intersection(missing_ids): 107 | # if for some reason a file contained in this dir is 108 | # missing both locally and in the remote, we want to 109 | # push whatever file content we have, but should not 110 | # push .dir file 111 | logger.debug( 112 | "directory '%s' contains missing files, skipping .dir file upload", 113 | dir_hash, 114 | ) 115 | elif _add(src, dest, [dir_obj.hash_info], **kwargs): 116 | failed_ids.add(dir_obj.hash_info) 117 | else: 118 | succeeded_dir_objs.append(dir_obj) 119 | 120 | # insert the rest 121 | failed_ids.update(_add(src, dest, file_ids, **kwargs)) 122 | if failed_ids: 123 | if src_index: 124 | src_index.clear() 125 | return failed_ids 126 | 127 | # index successfully pushed dirs 128 | if dest_index: 129 | for dir_obj in succeeded_dir_objs: 130 | file_hashes = {oid.value for _, _, oid in dir_obj} 131 | logger.debug( 132 | "Indexing pushed dir '%s' with '%s' nested files", 133 | dir_obj.hash_info, 134 | len(file_hashes), 135 | ) 136 | assert dir_obj.hash_info 137 | assert dir_obj.hash_info.value 138 | dest_index.update([dir_obj.hash_info.value], file_hashes) 139 | 140 | return set() 141 | 142 | 143 | def _add( 144 | src: "HashFileDB", 145 | dest: "HashFileDB", 146 | hash_infos: Iterable["HashInfo"], 147 | **kwargs, 148 | ) -> set["HashInfo"]: 149 | failed: set[HashInfo] = set() 150 | if not hash_infos: 151 | return failed 152 | 153 | def _error(oid: str, exc: BaseException): 154 | _log_exception(oid, exc) 155 | failed.add(HashInfo(src.hash_name, oid)) 156 | 157 | fs_map: dict[FileSystem, list[tuple[str, str]]] = defaultdict(list) 158 | for hash_info in hash_infos: 159 | assert hash_info.value 160 | obj = src.get(hash_info.value) 161 | fs_map[obj.fs].append((obj.path, obj.oid)) 162 | 163 | for fs, args in fs_map.items(): 164 | paths, oids = zip(*args) 165 | dest.add( 166 | list(paths), 167 | fs, 168 | list(oids), 169 | on_error=_error, 170 | **kwargs, 171 | ) 172 | return failed 173 | 174 | 175 | def transfer( # noqa: PLR0913 176 | src: "HashFileDB", 177 | dest: "HashFileDB", 178 | obj_ids: Iterable["HashInfo"], 179 | jobs: Optional[int] = None, 180 | verify: bool = False, 181 | hardlink: bool = False, 182 | validate_status: Optional[Callable[["CompareStatusResult"], None]] = None, 183 | src_index: Optional["ObjectDBIndexBase"] = None, 184 | dest_index: Optional["ObjectDBIndexBase"] = None, 185 | cache_odb: Optional["HashFileDB"] = None, 186 | shallow: bool = True, 187 | callback: "Callback" = DEFAULT_CALLBACK, 188 | ) -> "TransferResult": 189 | """Transfer (copy) the specified objects from one ODB to another. 190 | 191 | Returns the number of successfully transferred objects 192 | """ 193 | from .status import compare_status 194 | 195 | logger.debug( 196 | "Preparing to transfer data from '%s' to '%s'", 197 | src.fs.unstrip_protocol(src.path), 198 | dest.fs.unstrip_protocol(dest.path), 199 | ) 200 | if src == dest: 201 | return TransferResult(set(), set()) 202 | 203 | status = compare_status( 204 | src, 205 | dest, 206 | obj_ids, 207 | check_deleted=False, 208 | jobs=jobs, 209 | src_index=src_index, 210 | dest_index=dest_index, 211 | cache_odb=cache_odb, 212 | shallow=shallow, 213 | ) 214 | 215 | if validate_status: 216 | validate_status(status) 217 | 218 | if not status.new: 219 | return TransferResult(set(), set()) 220 | 221 | callback.set_size(len(status.new)) 222 | jobs = jobs or dest.fs.jobs 223 | 224 | failed = _do_transfer( 225 | src, 226 | dest, 227 | status.new, 228 | status.missing, 229 | verify=verify, 230 | hardlink=hardlink, 231 | callback=callback, 232 | batch_size=jobs, 233 | check_exists=False, 234 | src_index=src_index, 235 | dest_index=dest_index, 236 | cache_odb=cache_odb, 237 | ) 238 | return TransferResult(status.new - failed, failed) 239 | -------------------------------------------------------------------------------- /src/dvc_data/hashfile/utils.py: -------------------------------------------------------------------------------- 1 | import errno 2 | import hashlib 3 | import json 4 | from typing import TYPE_CHECKING, Optional 5 | 6 | from dvc_data.fsutils import _localfs_info 7 | 8 | if TYPE_CHECKING: 9 | from dvc_objects.fs.base import AnyFSPath, FileSystem 10 | 11 | from ._ignore import Ignore 12 | from .diff import DiffResult 13 | 14 | 15 | def to_nanoseconds(ts: float) -> int: 16 | return round(ts * 1_000_000_000) 17 | 18 | 19 | def _tokenize_mtimes(files_mtimes: dict[str, float]) -> str: 20 | data = json.dumps(files_mtimes, sort_keys=True).encode("utf-8") 21 | digest = hashlib.md5(data) # noqa: S324 22 | return digest.hexdigest() 23 | 24 | 25 | def get_mtime_and_size( 26 | path: "AnyFSPath", fs: "FileSystem", ignore: Optional["Ignore"] = None 27 | ) -> tuple[str, int]: 28 | if not fs.isdir(path): 29 | base_stat = fs.info(path) 30 | size = base_stat["size"] 31 | mtime = str(to_nanoseconds(base_stat["mtime"])) 32 | return mtime, size 33 | 34 | size = 0 35 | files_mtimes = {} 36 | if ignore: 37 | walk_iterator = ignore.find(fs, path) 38 | else: 39 | walk_iterator = fs.find(path) 40 | for file_path in walk_iterator: 41 | try: 42 | stats = _localfs_info(file_path) 43 | except OSError as exc: 44 | # NOTE: broken symlink case. 45 | if exc.errno != errno.ENOENT: 46 | raise 47 | continue 48 | size += stats["size"] 49 | files_mtimes[file_path] = stats["mtime"] 50 | 51 | # We track file changes and moves, which cannot be detected with simply 52 | # max(mtime(f) for f in non_ignored_files) 53 | mtime = _tokenize_mtimes(files_mtimes) 54 | return mtime, size 55 | 56 | 57 | def _get_mtime_from_changes( 58 | path: str, 59 | fs: "FileSystem", 60 | diff: "DiffResult", 61 | updated_mtimes: dict[str, float], 62 | ) -> str: 63 | from .diff import ROOT 64 | 65 | fs_info = _localfs_info(path) 66 | if fs_info["type"] == "file": 67 | return str(to_nanoseconds(fs_info["mtime"])) 68 | 69 | mtimes: dict[str, float] = {} 70 | mtimes.update(updated_mtimes) 71 | 72 | sep = fs.sep 73 | 74 | for change in diff.unchanged: 75 | key = change.old.key 76 | if key == ROOT: 77 | continue 78 | 79 | entry_path = sep.join((path, *key)) 80 | if entry_path in mtimes: 81 | continue 82 | meta = change.old.meta 83 | mtime = meta.mtime if meta is not None else None 84 | if mtime is None: 85 | try: 86 | stats = _localfs_info(entry_path) 87 | except OSError as exc: 88 | # NOTE: broken symlink case. 89 | if exc.errno != errno.ENOENT: 90 | raise 91 | continue 92 | mtime = stats["mtime"] 93 | assert mtime is not None 94 | mtimes[entry_path] = mtime 95 | 96 | return _tokenize_mtimes(mtimes) 97 | -------------------------------------------------------------------------------- /src/dvc_data/index/__init__.py: -------------------------------------------------------------------------------- 1 | from .add import add # noqa: F401 2 | from .build import build # noqa: F401 3 | from .diff import diff # noqa: F401 4 | from .index import * # noqa: F403 5 | from .save import md5, save # noqa: F401 6 | from .serialize import ( 7 | read_db, # noqa: F401 8 | read_json, # noqa: F401 9 | write_db, # noqa: F401 10 | write_json, # noqa: F401 11 | ) 12 | from .update import update # noqa: F401 13 | from .view import DataIndexView, view # noqa: F401 14 | -------------------------------------------------------------------------------- /src/dvc_data/index/add.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Optional 2 | 3 | from .build import build_entries, build_entry 4 | from .index import FileStorage 5 | 6 | if TYPE_CHECKING: 7 | from dvc_objects.fs import FileSystem 8 | 9 | from dvc_data.hashfile._ignore import Ignore 10 | 11 | from .index import DataIndex, DataIndexKey 12 | 13 | 14 | def add( 15 | index: "DataIndex", 16 | path: str, 17 | fs: "FileSystem", 18 | key: "DataIndexKey", 19 | ignore: Optional["Ignore"] = None, 20 | ): 21 | entry = build_entry(path, fs) 22 | entry.key = key 23 | index.add(entry) 24 | 25 | index.storage_map.add_data(FileStorage(key=key, fs=fs, path=path)) 26 | 27 | if not fs.isdir(path): 28 | return 29 | 30 | for entry in build_entries(path, fs, ignore=ignore): 31 | assert entry.key is not None 32 | entry.key = (*key, *entry.key) 33 | index.add(entry) 34 | -------------------------------------------------------------------------------- /src/dvc_data/index/build.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable, Iterator 2 | from itertools import chain, repeat 3 | from typing import TYPE_CHECKING, Any, Optional 4 | 5 | from dvc_objects.fs.local import LocalFileSystem 6 | 7 | from dvc_data.hashfile.hash import DEFAULT_ALGORITHM, hash_file 8 | from dvc_data.hashfile.meta import Meta 9 | 10 | from .index import DataIndex, DataIndexEntry, FileStorage 11 | 12 | if TYPE_CHECKING: 13 | from dvc_objects.fs.base import FileSystem 14 | 15 | from dvc_data.hashfile._ignore import Ignore 16 | from dvc_data.hashfile.hash_info import HashInfo 17 | from dvc_data.hashfile.state import StateBase 18 | 19 | 20 | def build_entry( 21 | path: str, 22 | fs: "FileSystem", 23 | info: Optional[dict[str, Any]] = None, 24 | compute_hash: Optional[bool] = False, 25 | state: Optional["StateBase"] = None, 26 | hash_name: str = DEFAULT_ALGORITHM, 27 | ): 28 | if info is None: 29 | info = fs.info(path) 30 | 31 | if compute_hash and info["type"] != "directory": 32 | meta, hash_info = hash_file(path, fs, hash_name, state=state, info=info) 33 | else: 34 | meta, hash_info = Meta.from_info(info, fs.protocol), None 35 | 36 | return DataIndexEntry( 37 | meta=meta, 38 | hash_info=hash_info, 39 | loaded=meta.isdir or None, 40 | ) 41 | 42 | 43 | def safe_walk( 44 | path: str, 45 | fs: "FileSystem", 46 | ignore: Optional["Ignore"] = None, 47 | ) -> Iterator[tuple[str, dict[str, dict], dict[str, dict], set[str]]]: 48 | if not isinstance(fs, LocalFileSystem): 49 | for root, dirs, files in fs.walk(path, detail=True): 50 | yield root, dirs, files, set() 51 | 52 | return 53 | 54 | # NOTE: can't use detail=True with walk, because that will make it error 55 | # out on broken symlinks. 56 | sep = fs.sep 57 | walk_iter = ignore.walk(fs, path, detail=False) if ignore else fs.walk(path) 58 | for root, dirs, files in walk_iter: 59 | _dirs: dict[str, dict] = {} 60 | _files: dict[str, dict] = {} 61 | broken = set() 62 | 63 | for name, d in chain(zip(dirs, repeat(_dirs)), zip(files, repeat(_files))): 64 | p = f"{root}{sep}{name}" 65 | try: 66 | d[name] = fs.info(p) 67 | except FileNotFoundError: 68 | d[name] = {} 69 | broken.add(name) 70 | yield root, _dirs, _files, broken 71 | dirs[:] = list(_dirs) 72 | 73 | 74 | def build_entries( 75 | path: str, 76 | fs: "FileSystem", 77 | ignore: Optional["Ignore"] = None, 78 | compute_hash: Optional[bool] = False, 79 | state: Optional["StateBase"] = None, 80 | hash_name: str = DEFAULT_ALGORITHM, 81 | checksum_jobs: Optional[int] = None, 82 | ) -> Iterable[DataIndexEntry]: 83 | from dvc_data.hashfile.build import _get_hashes 84 | 85 | sep = fs.sep 86 | jobs = checksum_jobs or fs.hash_jobs 87 | for root, dirs, files, broken in safe_walk(path, fs, ignore=ignore): 88 | if root == path: 89 | root_key: tuple[str, ...] = () 90 | else: 91 | root_key = fs.relparts(root, path) 92 | 93 | hashes: dict[str, tuple[Meta, HashInfo, dict]] = {} 94 | if compute_hash: 95 | file_infos = { 96 | f"{root}{sep}{name}": info for name, info in files.items() if info 97 | } 98 | file_paths = list(file_infos) 99 | hashes = _get_hashes( 100 | file_paths, fs, hash_name, file_infos, state=state, jobs=jobs 101 | ) 102 | 103 | for name, info in chain(dirs.items(), files.items()): 104 | key = (*root_key, name) 105 | if name in broken: 106 | yield DataIndexEntry(key=key) 107 | continue 108 | 109 | p = f"{root}{sep}{name}" 110 | if p in hashes: 111 | meta, hash_info, _ = hashes[p] 112 | else: 113 | meta, hash_info = Meta.from_info(info, fs.protocol), None 114 | loaded = meta.isdir or None 115 | yield DataIndexEntry(key=key, meta=meta, hash_info=hash_info, loaded=loaded) 116 | 117 | 118 | def build(path: str, fs: "FileSystem", ignore: Optional["Ignore"] = None) -> DataIndex: 119 | index = DataIndex() 120 | 121 | index.storage_map.add_data(FileStorage(key=(), fs=fs, path=path)) 122 | 123 | for entry in build_entries(path, fs, ignore=ignore): 124 | index.add(entry) 125 | 126 | return index 127 | -------------------------------------------------------------------------------- /src/dvc_data/index/collect.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import TYPE_CHECKING, Optional 3 | 4 | from fsspec.callbacks import DEFAULT_CALLBACK 5 | 6 | from .index import DataIndex, DataIndexEntry, FileStorage, ObjectStorage, StorageInfo 7 | 8 | if TYPE_CHECKING: 9 | from fsspec import Callback 10 | 11 | from .index import Storage 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def _collect_from_index( 17 | cache, 18 | cache_prefix, 19 | index, 20 | prefix, 21 | storage, 22 | callback: "Callback" = DEFAULT_CALLBACK, 23 | push: bool = False, 24 | ): 25 | entries = {} 26 | 27 | dir_keys = set() 28 | try: 29 | for _, entry in index.iteritems(prefix): 30 | callback.relative_update() 31 | try: 32 | storage_key = storage.get_key(entry) 33 | except ValueError: 34 | continue 35 | 36 | if entry.meta and entry.meta.isdir and entry.loaded is None: 37 | # NOTE: at this point it might not be loaded yet, so we can't 38 | # rely on entry.loaded 39 | dir_keys.add((entry.key, storage_key)) 40 | 41 | meta = entry.meta 42 | hash_info = entry.hash_info 43 | if ( 44 | not push 45 | and isinstance(storage, FileStorage) 46 | and storage.fs.version_aware 47 | and entry.meta 48 | and not entry.meta.isdir 49 | and entry.meta.version_id is None 50 | ): 51 | meta.md5 = None 52 | hash_info = None 53 | 54 | # NOTE: avoiding modifying cache right away, because you might 55 | # run into a locked database if idx and cache are using the same 56 | # table. 57 | entries[storage_key] = DataIndexEntry( 58 | key=storage_key, 59 | meta=meta, 60 | hash_info=hash_info, 61 | loaded=entry.loaded, 62 | ) 63 | 64 | except KeyError: 65 | return 66 | 67 | for key, storage_key in dir_keys: 68 | entries[storage_key].loaded = index[key].loaded 69 | 70 | for key, entry in entries.items(): 71 | cache[(*cache_prefix, *key)] = entry 72 | 73 | 74 | def collect( # noqa: C901, PLR0912, PLR0915 75 | idxs, 76 | storage, 77 | callback: "Callback" = DEFAULT_CALLBACK, 78 | cache_index=None, 79 | cache_key=None, 80 | push: bool = False, 81 | ) -> list["DataIndex"]: 82 | from fsspec.utils import tokenize 83 | 84 | storage_by_fs: dict[tuple[str, str], StorageInfo] = {} 85 | skip = set() 86 | 87 | if cache_index is None: 88 | cache_index = DataIndex() 89 | cache_key = () 90 | 91 | for idx in idxs: 92 | for prefix, storage_info in idx.storage_map.items(): 93 | data = getattr(storage_info, storage) 94 | cache = storage_info.cache if storage != "cache" else None 95 | remote = storage_info.remote if storage != "remote" else None 96 | 97 | if not data or (push and data.read_only): 98 | continue 99 | 100 | try: 101 | fsid = data.fs.fsid 102 | except (NotImplementedError, AttributeError): 103 | fsid = data.fs.protocol 104 | except BaseException: # noqa: BLE001 105 | logger.debug("skipping index collection for data with invalid fsid") 106 | continue 107 | 108 | key = (fsid, tokenize(data.path)) 109 | 110 | if key not in storage_by_fs and cache_index.has_node((*cache_key, *key)): 111 | skip.add(key) 112 | 113 | if key not in skip: 114 | _collect_from_index( 115 | cache_index, 116 | (*cache_key, *key), 117 | idx, 118 | prefix, 119 | data, 120 | callback=callback, 121 | push=push, 122 | ) 123 | cache_index.commit() 124 | 125 | if key not in storage_by_fs: 126 | fs_data: Storage 127 | fs_cache: Optional[Storage] 128 | fs_remote: Optional[Storage] 129 | 130 | if isinstance(data, ObjectStorage): 131 | fs_data = ObjectStorage(key=(), odb=data.odb) 132 | else: 133 | fs_data = FileStorage(key=(), fs=data.fs, path=data.path) 134 | 135 | if not cache: 136 | fs_cache = None 137 | elif isinstance(cache, ObjectStorage): 138 | fs_cache = ObjectStorage(key=(), odb=cache.odb) 139 | else: 140 | fs_cache = FileStorage(key=(), fs=cache.fs, path=cache.path) 141 | 142 | if not remote: 143 | fs_remote = None 144 | elif isinstance(remote, ObjectStorage): 145 | fs_remote = ObjectStorage(key=(), odb=remote.odb) 146 | else: 147 | fs_remote = FileStorage( 148 | key=(), 149 | fs=remote.fs, 150 | path=remote.path, 151 | ) 152 | 153 | storage_by_fs[key] = StorageInfo( 154 | data=fs_data, cache=fs_cache, remote=fs_remote 155 | ) 156 | 157 | storage_indexes = [] 158 | for key, storage_info in storage_by_fs.items(): 159 | idx = cache_index.view((*cache_key, *key)) 160 | idx.storage_map[()] = storage_info 161 | 162 | def _onerror(*args): 163 | pass 164 | 165 | idx.onerror = _onerror 166 | storage_indexes.append(idx) 167 | 168 | return storage_indexes 169 | -------------------------------------------------------------------------------- /src/dvc_data/index/diff.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from collections import defaultdict, deque 3 | from collections.abc import Iterable 4 | from typing import TYPE_CHECKING, Any, Callable, Optional 5 | 6 | from attrs import define 7 | from fsspec.callbacks import DEFAULT_CALLBACK, Callback 8 | 9 | if TYPE_CHECKING: 10 | from dvc_data.hashfile.hash_info import HashInfo 11 | from dvc_data.hashfile.meta import Meta 12 | 13 | from .index import BaseDataIndex, DataIndexKey 14 | 15 | from .index import DataIndexDirError, DataIndexEntry 16 | 17 | ADD = "add" 18 | MODIFY = "modify" 19 | RENAME = "rename" 20 | DELETE = "delete" 21 | UNCHANGED = "unchanged" 22 | UNKNOWN = "unknown" 23 | 24 | 25 | @define(frozen=True, unsafe_hash=True, order=True) 26 | class Change: 27 | typ: str 28 | old: Optional[DataIndexEntry] 29 | new: Optional[DataIndexEntry] 30 | 31 | @property 32 | def key(self) -> "DataIndexKey": 33 | if self.typ == RENAME: 34 | raise ValueError 35 | 36 | if self.typ == ADD: 37 | entry = self.new 38 | elif self.typ == DELETE: 39 | entry = self.old 40 | else: 41 | entry = self.old or self.new 42 | 43 | assert entry 44 | assert entry.key is not None 45 | return entry.key 46 | 47 | def __bool__(self): 48 | return self.typ != UNCHANGED 49 | 50 | 51 | def _diff_meta( 52 | old: Optional["Meta"], 53 | new: Optional["Meta"], 54 | *, 55 | cmp_key: Optional[Callable[[Optional["Meta"]], Any]] = None, 56 | ): 57 | if old is None and new is not None: 58 | return ADD 59 | 60 | if old is not None and new is None: 61 | return DELETE 62 | 63 | if cmp_key is None and old != new: 64 | return MODIFY 65 | 66 | if cmp_key is not None and cmp_key(old) != cmp_key(new): 67 | return MODIFY 68 | 69 | return UNCHANGED 70 | 71 | 72 | def _diff_hash_info( 73 | old: Optional["HashInfo"], 74 | new: Optional["HashInfo"], 75 | ): 76 | if not old and new: 77 | return ADD 78 | 79 | if old and not new: 80 | return DELETE 81 | 82 | if old and new and old != new: 83 | return MODIFY 84 | 85 | return UNCHANGED 86 | 87 | 88 | def _diff_entry( # noqa: PLR0911 89 | old: Optional["DataIndexEntry"], 90 | new: Optional["DataIndexEntry"], 91 | *, 92 | hash_only: Optional[bool] = False, 93 | meta_only: Optional[bool] = False, 94 | meta_cmp_key: Optional[Callable[[Optional["Meta"]], Any]] = None, 95 | unknown: Optional[bool] = False, 96 | ): 97 | if unknown: 98 | return UNKNOWN 99 | 100 | old_hi = old.hash_info if old else None 101 | new_hi = new.hash_info if new else None 102 | old_meta = old.meta if old else None 103 | new_meta = new.meta if new else None 104 | 105 | meta_diff = _diff_meta(old_meta, new_meta, cmp_key=meta_cmp_key) 106 | hi_diff = _diff_hash_info(old_hi, new_hi) 107 | 108 | if old is None and new is not None: 109 | entry_diff = ADD 110 | elif old is not None and new is None: 111 | entry_diff = DELETE 112 | else: 113 | entry_diff = UNCHANGED 114 | 115 | if meta_only: 116 | return meta_diff 117 | 118 | if hash_only: 119 | return hi_diff 120 | 121 | if entry_diff != UNCHANGED: 122 | return entry_diff 123 | 124 | # If both meta's are None, return hi_diff 125 | if meta_diff == UNCHANGED and old_meta is None: 126 | return hi_diff 127 | 128 | # If both hi's are falsey, return meta_diff 129 | if hi_diff == UNCHANGED and not old_hi: 130 | return meta_diff 131 | 132 | # Only return UNCHANGED/ADD/DELETE when hi_diff and meta_diff match, 133 | # otherwise return MODIFY 134 | if meta_diff == hi_diff == entry_diff: 135 | return meta_diff 136 | 137 | return MODIFY 138 | 139 | 140 | def _get_items( 141 | index: Optional["BaseDataIndex"], 142 | key, 143 | entry, 144 | *, 145 | shallow=False, 146 | with_unknown=False, 147 | ): 148 | items = {} 149 | unknown = False 150 | 151 | try: 152 | if index is not None and not (shallow and entry and entry.hash_info): 153 | items = dict(index.ls(key, detail=True)) 154 | except KeyError: 155 | pass 156 | except DataIndexDirError: 157 | unknown = with_unknown 158 | 159 | return items, unknown 160 | 161 | 162 | def _diff( # noqa: C901 163 | old: Optional["BaseDataIndex"], 164 | new: Optional["BaseDataIndex"], 165 | *, 166 | with_unchanged: Optional[bool] = False, 167 | with_unknown: Optional[bool] = False, 168 | hash_only: Optional[bool] = False, 169 | meta_only: Optional[bool] = False, 170 | meta_cmp_key: Optional[Callable[[Optional["Meta"]], Any]] = None, 171 | shallow: Optional[bool] = False, 172 | callback: Callback = DEFAULT_CALLBACK, 173 | roots: Optional[Iterable["DataIndexKey"]] = None, 174 | ): 175 | roots = roots or [()] 176 | todo: deque[tuple[dict, dict, bool]] = deque() 177 | 178 | for root in roots: 179 | old_root_items = {} 180 | new_root_items = {} 181 | 182 | if old is not None: 183 | try: 184 | old_root_items[root] = old.info(root) 185 | except KeyError: 186 | pass 187 | 188 | if new is not None: 189 | try: 190 | new_root_items[root] = new.info(root) 191 | except KeyError: 192 | pass 193 | 194 | todo.append((old_root_items, new_root_items, False)) 195 | 196 | while todo: 197 | old_items, new_items, unknown = todo.popleft() 198 | for key in callback.wrap(old_items.keys() | new_items.keys()): 199 | old_info = old_items.get(key) or {} 200 | new_info = new_items.get(key) or {} 201 | 202 | old_entry = old_info.get("entry") 203 | new_entry = new_info.get("entry") 204 | 205 | typ = _diff_entry( 206 | old_entry, 207 | new_entry, 208 | hash_only=hash_only, 209 | meta_only=meta_only, 210 | meta_cmp_key=meta_cmp_key, 211 | unknown=unknown, 212 | ) 213 | 214 | if ( 215 | hash_only 216 | and not with_unchanged 217 | and not unknown 218 | and typ == UNCHANGED 219 | and old_entry 220 | and old_entry.hash_info 221 | and old_entry.hash_info.isdir 222 | ): 223 | # NOTE: skipping the whole branch since we know it is unchanged 224 | pass 225 | elif ( 226 | old_info.get("type") == "directory" 227 | or new_info.get("type") == "directory" 228 | ): 229 | kwargs = {"shallow": shallow, "with_unknown": with_unknown} 230 | old_dir_items, old_unknown = _get_items(old, key, old_entry, **kwargs) 231 | new_dir_items, new_unknown = _get_items(new, key, new_entry, **kwargs) 232 | dir_unknown = old_unknown or new_unknown 233 | todo.append((old_dir_items, new_dir_items, dir_unknown)) 234 | 235 | if old_entry is None and new_entry is None: 236 | continue 237 | 238 | if typ == UNCHANGED and not with_unchanged: 239 | continue 240 | 241 | yield Change(typ, old_entry, new_entry) 242 | 243 | 244 | def _detect_renames(changes: Iterable[Change]): 245 | added: list[Change] = [] 246 | deleted: list[Change] = [] 247 | 248 | for change in changes: 249 | if change.typ == ADD: 250 | added.append(change) 251 | elif change.typ == DELETE: 252 | deleted.append(change) 253 | else: 254 | yield change 255 | 256 | def _get_key(change): 257 | return change.key 258 | 259 | added.sort(key=_get_key) 260 | deleted.sort(key=_get_key) 261 | 262 | # Create a dictionary for fast lookup of deletions by hash_info 263 | deleted_dict: dict[Optional[HashInfo], deque[Change]] = defaultdict(deque) 264 | for deletion in deleted: 265 | change_hash = deletion.old.hash_info if deletion.old else None 266 | # appendleft to get queue behaviour (we pop off right) 267 | deleted_dict[change_hash].appendleft(deletion) 268 | 269 | for addition in added: 270 | new_hash_info = addition.new.hash_info if addition.new else None 271 | 272 | # If the new entry is the same as a deleted change, 273 | # it is in fact a rename. 274 | # Note: get instead of __getitem__, to avoid creating 275 | # unnecessary entries. 276 | if new_hash_info and (queue := deleted_dict.get(new_hash_info)): 277 | deletion = queue.pop() 278 | 279 | yield Change( 280 | RENAME, 281 | deletion.old, 282 | addition.new, 283 | ) 284 | else: 285 | yield addition 286 | 287 | # Yield the remaining unmatched deletions 288 | if deleted_dict: 289 | yield from itertools.chain.from_iterable(deleted_dict.values()) 290 | 291 | 292 | def diff( # noqa: PLR0913 293 | old: Optional["BaseDataIndex"], 294 | new: Optional["BaseDataIndex"], 295 | *, 296 | with_renames: Optional[bool] = False, 297 | with_unchanged: Optional[bool] = False, 298 | with_unknown: Optional[bool] = False, 299 | hash_only: Optional[bool] = False, 300 | meta_only: Optional[bool] = False, 301 | meta_cmp_key: Optional[Callable[[Optional["Meta"]], Any]] = None, 302 | shallow: Optional[bool] = False, 303 | callback: Callback = DEFAULT_CALLBACK, 304 | roots: Optional[Iterable["DataIndexKey"]] = None, 305 | ): 306 | changes = _diff( 307 | old, 308 | new, 309 | with_unchanged=with_unchanged, 310 | with_unknown=with_unknown, 311 | hash_only=hash_only, 312 | meta_only=meta_only, 313 | meta_cmp_key=meta_cmp_key, 314 | shallow=shallow, 315 | callback=callback, 316 | roots=roots, 317 | ) 318 | 319 | if with_renames and old is not None and new is not None: 320 | assert not meta_only 321 | yield from _detect_renames(changes) 322 | else: 323 | yield from changes 324 | -------------------------------------------------------------------------------- /src/dvc_data/index/fetch.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from contextlib import closing 3 | from functools import partial 4 | from typing import TYPE_CHECKING, Optional 5 | 6 | from dvc_objects.fs.local import LocalFileSystem 7 | from fsspec.callbacks import DEFAULT_CALLBACK 8 | 9 | from dvc_data.callbacks import TqdmCallback 10 | from dvc_data.hashfile.db import get_index 11 | from dvc_data.hashfile.meta import Meta 12 | from dvc_data.hashfile.transfer import transfer 13 | 14 | from .build import build 15 | from .checkout import apply, compare 16 | from .collect import collect # noqa: F401 17 | from .index import DataIndex, ObjectStorage 18 | from .save import md5, save 19 | 20 | if TYPE_CHECKING: 21 | from fsspec import Callback 22 | 23 | from dvc_data.hashfile.status import CompareStatusResult 24 | 25 | from .index import DataIndexKey 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def _log_missing(status: "CompareStatusResult"): 31 | if status.missing: 32 | missing_desc = "\n".join(f"{hash_info}" for hash_info in status.missing) 33 | logger.warning( 34 | "Some of the cache files do not exist neither locally " 35 | "nor on remote. Missing cache files:\n%s", 36 | missing_desc, 37 | ) 38 | 39 | 40 | def _onerror(data, cache, failed_keys, src_path, dest_path, exc): 41 | if not isinstance(exc, FileNotFoundError) or data.fs.exists(src_path): 42 | failed_keys.add(cache.fs.relparts(dest_path, cache.path)) 43 | 44 | logger.debug( 45 | "failed to create '%s' from '%s'", 46 | src_path, 47 | dest_path, 48 | exc_info=True, 49 | ) 50 | 51 | 52 | def _filter_changed(index): 53 | ret = DataIndex() 54 | ret.storage_map = index.storage_map 55 | 56 | for _, entry in index.items(): 57 | if entry.meta and entry.meta.isdir: 58 | ret.add(entry) 59 | continue 60 | 61 | if not entry.meta or entry.meta.version_id: 62 | ret.add(entry) 63 | continue 64 | 65 | try: 66 | data_fs, data_path = index.storage_map.get_data(entry) 67 | except ValueError: 68 | continue 69 | 70 | try: 71 | info = data_fs.info(data_path) 72 | except FileNotFoundError: 73 | continue 74 | 75 | if getattr(data_fs, "immutable", None): 76 | ret.add(entry) 77 | continue 78 | 79 | meta = Meta.from_info(info) 80 | old = getattr(entry.meta, data_fs.PARAM_CHECKSUM, None) if entry.meta else None 81 | new = getattr(meta, data_fs.PARAM_CHECKSUM, None) 82 | 83 | if old and new is None and isinstance(data_fs, LocalFileSystem): 84 | # NOTE: temporary ugly hack to handle local sources where 85 | # the only thing we currently have is md5. 86 | from dvc_data.hashfile.hash import hash_file 87 | 88 | _, hi = hash_file(data_path, data_fs, "md5") 89 | new = hi.value 90 | 91 | if old and new and old == new: 92 | ret.add(entry) 93 | 94 | return ret 95 | 96 | 97 | def fetch( 98 | idxs, 99 | callback: "Callback" = DEFAULT_CALLBACK, 100 | jobs: Optional[int] = None, 101 | ): 102 | fetched, failed = 0, 0 103 | for fs_index in idxs: 104 | data = fs_index.storage_map[()].data 105 | cache = fs_index.storage_map[()].cache 106 | 107 | if callback != DEFAULT_CALLBACK: 108 | cb = TqdmCallback( 109 | unit="file", 110 | total=len(fs_index), 111 | desc=f"Fetching from {data.fs.protocol}", 112 | ) 113 | else: 114 | cb = callback 115 | 116 | try: 117 | # NOTE: make sure there are no auth errors 118 | data.fs.exists(data.path) 119 | except Exception: 120 | failed += len(fs_index) 121 | logger.exception( 122 | "failed to connect to %s (%s)", data.fs.protocol, data.path 123 | ) 124 | continue 125 | 126 | with cb: 127 | if isinstance(cache, ObjectStorage) and isinstance(data, ObjectStorage): 128 | with closing(get_index(data.odb)) as src_index: 129 | result = transfer( 130 | data.odb, 131 | cache.odb, 132 | [ 133 | entry.hash_info 134 | for _, entry in fs_index.iteritems() 135 | if entry.hash_info 136 | ], 137 | jobs=jobs, 138 | src_index=src_index, 139 | cache_odb=cache.odb, 140 | verify=data.odb.verify, 141 | validate_status=_log_missing, 142 | callback=cb, 143 | ) 144 | fetched += len(result.transferred) 145 | failed += len(result.failed) 146 | elif isinstance(cache, ObjectStorage): 147 | updated = md5(fs_index) 148 | 149 | def _on_error(failed, oid, exc): 150 | if isinstance(exc, FileNotFoundError): 151 | return 152 | failed += 1 153 | logger.debug( 154 | "failed to transfer '%s'", 155 | oid, 156 | exc_info=True, 157 | ) 158 | 159 | fetched += save( 160 | updated, 161 | jobs=jobs, 162 | callback=cb, 163 | on_error=partial(_on_error, failed), 164 | ) 165 | else: 166 | old = build(cache.path, cache.fs) 167 | filtered = _filter_changed(fs_index) 168 | diff = compare(old, filtered) 169 | cache.fs.makedirs(cache.fs.parent(cache.path), exist_ok=True) 170 | 171 | failed_keys: set[DataIndexKey] = set() 172 | apply( 173 | diff, 174 | cache.path, 175 | cache.fs, 176 | update_meta=False, 177 | storage="data", 178 | jobs=jobs, 179 | callback=cb, 180 | onerror=partial(_onerror, data, cache, failed_keys), 181 | ) 182 | 183 | added_keys = {entry.key for entry in diff.files_create} 184 | fetched += len(added_keys - failed_keys) 185 | failed += len(failed_keys) 186 | 187 | return fetched, failed 188 | -------------------------------------------------------------------------------- /src/dvc_data/index/push.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from contextlib import closing 3 | from functools import partial 4 | from typing import TYPE_CHECKING, Any, Optional 5 | 6 | from fsspec.callbacks import DEFAULT_CALLBACK 7 | 8 | from dvc_data.callbacks import TqdmCallback 9 | from dvc_data.hashfile.db import get_index 10 | from dvc_data.hashfile.transfer import transfer 11 | 12 | from .build import build 13 | from .checkout import _prune_existing_versions, apply, compare 14 | from .fetch import _log_missing 15 | from .index import DataIndex, ObjectStorage 16 | 17 | if TYPE_CHECKING: 18 | from dvc_objects.fs import FileSystem 19 | from fsspec import Callback 20 | 21 | from dvc_data.hashfile.meta import Meta 22 | 23 | from .index import DataIndexKey 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | # for files, if our version's checksum (etag) matches the latest remote 29 | # checksum, we do not need to push, even if the version IDs don't match 30 | def _meta_checksum(fs: "FileSystem", meta: "Meta") -> Any: 31 | if not meta or meta.isdir: 32 | return meta 33 | assert fs.PARAM_CHECKSUM 34 | return getattr(meta, fs.PARAM_CHECKSUM) 35 | 36 | 37 | def _onerror(cache, data, failed_keys, src_path, dest_path, exc): 38 | if not isinstance(exc, FileNotFoundError) or cache.fs.exists(src_path): 39 | failed_keys.add(data.fs.relparts(dest_path, data.path)) 40 | 41 | logger.debug( 42 | "failed to create '%s' from '%s'", 43 | src_path, 44 | dest_path, 45 | exc_info=True, 46 | ) 47 | 48 | 49 | def _filter_missing(index): 50 | ret = DataIndex() 51 | ret.storage_map = index.storage_map 52 | 53 | for _, entry in index.items(): 54 | try: 55 | cache_fs, cache_path = index.storage_map.get_cache(entry) 56 | except ValueError: 57 | continue 58 | 59 | if cache_fs.exists(cache_path): 60 | ret.add(entry) 61 | 62 | return ret 63 | 64 | 65 | def push( 66 | idxs, 67 | callback: "Callback" = DEFAULT_CALLBACK, 68 | jobs: Optional[int] = None, 69 | ): 70 | pushed, failed = 0, 0 71 | for fs_index in idxs: 72 | data = fs_index.storage_map[()].data 73 | cache = fs_index.storage_map[()].cache 74 | 75 | if isinstance(cache, ObjectStorage) and isinstance(data, ObjectStorage): 76 | with TqdmCallback(unit="file", desc=f"Pushing to {data.fs.protocol}") as cb: 77 | with closing(get_index(data.odb)) as dest_index: 78 | result = transfer( 79 | cache.odb, 80 | data.odb, 81 | [ 82 | entry.hash_info 83 | for _, entry in fs_index.iteritems() 84 | if entry.hash_info 85 | ], 86 | jobs=jobs, 87 | dest_index=dest_index, 88 | cache_odb=data.odb, 89 | validate_status=_log_missing, 90 | callback=cb, 91 | ) 92 | pushed += len(result.transferred) 93 | failed += len(result.failed) 94 | else: 95 | old = build(data.path, data.fs) 96 | 97 | existing_fs_index = _filter_missing(fs_index) 98 | diff = compare( 99 | old, 100 | existing_fs_index, 101 | meta_only=True, 102 | meta_cmp_key=partial(_meta_checksum, data.fs), 103 | ) 104 | data.fs.makedirs(data.fs.parent(data.path), exist_ok=True) 105 | 106 | failed_keys: set[DataIndexKey] = set() 107 | 108 | if data.fs.version_aware: 109 | desc = f"Checking status of existing versions in {data.path!r}" 110 | with TqdmCallback(desc=desc, unit="file") as cb: 111 | diff.files_create = list( 112 | _prune_existing_versions( 113 | diff.files_create, data.fs, data.path, callback=cb 114 | ) 115 | ) 116 | 117 | with TqdmCallback(unit="file", desc=f"Pushing to {data.fs.protocol}") as cb: 118 | cb.set_size(len(diff.files_create)) 119 | apply( 120 | diff, 121 | data.path, 122 | data.fs, 123 | update_meta=False, 124 | storage="cache", 125 | jobs=jobs, 126 | callback=cb, 127 | links=["reflink", "copy"], 128 | onerror=partial(_onerror, cache, data, failed_keys), 129 | ) 130 | 131 | added_keys = {entry.key for entry in diff.files_create} 132 | pushed += len(added_keys - failed_keys) 133 | failed += len(failed_keys) 134 | 135 | return pushed, failed 136 | -------------------------------------------------------------------------------- /src/dvc_data/index/save.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import TYPE_CHECKING, Optional 3 | 4 | from fsspec.callbacks import DEFAULT_CALLBACK 5 | 6 | from dvc_data.hashfile.hash import DEFAULT_ALGORITHM, hash_file 7 | from dvc_data.hashfile.meta import Meta 8 | from dvc_data.hashfile.tree import Tree 9 | 10 | if TYPE_CHECKING: 11 | from dvc_objects.fs.base import FileSystem 12 | from fsspec import Callback 13 | 14 | from dvc_data.hashfile.db import HashFileDB 15 | from dvc_data.hashfile.state import StateBase 16 | 17 | from .index import BaseDataIndex, DataIndex, DataIndexKey 18 | 19 | 20 | def _meta_matches(fs, path, old_meta): 21 | try: 22 | info = fs.info(path) 23 | except FileNotFoundError: 24 | return False 25 | 26 | if getattr(fs, "immutable", False): 27 | return True 28 | 29 | new_meta = Meta.from_info(info, fs.protocol) 30 | old = getattr(old_meta, fs.PARAM_CHECKSUM, None) if old_meta else None 31 | new = getattr(new_meta, fs.PARAM_CHECKSUM, None) 32 | if not old or not new: 33 | return None 34 | 35 | return old == new 36 | 37 | 38 | def md5( 39 | index: "BaseDataIndex", 40 | state: Optional["StateBase"] = None, 41 | storage: str = "data", 42 | name: str = DEFAULT_ALGORITHM, 43 | ) -> "DataIndex": 44 | from .index import DataIndex, DataIndexEntry 45 | 46 | ret = DataIndex() 47 | 48 | for _, entry in index.iteritems(): 49 | if entry.meta and entry.meta.isdir: 50 | ret.add(entry) 51 | continue 52 | 53 | hash_info = None 54 | if entry.hash_info and entry.hash_info.name in ("md5", "md5-dos2unix"): 55 | hash_info = entry.hash_info 56 | 57 | try: 58 | fs, path = index.storage_map.get_storage(entry, storage) 59 | except ValueError: 60 | continue 61 | 62 | matches = _meta_matches(fs, path, entry.meta) 63 | if matches: 64 | ret.add(entry) 65 | elif matches is not None: 66 | continue 67 | 68 | try: 69 | _, hi = hash_file(path, fs, name, state=state) 70 | except FileNotFoundError: 71 | continue 72 | 73 | if hash_info and hi != hash_info: 74 | continue 75 | 76 | ret.add( 77 | DataIndexEntry( 78 | key=entry.key, 79 | meta=entry.meta, 80 | hash_info=hi, 81 | ) 82 | ) 83 | 84 | ret.storage_map = index.storage_map 85 | return ret 86 | 87 | 88 | def build_tree( 89 | index: "BaseDataIndex", 90 | prefix: "DataIndexKey", 91 | name: str = DEFAULT_ALGORITHM, 92 | ) -> tuple["Meta", Tree]: 93 | tree_meta = Meta(size=0, nfiles=0, isdir=True) 94 | assert tree_meta.size is not None 95 | assert tree_meta.nfiles is not None 96 | tree = Tree() 97 | for key, entry in index.iteritems(prefix=prefix): 98 | if key == prefix or (entry.meta and entry.meta.isdir): 99 | continue 100 | tree_key = key[len(prefix) :] 101 | tree.add(tree_key, entry.meta, entry.hash_info) 102 | tree_meta.size += (entry.meta.size if entry.meta else 0) or 0 103 | tree_meta.nfiles += 1 104 | tree.digest(name=name) 105 | return tree_meta, tree 106 | 107 | 108 | def _save_dir_entry( 109 | index: "BaseDataIndex", 110 | key: "DataIndexKey", 111 | odb: Optional["HashFileDB"] = None, 112 | ) -> None: 113 | from dvc_data.hashfile.db import add_update_tree 114 | 115 | from .index import StorageKeyError 116 | 117 | entry = index[key] 118 | 119 | try: 120 | cache = odb or index.storage_map.get_cache_odb(entry) 121 | except StorageKeyError: 122 | return 123 | 124 | assert cache 125 | meta, tree = build_tree(index, key) 126 | tree = add_update_tree(cache, tree) 127 | entry.meta = meta 128 | entry.hash_info = tree.hash_info 129 | assert tree.hash_info.name 130 | assert tree.hash_info.value 131 | setattr(entry.meta, tree.hash_info.name, tree.hash_info.value) 132 | 133 | 134 | if TYPE_CHECKING: 135 | _ODBMap = dict["HashFileDB", "_FSMap"] 136 | _FSMap = dict["FileSystem", list[tuple[str, str]]] 137 | 138 | 139 | def save( 140 | index: "BaseDataIndex", 141 | odb: Optional["HashFileDB"] = None, 142 | callback: "Callback" = DEFAULT_CALLBACK, 143 | jobs: Optional[int] = None, 144 | storage: str = "data", 145 | **kwargs, 146 | ) -> int: 147 | dir_entries: list[DataIndexKey] = [] 148 | transferred = 0 149 | 150 | odb_map: _ODBMap = {} 151 | for key, entry in index.iteritems(): 152 | if entry.meta and entry.meta.isdir: 153 | dir_entries.append(key) 154 | continue 155 | 156 | try: 157 | fs, path = index.storage_map.get_storage(entry, storage) 158 | except ValueError: 159 | continue 160 | 161 | if entry.hash_info: 162 | cache = odb or index.storage_map.get_cache_odb(entry) 163 | assert cache 164 | assert entry.hash_info.value 165 | oid = entry.hash_info.value 166 | if cache not in odb_map: 167 | odb_map[cache] = defaultdict(list) 168 | odb_map[cache][fs].append((path, oid)) 169 | for cache, fs_map in odb_map.items(): 170 | for fs, args in fs_map.items(): 171 | paths, oids = zip(*args) 172 | transferred += cache.add( 173 | list(paths), 174 | fs, 175 | list(oids), 176 | callback=callback, 177 | batch_size=jobs, 178 | **kwargs, 179 | ) 180 | 181 | for key in dir_entries: 182 | _save_dir_entry(index, key, odb=odb) 183 | 184 | return transferred 185 | -------------------------------------------------------------------------------- /src/dvc_data/index/serialize.py: -------------------------------------------------------------------------------- 1 | import json 2 | from contextlib import closing 3 | 4 | from dvc_data.hashfile.cache import Cache 5 | 6 | from .index import DataIndex, DataIndexEntry 7 | 8 | 9 | def write_db(index: DataIndex, path: str) -> None: 10 | cache = Cache(path) 11 | with closing(cache), cache.transact(): 12 | for key, entry in index.iteritems(): 13 | cache["/".join(key)] = entry.to_dict() 14 | 15 | 16 | def read_db(path: str) -> DataIndex: 17 | index = DataIndex() 18 | cache = Cache(path) 19 | 20 | with closing(cache), cache.transact(): 21 | for key in cache: 22 | value = cache.get(key) 23 | entry = DataIndexEntry.from_dict(value) 24 | entry.key = tuple(key.split("/")) 25 | index.add(entry) 26 | 27 | return index 28 | 29 | 30 | def write_json(index: DataIndex, path: str) -> None: 31 | with open(path, "w", encoding="utf-8") as fobj: 32 | json.dump( 33 | {"/".join(key): entry.to_dict() for key, entry in index.iteritems()}, 34 | fobj, 35 | ) 36 | 37 | 38 | def read_json(path: str) -> DataIndex: 39 | index = DataIndex() 40 | 41 | with open(path, encoding="utf-8") as fobj: 42 | for key, value in json.load(fobj).items(): 43 | entry = DataIndexEntry.from_dict(value) 44 | entry.key = tuple(key.split("/")) 45 | index.add(entry) 46 | 47 | return index 48 | -------------------------------------------------------------------------------- /src/dvc_data/index/update.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from .diff import UNCHANGED, diff 4 | 5 | if TYPE_CHECKING: 6 | from .index import BaseDataIndex, DataIndex 7 | 8 | 9 | def update(new: "DataIndex", old: "BaseDataIndex") -> None: 10 | for change in diff(old, new, with_unchanged=True, meta_only=True): 11 | if change.typ == UNCHANGED: 12 | change.new.hash_info = change.old.hash_info 13 | -------------------------------------------------------------------------------- /src/dvc_data/index/view.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | from collections.abc import Iterator 3 | from typing import TYPE_CHECKING, Any, Callable, Optional 4 | 5 | from .index import BaseDataIndex, DataIndex, DataIndexEntry, DataIndexKey 6 | 7 | if TYPE_CHECKING: 8 | from .index import StorageMapping 9 | 10 | 11 | class DataIndexView(BaseDataIndex): 12 | def __init__( 13 | self, 14 | index: DataIndex, 15 | filter_fn: Callable[[DataIndexKey], bool], 16 | ): 17 | self._index = index 18 | self.filter_fn = filter_fn 19 | 20 | @property 21 | def onerror(self): 22 | return self._index.onerror 23 | 24 | @onerror.setter 25 | def onerror(self, onerror): 26 | self._index.onerror = onerror 27 | 28 | @property 29 | def storage_map(self) -> "StorageMapping": # type: ignore[override] 30 | return self._index.storage_map 31 | 32 | def __setitem__(self, key, value): 33 | if self.filter_fn(key): 34 | self._index[key] = value 35 | else: 36 | raise KeyError 37 | 38 | def __getitem__(self, key: DataIndexKey) -> DataIndexEntry: 39 | if key == () or self.filter_fn(key): 40 | return self._index[key] 41 | raise KeyError 42 | 43 | def __delitem__(self, key: DataIndexKey): 44 | if self.filter_fn(key): 45 | del self._index[key] 46 | else: 47 | raise KeyError 48 | 49 | def __iter__(self) -> Iterator[DataIndexKey]: 50 | return (key for key, _ in self._iteritems()) 51 | 52 | def __len__(self): 53 | return len(list(iter(self))) 54 | 55 | def _iteritems( 56 | self, 57 | prefix: Optional[DataIndexKey] = None, 58 | shallow: bool = False, 59 | ensure_loaded: bool = False, 60 | ) -> Iterator[tuple[DataIndexKey, DataIndexEntry]]: 61 | # NOTE: iteration is implemented using traverse and not iter/iteritems 62 | # since it supports skipping subtrie traversal for prefixes that are 63 | # not in the view. 64 | 65 | class _FilterNode: 66 | def __init__(self, key, children, *args): 67 | self.key = key 68 | self.children = children 69 | self.value = args[0] if args else None 70 | 71 | def build(self, stack): 72 | if not self.key or not shallow: 73 | for child in self.children: 74 | stack.append(child) 75 | return self.key, self.value 76 | 77 | def _node_factory(_, key, children, *args) -> Optional[_FilterNode]: 78 | return _FilterNode(key, children, *args) 79 | 80 | kwargs = {"prefix": prefix} if prefix is not None else {} 81 | stack = deque([self.traverse(_node_factory, **kwargs)]) 82 | while stack: 83 | node = stack.popleft() 84 | if node is not None: 85 | key, value = node.build(stack) 86 | if key and value: 87 | yield key, value 88 | if ensure_loaded: 89 | yield from self._load_dir_keys(key, value, shallow=shallow) 90 | 91 | def _load_dir_keys( 92 | self, 93 | prefix: DataIndexKey, 94 | entry: Optional[DataIndexEntry], 95 | shallow: Optional[bool] = False, 96 | ) -> Iterator[tuple[DataIndexKey, DataIndexEntry]]: 97 | # NOTE: traverse() will not enter subtries that have been added 98 | # in-place during traversal. So for dirs which we load in-place, we 99 | # need to iterate over the new keys ourselves. 100 | if ( 101 | entry is not None 102 | and entry.hash_info 103 | and entry.hash_info.isdir 104 | and not entry.loaded 105 | ): 106 | self._index._load(prefix, entry) 107 | if not shallow: 108 | for key, val in self._index.iteritems(entry.key): 109 | if key != prefix and self.filter_fn(key): 110 | yield key, val 111 | 112 | def iteritems( 113 | self, 114 | prefix: Optional[DataIndexKey] = None, 115 | shallow: bool = False, 116 | ) -> Iterator[tuple[DataIndexKey, DataIndexEntry]]: 117 | return self._iteritems(prefix=prefix, shallow=shallow, ensure_loaded=True) 118 | 119 | def traverse(self, node_factory: Callable, **kwargs) -> Any: 120 | def _node_factory(path_conv, key, children, *args): 121 | if not key or self.filter_fn(key): 122 | return node_factory(path_conv, key, children, *args) 123 | return None 124 | 125 | return self._index.traverse(_node_factory, **kwargs) 126 | 127 | def ls(self, root_key: DataIndexKey, detail=True): 128 | self._index._ensure_loaded(root_key) 129 | 130 | if detail: 131 | yield from ( 132 | (key, self._index._info_from_entry(key, entry)) 133 | for key, entry in self._index._trie.ls(root_key, with_values=True) 134 | if self.filter_fn(key) 135 | ) 136 | else: 137 | yield from filter(self.filter_fn, self._index.ls(root_key, detail=False)) 138 | 139 | def has_node(self, key: DataIndexKey) -> bool: 140 | return self.filter_fn(key) and self._index.has_node(key) 141 | 142 | def delete_node(self, key: DataIndexKey) -> None: 143 | if not self.filter_fn(key): 144 | raise KeyError 145 | self._index.delete_node(key) 146 | 147 | def longest_prefix( 148 | self, key: DataIndexKey 149 | ) -> tuple[Optional[DataIndexKey], Optional[DataIndexEntry]]: 150 | if self.filter_fn(key): 151 | return self._index.longest_prefix(key) 152 | return (None, None) 153 | 154 | 155 | def view(index: DataIndex, filter_fn: Callable[[DataIndexKey], bool]) -> DataIndexView: 156 | """Return read-only filtered view of an index.""" 157 | return DataIndexView(index, filter_fn=filter_fn) 158 | -------------------------------------------------------------------------------- /src/dvc_data/json_compat.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any 3 | 4 | try: 5 | import orjson # type: ignore[import-not-found] 6 | except ImportError: 7 | 8 | def loads(data: str) -> Any: 9 | return json.loads(data) 10 | 11 | def dumps(data: Any) -> str: 12 | return json.dumps(data) 13 | else: 14 | 15 | def loads(data: str) -> Any: 16 | return orjson.loads(data) 17 | 18 | def dumps(data: Any) -> str: 19 | return orjson.dumps(data).decode("utf8") 20 | 21 | 22 | __all__ = ["dumps", "loads"] 23 | -------------------------------------------------------------------------------- /src/dvc_data/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iterative/dvc-data/4ee77349a9712476cea4ac57154ee25ce79fcc02/src/dvc_data/py.typed -------------------------------------------------------------------------------- /src/dvc_data/repo.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | from dvc_objects.fs import localfs 5 | from dvc_objects.fs.base import FileSystem 6 | 7 | from .index import DataIndex 8 | 9 | 10 | class NotARepoError(Exception): 11 | pass 12 | 13 | 14 | class Repo: 15 | def __init__(self, root: str = "", fs: Optional[FileSystem] = None) -> None: 16 | fs = fs or localfs 17 | root = root or fs.getcwd() 18 | control_dir: str = os.getenv("DVC_DIR") or fs.join(root, ".dvc") 19 | 20 | if not fs.isdir(control_dir): 21 | raise NotARepoError(f"{root} is not a data repo.") 22 | 23 | self.fs = fs or localfs 24 | self.root = root 25 | self._control_dir = control_dir 26 | self._tmp_dir: str = fs.join(self._control_dir, "tmp") 27 | self._cache_dir = fs.join(self._control_dir, "cache") 28 | self._object_dir = fs.join(self._cache_dir, "files", "md5") 29 | 30 | self.index = DataIndex() 31 | 32 | @classmethod 33 | def discover( 34 | cls, 35 | start: str = ".", 36 | fs: Optional[FileSystem] = None, 37 | ) -> "Repo": 38 | remaining = start 39 | fs = fs or localfs 40 | path = start = fs.abspath(start) 41 | while remaining: 42 | try: 43 | return cls(path, fs) 44 | except NotARepoError: 45 | path, remaining = fs.split(path) 46 | raise NotARepoError(f"No data repository was found at {start}") 47 | 48 | @property 49 | def control_dir(self): 50 | return self._control_dir 51 | 52 | @property 53 | def tmp_dir(self): 54 | return self._tmp_dir 55 | 56 | @property 57 | def object_dir(self): 58 | return self._object_dir 59 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Test suite for the dvc_data package.""" 2 | -------------------------------------------------------------------------------- /tests/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iterative/dvc-data/4ee77349a9712476cea4ac57154ee25ce79fcc02/tests/benchmarks/__init__.py -------------------------------------------------------------------------------- /tests/benchmarks/test_checkout.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from os import fspath 3 | from pathlib import Path 4 | from tempfile import TemporaryDirectory 5 | 6 | import pytest 7 | from dvc_objects.fs import localfs 8 | from dvc_objects.fs.generic import test_links as _test_links 9 | 10 | from dvc_data.cli import build, gentree, get_odb 11 | from dvc_data.hashfile.checkout import checkout 12 | from dvc_data.hashfile.state import State 13 | 14 | 15 | @pytest.fixture 16 | def repo(request, monkeypatch): 17 | """Create a dvc data repo within pytest'scache directory. 18 | The cache directory by default, is in the root of the repo, where reflink 19 | may be supported. 20 | """ 21 | cache = request.config.cache 22 | path = cache.mkdir("dvc_data_repo") 23 | with TemporaryDirectory(dir=path) as tmp_dir: 24 | monkeypatch.chdir(tmp_dir) 25 | path = Path(tmp_dir) 26 | (path / ".dvc").mkdir() 27 | yield path 28 | 29 | 30 | @pytest.mark.parametrize("link", ["reflink", "copy", "symlink", "hardlink"]) 31 | def test_checkout(repo, benchmark, link): 32 | fs_path = fspath(repo / "dataset") 33 | odb = get_odb(type=[link]) 34 | 35 | if not _test_links([link], localfs, odb.path, localfs, fs_path): 36 | pytest.skip(f"unsupported link type: {link}") 37 | 38 | gentree(repo / "dataset", 1000, "50Mb") 39 | obj = build(repo / "dataset", write=True) 40 | state = odb.state 41 | 42 | def setup(): 43 | for path in (state.tmp_dir, fs_path): 44 | try: 45 | shutil.rmtree(path) 46 | except FileNotFoundError: 47 | pass 48 | State(state.root_dir, state.tmp_dir, state.ignore) # recreate db 49 | 50 | assert benchmark.pedantic( 51 | checkout, 52 | setup=setup, 53 | args=(fs_path, localfs, obj, odb), 54 | kwargs={"state": state}, 55 | rounds=10, 56 | warmup_rounds=2, 57 | ) 58 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import dvc_objects 2 | import pytest 3 | 4 | 5 | @pytest.fixture 6 | def as_filesystem(): 7 | return dvc_objects.fs.as_filesystem 8 | -------------------------------------------------------------------------------- /tests/hashfile/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iterative/dvc-data/4ee77349a9712476cea4ac57154ee25ce79fcc02/tests/hashfile/__init__.py -------------------------------------------------------------------------------- /tests/hashfile/test_build.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dvc_objects.fs.local import LocalFileSystem 4 | 5 | from dvc_data.hashfile.build import build 6 | from dvc_data.hashfile.db import HashFileDB 7 | from dvc_data.hashfile.hash_info import HashInfo 8 | from dvc_data.hashfile.meta import Meta 9 | from dvc_data.hashfile.tree import Tree 10 | 11 | 12 | def test_build_file(tmp_path): 13 | fs = LocalFileSystem() 14 | file = tmp_path / "foo" 15 | 16 | odb = HashFileDB(fs, os.fspath(tmp_path / ".dvc" / ".cache" / "files" / "md5")) 17 | 18 | fs.pipe({file: b"foo"}) 19 | 20 | _, meta, obj = build(odb, str(file), fs, "md5") 21 | assert meta.isdir is False 22 | assert meta.size == 3 23 | assert obj.hash_info == HashInfo("md5", "acbd18db4cc2f85cedef654fccc4a4d8") 24 | 25 | 26 | def test_build_directory(tmp_path): 27 | fs = LocalFileSystem() 28 | directory = tmp_path / "dir" 29 | directory.mkdir() 30 | 31 | odb = HashFileDB(fs, os.fspath(tmp_path / ".dvc" / ".cache" / "files" / "md5")) 32 | 33 | fs.pipe({directory / "foo": b"foo", directory / "bar": b"bar"}) 34 | 35 | _, meta, tree = build(odb, str(directory), fs, "md5") 36 | assert meta == Meta(isdir=True, size=6, nfiles=2) 37 | assert isinstance(tree, Tree) 38 | assert tree.hash_info == HashInfo("md5", "5ea40360f5b4ec688df672a4db9c17d1.dir") 39 | assert tree.as_list() == [ 40 | {"md5": "37b51d194a7513e45b56f6524f2d51f2", "relpath": "bar"}, 41 | {"md5": "acbd18db4cc2f85cedef654fccc4a4d8", "relpath": "foo"}, 42 | ] 43 | -------------------------------------------------------------------------------- /tests/hashfile/test_cache.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from os import fspath 3 | from typing import Any 4 | 5 | import pytest 6 | 7 | from dvc_data.hashfile.cache import Cache, DiskError, HashesCache 8 | 9 | 10 | def set_value(cache: Cache, key: str, value: Any) -> Any: 11 | cache[key] = value 12 | return cache[key] 13 | 14 | 15 | @pytest.mark.parametrize("disk_type", [None, "test"]) 16 | def test_pickle_protocol_error(tmp_path, disk_type): 17 | directory = tmp_path / "test" 18 | cache = Cache( 19 | fspath(directory), 20 | disk_pickle_protocol=pickle.HIGHEST_PROTOCOL + 1, 21 | type=disk_type, 22 | ) 23 | with pytest.raises(DiskError) as exc, cache as cache: 24 | set_value(cache, "key", ("value1", "value2")) 25 | assert exc.value.directory == fspath(directory) 26 | assert exc.value.type == "test" 27 | assert f"Could not open disk 'test' in {directory}" == str(exc.value) 28 | 29 | 30 | @pytest.mark.parametrize( 31 | "proto_a, proto_b", 32 | [ 33 | (pickle.HIGHEST_PROTOCOL - 1, pickle.HIGHEST_PROTOCOL), 34 | (pickle.HIGHEST_PROTOCOL, pickle.HIGHEST_PROTOCOL - 1), 35 | ], 36 | ) 37 | def test_pickle_backwards_compat(tmp_path, proto_a, proto_b): 38 | with Cache( 39 | directory=fspath(tmp_path / "test"), 40 | disk_pickle_protocol=proto_a, 41 | ) as cache: 42 | set_value(cache, "key", ("value1", "value2")) 43 | with Cache( 44 | directory=fspath(tmp_path / "test"), 45 | disk_pickle_protocol=proto_b, 46 | ) as cache: 47 | assert cache["key"] == ("value1", "value2") 48 | set_value(cache, "key", ("value3", "value4")) 49 | assert cache["key"] == ("value3", "value4") 50 | 51 | 52 | def test_hashes_cache(tmp_path): 53 | with HashesCache(tmp_path / "test") as cache: 54 | assert cache.is_empty() 55 | assert cache.set("key", "value") 56 | assert not cache.is_empty() 57 | assert cache.get("key") == "value" 58 | assert cache.get("not-existing-key") is None 59 | 60 | 61 | def test_hashes_cache_many(tmp_path): 62 | with HashesCache(tmp_path / "test") as cache: 63 | assert cache.is_empty() 64 | assert list(cache.get_many(("key1",))) == [("key1", None)] 65 | 66 | cache.set_many((("key1", "value1"), ("key2", "value2"))) 67 | assert not cache.is_empty() 68 | assert list(cache.get_many(("key1", "key2"))) == [ 69 | ("key1", "value1"), 70 | ("key2", "value2"), 71 | ] 72 | assert list(cache.get_many(("key1", "key2", "not-existing-key"))) == [ 73 | ("key1", "value1"), 74 | ("key2", "value2"), 75 | ("not-existing-key", None), 76 | ] 77 | 78 | 79 | @pytest.mark.parametrize("upsert", [True, False]) 80 | def test_hashes_cache_update(tmp_path, upsert): 81 | with HashesCache(tmp_path / "test") as cache: 82 | cache.SUPPORTS_UPSERT = upsert 83 | 84 | assert cache.is_empty() 85 | cache.set("key1", "value") 86 | cache.set_many((("key1", "value1"), ("key2", "value2"))) 87 | assert list(cache.get_many(("key1", "key2"))) == [ 88 | ("key1", "value1"), 89 | ("key2", "value2"), 90 | ] 91 | -------------------------------------------------------------------------------- /tests/hashfile/test_db.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from dvc_objects.errors import ObjectFormatError 3 | 4 | from dvc_data.hashfile.db import HashFile, HashFileDB 5 | from dvc_data.hashfile.db.local import LocalHashFileDB 6 | from dvc_data.hashfile.meta import Meta 7 | 8 | 9 | def test_db(tmp_upath, as_filesystem): 10 | odb = HashFileDB(as_filesystem(tmp_upath.fs), str(tmp_upath)) 11 | 12 | assert not odb.exists("123456") 13 | assert list(odb.all()) == [] 14 | 15 | obj = odb.get("123456") 16 | assert isinstance(obj, HashFile) 17 | 18 | 19 | @pytest.mark.parametrize("tmp_upath", ["local", "memory"], indirect=True) 20 | def test_db_check(tmp_upath, as_filesystem): 21 | fs = as_filesystem(tmp_upath.fs) 22 | db_cls = LocalHashFileDB if fs.protocol == "local" else HashFileDB 23 | odb = db_cls(as_filesystem(tmp_upath.fs), str(tmp_upath)) 24 | 25 | oid = "acbd18db4cc2f85cedef654fccc4a4d8" 26 | path = odb.oid_to_path(oid) 27 | 28 | with pytest.raises(FileNotFoundError): 29 | odb.check(oid) 30 | 31 | odb.add_bytes(oid, b"foo") 32 | assert odb.check(oid) == Meta.from_info(odb.fs.info(path)) 33 | 34 | odb.protect(oid) 35 | assert odb.check(oid) == Meta.from_info(odb.fs.info(path)) 36 | 37 | odb.delete(oid) 38 | 39 | odb.add_bytes(oid, b"bar") 40 | with pytest.raises(ObjectFormatError): 41 | odb.check(oid) 42 | -------------------------------------------------------------------------------- /tests/hashfile/test_db_index.py: -------------------------------------------------------------------------------- 1 | from contextlib import closing 2 | 3 | import pytest 4 | 5 | from dvc_data.hashfile.db.index import ObjectDBIndex 6 | 7 | 8 | @pytest.fixture 9 | def index(tmp_upath): 10 | with closing(ObjectDBIndex(tmp_upath, "foo")) as _index: 11 | yield _index 12 | 13 | 14 | def test_roundtrip(request, tmp_upath, index): 15 | expected_dir = {"1234.dir"} 16 | expected_file = {"5678"} 17 | index.update(expected_dir, expected_file) 18 | 19 | new_index = ObjectDBIndex(tmp_upath, "foo") 20 | request.addfinalizer(new_index.close) 21 | 22 | assert set(new_index.dir_hashes()) == expected_dir 23 | assert set(new_index.hashes()) == expected_dir | expected_file 24 | 25 | 26 | def test_clear(index): 27 | index.update(["1234.dir"], ["5678"]) 28 | index.clear() 29 | assert not list(index.hashes()) 30 | 31 | 32 | def test_update(index): 33 | expected_dir = {"1234.dir"} 34 | expected_file = {"5678"} 35 | index.update(expected_dir, expected_file) 36 | assert set(index.dir_hashes()) == expected_dir 37 | assert set(index.hashes()) == expected_dir | expected_file 38 | 39 | 40 | def test_intersection(index): 41 | hashes = (str(i) for i in range(2000)) 42 | expected = {str(i) for i in range(1000)} 43 | index.update([], hashes) 44 | assert set(index.intersection(expected)) == expected 45 | -------------------------------------------------------------------------------- /tests/hashfile/test_diff.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dvc_data.hashfile.diff import ROOT, Change, TreeEntry, diff 4 | from dvc_data.hashfile.meta import Meta 5 | from dvc_data.hashfile.obj import HashFile 6 | from dvc_data.hashfile.tree import Tree 7 | 8 | 9 | @pytest.fixture 10 | def tree(): 11 | tree = Tree.from_list( 12 | [ 13 | {"md5": "37b51d194a7513e45b56f6524f2d51f2", "relpath": "bar"}, 14 | {"md5": "acbd18db4cc2f85cedef654fccc4a4d8", "relpath": "foo"}, 15 | ] 16 | ) 17 | tree.digest() 18 | return tree 19 | 20 | 21 | def test_diff_unchanged(mocker, tree): 22 | meta = Meta() 23 | mocked_cache = mocker.MagicMock(check=mocker.MagicMock(return_value=meta)) 24 | _, bar_oid = tree.get(("bar",)) 25 | obj = HashFile("data", mocker.MagicMock(), bar_oid) 26 | 27 | assert not diff(obj, obj, mocked_cache) 28 | assert not diff(tree, tree, mocked_cache) 29 | 30 | 31 | def test_different_object_type_tree_to_hashfile(mocker, tree): 32 | meta = Meta() 33 | mocked_cache = mocker.MagicMock(check=mocker.MagicMock(return_value=meta)) 34 | 35 | (_, bar_oid), (_, foo_oid) = tree.get(("bar",)), tree.get(("foo",)) 36 | obj = HashFile("data", mocker.MagicMock(), bar_oid) 37 | d = diff(tree, obj, mocked_cache) 38 | 39 | assert d.stats == {"modified": 1, "deleted": 2, "added": 0} 40 | assert not d.unchanged 41 | assert d.modified == [ 42 | Change( 43 | old=TreeEntry(cache_meta=meta, key=ROOT, oid=tree.hash_info), 44 | new=TreeEntry(cache_meta=meta, key=ROOT, oid=bar_oid), 45 | ) 46 | ] 47 | assert sorted(d.deleted) == [ 48 | Change( 49 | old=TreeEntry(cache_meta=meta, key=("bar",), oid=bar_oid), 50 | new=TreeEntry(key=("bar",)), 51 | ), 52 | Change( 53 | old=TreeEntry(cache_meta=meta, key=("foo",), oid=foo_oid), 54 | new=TreeEntry(key=("foo",)), 55 | ), 56 | ] 57 | 58 | 59 | def test_different_object_type_hashfile_to_tree(mocker, tree): 60 | meta = Meta() 61 | mocked_cache = mocker.MagicMock(check=mocker.MagicMock(return_value=meta)) 62 | (_, bar_oid), (_, foo_oid) = tree.get(("bar",)), tree.get(("foo",)) 63 | obj = HashFile("data", mocker.MagicMock(), bar_oid) 64 | d = diff(obj, tree, mocked_cache) 65 | 66 | assert d.stats == {"modified": 1, "deleted": 0, "added": 2} 67 | assert not d.unchanged 68 | assert d.modified == [ 69 | Change( 70 | old=TreeEntry(cache_meta=meta, key=ROOT, oid=bar_oid), 71 | new=TreeEntry(cache_meta=meta, key=ROOT, oid=tree.hash_info), 72 | ) 73 | ] 74 | assert sorted(d.added) == [ 75 | Change( 76 | old=TreeEntry(cache_meta=meta, key=("bar",)), 77 | new=TreeEntry(key=("bar",), oid=bar_oid), 78 | ), 79 | Change( 80 | old=TreeEntry(cache_meta=meta, key=("foo",)), 81 | new=TreeEntry(key=("foo",), oid=foo_oid), 82 | ), 83 | ] 84 | -------------------------------------------------------------------------------- /tests/hashfile/test_hash.py: -------------------------------------------------------------------------------- 1 | from os import fspath 2 | 3 | from dvc_objects.fs import LocalFileSystem 4 | 5 | from dvc_data.hashfile.hash import file_md5 6 | 7 | 8 | def test_file_md5(tmp_path): 9 | foo = tmp_path / "foo" 10 | foo.write_text("foo content", encoding="utf8") 11 | 12 | fs = LocalFileSystem() 13 | assert file_md5(fspath(foo), fs) == file_md5(fspath(foo), fs) 14 | 15 | 16 | def test_file_md5_dos2unix(tmp_path): 17 | fs = LocalFileSystem() 18 | cr = tmp_path / "cr" 19 | crlf = tmp_path / "crlf" 20 | cr.write_bytes(b"a\nb\nc") 21 | crlf.write_bytes(b"a\r\nb\r\nc") 22 | assert file_md5(fspath(cr), fs, name="md5-dos2unix") == file_md5( 23 | fspath(crlf), fs, name="md5-dos2unix" 24 | ) 25 | -------------------------------------------------------------------------------- /tests/hashfile/test_hash_stream.py: -------------------------------------------------------------------------------- 1 | from os import fspath 2 | 3 | import pytest 4 | from dvc_objects.fs import LocalFileSystem 5 | 6 | from dvc_data.hashfile.hash import HashStreamFile, file_md5 7 | from dvc_data.hashfile.istextfile import DEFAULT_CHUNK_SIZE 8 | 9 | 10 | def test_hashed_stream_reader(tmp_path): 11 | foo = tmp_path / "foo" 12 | foo.write_bytes(b"foo") 13 | 14 | with open(foo, "rb") as fobj: 15 | stream_reader = HashStreamFile(fobj) 16 | 17 | assert stream_reader.readable() 18 | assert not stream_reader.seekable() 19 | 20 | assert stream_reader.read(2) == b"fo" 21 | assert stream_reader.tell() == 2 22 | 23 | assert stream_reader.read(1) == b"o" 24 | assert stream_reader.tell() == 3 25 | 26 | hex_digest = file_md5(fspath(foo), LocalFileSystem()) 27 | assert hex_digest == stream_reader.hash_value 28 | 29 | 30 | def test_hashed_stream_reader_as_chunks(tmp_path): 31 | foo = tmp_path / "foo" 32 | foo.write_bytes(b"foo \x00" * 16) 33 | 34 | actual_size = len(foo.read_bytes()) 35 | with open(foo, "rb") as fobj: 36 | stream_reader = HashStreamFile(fobj) 37 | 38 | total_read = 0 39 | while True: 40 | chunk = stream_reader.read(16) 41 | total_read += len(chunk) 42 | assert stream_reader.tell() == total_read 43 | if not chunk: 44 | break 45 | 46 | assert stream_reader.tell() == actual_size == total_read 47 | 48 | hex_digest = file_md5(fspath(foo), LocalFileSystem()) 49 | assert hex_digest == stream_reader.hash_value 50 | 51 | 52 | @pytest.mark.parametrize( 53 | "contents", 54 | [b"x" * DEFAULT_CHUNK_SIZE + b"\x00", b"clean", b"not clean \x00"], 55 | ) 56 | def test_hashed_stream_reader_compatibility(tmp_path, contents): 57 | # Always read more than the DEFAULT_CHUNK_SIZE (512 bytes). 58 | # This imitates the read actions performed by upload_fobj. 59 | chunk_size = DEFAULT_CHUNK_SIZE * 2 60 | 61 | data = tmp_path / "data" 62 | data.write_bytes(contents) 63 | 64 | with open(data, "rb") as fobj: 65 | stream_reader = HashStreamFile(fobj) 66 | stream_reader.read(chunk_size) 67 | 68 | local_fs = LocalFileSystem() 69 | hex_digest = file_md5(fspath(data), local_fs) 70 | 71 | assert stream_reader.hash_value == hex_digest 72 | -------------------------------------------------------------------------------- /tests/hashfile/test_istextfile.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from dvc_objects.fs.memory import MemoryFileSystem 3 | 4 | from dvc_data.hashfile.istextfile import istextblock, istextfile 5 | 6 | pytestmark = pytest.mark.parametrize( 7 | "block, expected", 8 | [ 9 | (b"", True), 10 | (b"text", True), 11 | (b"\x00\x001", False), 12 | ( 13 | ( 14 | b"True\x80\x04\x95\x1a\x00\x00\x00\x00\x00\x00\x00\x8c\x08\r\n" 15 | b"__main__\x94\x8c\x06Animal\x94\x93\x94)\x81\x94." 16 | ), 17 | False, 18 | ), 19 | ], 20 | ids=["empty", "text", "binary", "long_binary"], 21 | ) 22 | 23 | 24 | def test_istextblock(block, expected): 25 | assert istextblock(block) is expected 26 | 27 | 28 | def test_istextfile(block, expected): 29 | fs = MemoryFileSystem(global_store=False) 30 | fs.pipe_file("/file", block) 31 | 32 | assert istextfile("/file", fs) is expected 33 | -------------------------------------------------------------------------------- /tests/hashfile/test_obj.py: -------------------------------------------------------------------------------- 1 | from dvc_data.hashfile.hash_info import HashInfo 2 | from dvc_data.hashfile.obj import HashFile 3 | 4 | 5 | def test_obj(tmp_upath): 6 | hash_info = HashInfo("md5", "123456") 7 | obj = HashFile(tmp_upath, tmp_upath.fs, hash_info) 8 | assert obj.path == tmp_upath 9 | assert obj.fs == tmp_upath.fs 10 | assert obj.oid == "123456" 11 | assert obj.hash_info == hash_info 12 | -------------------------------------------------------------------------------- /tests/hashfile/test_state.py: -------------------------------------------------------------------------------- 1 | import os 2 | from contextlib import closing 3 | 4 | import pytest 5 | from dvc_objects.fs import MemoryFileSystem 6 | from dvc_objects.fs.local import LocalFileSystem 7 | from dvc_objects.fs.system import inode 8 | 9 | from dvc_data.hashfile.hash import file_md5 10 | from dvc_data.hashfile.hash_info import HashInfo 11 | from dvc_data.hashfile.meta import Meta 12 | from dvc_data.hashfile.state import State, StateNoop, _checksum 13 | from dvc_data.hashfile.utils import get_mtime_and_size 14 | from dvc_data.json_compat import dumps as json_dumps 15 | 16 | 17 | @pytest.fixture 18 | def state(tmp_path): 19 | with closing(State(tmp_path, tmp_path / "tmp")) as _state: 20 | yield _state 21 | 22 | 23 | def test_hashes(tmp_path, state: State): 24 | path = tmp_path / "foo" 25 | path.write_text("foo content", encoding="utf-8") 26 | 27 | fs = LocalFileSystem() 28 | hash_info = HashInfo(name="md5", value="6dbda444875c24ec1bbdb433456be11f") 29 | 30 | state.save(str(path), fs, hash_info) 31 | info = fs.info(str(path)) 32 | meta = Meta.from_info(info) 33 | assert state.hashes[str(path)] == json_dumps( 34 | { 35 | "version": 1, 36 | "checksum": _checksum(info), 37 | "size": 11, 38 | "hash_info": {"md5": hash_info.value}, 39 | } 40 | ) 41 | assert state.get(str(path), fs) == (meta, hash_info) 42 | assert list(state.get_many((str(path),), fs, {})) == [(str(path), meta, hash_info)] 43 | 44 | path.write_text("foo content 1", encoding="utf-8") 45 | info = fs.info(str(path)) 46 | meta = Meta.from_info(info) 47 | hash_info = HashInfo(name="md5", value="8efcb74434c93f295375a9118292fd0c") 48 | path.unlink() 49 | 50 | state.save(str(path), fs, hash_info, info) 51 | assert state.hashes[str(path)] == json_dumps( 52 | { 53 | "version": 1, 54 | "checksum": _checksum(info), 55 | "size": 13, 56 | "hash_info": {"md5": hash_info.value}, 57 | } 58 | ) 59 | assert state.get(str(path), fs, info) == (meta, hash_info) 60 | assert list(state.get_many((str(path),), fs, {str(path): info})) == [ 61 | (str(path), meta, hash_info) 62 | ] 63 | 64 | assert state.get(str(path), fs) == (None, None) 65 | assert list(state.get_many((str(path),), fs, {})) == [(str(path), None, None)] 66 | 67 | 68 | def test_hashes_get_not_a_local_fs(tmp_path, state: State): 69 | fs = MemoryFileSystem() 70 | 71 | assert state.get("not-existing-file", fs) == (None, None) 72 | assert list(state.get_many(("not-existing-file",), fs, {})) == [ 73 | ("not-existing-file", None, None) 74 | ] 75 | 76 | 77 | def test_hashes_get_invalid_data(tmp_path, state: State): 78 | path = tmp_path / "foo" 79 | path.write_text("foo content", encoding="utf-8") 80 | 81 | fs = LocalFileSystem() 82 | 83 | # invalid json 84 | state.hashes[str(path)] = "" 85 | assert state.get(str(path), fs) == (None, None) 86 | assert list(state.get_many((str(path),), fs, {})) == [(str(path), None, None)] 87 | 88 | # invalid json 89 | state.hashes[str(path)] = '{"x"}' 90 | assert state.get(str(path), fs) == (None, None) 91 | assert list(state.get_many((str(path),), fs, {})) == [(str(path), None, None)] 92 | 93 | # invalid checksum 94 | state.hashes[str(path)] = json_dumps( 95 | { 96 | "version": 1, 97 | "checksum": 1, 98 | "size": 13, 99 | "hash_info": {"md5": "value"}, 100 | } 101 | ) 102 | assert state.get(str(path), fs) == (None, None) 103 | assert list(state.get_many((str(path),), fs, {})) == [(str(path), None, None)] 104 | 105 | # invalid version 106 | state.hashes[str(path)] = json_dumps( 107 | { 108 | "version": state.HASH_VERSION + 1, 109 | "checksum": _checksum(fs.info(str(path))), 110 | "size": 13, 111 | "hash_info": {"md5": "value"}, 112 | } 113 | ) 114 | assert state.get(str(path), fs) == (None, None) 115 | assert list(state.get_many((str(path),), fs, {})) == [(str(path), None, None)] 116 | 117 | 118 | def test_hashes_without_version(tmp_path, state: State): 119 | # If there is no version, it is considered as old md5-dos2unix hashes. 120 | # dvc-data does not write this format anymore, but it should be able to read it 121 | fs = LocalFileSystem() 122 | 123 | path = tmp_path / "foo" 124 | path.write_text("foo content", encoding="utf-8") 125 | 126 | info = fs.info(str(path)) 127 | meta = Meta.from_info(info) 128 | 129 | state.hashes[str(path)] = json_dumps( 130 | { 131 | "checksum": _checksum(info), 132 | "size": 11, 133 | "hash_info": {"md5": "value"}, 134 | } 135 | ) 136 | assert state.get(str(path), fs) == ( 137 | meta, 138 | HashInfo("md5-dos2unix", "value"), 139 | ) 140 | assert list(state.get_many((str(path),), fs, {})) == [ 141 | (str(path), meta, HashInfo("md5-dos2unix", "value")) 142 | ] 143 | 144 | 145 | def test_hashes_save_not_existing(tmp_path, state: State): 146 | fs = LocalFileSystem() 147 | 148 | with pytest.raises(FileNotFoundError): 149 | state.save("not-existing-file", fs, HashInfo("md5", "value")) 150 | 151 | state.save_many((("not-existing-file", HashInfo("md5", "value"), None),), fs) 152 | assert len(state.hashes) == 0 153 | 154 | 155 | def test_hashes_save_when_fs_is_not_a_local_fs(tmp_path, state: State): 156 | fs = MemoryFileSystem() 157 | 158 | state.save("not-existing-file", fs, HashInfo("md5", "value")) 159 | assert len(state.hashes) == 0 160 | 161 | state.save_many((("not-existing-file", HashInfo("md5", "value"), None),), fs) 162 | assert len(state.hashes) == 0 163 | 164 | 165 | def test_state_many(tmp_path, state: State): 166 | foo = tmp_path / "foo" 167 | foo.write_text("foo content", encoding="utf-8") 168 | 169 | bar = tmp_path / "bar" 170 | bar.write_text("bar content", encoding="utf-8") 171 | 172 | fs = LocalFileSystem() 173 | 174 | hash_info_foo = HashInfo("md5", file_md5(foo, fs)) 175 | foo_info = fs.info(str(foo)) 176 | bar_info = fs.info(str(bar)) 177 | hash_info_bar = HashInfo("md5", file_md5(bar, fs)) 178 | 179 | state.save_many( 180 | [(str(foo), hash_info_foo, None), (str(bar), hash_info_bar, None)], fs 181 | ) 182 | assert list(state.get_many([str(foo), str(bar)], fs, {})) == [ 183 | (str(foo), Meta.from_info(foo_info), hash_info_foo), 184 | (str(bar), Meta.from_info(bar_info), hash_info_bar), 185 | ] 186 | 187 | foo.write_text("foo content 1", encoding="utf-8") 188 | foo_info = fs.info(str(foo)) 189 | hash_info_foo = HashInfo("md5", file_md5(foo, fs)) 190 | foo.unlink() 191 | bar.write_text("bar content 1", encoding="utf-8") 192 | bar_info = fs.info(str(bar)) 193 | hash_info_bar = HashInfo("md5", file_md5(bar, fs)) 194 | bar.unlink() 195 | 196 | state.save_many( 197 | [(str(foo), hash_info_foo, foo_info), (str(bar), hash_info_bar, bar_info)], fs 198 | ) 199 | assert list( 200 | state.get_many( 201 | [str(foo), str(bar)], fs, {str(foo): foo_info, str(bar): bar_info} 202 | ) 203 | ) == [ 204 | (str(foo), Meta.from_info(foo_info), hash_info_foo), 205 | (str(bar), Meta.from_info(bar_info), hash_info_bar), 206 | ] 207 | 208 | 209 | def test_set_link(tmp_path, state): 210 | state.set_link(tmp_path / "foo", 42, "mtime") 211 | assert state.links["foo"] == (42, "mtime") 212 | 213 | 214 | def test_state_noop(tmp_path): 215 | state = StateNoop() 216 | fs = LocalFileSystem() 217 | 218 | state.save_many([("foo", HashInfo("md5", "value"), None)], fs) 219 | assert state.get("foo", fs) == (None, None) 220 | assert list(state.get_many(("foo", "bar"), fs, {})) == [ 221 | ("foo", None, None), 222 | ("bar", None, None), 223 | ] 224 | 225 | state.set_link(tmp_path / "foo", 42, "mtime") 226 | assert state.get_unused_links([], fs) == [] 227 | 228 | state.save_link(tmp_path / "foo", fs) 229 | assert state.get_unused_links([], fs) == [] 230 | 231 | 232 | def test_links(tmp_path, state: State): 233 | foo, bar = tmp_path / "foo", tmp_path / "bar" 234 | dataset = tmp_path / "dataset" 235 | dataset.mkdir() 236 | file = dataset / "file" 237 | 238 | for path in [foo, bar, file]: 239 | path.write_text(f"{path.name} content", encoding="utf-8") 240 | 241 | fs = LocalFileSystem() 242 | 243 | state.save_link(os.fspath(foo), fs) 244 | state.save_link(os.fspath(bar), fs) 245 | state.save_link(os.fspath(dataset), fs) 246 | 247 | def _get_inode_mtime(path): 248 | path = os.fspath(path) 249 | return inode(path), get_mtime_and_size(path, fs)[0] 250 | 251 | assert len(state.links) == 3 252 | assert {k: state.links[k] for k in state.links} == { 253 | "foo": _get_inode_mtime(foo), 254 | "bar": _get_inode_mtime(bar), 255 | "dataset": _get_inode_mtime(dataset), 256 | } 257 | 258 | links = [os.fspath(tmp_path / link) for link in ["foo", "bar", "dataset"]] 259 | assert set(state.get_unused_links([], fs)) == {"foo", "bar", "dataset"} 260 | assert set(state.get_unused_links(links[:1], fs)) == {"bar", "dataset"} 261 | assert set(state.get_unused_links(links[:2], fs)) == {"dataset"} 262 | assert set(state.get_unused_links(links, fs)) == set() 263 | assert set( 264 | state.get_unused_links( 265 | ([*links[:1], os.path.join(tmp_path, "not-existing-file")]), 266 | fs, 267 | ) 268 | ) == {"bar", "dataset"} 269 | 270 | state.remove_links(["foo", "bar", "dataset"], fs) 271 | assert len(state.links) == 0 272 | assert not foo.exists() 273 | assert not bar.exists() 274 | assert not dataset.exists() 275 | -------------------------------------------------------------------------------- /tests/hashfile/test_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dvc_objects.fs import LocalFileSystem 4 | 5 | from dvc_data.hashfile.utils import get_mtime_and_size 6 | 7 | 8 | def test_mtime_and_size(tmp_path): 9 | directory = tmp_path / "dir" 10 | directory.mkdir(parents=True) 11 | dir_file = directory / "file" 12 | dir_file.write_text("dir_file", encoding="utf8") 13 | 14 | sub = directory / "sub" 15 | sub.mkdir(parents=True) 16 | subfile = sub / "file" 17 | subfile.write_text("sub_file", encoding="utf8") 18 | 19 | fs = LocalFileSystem(url=tmp_path) 20 | file_time, file_size = get_mtime_and_size(str(dir_file), fs) 21 | dir_time, dir_size = get_mtime_and_size(str(directory), fs) 22 | 23 | actual_file_size = os.path.getsize(dir_file) 24 | actual_dir_size = os.path.getsize(dir_file) + os.path.getsize(subfile) 25 | 26 | assert isinstance(file_time, str) 27 | assert isinstance(file_size, int) 28 | assert file_size == actual_file_size 29 | assert isinstance(dir_time, str) 30 | assert isinstance(dir_size, int) 31 | assert dir_size == actual_dir_size 32 | 33 | 34 | def test_path_object_and_str_are_valid_types_get_mtime_and_size(tmp_path): 35 | directory = tmp_path / "dir" 36 | directory.mkdir() 37 | (directory / "file").write_text("dir_file_content") 38 | file = directory / "file" 39 | file.write_text("file_content", encoding="utf8") 40 | 41 | fs = LocalFileSystem(url=tmp_path) 42 | 43 | time, size = get_mtime_and_size(str(directory), fs) 44 | object_time, object_size = get_mtime_and_size(str(directory), fs) 45 | assert time == object_time 46 | assert size == object_size 47 | 48 | time, size = get_mtime_and_size(str(file), fs) 49 | object_time, object_size = get_mtime_and_size(str(file), fs) 50 | assert time == object_time 51 | assert size == object_size 52 | -------------------------------------------------------------------------------- /tests/index/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iterative/dvc-data/4ee77349a9712476cea4ac57154ee25ce79fcc02/tests/index/__init__.py -------------------------------------------------------------------------------- /tests/index/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from dvc_data.hashfile.db import HashFileDB 6 | 7 | 8 | @pytest.fixture 9 | def make_odb(tmp_upath_factory, as_filesystem): 10 | def _make_odb(): 11 | path = tmp_upath_factory.mktemp() 12 | fs = as_filesystem(path.fs) 13 | return HashFileDB(fs, os.fspath(path)) 14 | 15 | return _make_odb 16 | 17 | 18 | @pytest.fixture 19 | def odb(make_odb): 20 | odb = make_odb() 21 | 22 | odb.add_bytes("d3b07384d113edec49eaa6238ad5ff00", b"foo\n") 23 | odb.add_bytes("c157a79031e1c40f85931829bc5fc552", b"bar\n") 24 | odb.add_bytes("258622b1688250cb619f3c9ccaefb7eb", b"baz\n") 25 | odb.add_bytes( 26 | "1f69c66028c35037e8bf67e5bc4ceb6a.dir", 27 | ( 28 | b'[{"md5": "c157a79031e1c40f85931829bc5fc552", "relpath": "bar"}, ' 29 | b'{"md5": "258622b1688250cb619f3c9ccaefb7eb", "relpath": "baz"}]' 30 | ), 31 | ) 32 | return odb 33 | -------------------------------------------------------------------------------- /tests/index/test_build.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dvc_data.index.build import DataIndexEntry, build, build_entry 4 | 5 | 6 | def test_build_entry(tmp_upath, as_filesystem): 7 | (tmp_upath / "foo").write_bytes(b"foo\n") 8 | 9 | fs = as_filesystem(tmp_upath.fs) 10 | 11 | entry = build_entry(str(tmp_upath / "foo"), fs) 12 | assert isinstance(entry, DataIndexEntry) 13 | 14 | assert entry.meta 15 | assert entry.meta.size == 4 16 | assert entry.key is None 17 | assert entry.hash_info is None 18 | 19 | with pytest.raises(FileNotFoundError): 20 | build_entry(str(tmp_upath / "missing"), fs) 21 | 22 | 23 | def test_build(tmp_upath, as_filesystem): 24 | (tmp_upath / "foo").write_bytes(b"foo\n") 25 | (tmp_upath / "data").mkdir() 26 | (tmp_upath / "data" / "bar").write_bytes(b"bar\n") 27 | (tmp_upath / "data" / "baz").write_bytes(b"baz\n") 28 | 29 | fs = as_filesystem(tmp_upath.fs) 30 | index = build(str(tmp_upath), fs) 31 | assert index[("foo",)].meta.size == 4 32 | assert index.storage_map.get_data(index[("foo",)]) == ( 33 | fs, 34 | str(tmp_upath / "foo"), 35 | ) 36 | assert index[("data",)].meta.isdir 37 | assert index[("data", "bar")].meta.size == 4 38 | assert index.storage_map.get_data(index[("data", "bar")]) == ( 39 | fs, 40 | str(tmp_upath / "data" / "bar"), 41 | ) 42 | assert index[("data", "baz")].meta.size == 4 43 | assert index.storage_map.get_data(index[("data", "baz")]) == ( 44 | fs, 45 | str(tmp_upath / "data" / "baz"), 46 | ) 47 | -------------------------------------------------------------------------------- /tests/index/test_checkout.py: -------------------------------------------------------------------------------- 1 | from dvc_data.hashfile.hash_info import HashInfo 2 | from dvc_data.hashfile.meta import Meta 3 | from dvc_data.index import DataIndex, DataIndexEntry, ObjectStorage 4 | from dvc_data.index.checkout import apply, compare 5 | 6 | 7 | def test_checkout(tmp_upath, odb, as_filesystem): 8 | index = DataIndex( 9 | { 10 | ("foo",): DataIndexEntry( 11 | key=("foo",), 12 | meta=Meta(), 13 | hash_info=HashInfo( 14 | name="md5", value="d3b07384d113edec49eaa6238ad5ff00" 15 | ), 16 | ), 17 | ("data",): DataIndexEntry( 18 | key=("data",), 19 | meta=Meta(isdir=True), 20 | hash_info=HashInfo( 21 | name="md5", 22 | value="1f69c66028c35037e8bf67e5bc4ceb6a.dir", 23 | ), 24 | ), 25 | } 26 | ) 27 | index.storage_map.add_cache(ObjectStorage((), odb)) 28 | diff = compare(None, index) 29 | apply(diff, str(tmp_upath), as_filesystem(tmp_upath.fs)) 30 | assert (tmp_upath / "foo").read_text() == "foo\n" 31 | assert (tmp_upath / "data").is_dir() 32 | assert (tmp_upath / "data" / "bar").read_text() == "bar\n" 33 | assert (tmp_upath / "data" / "baz").read_text() == "baz\n" 34 | assert set(tmp_upath.iterdir()) == { 35 | (tmp_upath / "foo"), 36 | (tmp_upath / "data"), 37 | } 38 | assert set((tmp_upath / "data").iterdir()) == { 39 | (tmp_upath / "data" / "bar"), 40 | (tmp_upath / "data" / "baz"), 41 | } 42 | 43 | 44 | def test_checkout_file(tmp_upath, odb, as_filesystem): 45 | index = DataIndex( 46 | { 47 | (): DataIndexEntry( 48 | key=(), 49 | meta=Meta(), 50 | hash_info=HashInfo( 51 | name="md5", value="d3b07384d113edec49eaa6238ad5ff00" 52 | ), 53 | ), 54 | } 55 | ) 56 | index.storage_map.add_cache(ObjectStorage((), odb)) 57 | diff = compare(None, index) 58 | apply(diff, str(tmp_upath / "foo"), as_filesystem(tmp_upath.fs)) 59 | assert (tmp_upath / "foo").read_text() == "foo\n" 60 | 61 | 62 | def test_checkout_broken_dir(tmp_upath, odb, as_filesystem): 63 | index = DataIndex( 64 | { 65 | ("foo",): DataIndexEntry( 66 | key=("foo",), 67 | meta=Meta(), 68 | hash_info=HashInfo( 69 | name="md5", value="d3b07384d113edec49eaa6238ad5ff00" 70 | ), 71 | ), 72 | ("data",): DataIndexEntry( 73 | key=("data",), 74 | meta=Meta(isdir=True), 75 | hash_info=HashInfo( 76 | name="md5", 77 | value="1f69c66028c35037e8bf67e5bc4ceb6a.dir", 78 | ), 79 | ), 80 | ("broken",): DataIndexEntry( 81 | key=("broken",), 82 | meta=Meta(isdir=True), 83 | hash_info=HashInfo( 84 | name="md5", 85 | value="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.dir", 86 | ), 87 | ), 88 | } 89 | ) 90 | index.storage_map.add_cache(ObjectStorage((), odb)) 91 | diff = compare(None, index) 92 | apply(diff, str(tmp_upath), as_filesystem(tmp_upath.fs)) 93 | assert (tmp_upath / "foo").read_text() == "foo\n" 94 | assert (tmp_upath / "data").is_dir() 95 | assert (tmp_upath / "data" / "bar").read_text() == "bar\n" 96 | assert (tmp_upath / "data" / "baz").read_text() == "baz\n" 97 | assert set(tmp_upath.iterdir()) == { 98 | (tmp_upath / "foo"), 99 | (tmp_upath / "data"), 100 | } 101 | assert set((tmp_upath / "data").iterdir()) == { 102 | (tmp_upath / "data" / "bar"), 103 | (tmp_upath / "data" / "baz"), 104 | } 105 | assert not (tmp_upath / "broken").exists() 106 | 107 | 108 | def test_checkout_delete_nested_dir(tmp_upath, odb, as_filesystem): 109 | old = DataIndex( 110 | { 111 | ("dir1",): DataIndexEntry( 112 | key=("dir1",), 113 | meta=Meta(isdir=True), 114 | ), 115 | ("dir1", "subdir1"): DataIndexEntry( 116 | key=("dir1", "subdir1"), 117 | meta=Meta(isdir=True), 118 | ), 119 | } 120 | ) 121 | diff = compare(None, old) 122 | apply(diff, str(tmp_upath), as_filesystem(tmp_upath.fs)) 123 | 124 | assert (tmp_upath / "dir1").exists() 125 | assert (tmp_upath / "dir1").is_dir() 126 | assert (tmp_upath / "dir1" / "subdir1").exists() 127 | assert (tmp_upath / "dir1" / "subdir1").is_dir() 128 | 129 | new = DataIndex({}) 130 | diff = compare(old, new, delete=True) 131 | apply(diff, str(tmp_upath), as_filesystem(tmp_upath.fs)) 132 | 133 | assert not (tmp_upath / "dir1" / "subdir1").exists() 134 | assert not (tmp_upath / "dir1").exists() 135 | -------------------------------------------------------------------------------- /tests/index/test_diff.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dvc_data.hashfile.hash_info import HashInfo 4 | from dvc_data.hashfile.meta import Meta 5 | from dvc_data.index import DataIndex, DataIndexEntry 6 | from dvc_data.index.diff import ADD, DELETE, MODIFY, RENAME, UNCHANGED, Change, diff 7 | 8 | 9 | def test_diff(): 10 | old_foo_key = ("foo",) 11 | old_foo_entry = DataIndexEntry( 12 | key=old_foo_key, 13 | meta=Meta(), 14 | hash_info=HashInfo(name="md5", value="d3b07384d113edec49eaa6238ad5ff00"), 15 | ) 16 | old_bar_key = ("dir", "subdir", "bar") 17 | old_bar_entry = DataIndexEntry( 18 | key=old_bar_key, 19 | meta=Meta(isdir=True), 20 | hash_info=HashInfo( 21 | name="md5", 22 | value="1f69c66028c35037e8bf67e5bc4ceb6a.dir", 23 | ), 24 | ) 25 | old = DataIndex({old_foo_key: old_foo_entry, old_bar_key: old_bar_entry}) 26 | 27 | assert set(diff(old, old, with_unchanged=True)) == { 28 | Change(UNCHANGED, old_foo_entry, old_foo_entry), 29 | Change(UNCHANGED, old_bar_entry, old_bar_entry), 30 | } 31 | assert set(diff(old, old, with_renames=True, with_unchanged=True)) == { 32 | Change(UNCHANGED, old_foo_entry, old_foo_entry), 33 | Change(UNCHANGED, old_bar_entry, old_bar_entry), 34 | } 35 | 36 | new_foo_key = ("data", "FOO") 37 | new_foo_entry = DataIndexEntry( 38 | key=new_foo_key, 39 | meta=Meta(), 40 | hash_info=HashInfo(name="md5", value="d3b07384d113edec49eaa6238ad5ff00"), 41 | ) 42 | new = DataIndex( 43 | { 44 | ( 45 | "data", 46 | "FOO", 47 | ): new_foo_entry, 48 | old_bar_key: old_bar_entry, 49 | } 50 | ) 51 | 52 | assert set(diff(old, new, with_unchanged=True)) == { 53 | Change(ADD, None, new_foo_entry), 54 | Change(DELETE, old_foo_entry, None), 55 | Change(UNCHANGED, old_bar_entry, old_bar_entry), 56 | } 57 | assert set(diff(old, new, with_renames=True, with_unchanged=True)) == { 58 | Change(RENAME, old_foo_entry, new_foo_entry), 59 | Change(UNCHANGED, old_bar_entry, old_bar_entry), 60 | } 61 | 62 | 63 | def test_diff_no_hashes(): 64 | index = DataIndex( 65 | { 66 | ("foo",): DataIndexEntry(key=("foo",)), 67 | } 68 | ) 69 | assert not set(diff(index, None, hash_only=True)) 70 | 71 | 72 | def test_diff_meta_only(): 73 | key = ("foo",) 74 | old_entry = DataIndexEntry( 75 | key=key, 76 | meta=Meta(etag="abc"), 77 | hash_info=HashInfo(name="md5", value="123"), 78 | ) 79 | new_entry = DataIndexEntry( 80 | key=key, 81 | meta=Meta(etag="abc"), 82 | hash_info=HashInfo(name="md5", value="456"), 83 | ) 84 | old = DataIndex({key: old_entry}) 85 | new = DataIndex({key: new_entry}) 86 | 87 | assert list(diff(old, new, meta_only=True, with_unchanged=True)) == [ 88 | Change(UNCHANGED, old_entry, new_entry), 89 | ] 90 | 91 | new_entry.meta = Meta(etag="def") 92 | assert list(diff(old, new, meta_only=True, with_unchanged=True)) == [ 93 | Change(MODIFY, old_entry, new_entry), 94 | ] 95 | 96 | 97 | @pytest.mark.parametrize( 98 | "typ, left_meta, left_hi, right_meta, right_hi", 99 | [ 100 | ( 101 | UNCHANGED, 102 | Meta(etag="123"), 103 | HashInfo(name="md5", value="123"), 104 | Meta(etag="123"), 105 | HashInfo(name="md5", value="123"), 106 | ), 107 | ( 108 | ADD, 109 | None, 110 | None, 111 | Meta(etag="123"), 112 | HashInfo(name="md5", value="123"), 113 | ), 114 | ( 115 | DELETE, 116 | Meta(etag="123"), 117 | HashInfo(name="md5", value="123"), 118 | None, 119 | None, 120 | ), 121 | ], 122 | ) 123 | def test_diff_combined(typ, left_meta, left_hi, right_meta, right_hi): 124 | key = ("foo",) 125 | old_entry = DataIndexEntry( 126 | key=key, 127 | meta=left_meta, 128 | hash_info=left_hi, 129 | ) 130 | new_entry = DataIndexEntry( 131 | key=key, 132 | meta=right_meta, 133 | hash_info=right_hi, 134 | ) 135 | old = DataIndex({key: old_entry}) 136 | new = DataIndex({key: new_entry}) 137 | 138 | # diff should return UNCHANGED if both meta and hash info match, 139 | # but MODIFY if they don't since entries still exist 140 | assert list(diff(old, new, with_unchanged=True)) == [ 141 | Change(UNCHANGED if typ == UNCHANGED else MODIFY, old_entry, new_entry), 142 | ] 143 | 144 | # diff should return UNCHANGED if both meta and hash info match, 145 | # but MODIFY if they don't since entries still exist 146 | old_entry.meta = None 147 | new_entry.meta = None 148 | assert list(diff(old, new, with_unchanged=True)) == [ 149 | Change(UNCHANGED if typ == UNCHANGED else MODIFY, old_entry, new_entry), 150 | ] 151 | 152 | # diff should return meta diff when both hash infos are None 153 | old_entry.meta = left_meta 154 | new_entry.meta = right_meta 155 | old_entry.hash_info = None 156 | new_entry.hash_info = None 157 | assert list(diff(old, new, with_unchanged=True)) == [ 158 | Change(typ, old_entry, new_entry), 159 | ] 160 | 161 | # diff should return modify when meta and hash info diff do not match 162 | old_entry.meta = Meta(etag="abc") 163 | new_entry.meta = Meta(etag="def") 164 | old_entry.hash_info = left_hi 165 | new_entry.hash_info = right_hi 166 | assert list(diff(old, new, with_unchanged=True)) == [ 167 | Change(MODIFY, old_entry, new_entry), 168 | ] 169 | old_entry.meta = left_meta 170 | new_entry.meta = right_meta 171 | old_entry.hash_info = HashInfo(name="md5", value="abc") 172 | new_entry.hash_info = HashInfo(name="md5", value="def") 173 | assert list(diff(old, new, with_unchanged=True)) == [ 174 | Change(MODIFY, old_entry, new_entry), 175 | ] 176 | -------------------------------------------------------------------------------- /tests/index/test_fs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dvc_data.fs import DataFileSystem 4 | from dvc_data.hashfile.hash_info import HashInfo 5 | from dvc_data.hashfile.meta import Meta 6 | from dvc_data.index import ( 7 | DataIndex, 8 | DataIndexDirError, 9 | DataIndexEntry, 10 | FileStorage, 11 | ObjectStorage, 12 | ) 13 | 14 | 15 | def test_fs(tmp_upath, odb, as_filesystem): 16 | index = DataIndex( 17 | { 18 | ("foo",): DataIndexEntry( 19 | key=("foo",), 20 | hash_info=HashInfo( 21 | name="md5", value="d3b07384d113edec49eaa6238ad5ff00" 22 | ), 23 | ), 24 | ("data",): DataIndexEntry( 25 | key=("data",), 26 | meta=Meta(isdir=True), 27 | hash_info=HashInfo( 28 | name="md5", 29 | value="1f69c66028c35037e8bf67e5bc4ceb6a.dir", 30 | ), 31 | ), 32 | } 33 | ) 34 | index.storage_map.add_cache(ObjectStorage((), odb)) 35 | fs = DataFileSystem(index) 36 | assert fs.exists("foo") 37 | assert fs.cat("foo") == b"foo\n" 38 | assert fs.ls("foo") == [fs.info("foo")] 39 | assert fs.ls("/", detail=False) == ["/foo", "/data"] 40 | assert fs.ls("/", detail=True) == [fs.info("/foo"), fs.info("/data")] 41 | assert fs.cat("/data/bar") == b"bar\n" 42 | assert fs.cat("/data/baz") == b"baz\n" 43 | assert fs.ls("/data/bar") == [fs.info("data/bar")] 44 | assert fs.ls("/data", detail=False) == ["/data/bar", "/data/baz"] 45 | assert fs.ls("/data", detail=True) == [ 46 | fs.info("/data/bar"), 47 | fs.info("/data/baz"), 48 | ] 49 | 50 | 51 | def test_fs_file_storage(tmp_upath, as_filesystem): 52 | (tmp_upath / "foo").write_bytes(b"foo\n") 53 | (tmp_upath / "data").mkdir() 54 | (tmp_upath / "data" / "bar").write_bytes(b"bar\n") 55 | (tmp_upath / "data" / "baz").write_bytes(b"baz\n") 56 | 57 | index = DataIndex( 58 | { 59 | ("foo",): DataIndexEntry( 60 | key=("foo",), 61 | ), 62 | ("data",): DataIndexEntry( 63 | key=("data",), 64 | ), 65 | } 66 | ) 67 | index.storage_map.add_cache( 68 | FileStorage((), as_filesystem(tmp_upath.fs), str(tmp_upath)) 69 | ) 70 | fs = DataFileSystem(index) 71 | assert fs.exists("foo") 72 | assert fs.cat("foo") == b"foo\n" 73 | assert sorted(fs.ls("/", detail=False)) == sorted(["/foo", "/data"]) 74 | assert sorted(fs.ls("/", detail=True), key=lambda entry: entry["name"]) == sorted( 75 | [fs.info("/foo"), fs.info("/data")], 76 | key=lambda entry: entry["name"], 77 | ) 78 | assert fs.cat("/data/bar") == b"bar\n" 79 | assert fs.cat("/data/baz") == b"baz\n" 80 | assert sorted(fs.ls("/data", detail=False)) == sorted(["/data/bar", "/data/baz"]) 81 | assert sorted( 82 | fs.ls("/data", detail=True), key=lambda entry: entry["name"] 83 | ) == sorted( 84 | [ 85 | fs.info("/data/bar"), 86 | fs.info("/data/baz"), 87 | ], 88 | key=lambda entry: entry["name"], 89 | ) 90 | 91 | 92 | def test_fs_broken(tmp_upath, odb, as_filesystem): 93 | index = DataIndex( 94 | { 95 | ("foo",): DataIndexEntry( 96 | key=("foo",), 97 | hash_info=HashInfo( 98 | name="md5", value="d3b07384d113edec49eaa6238ad5ff00" 99 | ), 100 | ), 101 | ("data",): DataIndexEntry( 102 | key=("data",), 103 | meta=Meta(isdir=True), 104 | hash_info=HashInfo( 105 | name="md5", 106 | value="1f69c66028c35037e8bf67e5bc4ceb6a.dir", 107 | ), 108 | ), 109 | ("broken",): DataIndexEntry( 110 | key=("broken",), 111 | meta=Meta(isdir=True), 112 | hash_info=HashInfo( 113 | name="md5", 114 | value="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.dir", 115 | ), 116 | ), 117 | } 118 | ) 119 | index.storage_map.add_cache(ObjectStorage((), odb)) 120 | fs = DataFileSystem(index) 121 | assert fs.exists("foo") 122 | assert fs.cat("foo") == b"foo\n" 123 | assert fs.ls("foo") == [fs.info("foo")] 124 | 125 | assert fs.ls("/", detail=False) == ["/foo", "/data", "/broken"] 126 | assert fs.ls("/", detail=True) == [ 127 | fs.info("/foo"), 128 | fs.info("/data"), 129 | fs.info("/broken"), 130 | ] 131 | 132 | assert fs.cat("/data/bar") == b"bar\n" 133 | assert fs.cat("/data/baz") == b"baz\n" 134 | assert fs.ls("/data/bar") == [fs.info("data/bar")] 135 | assert fs.ls("/data", detail=False) == ["/data/bar", "/data/baz"] 136 | assert fs.ls("/data", detail=True) == [ 137 | fs.info("/data/bar"), 138 | fs.info("/data/baz"), 139 | ] 140 | 141 | assert fs.exists("/broken") 142 | assert fs.isdir("/broken") 143 | with pytest.raises(DataIndexDirError): 144 | fs.ls("/broken", detail=False) 145 | 146 | with pytest.raises(DataIndexDirError): 147 | fs.ls("/broken", detail=True) 148 | 149 | def onerror(_entry, _exc): 150 | pass 151 | 152 | fs.index.onerror = onerror 153 | assert fs.ls("/broken", detail=False) == [] 154 | assert fs.ls("/broken", detail=True) == [] 155 | 156 | 157 | def test_fs_du(tmp_upath, odb, as_filesystem): 158 | index = DataIndex( 159 | { 160 | ("file_no_meta",): DataIndexEntry( 161 | key=("file_no_meta",), 162 | ), 163 | ("file_meta_size",): DataIndexEntry( 164 | key=("file_meta_size",), 165 | meta=Meta(size=4), 166 | ), 167 | ("file_meta_no_size",): DataIndexEntry( 168 | key=("file_meta_no_size",), 169 | meta=Meta(), 170 | ), 171 | ("prefix",): DataIndexEntry( 172 | key=("prefix",), 173 | meta=Meta(isdir=True), 174 | ), 175 | ("prefix", "dir"): DataIndexEntry( 176 | key=("prefix", "dir"), 177 | meta=Meta(isdir=True), 178 | ), 179 | ("prefix", "dir", "dir_size"): DataIndexEntry( 180 | key=("prefix", "dir", "dir_size"), 181 | meta=Meta(isdir=True, size=123), 182 | ), 183 | } 184 | ) 185 | 186 | fs = DataFileSystem(index) 187 | 188 | assert fs.du("file_no_meta") == 0 189 | assert fs.du("file_meta_size") == 4 190 | assert fs.du("file_meta_no_size") == 0 191 | assert fs.du("prefix/dir/dir_size") == 123 192 | assert fs.du("prefix/dir") == 123 193 | assert fs.du("prefix") == 123 194 | assert fs.du("/") == 127 195 | 196 | assert fs.du("file_meta_size", total=False) == { 197 | "file_meta_size": 4, 198 | } 199 | assert fs.du("prefix", total=False) == { 200 | "prefix": 0, 201 | "prefix/dir": 0, 202 | "prefix/dir/dir_size": 123, 203 | } 204 | assert fs.du("prefix/dir", total=False) == { 205 | "prefix/dir": 0, 206 | "prefix/dir/dir_size": 123, 207 | } 208 | assert fs.du("/", total=False) == { 209 | "/": 0, 210 | "/file_meta_no_size": 0, 211 | "/file_meta_size": 4, 212 | "/file_no_meta": 0, 213 | "/prefix": 0, 214 | "/prefix/dir": 0, 215 | "/prefix/dir/dir_size": 123, 216 | } 217 | -------------------------------------------------------------------------------- /tests/index/test_storage.py: -------------------------------------------------------------------------------- 1 | from dvc_data.index import FileStorage, ObjectStorage, StorageInfo, StorageMapping 2 | 3 | 4 | def test_map_get(tmp_upath, as_filesystem, odb): 5 | smap = StorageMapping() 6 | 7 | data = FileStorage(key=(), fs=as_filesystem(tmp_upath.fs), path=str(tmp_upath)) 8 | cache = FileStorage( 9 | key=("dir",), fs=as_filesystem(tmp_upath.fs), path=str(tmp_upath) 10 | ) 11 | remote = FileStorage( 12 | key=("dir", "subdir"), fs=as_filesystem(tmp_upath.fs), path=str(tmp_upath) 13 | ) 14 | foo_cache = ObjectStorage(key=("dir", "foo"), odb=odb) 15 | 16 | smap[()] = StorageInfo(data=data) 17 | smap[("dir",)] = StorageInfo(cache=cache) 18 | smap[("dir", "subdir")] = StorageInfo(remote=remote) 19 | smap[("dir", "foo")] = StorageInfo(cache=foo_cache) 20 | 21 | sinfo = smap[()] 22 | assert sinfo.data == data 23 | assert sinfo.cache is None 24 | assert sinfo.remote is None 25 | 26 | sinfo = smap[("dir",)] 27 | assert sinfo.data == data 28 | assert sinfo.cache == cache 29 | assert sinfo.remote is None 30 | 31 | sinfo = smap[("dir", "foo")] 32 | assert sinfo.data == data 33 | assert sinfo.cache == foo_cache 34 | assert sinfo.remote is None 35 | 36 | sinfo = smap[("dir", "subdir")] 37 | assert sinfo.data == data 38 | assert sinfo.cache == cache 39 | assert sinfo.remote == remote 40 | 41 | sinfo = smap[("dir", "subdir", "file")] 42 | assert sinfo.data == data 43 | assert sinfo.cache == cache 44 | assert sinfo.remote == remote 45 | 46 | sinfo = smap[("dir", "subdir", "subsubdir", "otherfile")] 47 | assert sinfo.data == data 48 | assert sinfo.cache == cache 49 | assert sinfo.remote == remote 50 | --------------------------------------------------------------------------------