├── .coveragerc ├── .gitattributes ├── .github └── workflows │ ├── ci.yml │ └── publish.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmark ├── README.md ├── results.csv ├── results.json ├── results_v0.2.0.csv ├── results_v0.2.0.json └── run.py ├── codecov.yml ├── pyproject.toml ├── setup.cfg ├── setup.py ├── src └── dirhash │ ├── __init__.py │ ├── _version.py │ └── cli.py ├── tests ├── test_cli.py └── test_dirhash.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = dirhash 4 | omit = _version.py 5 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | src/dirhash/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - "master" 7 | pull_request: 8 | branches: 9 | - "**" 10 | workflow_dispatch: 11 | release: 12 | types: [published, edited] 13 | 14 | jobs: 15 | pre-commit: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: actions/setup-python@v5 20 | with: 21 | python-version: "3.8" 22 | - uses: pre-commit/action@v3.0.1 23 | 24 | tests: 25 | runs-on: ${{ matrix.os }} 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 30 | os: [ubuntu-latest, windows-latest] 31 | 32 | steps: 33 | - uses: actions/checkout@v4 34 | - name: Set up Python ${{ matrix.python-version }} 35 | uses: actions/setup-python@v5 36 | with: 37 | python-version: ${{ matrix.python-version }} 38 | - name: Install dependencies 39 | run: | 40 | python -m pip install --upgrade pip 41 | python -m pip install tox tox-gh-actions 42 | - name: Cache tox environments 43 | id: cache-tox 44 | uses: actions/cache@v4 45 | with: 46 | path: .tox 47 | # setup.py and setup.cfg have versioning info that would impact the 48 | # tox environment. hashFiles only takes a single file path or pattern 49 | # at the moment. 50 | key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }} 51 | - name: Test with tox 52 | run: tox 53 | - uses: codecov/codecov-action@v4 54 | if: matrix.python-version == '3.8' && matrix.os == 'ubuntu-latest' 55 | with: 56 | token: ${{ secrets.CODECOV_TOKEN }} 57 | verbose: true 58 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | # Based on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/# 2 | name: Publish Python Package 3 | 4 | on: 5 | push: 6 | tags: 7 | - "v[0-9]+.[0-9]+.[0-9]*" 8 | 9 | jobs: 10 | build: 11 | name: Build distribution 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | # NOTE: tags are not present unless triggered by tag push 17 | # - name: Get tags 18 | # run: git fetch --tags origin 19 | # - name: List tags 20 | # run: git tag --list 21 | # TODO: somehow versioneer does not pickup the tag when workflow is not triggered by a 22 | # tag push, getting e.g. (for sister repo scantree) scantree-0+untagged.1.gd74b1d5, 23 | # see: https://github.com/andhus/scantree/actions/runs/7485873305/job/20375116541#step:7:42) 24 | - name: Set up Python 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: "3.x" 28 | - name: Install pypa/build 29 | run: >- 30 | python3 -m 31 | pip install 32 | build 33 | --user 34 | - name: Build a binary wheel and a source tarball 35 | run: python3 -m build 36 | - name: Store the distribution packages 37 | uses: actions/upload-artifact@v4 38 | with: 39 | name: python-package-distributions 40 | path: dist/ 41 | 42 | publish-to-pypi: 43 | name: Publish to PyPI 44 | # TODO we need to make sure the tag matches the version! 45 | if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes 46 | needs: 47 | - build 48 | runs-on: ubuntu-latest 49 | environment: 50 | name: pypi 51 | url: https://pypi.org/p/dirhash 52 | permissions: 53 | id-token: write # IMPORTANT: mandatory for trusted publishing 54 | 55 | steps: 56 | - name: Download all the dists 57 | uses: actions/download-artifact@v4 58 | with: 59 | name: python-package-distributions 60 | path: dist/ 61 | - name: Publish distribution 📦 to PyPI 62 | uses: pypa/gh-action-pypi-publish@release/v1 63 | 64 | github-release: 65 | name: Sign and upload to GitHub Release 66 | needs: 67 | - publish-to-pypi 68 | runs-on: ubuntu-latest 69 | 70 | permissions: 71 | contents: write # IMPORTANT: mandatory for making GitHub Releases 72 | id-token: write # IMPORTANT: mandatory for sigstore 73 | 74 | steps: 75 | - name: Download all the dists 76 | uses: actions/download-artifact@v4 77 | with: 78 | name: python-package-distributions 79 | path: dist/ 80 | - name: Sign the dists with Sigstore 81 | uses: sigstore/gh-action-sigstore-python@v2.1.1 82 | with: 83 | inputs: >- 84 | ./dist/*.tar.gz 85 | ./dist/*.whl 86 | - name: Create GitHub Release 87 | env: 88 | GITHUB_TOKEN: ${{ github.token }} 89 | run: >- 90 | gh release create 91 | '${{ github.ref_name }}' 92 | --repo '${{ github.repository }}' 93 | --notes "" 94 | - name: Upload artifact signatures to GitHub Release 95 | env: 96 | GITHUB_TOKEN: ${{ github.token }} 97 | # Upload to GitHub Release using the `gh` CLI. 98 | # `dist/` contains the built packages, and the 99 | # sigstore-produced signatures and certificates. 100 | run: >- 101 | gh release upload 102 | '${{ github.ref_name }}' dist/** 103 | --repo '${{ github.repository }}' 104 | 105 | publish-to-testpypi: 106 | name: Publish to TestPyPI 107 | if: startsWith(github.ref, 'refs/tags/') # only publish on tag pushes 108 | needs: 109 | - build 110 | runs-on: ubuntu-latest 111 | 112 | environment: 113 | name: testpypi 114 | url: https://test.pypi.org/p/dirhash 115 | 116 | permissions: 117 | id-token: write # IMPORTANT: mandatory for trusted publishing 118 | 119 | steps: 120 | - name: Download all the dists 121 | uses: actions/download-artifact@v4 122 | with: 123 | name: python-package-distributions 124 | path: dist/ 125 | - name: Publish distribution 📦 to TestPyPI 126 | uses: pypa/gh-action-pypi-publish@release/v1 127 | with: 128 | repository-url: https://test.pypi.org/legacy/ 129 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Pycharm 107 | .idea/ 108 | 109 | # VSC 110 | .vscode/ 111 | 112 | # Project specific 113 | benchmark/test_cases/* 114 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/mirrors-prettier 3 | rev: v3.1.0 4 | hooks: 5 | - id: prettier 6 | args: [--prose-wrap=preserve, --print-width=88] 7 | - repo: https://github.com/astral-sh/ruff-pre-commit 8 | rev: v0.3.7 9 | hooks: 10 | - id: ruff 11 | args: 12 | - --fix 13 | - id: ruff-format 14 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [Unreleased] 9 | 10 | NIL 11 | 12 | ## [0.2.0] - 2019-04-20 13 | 14 | Complies with [Dirhash Standard](https://github.com/andhus/dirhash) Version [0.1.0](https://github.com/andhus/dirhash/releases/v0.1.0) 15 | 16 | ### Added 17 | 18 | - A first implementation based on the formalized [Dirhash Standard](https://github.com/andhus/dirhash). 19 | - This changelog. 20 | - Results form a new benchmark run after changes. The `benchmark/run.py` now outputs results files which names include the `dirhash.__version__`. 21 | 22 | ### Changed 23 | 24 | - **Significant breaking changes** from version 0.1.1 - both regarding API and the 25 | underlying method/protocol for computing the hash. This means that **hashes 26 | computed with this version will differ from hashes computed with version < 0.2.0 for 27 | same directory**. 28 | - This dirhash python implementation has moved to here 29 | [github.com/andhus/dirhash-python](https://github.com/andhus/dirhash-python) from 30 | the previous repository 31 | [github.com/andhus/dirhash](https://github.com/andhus/dirhash) 32 | which now contains the formal description of the Dirhash Standard. 33 | 34 | ### Removed 35 | 36 | - All support for the `.dirhashignore` file. This seemed superfluous, please file an 37 | issue if you need this feature. 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Anders Huss 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![codecov](https://codecov.io/gh/andhus/dirhash-python/branch/master/graph/badge.svg)](https://codecov.io/gh/andhus/dirhash-python) 2 | 3 | # dirhash 4 | 5 | A lightweight python module and CLI for computing the hash of any 6 | directory based on its files' structure and content. 7 | 8 | - Supports all hashing algorithms of Python's built-in `hashlib` module. 9 | - Glob/wildcard (".gitignore style") path matching for expressive filtering of files to include/exclude. 10 | - Multiprocessing for up to [6x speed-up](#performance) 11 | 12 | The hash is computed according to the [Dirhash Standard](https://github.com/andhus/dirhash), which is designed to allow for consistent and collision resistant generation/verification of directory hashes across implementations. 13 | 14 | ## Installation 15 | 16 | From PyPI: 17 | 18 | ```commandline 19 | pip install dirhash 20 | ``` 21 | 22 | Or directly from source: 23 | 24 | ```commandline 25 | git clone git@github.com:andhus/dirhash-python.git 26 | pip install dirhash/ 27 | ``` 28 | 29 | ## Usage 30 | 31 | Python module: 32 | 33 | ```python 34 | from dirhash import dirhash 35 | 36 | dirpath = "path/to/directory" 37 | dir_md5 = dirhash(dirpath, "md5") 38 | pyfiles_md5 = dirhash(dirpath, "md5", match=["*.py"]) 39 | no_hidden_sha1 = dirhash(dirpath, "sha1", ignore=[".*", ".*/"]) 40 | ``` 41 | 42 | CLI: 43 | 44 | ```commandline 45 | dirhash path/to/directory -a md5 46 | dirhash path/to/directory -a md5 --match "*.py" 47 | dirhash path/to/directory -a sha1 --ignore ".*" ".*/" 48 | ``` 49 | 50 | ## Why? 51 | 52 | If you (or your application) need to verify the integrity of a set of files as well 53 | as their name and location, you might find this useful. Use-cases range from 54 | verification of your image classification dataset (before spending GPU-$$$ on 55 | training your fancy Deep Learning model) to validation of generated files in 56 | regression-testing. 57 | 58 | There isn't really a standard way of doing this. There are plenty of recipes out 59 | there (see e.g. these SO-questions for [linux](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents) 60 | and [python](https://stackoverflow.com/questions/24937495/how-can-i-calculate-a-hash-for-a-filesystem-directory-using-python)) 61 | but I couldn't find one that is properly tested (there are some gotcha:s to cover!) 62 | and documented with a compelling user interface. `dirhash` was created with this as 63 | the goal. 64 | 65 | [checksumdir](https://github.com/cakepietoast/checksumdir) is another python 66 | module/tool with similar intent (that inspired this project) but it lacks much of the 67 | functionality offered here (most notably including file names/structure in the hash) 68 | and lacks tests. 69 | 70 | ## Performance 71 | 72 | The python `hashlib` implementation of common hashing algorithms are highly 73 | optimised. `dirhash` mainly parses the file tree, pipes data to `hashlib` and 74 | combines the output. Reasonable measures have been taken to minimize the overhead 75 | and for common use-cases, the majority of time is spent reading data from disk 76 | and executing `hashlib` code. 77 | 78 | The main effort to boost performance is support for multiprocessing, where the 79 | reading and hashing is parallelized over individual files. 80 | 81 | As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/cli.py) 82 | with the shell command: 83 | 84 | `find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` 85 | 86 | which is the top answer for the SO-question: 87 | [Linux: compute a single hash for a given folder & contents?](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents) 88 | Results for two test cases are shown below. Both have 1 GiB of random data: in 89 | "flat_1k_1MB", split into 1k files (1 MiB each) in a flat structure, and in 90 | "nested_32k_32kB", into 32k files (32 KiB each) spread over the 256 leaf directories 91 | in a binary tree of depth 8. 92 | 93 | | Implementation | Test Case | Time (s) | Speed up | 94 | | -------------------- | --------------- | -------: | -------: | 95 | | shell reference | flat_1k_1MB | 2.29 | -> 1.0 | 96 | | `dirhash` | flat_1k_1MB | 1.67 | 1.36 | 97 | | `dirhash`(8 workers) | flat_1k_1MB | 0.48 | **4.73** | 98 | | shell reference | nested_32k_32kB | 6.82 | -> 1.0 | 99 | | `dirhash` | nested_32k_32kB | 3.43 | 2.00 | 100 | | `dirhash`(8 workers) | nested_32k_32kB | 1.14 | **6.00** | 101 | 102 | The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash-python/tree/master/benchmark). 103 | 104 | ## Documentation 105 | 106 | Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). 107 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | As a reference, the performance of `dirhash` is benchmarked against the shell command: 4 | 5 | `find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` 6 | 7 | (top answer for the SO-question: 8 | [Linux: compute a single hash for a given folder & contents?](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents)) 9 | 10 | Each test case contains 1 GiB of random data, split equally into 8, 1k or 32k files, 11 | in a flat or nested (binary tree of depth 8) structure. 12 | 13 | For a fair comparison, _the CLI version_ of `dirhash` was used (including startup 14 | time for loading of python modules etc.). 15 | 16 | For full details/reproducibility see/run the `run.py` script for which the output is 17 | found in `results.csv`. These results were generated on a MacBook Pro (2018): 18 | 19 | - 2,2 GHz Intel Core i7 (`sysctl -n hw.physicalcpu hw.logicalcpu`-> 6, 12) 20 | - 16 GB 2400 MHz DDR4 21 | - APPLE SSD AP0512M 22 | 23 | ## Sample results: 24 | 25 | | Implementation | Test Case | Time (s) | Speed up | 26 | | -------------------- | --------------- | -------: | -------: | 27 | | shell reference | flat_1k_1MB | 2.29 | -> 1.0 | 28 | | `dirhash` | flat_1k_1MB | 1.67 | 1.36 | 29 | | `dirhash`(8 workers) | flat_1k_1MB | 0.48 | **4.73** | 30 | | shell reference | nested_32k_32kB | 6.82 | -> 1.0 | 31 | | `dirhash` | nested_32k_32kB | 3.43 | 2.00 | 32 | | `dirhash`(8 workers) | nested_32k_32kB | 1.14 | **6.00** | 33 | -------------------------------------------------------------------------------- /benchmark/results.csv: -------------------------------------------------------------------------------- 1 | ,test_case,implementation,algorithm,workers,t_best,t_median,speed-up (median) 2 | 0,flat_8_128MB,shell reference,md5,1,2.014,2.02,1.0 3 | 1,flat_8_128MB,dirhash,md5,1,1.602,1.604,1.2593516209476308 4 | 2,flat_8_128MB,dirhash,md5,2,0.977,0.98,2.061224489795918 5 | 3,flat_8_128MB,dirhash,md5,4,0.562,0.569,3.5500878734622145 6 | 4,flat_8_128MB,dirhash,md5,8,0.464,0.473,4.2706131078224105 7 | 5,flat_1k_1MB,shell reference,md5,1,2.263,2.268,1.0 8 | 6,flat_1k_1MB,dirhash,md5,1,1.662,1.667,1.3605278944211157 9 | 7,flat_1k_1MB,dirhash,md5,2,0.978,0.983,2.3072227873855544 10 | 8,flat_1k_1MB,dirhash,md5,4,0.57,0.58,3.910344827586207 11 | 9,flat_1k_1MB,dirhash,md5,8,0.476,0.48,4.725 12 | 10,flat_32k_32kB,shell reference,md5,1,6.711,6.721,1.0 13 | 11,flat_32k_32kB,dirhash,md5,1,3.329,3.354,2.003875968992248 14 | 12,flat_32k_32kB,dirhash,md5,2,2.067,2.074,3.240597878495661 15 | 13,flat_32k_32kB,dirhash,md5,4,1.345,1.362,4.934654919236417 16 | 14,flat_32k_32kB,dirhash,md5,8,1.09,1.094,6.143510054844606 17 | 15,nested_1k_1MB,shell reference,md5,1,2.296,2.306,1.0 18 | 16,nested_1k_1MB,dirhash,md5,1,1.713,1.714,1.3453908984830805 19 | 17,nested_1k_1MB,dirhash,md5,2,0.996,1.009,2.285431119920714 20 | 18,nested_1k_1MB,dirhash,md5,4,0.601,0.602,3.8305647840531565 21 | 19,nested_1k_1MB,dirhash,md5,8,0.499,0.505,4.566336633663366 22 | 20,nested_32k_32kB,shell reference,md5,1,6.814,6.818,1.0 23 | 21,nested_32k_32kB,dirhash,md5,1,3.376,3.426,1.9900758902510214 24 | 22,nested_32k_32kB,dirhash,md5,2,2.147,2.153,3.166744078030655 25 | 23,nested_32k_32kB,dirhash,md5,4,1.414,1.416,4.814971751412429 26 | 24,nested_32k_32kB,dirhash,md5,8,1.137,1.138,5.991212653778559 27 | 25,flat_8_128MB,shell reference,sha1,1,2.181,2.196,1.0 28 | 26,flat_8_128MB,dirhash,sha1,1,1.214,1.225,1.7926530612244898 29 | 27,flat_8_128MB,dirhash,sha1,2,0.768,0.774,2.8372093023255816 30 | 28,flat_8_128MB,dirhash,sha1,4,0.467,0.474,4.632911392405064 31 | 29,flat_8_128MB,dirhash,sha1,8,0.47,0.477,4.603773584905661 32 | 30,flat_1k_1MB,shell reference,sha1,1,2.221,2.229,1.0 33 | 31,flat_1k_1MB,dirhash,sha1,1,1.252,1.263,1.7648456057007127 34 | 32,flat_1k_1MB,dirhash,sha1,2,0.774,0.777,2.8687258687258685 35 | 33,flat_1k_1MB,dirhash,sha1,4,0.471,0.477,4.672955974842767 36 | 34,flat_1k_1MB,dirhash,sha1,8,0.378,0.478,4.663179916317992 37 | 35,flat_32k_32kB,shell reference,sha1,1,4.178,4.224,1.0 38 | 36,flat_32k_32kB,dirhash,sha1,1,2.921,3.008,1.4042553191489362 39 | 37,flat_32k_32kB,dirhash,sha1,2,1.888,1.892,2.232558139534884 40 | 38,flat_32k_32kB,dirhash,sha1,4,1.266,1.275,3.3129411764705887 41 | 39,flat_32k_32kB,dirhash,sha1,8,1.072,1.079,3.914735866543096 42 | 40,nested_1k_1MB,shell reference,sha1,1,2.236,2.26,1.0 43 | 41,nested_1k_1MB,dirhash,sha1,1,1.308,1.314,1.719939117199391 44 | 42,nested_1k_1MB,dirhash,sha1,2,0.797,0.8,2.8249999999999997 45 | 43,nested_1k_1MB,dirhash,sha1,4,0.501,0.509,4.4400785854616895 46 | 44,nested_1k_1MB,dirhash,sha1,8,0.499,0.503,4.493041749502981 47 | 45,nested_32k_32kB,shell reference,sha1,1,4.383,4.406,1.0 48 | 46,nested_32k_32kB,dirhash,sha1,1,3.041,3.05,1.4445901639344263 49 | 47,nested_32k_32kB,dirhash,sha1,2,1.943,1.965,2.242239185750636 50 | 48,nested_32k_32kB,dirhash,sha1,4,1.329,1.334,3.3028485757121433 51 | 49,nested_32k_32kB,dirhash,sha1,8,1.14,1.149,3.8346388163620535 52 | -------------------------------------------------------------------------------- /benchmark/results.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "test_case": "flat_8_128MB", 4 | "implementation": "shell reference", 5 | "algorithm": "md5", 6 | "workers": 1, 7 | "t_best": 2.014, 8 | "t_median": 2.02 9 | }, 10 | { 11 | "test_case": "flat_8_128MB", 12 | "implementation": "dirhash", 13 | "algorithm": "md5", 14 | "workers": 1, 15 | "t_best": 1.602, 16 | "t_median": 1.604 17 | }, 18 | { 19 | "test_case": "flat_8_128MB", 20 | "implementation": "dirhash", 21 | "algorithm": "md5", 22 | "workers": 2, 23 | "t_best": 0.977, 24 | "t_median": 0.98 25 | }, 26 | { 27 | "test_case": "flat_8_128MB", 28 | "implementation": "dirhash", 29 | "algorithm": "md5", 30 | "workers": 4, 31 | "t_best": 0.562, 32 | "t_median": 0.569 33 | }, 34 | { 35 | "test_case": "flat_8_128MB", 36 | "implementation": "dirhash", 37 | "algorithm": "md5", 38 | "workers": 8, 39 | "t_best": 0.464, 40 | "t_median": 0.473 41 | }, 42 | { 43 | "test_case": "flat_1k_1MB", 44 | "implementation": "shell reference", 45 | "algorithm": "md5", 46 | "workers": 1, 47 | "t_best": 2.263, 48 | "t_median": 2.268 49 | }, 50 | { 51 | "test_case": "flat_1k_1MB", 52 | "implementation": "dirhash", 53 | "algorithm": "md5", 54 | "workers": 1, 55 | "t_best": 1.662, 56 | "t_median": 1.667 57 | }, 58 | { 59 | "test_case": "flat_1k_1MB", 60 | "implementation": "dirhash", 61 | "algorithm": "md5", 62 | "workers": 2, 63 | "t_best": 0.978, 64 | "t_median": 0.983 65 | }, 66 | { 67 | "test_case": "flat_1k_1MB", 68 | "implementation": "dirhash", 69 | "algorithm": "md5", 70 | "workers": 4, 71 | "t_best": 0.57, 72 | "t_median": 0.58 73 | }, 74 | { 75 | "test_case": "flat_1k_1MB", 76 | "implementation": "dirhash", 77 | "algorithm": "md5", 78 | "workers": 8, 79 | "t_best": 0.476, 80 | "t_median": 0.48 81 | }, 82 | { 83 | "test_case": "flat_32k_32kB", 84 | "implementation": "shell reference", 85 | "algorithm": "md5", 86 | "workers": 1, 87 | "t_best": 6.711, 88 | "t_median": 6.721 89 | }, 90 | { 91 | "test_case": "flat_32k_32kB", 92 | "implementation": "dirhash", 93 | "algorithm": "md5", 94 | "workers": 1, 95 | "t_best": 3.329, 96 | "t_median": 3.354 97 | }, 98 | { 99 | "test_case": "flat_32k_32kB", 100 | "implementation": "dirhash", 101 | "algorithm": "md5", 102 | "workers": 2, 103 | "t_best": 2.067, 104 | "t_median": 2.074 105 | }, 106 | { 107 | "test_case": "flat_32k_32kB", 108 | "implementation": "dirhash", 109 | "algorithm": "md5", 110 | "workers": 4, 111 | "t_best": 1.345, 112 | "t_median": 1.362 113 | }, 114 | { 115 | "test_case": "flat_32k_32kB", 116 | "implementation": "dirhash", 117 | "algorithm": "md5", 118 | "workers": 8, 119 | "t_best": 1.09, 120 | "t_median": 1.094 121 | }, 122 | { 123 | "test_case": "nested_1k_1MB", 124 | "implementation": "shell reference", 125 | "algorithm": "md5", 126 | "workers": 1, 127 | "t_best": 2.296, 128 | "t_median": 2.306 129 | }, 130 | { 131 | "test_case": "nested_1k_1MB", 132 | "implementation": "dirhash", 133 | "algorithm": "md5", 134 | "workers": 1, 135 | "t_best": 1.713, 136 | "t_median": 1.714 137 | }, 138 | { 139 | "test_case": "nested_1k_1MB", 140 | "implementation": "dirhash", 141 | "algorithm": "md5", 142 | "workers": 2, 143 | "t_best": 0.996, 144 | "t_median": 1.009 145 | }, 146 | { 147 | "test_case": "nested_1k_1MB", 148 | "implementation": "dirhash", 149 | "algorithm": "md5", 150 | "workers": 4, 151 | "t_best": 0.601, 152 | "t_median": 0.602 153 | }, 154 | { 155 | "test_case": "nested_1k_1MB", 156 | "implementation": "dirhash", 157 | "algorithm": "md5", 158 | "workers": 8, 159 | "t_best": 0.499, 160 | "t_median": 0.505 161 | }, 162 | { 163 | "test_case": "nested_32k_32kB", 164 | "implementation": "shell reference", 165 | "algorithm": "md5", 166 | "workers": 1, 167 | "t_best": 6.814, 168 | "t_median": 6.818 169 | }, 170 | { 171 | "test_case": "nested_32k_32kB", 172 | "implementation": "dirhash", 173 | "algorithm": "md5", 174 | "workers": 1, 175 | "t_best": 3.376, 176 | "t_median": 3.426 177 | }, 178 | { 179 | "test_case": "nested_32k_32kB", 180 | "implementation": "dirhash", 181 | "algorithm": "md5", 182 | "workers": 2, 183 | "t_best": 2.147, 184 | "t_median": 2.153 185 | }, 186 | { 187 | "test_case": "nested_32k_32kB", 188 | "implementation": "dirhash", 189 | "algorithm": "md5", 190 | "workers": 4, 191 | "t_best": 1.414, 192 | "t_median": 1.416 193 | }, 194 | { 195 | "test_case": "nested_32k_32kB", 196 | "implementation": "dirhash", 197 | "algorithm": "md5", 198 | "workers": 8, 199 | "t_best": 1.137, 200 | "t_median": 1.138 201 | }, 202 | { 203 | "test_case": "flat_8_128MB", 204 | "implementation": "shell reference", 205 | "algorithm": "sha1", 206 | "workers": 1, 207 | "t_best": 2.181, 208 | "t_median": 2.196 209 | }, 210 | { 211 | "test_case": "flat_8_128MB", 212 | "implementation": "dirhash", 213 | "algorithm": "sha1", 214 | "workers": 1, 215 | "t_best": 1.214, 216 | "t_median": 1.225 217 | }, 218 | { 219 | "test_case": "flat_8_128MB", 220 | "implementation": "dirhash", 221 | "algorithm": "sha1", 222 | "workers": 2, 223 | "t_best": 0.768, 224 | "t_median": 0.774 225 | }, 226 | { 227 | "test_case": "flat_8_128MB", 228 | "implementation": "dirhash", 229 | "algorithm": "sha1", 230 | "workers": 4, 231 | "t_best": 0.467, 232 | "t_median": 0.474 233 | }, 234 | { 235 | "test_case": "flat_8_128MB", 236 | "implementation": "dirhash", 237 | "algorithm": "sha1", 238 | "workers": 8, 239 | "t_best": 0.47, 240 | "t_median": 0.477 241 | }, 242 | { 243 | "test_case": "flat_1k_1MB", 244 | "implementation": "shell reference", 245 | "algorithm": "sha1", 246 | "workers": 1, 247 | "t_best": 2.221, 248 | "t_median": 2.229 249 | }, 250 | { 251 | "test_case": "flat_1k_1MB", 252 | "implementation": "dirhash", 253 | "algorithm": "sha1", 254 | "workers": 1, 255 | "t_best": 1.252, 256 | "t_median": 1.263 257 | }, 258 | { 259 | "test_case": "flat_1k_1MB", 260 | "implementation": "dirhash", 261 | "algorithm": "sha1", 262 | "workers": 2, 263 | "t_best": 0.774, 264 | "t_median": 0.777 265 | }, 266 | { 267 | "test_case": "flat_1k_1MB", 268 | "implementation": "dirhash", 269 | "algorithm": "sha1", 270 | "workers": 4, 271 | "t_best": 0.471, 272 | "t_median": 0.477 273 | }, 274 | { 275 | "test_case": "flat_1k_1MB", 276 | "implementation": "dirhash", 277 | "algorithm": "sha1", 278 | "workers": 8, 279 | "t_best": 0.378, 280 | "t_median": 0.478 281 | }, 282 | { 283 | "test_case": "flat_32k_32kB", 284 | "implementation": "shell reference", 285 | "algorithm": "sha1", 286 | "workers": 1, 287 | "t_best": 4.178, 288 | "t_median": 4.224 289 | }, 290 | { 291 | "test_case": "flat_32k_32kB", 292 | "implementation": "dirhash", 293 | "algorithm": "sha1", 294 | "workers": 1, 295 | "t_best": 2.921, 296 | "t_median": 3.008 297 | }, 298 | { 299 | "test_case": "flat_32k_32kB", 300 | "implementation": "dirhash", 301 | "algorithm": "sha1", 302 | "workers": 2, 303 | "t_best": 1.888, 304 | "t_median": 1.892 305 | }, 306 | { 307 | "test_case": "flat_32k_32kB", 308 | "implementation": "dirhash", 309 | "algorithm": "sha1", 310 | "workers": 4, 311 | "t_best": 1.266, 312 | "t_median": 1.275 313 | }, 314 | { 315 | "test_case": "flat_32k_32kB", 316 | "implementation": "dirhash", 317 | "algorithm": "sha1", 318 | "workers": 8, 319 | "t_best": 1.072, 320 | "t_median": 1.079 321 | }, 322 | { 323 | "test_case": "nested_1k_1MB", 324 | "implementation": "shell reference", 325 | "algorithm": "sha1", 326 | "workers": 1, 327 | "t_best": 2.236, 328 | "t_median": 2.26 329 | }, 330 | { 331 | "test_case": "nested_1k_1MB", 332 | "implementation": "dirhash", 333 | "algorithm": "sha1", 334 | "workers": 1, 335 | "t_best": 1.308, 336 | "t_median": 1.314 337 | }, 338 | { 339 | "test_case": "nested_1k_1MB", 340 | "implementation": "dirhash", 341 | "algorithm": "sha1", 342 | "workers": 2, 343 | "t_best": 0.797, 344 | "t_median": 0.8 345 | }, 346 | { 347 | "test_case": "nested_1k_1MB", 348 | "implementation": "dirhash", 349 | "algorithm": "sha1", 350 | "workers": 4, 351 | "t_best": 0.501, 352 | "t_median": 0.509 353 | }, 354 | { 355 | "test_case": "nested_1k_1MB", 356 | "implementation": "dirhash", 357 | "algorithm": "sha1", 358 | "workers": 8, 359 | "t_best": 0.499, 360 | "t_median": 0.503 361 | }, 362 | { 363 | "test_case": "nested_32k_32kB", 364 | "implementation": "shell reference", 365 | "algorithm": "sha1", 366 | "workers": 1, 367 | "t_best": 4.383, 368 | "t_median": 4.406 369 | }, 370 | { 371 | "test_case": "nested_32k_32kB", 372 | "implementation": "dirhash", 373 | "algorithm": "sha1", 374 | "workers": 1, 375 | "t_best": 3.041, 376 | "t_median": 3.05 377 | }, 378 | { 379 | "test_case": "nested_32k_32kB", 380 | "implementation": "dirhash", 381 | "algorithm": "sha1", 382 | "workers": 2, 383 | "t_best": 1.943, 384 | "t_median": 1.965 385 | }, 386 | { 387 | "test_case": "nested_32k_32kB", 388 | "implementation": "dirhash", 389 | "algorithm": "sha1", 390 | "workers": 4, 391 | "t_best": 1.329, 392 | "t_median": 1.334 393 | }, 394 | { 395 | "test_case": "nested_32k_32kB", 396 | "implementation": "dirhash", 397 | "algorithm": "sha1", 398 | "workers": 8, 399 | "t_best": 1.14, 400 | "t_median": 1.149 401 | } 402 | ] 403 | -------------------------------------------------------------------------------- /benchmark/results_v0.2.0.csv: -------------------------------------------------------------------------------- 1 | ,test_case,implementation,algorithm,workers,t_best,t_median,speed-up (median) 2 | 0,flat_8_128MB,shell reference,md5,1,2.079,2.083,1.0 3 | 1,flat_8_128MB,dirhash_impl,md5,1,1.734,1.945,1.0709511568123393 4 | 2,flat_8_128MB,dirhash_impl,md5,2,0.999,1.183,1.760777683854607 5 | 3,flat_8_128MB,dirhash_impl,md5,4,0.711,0.728,2.8612637362637368 6 | 4,flat_8_128MB,dirhash_impl,md5,8,0.504,0.518,4.021235521235521 7 | 5,flat_1k_1MB,shell reference,md5,1,3.383,3.679,1.0 8 | 6,flat_1k_1MB,dirhash_impl,md5,1,1.846,1.921,1.9151483602290473 9 | 7,flat_1k_1MB,dirhash_impl,md5,2,1.137,1.158,3.1770293609671847 10 | 8,flat_1k_1MB,dirhash_impl,md5,4,0.74,0.749,4.911882510013351 11 | 9,flat_1k_1MB,dirhash_impl,md5,8,0.53,0.534,6.889513108614231 12 | 10,flat_32k_32kB,shell reference,md5,1,13.827,18.213,1.0 13 | 11,flat_32k_32kB,dirhash_impl,md5,1,13.655,13.808,1.3190179606025494 14 | 12,flat_32k_32kB,dirhash_impl,md5,2,3.276,3.33,5.469369369369369 15 | 13,flat_32k_32kB,dirhash_impl,md5,4,2.409,2.421,7.522924411400249 16 | 14,flat_32k_32kB,dirhash_impl,md5,8,2.045,2.086,8.731064237775648 17 | 15,nested_1k_1MB,shell reference,md5,1,3.284,3.332,1.0 18 | 16,nested_1k_1MB,dirhash_impl,md5,1,1.717,1.725,1.9315942028985504 19 | 17,nested_1k_1MB,dirhash_impl,md5,2,1.026,1.034,3.222437137330754 20 | 18,nested_1k_1MB,dirhash_impl,md5,4,0.622,0.633,5.263823064770932 21 | 19,nested_1k_1MB,dirhash_impl,md5,8,0.522,0.529,6.29867674858223 22 | 20,nested_32k_32kB,shell reference,md5,1,11.898,12.125,1.0 23 | 21,nested_32k_32kB,dirhash_impl,md5,1,13.858,14.146,0.8571327583769263 24 | 22,nested_32k_32kB,dirhash_impl,md5,2,2.781,2.987,4.059256779377302 25 | 23,nested_32k_32kB,dirhash_impl,md5,4,1.894,1.92,6.315104166666667 26 | 24,nested_32k_32kB,dirhash_impl,md5,8,1.55,1.568,7.732780612244897 27 | 25,flat_8_128MB,shell reference,sha1,1,2.042,2.05,1.0 28 | 26,flat_8_128MB,dirhash_impl,sha1,1,1.338,1.354,1.5140324963072376 29 | 27,flat_8_128MB,dirhash_impl,sha1,2,0.79,0.794,2.5818639798488663 30 | 28,flat_8_128MB,dirhash_impl,sha1,4,0.583,0.593,3.456998313659359 31 | 29,flat_8_128MB,dirhash_impl,sha1,8,0.483,0.487,4.209445585215605 32 | 30,flat_1k_1MB,shell reference,sha1,1,2.118,2.129,1.0 33 | 31,flat_1k_1MB,dirhash_impl,sha1,1,1.39,1.531,1.3905943827563685 34 | 32,flat_1k_1MB,dirhash_impl,sha1,2,0.925,0.932,2.2843347639484977 35 | 33,flat_1k_1MB,dirhash_impl,sha1,4,0.614,0.629,3.384737678855326 36 | 34,flat_1k_1MB,dirhash_impl,sha1,8,0.511,0.52,4.094230769230769 37 | 35,flat_32k_32kB,shell reference,sha1,1,10.551,10.97,1.0 38 | 36,flat_32k_32kB,dirhash_impl,sha1,1,4.663,4.76,2.304621848739496 39 | 37,flat_32k_32kB,dirhash_impl,sha1,2,3.108,3.235,3.3910355486862445 40 | 38,flat_32k_32kB,dirhash_impl,sha1,4,2.342,2.361,4.6463362981787375 41 | 39,flat_32k_32kB,dirhash_impl,sha1,8,2.071,2.094,5.2387774594078325 42 | 40,nested_1k_1MB,shell reference,sha1,1,2.11,2.159,1.0 43 | 41,nested_1k_1MB,dirhash_impl,sha1,1,1.436,1.47,1.4687074829931972 44 | 42,nested_1k_1MB,dirhash_impl,sha1,2,0.925,0.937,2.3041622198505864 45 | 43,nested_1k_1MB,dirhash_impl,sha1,4,0.627,0.643,3.357698289269051 46 | 44,nested_1k_1MB,dirhash_impl,sha1,8,0.516,0.527,4.096774193548386 47 | 45,nested_32k_32kB,shell reference,sha1,1,3.982,7.147,1.0 48 | 46,nested_32k_32kB,dirhash_impl,sha1,1,4.114,4.156,1.7196823869104911 49 | 47,nested_32k_32kB,dirhash_impl,sha1,2,2.598,2.616,2.7320336391437308 50 | 48,nested_32k_32kB,dirhash_impl,sha1,4,1.809,1.831,3.9033315128345167 51 | 49,nested_32k_32kB,dirhash_impl,sha1,8,1.552,1.58,4.523417721518987 52 | -------------------------------------------------------------------------------- /benchmark/results_v0.2.0.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "test_case": "flat_8_128MB", 4 | "implementation": "shell reference", 5 | "algorithm": "md5", 6 | "workers": 1, 7 | "t_best": 2.079, 8 | "t_median": 2.083 9 | }, 10 | { 11 | "test_case": "flat_8_128MB", 12 | "implementation": "dirhash", 13 | "algorithm": "md5", 14 | "workers": 1, 15 | "t_best": 1.734, 16 | "t_median": 1.945 17 | }, 18 | { 19 | "test_case": "flat_8_128MB", 20 | "implementation": "dirhash", 21 | "algorithm": "md5", 22 | "workers": 2, 23 | "t_best": 0.999, 24 | "t_median": 1.183 25 | }, 26 | { 27 | "test_case": "flat_8_128MB", 28 | "implementation": "dirhash", 29 | "algorithm": "md5", 30 | "workers": 4, 31 | "t_best": 0.711, 32 | "t_median": 0.728 33 | }, 34 | { 35 | "test_case": "flat_8_128MB", 36 | "implementation": "dirhash", 37 | "algorithm": "md5", 38 | "workers": 8, 39 | "t_best": 0.504, 40 | "t_median": 0.518 41 | }, 42 | { 43 | "test_case": "flat_1k_1MB", 44 | "implementation": "shell reference", 45 | "algorithm": "md5", 46 | "workers": 1, 47 | "t_best": 3.383, 48 | "t_median": 3.679 49 | }, 50 | { 51 | "test_case": "flat_1k_1MB", 52 | "implementation": "dirhash", 53 | "algorithm": "md5", 54 | "workers": 1, 55 | "t_best": 1.846, 56 | "t_median": 1.921 57 | }, 58 | { 59 | "test_case": "flat_1k_1MB", 60 | "implementation": "dirhash", 61 | "algorithm": "md5", 62 | "workers": 2, 63 | "t_best": 1.137, 64 | "t_median": 1.158 65 | }, 66 | { 67 | "test_case": "flat_1k_1MB", 68 | "implementation": "dirhash", 69 | "algorithm": "md5", 70 | "workers": 4, 71 | "t_best": 0.74, 72 | "t_median": 0.749 73 | }, 74 | { 75 | "test_case": "flat_1k_1MB", 76 | "implementation": "dirhash", 77 | "algorithm": "md5", 78 | "workers": 8, 79 | "t_best": 0.53, 80 | "t_median": 0.534 81 | }, 82 | { 83 | "test_case": "flat_32k_32kB", 84 | "implementation": "shell reference", 85 | "algorithm": "md5", 86 | "workers": 1, 87 | "t_best": 13.827, 88 | "t_median": 18.213 89 | }, 90 | { 91 | "test_case": "flat_32k_32kB", 92 | "implementation": "dirhash", 93 | "algorithm": "md5", 94 | "workers": 1, 95 | "t_best": 13.655, 96 | "t_median": 13.808 97 | }, 98 | { 99 | "test_case": "flat_32k_32kB", 100 | "implementation": "dirhash", 101 | "algorithm": "md5", 102 | "workers": 2, 103 | "t_best": 3.276, 104 | "t_median": 3.33 105 | }, 106 | { 107 | "test_case": "flat_32k_32kB", 108 | "implementation": "dirhash", 109 | "algorithm": "md5", 110 | "workers": 4, 111 | "t_best": 2.409, 112 | "t_median": 2.421 113 | }, 114 | { 115 | "test_case": "flat_32k_32kB", 116 | "implementation": "dirhash", 117 | "algorithm": "md5", 118 | "workers": 8, 119 | "t_best": 2.045, 120 | "t_median": 2.086 121 | }, 122 | { 123 | "test_case": "nested_1k_1MB", 124 | "implementation": "shell reference", 125 | "algorithm": "md5", 126 | "workers": 1, 127 | "t_best": 3.284, 128 | "t_median": 3.332 129 | }, 130 | { 131 | "test_case": "nested_1k_1MB", 132 | "implementation": "dirhash", 133 | "algorithm": "md5", 134 | "workers": 1, 135 | "t_best": 1.717, 136 | "t_median": 1.725 137 | }, 138 | { 139 | "test_case": "nested_1k_1MB", 140 | "implementation": "dirhash", 141 | "algorithm": "md5", 142 | "workers": 2, 143 | "t_best": 1.026, 144 | "t_median": 1.034 145 | }, 146 | { 147 | "test_case": "nested_1k_1MB", 148 | "implementation": "dirhash", 149 | "algorithm": "md5", 150 | "workers": 4, 151 | "t_best": 0.622, 152 | "t_median": 0.633 153 | }, 154 | { 155 | "test_case": "nested_1k_1MB", 156 | "implementation": "dirhash", 157 | "algorithm": "md5", 158 | "workers": 8, 159 | "t_best": 0.522, 160 | "t_median": 0.529 161 | }, 162 | { 163 | "test_case": "nested_32k_32kB", 164 | "implementation": "shell reference", 165 | "algorithm": "md5", 166 | "workers": 1, 167 | "t_best": 11.898, 168 | "t_median": 12.125 169 | }, 170 | { 171 | "test_case": "nested_32k_32kB", 172 | "implementation": "dirhash", 173 | "algorithm": "md5", 174 | "workers": 1, 175 | "t_best": 13.858, 176 | "t_median": 14.146 177 | }, 178 | { 179 | "test_case": "nested_32k_32kB", 180 | "implementation": "dirhash", 181 | "algorithm": "md5", 182 | "workers": 2, 183 | "t_best": 2.781, 184 | "t_median": 2.987 185 | }, 186 | { 187 | "test_case": "nested_32k_32kB", 188 | "implementation": "dirhash", 189 | "algorithm": "md5", 190 | "workers": 4, 191 | "t_best": 1.894, 192 | "t_median": 1.92 193 | }, 194 | { 195 | "test_case": "nested_32k_32kB", 196 | "implementation": "dirhash", 197 | "algorithm": "md5", 198 | "workers": 8, 199 | "t_best": 1.55, 200 | "t_median": 1.568 201 | }, 202 | { 203 | "test_case": "flat_8_128MB", 204 | "implementation": "shell reference", 205 | "algorithm": "sha1", 206 | "workers": 1, 207 | "t_best": 2.042, 208 | "t_median": 2.05 209 | }, 210 | { 211 | "test_case": "flat_8_128MB", 212 | "implementation": "dirhash", 213 | "algorithm": "sha1", 214 | "workers": 1, 215 | "t_best": 1.338, 216 | "t_median": 1.354 217 | }, 218 | { 219 | "test_case": "flat_8_128MB", 220 | "implementation": "dirhash", 221 | "algorithm": "sha1", 222 | "workers": 2, 223 | "t_best": 0.79, 224 | "t_median": 0.794 225 | }, 226 | { 227 | "test_case": "flat_8_128MB", 228 | "implementation": "dirhash", 229 | "algorithm": "sha1", 230 | "workers": 4, 231 | "t_best": 0.583, 232 | "t_median": 0.593 233 | }, 234 | { 235 | "test_case": "flat_8_128MB", 236 | "implementation": "dirhash", 237 | "algorithm": "sha1", 238 | "workers": 8, 239 | "t_best": 0.483, 240 | "t_median": 0.487 241 | }, 242 | { 243 | "test_case": "flat_1k_1MB", 244 | "implementation": "shell reference", 245 | "algorithm": "sha1", 246 | "workers": 1, 247 | "t_best": 2.118, 248 | "t_median": 2.129 249 | }, 250 | { 251 | "test_case": "flat_1k_1MB", 252 | "implementation": "dirhash", 253 | "algorithm": "sha1", 254 | "workers": 1, 255 | "t_best": 1.39, 256 | "t_median": 1.531 257 | }, 258 | { 259 | "test_case": "flat_1k_1MB", 260 | "implementation": "dirhash", 261 | "algorithm": "sha1", 262 | "workers": 2, 263 | "t_best": 0.925, 264 | "t_median": 0.932 265 | }, 266 | { 267 | "test_case": "flat_1k_1MB", 268 | "implementation": "dirhash", 269 | "algorithm": "sha1", 270 | "workers": 4, 271 | "t_best": 0.614, 272 | "t_median": 0.629 273 | }, 274 | { 275 | "test_case": "flat_1k_1MB", 276 | "implementation": "dirhash", 277 | "algorithm": "sha1", 278 | "workers": 8, 279 | "t_best": 0.511, 280 | "t_median": 0.52 281 | }, 282 | { 283 | "test_case": "flat_32k_32kB", 284 | "implementation": "shell reference", 285 | "algorithm": "sha1", 286 | "workers": 1, 287 | "t_best": 10.551, 288 | "t_median": 10.97 289 | }, 290 | { 291 | "test_case": "flat_32k_32kB", 292 | "implementation": "dirhash", 293 | "algorithm": "sha1", 294 | "workers": 1, 295 | "t_best": 4.663, 296 | "t_median": 4.76 297 | }, 298 | { 299 | "test_case": "flat_32k_32kB", 300 | "implementation": "dirhash", 301 | "algorithm": "sha1", 302 | "workers": 2, 303 | "t_best": 3.108, 304 | "t_median": 3.235 305 | }, 306 | { 307 | "test_case": "flat_32k_32kB", 308 | "implementation": "dirhash", 309 | "algorithm": "sha1", 310 | "workers": 4, 311 | "t_best": 2.342, 312 | "t_median": 2.361 313 | }, 314 | { 315 | "test_case": "flat_32k_32kB", 316 | "implementation": "dirhash", 317 | "algorithm": "sha1", 318 | "workers": 8, 319 | "t_best": 2.071, 320 | "t_median": 2.094 321 | }, 322 | { 323 | "test_case": "nested_1k_1MB", 324 | "implementation": "shell reference", 325 | "algorithm": "sha1", 326 | "workers": 1, 327 | "t_best": 2.11, 328 | "t_median": 2.159 329 | }, 330 | { 331 | "test_case": "nested_1k_1MB", 332 | "implementation": "dirhash", 333 | "algorithm": "sha1", 334 | "workers": 1, 335 | "t_best": 1.436, 336 | "t_median": 1.47 337 | }, 338 | { 339 | "test_case": "nested_1k_1MB", 340 | "implementation": "dirhash", 341 | "algorithm": "sha1", 342 | "workers": 2, 343 | "t_best": 0.925, 344 | "t_median": 0.937 345 | }, 346 | { 347 | "test_case": "nested_1k_1MB", 348 | "implementation": "dirhash", 349 | "algorithm": "sha1", 350 | "workers": 4, 351 | "t_best": 0.627, 352 | "t_median": 0.643 353 | }, 354 | { 355 | "test_case": "nested_1k_1MB", 356 | "implementation": "dirhash", 357 | "algorithm": "sha1", 358 | "workers": 8, 359 | "t_best": 0.516, 360 | "t_median": 0.527 361 | }, 362 | { 363 | "test_case": "nested_32k_32kB", 364 | "implementation": "shell reference", 365 | "algorithm": "sha1", 366 | "workers": 1, 367 | "t_best": 3.982, 368 | "t_median": 7.147 369 | }, 370 | { 371 | "test_case": "nested_32k_32kB", 372 | "implementation": "dirhash", 373 | "algorithm": "sha1", 374 | "workers": 1, 375 | "t_best": 4.114, 376 | "t_median": 4.156 377 | }, 378 | { 379 | "test_case": "nested_32k_32kB", 380 | "implementation": "dirhash", 381 | "algorithm": "sha1", 382 | "workers": 2, 383 | "t_best": 2.598, 384 | "t_median": 2.616 385 | }, 386 | { 387 | "test_case": "nested_32k_32kB", 388 | "implementation": "dirhash", 389 | "algorithm": "sha1", 390 | "workers": 4, 391 | "t_best": 1.809, 392 | "t_median": 1.831 393 | }, 394 | { 395 | "test_case": "nested_32k_32kB", 396 | "implementation": "dirhash", 397 | "algorithm": "sha1", 398 | "workers": 8, 399 | "t_best": 1.552, 400 | "t_median": 1.58 401 | } 402 | ] 403 | -------------------------------------------------------------------------------- /benchmark/run.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import subprocess 4 | from statistics import median 5 | 6 | from dirhash import __version__ 7 | 8 | BENCHMARK_ROOT = os.path.abspath(os.path.join(__file__, os.pardir)) 9 | 10 | TEST_CASES = { 11 | "flat_8_128MB": {"depth": 0, "num_files": 2**3, "file_size": 2**27}, 12 | "flat_1k_1MB": {"depth": 0, "num_files": 2**10, "file_size": 2**20}, 13 | "flat_32k_32kB": {"depth": 0, "num_files": 2**15, "file_size": 2**15}, 14 | "nested_1k_1MB": {"depth": 8, "num_files": 2**10, "file_size": 2**20}, 15 | "nested_32k_32kB": {"depth": 8, "num_files": 2**15, "file_size": 2**15}, 16 | } 17 | 18 | 19 | def int_chunks(x, n): 20 | base = x // n 21 | remain = x % n 22 | chunks = [base] * n 23 | for i in range(remain): 24 | chunks[i] += 1 25 | 26 | return chunks 27 | 28 | 29 | def write_file_tree(dirpath, depth, num_files, file_size, branch_factor=2): 30 | assert num_files >= branch_factor**depth 31 | os.mkdir(dirpath) 32 | if depth == 0: 33 | fill = len(str(num_files)) 34 | for i in range(num_files): 35 | filepath = os.path.join(dirpath, "f_" + str(i).rjust(fill, "0")) 36 | with open(filepath, "wb") as f: 37 | f.write(os.urandom(file_size)) 38 | else: 39 | fill = len(str(branch_factor)) 40 | for i, num_files_branch in enumerate(int_chunks(num_files, branch_factor)): 41 | dirpath_branch = os.path.join(dirpath, "d_" + str(i).rjust(fill, "0")) 42 | write_file_tree( 43 | dirpath_branch, depth - 1, num_files_branch, file_size, branch_factor 44 | ) 45 | 46 | 47 | def require_test_cases(): 48 | test_cases_root = os.path.join(BENCHMARK_ROOT, "test_cases") 49 | if not os.path.exists(test_cases_root): 50 | os.mkdir(test_cases_root) 51 | test_case_paths = [] 52 | for name, kwargs in TEST_CASES.items(): 53 | test_case_path = os.path.join(test_cases_root, name) 54 | if not os.path.exists(test_case_path): 55 | print(f"creating test case: {name}: {kwargs}") 56 | write_file_tree(test_case_path, **kwargs) 57 | test_case_paths.append(test_case_path) 58 | 59 | return test_case_paths 60 | 61 | 62 | def time_shell(cmd, runs=1, repetitions=1, setup=None): 63 | time_cmd = f"time for i in {{1..{repetitions}}}; do {cmd}; done" 64 | if setup is not None: 65 | time_cmd = f"{setup}; {time_cmd}" 66 | 67 | realtimes = [] 68 | for _run in range(runs): 69 | process = subprocess.run( 70 | time_cmd, capture_output=True, text=True, shell=True, check=True 71 | ) 72 | output_lines = process.stderr.split("\n") 73 | try: 74 | t_real, t_user, t_sys = output_lines[-4:-1] 75 | assert t_real.startswith("real") 76 | t_str = t_real.split("\t")[1] 77 | min_str, sec_str = t_str.split("m") 78 | sec = 60 * int(min_str) + float(sec_str[:-1]) 79 | sec_per_rep = sec / repetitions 80 | except Exception as exc: 81 | raise RuntimeError( 82 | f"Failed to parse `time` stderr output: {process.stderr}" 83 | ) from exc 84 | realtimes.append(sec_per_rep) 85 | 86 | return realtimes 87 | 88 | 89 | def get_reference_shell_cmd(dirpath, algorithm): 90 | if algorithm == "md5": 91 | pass 92 | elif algorithm.startswith("sha"): 93 | version = int(algorithm[3:]) 94 | algorithm = f"shasum -a {version}" 95 | else: 96 | raise ValueError("only md5 and sha supported") 97 | 98 | return ( 99 | f"find {dirpath} -type f -print0 | sort -z | xargs -0 {algorithm} | {algorithm}" 100 | ) 101 | 102 | 103 | def get_dirhash_shell_cmd(dirpath, algorithm, workers=1): 104 | return f"dirhash {dirpath} -a {algorithm} -j {workers}" 105 | 106 | 107 | def benchmark(dirpath, algorithm, **kwargs): 108 | test_case = os.path.basename(dirpath) 109 | result = [] 110 | 111 | cmd = get_reference_shell_cmd(dirpath, algorithm) 112 | realtimes = time_shell(cmd=cmd, **kwargs) 113 | res = { 114 | "test_case": test_case, 115 | "implementation": "shell reference", 116 | "algorithm": algorithm, 117 | "workers": 1, 118 | "t_best": min(realtimes), 119 | "t_median": median(realtimes), 120 | } 121 | print(res) 122 | print(realtimes) 123 | result.append(res) 124 | 125 | for workers in [1, 2, 4, 8]: 126 | cmd = get_dirhash_shell_cmd(dirpath, algorithm, workers) 127 | realtimes = time_shell(cmd=cmd, **kwargs) 128 | res = { 129 | "test_case": test_case, 130 | "implementation": "dirhash", 131 | "algorithm": algorithm, 132 | "workers": workers, 133 | "t_best": min(realtimes), 134 | "t_median": median(realtimes), 135 | } 136 | print(res) 137 | print(realtimes) 138 | result.append(res) 139 | 140 | return result 141 | 142 | 143 | if __name__ == "__main__": 144 | test_cases = require_test_cases() 145 | results = [] 146 | for alg in ["md5", "sha1"]: 147 | for test_case in test_cases: 148 | result = benchmark(test_case, algorithm=alg, runs=5, repetitions=1) 149 | results.extend(result) 150 | 151 | result_fname = f"results_v{__version__}" 152 | 153 | with open(os.path.join(BENCHMARK_ROOT, result_fname + ".json"), "w") as f: 154 | json.dump(results, f, indent=2) 155 | 156 | try: 157 | import pandas as pd 158 | 159 | df = pd.DataFrame(results) 160 | df = df[ 161 | [ 162 | "test_case", 163 | "implementation", 164 | "algorithm", 165 | "workers", 166 | "t_best", 167 | "t_median", 168 | ] 169 | ] 170 | for (_tc, _alg), subdf in df.groupby(["test_case", "algorithm"]): 171 | t_ref = subdf[subdf["implementation"] == "shell reference"][ 172 | "t_median" 173 | ].values[0] 174 | speed_up = t_ref / subdf["t_median"] 175 | df.loc[speed_up.index, "speed-up (median)"] = speed_up 176 | print(df) 177 | df_hd = df[df["implementation"] == "dirhash"] 178 | df_hd_1w = df_hd[df_hd["workers"] == 1] 179 | df_hd_8w = df_hd[df_hd["workers"] == 8] 180 | mean_speedup_1w = df_hd_1w.mean()["speed-up (median)"] 181 | mean_speedup_8w = df_hd_8w.mean()["speed-up (median)"] 182 | print(f"\nAverage speedup (single process): {mean_speedup_1w}") 183 | print(df_hd_1w) 184 | print(f"\nAverage speedup multiprocess (8 workers): {mean_speedup_8w}") 185 | print(df_hd_8w) 186 | df.to_csv(os.path.join(BENCHMARK_ROOT, result_fname + ".csv")) 187 | except ImportError: 188 | pass 189 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | target: 100% # the required coverage value 6 | threshold: 5% # the leniency in hitting the target 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "versioneer==0.29"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.ruff] 6 | target-version = "py38" 7 | 8 | [tool.ruff.lint] 9 | select = [ 10 | "E", # pycodestyle errors 11 | "W", # pycodestyle warnings 12 | "F", # pyflakes 13 | "I", # isort 14 | "B", # flake8-bugbear 15 | "C4", # flake8-comprehensions 16 | "UP", # pyupgrade 17 | ] 18 | 19 | [tool.ruff.lint.isort] 20 | known-local-folder = ["dirhash"] 21 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [versioneer] 2 | VCS = git 3 | style = pep440 4 | versionfile_source = src/dirhash/_version.py 5 | versionfile_build = dirhash/_version.py 6 | tag_prefix = v 7 | parentdir_prefix = dirhash- 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import versioneer 4 | from setuptools import find_packages, setup 5 | 6 | PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) 7 | 8 | DESCRIPTION = "Python module and CLI for hashing of file system directories." 9 | 10 | try: 11 | with open(os.path.join(PROJECT_ROOT, "README.md"), encoding="utf-8") as f: 12 | long_description = "\n" + f.read() 13 | except OSError: 14 | long_description = DESCRIPTION 15 | 16 | setup( 17 | name="dirhash", 18 | version=versioneer.get_version(), 19 | cmdclass=versioneer.get_cmdclass(), 20 | description=DESCRIPTION, 21 | long_description=long_description, 22 | long_description_content_type="text/markdown", 23 | url="https://github.com/andhus/dirhash-python", 24 | author="Anders Huss", 25 | author_email="andhus@kth.se", 26 | license="MIT", 27 | python_requires=">=3.8", 28 | install_requires=["scantree>=0.0.4"], 29 | packages=find_packages("src"), 30 | package_dir={"": "src"}, 31 | include_package_data=True, 32 | entry_points={ 33 | "console_scripts": ["dirhash=dirhash.cli:main"], 34 | }, 35 | tests_require=["pre-commit", "pytest", "pytest-cov"], 36 | ) 37 | -------------------------------------------------------------------------------- /src/dirhash/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """dirhash - a python library (and CLI) for hashing of file system directories.""" 3 | 4 | import hashlib 5 | import os 6 | from functools import partial 7 | from multiprocessing import Pool 8 | 9 | from scantree import CyclicLinkedDir, RecursionFilter, scantree 10 | 11 | from . import _version 12 | 13 | __version__ = _version.get_versions()["version"] 14 | 15 | __all__ = [ 16 | "__version__", 17 | "algorithms_guaranteed", 18 | "algorithms_available", 19 | "dirhash", 20 | "dirhash_impl", 21 | "included_paths", 22 | "Filter", 23 | "get_match_patterns", 24 | "Protocol", 25 | ] 26 | 27 | 28 | algorithms_guaranteed = {"md5", "sha1", "sha224", "sha256", "sha384", "sha512"} 29 | algorithms_available = hashlib.algorithms_available 30 | 31 | 32 | def dirhash( 33 | directory, 34 | algorithm, 35 | match=("*",), 36 | ignore=None, 37 | linked_dirs=True, 38 | linked_files=True, 39 | empty_dirs=False, 40 | entry_properties=("name", "data"), 41 | allow_cyclic_links=False, 42 | chunk_size=2**20, 43 | jobs=1, 44 | ): 45 | """Computes the hash of a directory based on its structure and content. 46 | 47 | # Arguments 48 | directory: Union[str, pathlib.Path] - Path to the directory to hash. 49 | algorithm: str - The name of the hashing algorithm to use. See 50 | `dirhash.algorithms_available` for the available options. 51 | match: Iterable[str] - An iterable of glob/wildcard match-patterns for paths 52 | to include when computing the hash. Default is ["*"] which means that all 53 | files and directories are matched. To e.g. only include python source 54 | files, use: `match=["*.py"]`. See "Path Selection and Filtering" section 55 | below for further details. 56 | ignore: Optional[Iterable[str]] - An iterable of glob/wildcard match-patterns 57 | for paths to ignore when computing the hash. Default `None` (no ignore 58 | patterns). To e.g. exclude hidden files and directories use: 59 | `ignore=[".*/", ".*"]`. See "Path Selection and Filtering" section below 60 | for further details. 61 | linked_dirs: bool - If `True` (default), follow symbolic links to other 62 | *directories* and include these and their content in the hash 63 | computation. 64 | linked_files: bool - If `True` (default), include symbolic linked files in 65 | the hash computation. 66 | empty_dirs: bool - If `True`, include empty directories when computing the 67 | hash. A directory is considered empty if it does not contain any files 68 | that *matches provided matching criteria*. Default `False`, i.e. empty 69 | directories are ignored (as is done in git version control). 70 | entry_properties: Iterable[str] - A set (i.e. order does not matter) of the 71 | file/directory properties to consider when computing the hash. Supported 72 | properties are {"name", "data", "is_link"} where at least one of 73 | "name" and "data" must be included. Default is ["name", "data"] which 74 | means that the content (actual data) as well as the path relative to the 75 | root `directory` of files will affect the hash value. See "Entry 76 | Properties Interpretation" section below for further details. 77 | allow_cyclic_links: bool - If `False` (default) a `SymlinkRecursionError` is 78 | raised on presence of cyclic symbolic links. If set to `True` the the 79 | dirhash value for directory causing the cyclic link is replaced with the 80 | hash function hexdigest of the relative path from the link to the target. 81 | chunk_size: int - The number of bytes to read in one go from files while 82 | being hashed. A too small size will slow down the processing and a larger 83 | size consumes more working memory. Default 2**20 byte = 1 MiB. 84 | jobs: int - The number of processes to use when computing the hash. 85 | Default `1`, which means that a single (the main) process is used. NOTE 86 | that using multiprocessing can significantly speed-up execution, see 87 | `https://github.com/andhus/dirhash-python/benchmark` for further 88 | details. 89 | 90 | # Returns 91 | str - The hash/checksum as a string of the hexadecimal digits (the result of 92 | `hexdigest` method of the hashlib._hashlib.HASH object corresponding to the 93 | provided `algorithm`). 94 | 95 | # Raises 96 | TypeError/ValueError: For incorrectly provided arguments. 97 | SymlinkRecursionError: In case the `directory` contains symbolic links that 98 | lead to (infinite) recursion and `allow_cyclic_links=False` (default). 99 | 100 | # Path Selection and Filtering 101 | Provided glob/wildcard (".gitignore style") match-patterns determine what 102 | paths within the `directory` to include when computing the hash value. Paths 103 | *relative to the root `directory`* (i.e. excluding the name of the root 104 | directory itself) are matched against the patterns. 105 | The `match` argument represent what should be *included* - as opposed 106 | to the `ignore` argument for which matches are *excluded*. Using `ignore` is 107 | just short for adding the same patterns to the `match` argument with the 108 | prefix "!", i.e. the calls bellow are equivalent: 109 | `dirhash(..., match=["*", "!"])` 110 | `dirhash(..., ignore=[""])` 111 | To validate which paths are included, call `dirhash.included_paths` with 112 | the same values for the arguments: `match`, `ignore`, `linked_dirs`, 113 | `linked_files` and `empty_dirs` to get a list of all paths that will be 114 | included when computing the hash by this function. 115 | 116 | # Entry Properties Interpretation 117 | - ["name", "data"] (Default) - The name as well as data is included. Due to 118 | the recursive nature of the dirhash computation, "name" implies that the 119 | path relative to the root `directory` of each file/directory affects the 120 | computed hash value. 121 | - ["data"] - Compute the hash only based on the data of files - 122 | *not* their names or the names of their parent directories. NOTE that 123 | the tree structure in which files are organized under the `directory` 124 | root still influences the computed hash. As longs as all files have 125 | the same content and are organised the same way in relation to all 126 | other files in the Directed Acyclic Graph representing the file-tree, 127 | the hash will remain the same (but the "name of nodes" does not 128 | matter). This option can e.g. be used to verify that that data is 129 | unchanged after renaming files (change extensions etc.). 130 | - ["name"] - Compute the hash only based on the name and location of 131 | files in the file tree under the `directory` root. This option can 132 | e.g. be used to check if any files have been added/moved/removed, 133 | ignoring the content of each file. 134 | - "is_link" - if this options is added to any of the cases above the 135 | hash value is also affected by whether a file or directory is a 136 | symbolic link or not. NOTE: with this property added, the hash 137 | will be different than without it even if there are no symbolic links 138 | in the directory. 139 | 140 | # References 141 | See https://github.com/andhus/dirhash/README.md for a formal 142 | description of how the returned hash value is computed. 143 | """ 144 | filter_ = Filter( 145 | match_patterns=get_match_patterns(match=match, ignore=ignore), 146 | linked_dirs=linked_dirs, 147 | linked_files=linked_files, 148 | empty_dirs=empty_dirs, 149 | ) 150 | protocol = Protocol( 151 | entry_properties=entry_properties, allow_cyclic_links=allow_cyclic_links 152 | ) 153 | return dirhash_impl( 154 | directory=directory, 155 | algorithm=algorithm, 156 | filter_=filter_, 157 | protocol=protocol, 158 | chunk_size=chunk_size, 159 | jobs=jobs, 160 | ) 161 | 162 | 163 | def dirhash_impl( 164 | directory, algorithm, filter_=None, protocol=None, chunk_size=2**20, jobs=1 165 | ): 166 | """Computes the hash of a directory based on its structure and content. 167 | 168 | In contrast to `dirhash.dirhash`, this function accepts custom implementations of 169 | the `dirhash.Filter` and `dirhash.Protocol` classes. 170 | 171 | # Arguments 172 | directory: Union[str, pathlib.Path] - Path to the directory to hash. 173 | algorithm: str - The name of the hashing algorithm to use. See 174 | `dirhash.algorithms_available` for the available options. 175 | It is also possible to provide a callable object that returns an instance 176 | implementing the `hashlib._hashlib.HASH` interface. 177 | filter_: dirhash.Filter - Determines what files and directories to include 178 | when computing the hash. See docs of `dirhash.Filter` for further 179 | details. 180 | protocol: dirhash.Protocol - Determines (mainly) what properties of files and 181 | directories to consider when computing the hash value. 182 | chunk_size: int - The number of bytes to read in one go from files while 183 | being hashed. A too small size will slow down the processing and a larger 184 | size consumes more working memory. Default 2**20 byte = 1 MiB. 185 | jobs: int - The number of processes to use when computing the hash. 186 | Default `1`, which means that a single (the main) process is used. NOTE 187 | that using multiprocessing can significantly speed-up execution, see 188 | `https://github.com/andhus/dirhash/tree/master/benchmark` for further 189 | details. 190 | 191 | # Returns 192 | str - The hash/checksum as a string of the hexadecimal digits (the result of 193 | `hexdigest` method of the hashlib._hashlib.HASH object corresponding to the 194 | provided `algorithm`). 195 | 196 | # Raises 197 | TypeError/ValueError: For incorrectly provided arguments. 198 | SymlinkRecursionError: In case the `directory` contains symbolic links that 199 | lead to (infinite) recursion and the protocol option `allow_cyclic_links` 200 | is `False`. 201 | 202 | # References 203 | See https://github.com/andhus/dirhash/README.md for a formal 204 | description of how the returned hash value is computed. 205 | """ 206 | 207 | def get_instance(value, cls_, argname): 208 | if isinstance(value, cls_): 209 | return value 210 | if value is None: 211 | return cls_() 212 | raise TypeError(f"{argname} must be an instance of {cls_} or None") 213 | 214 | filter_ = get_instance(filter_, Filter, "filter_") 215 | protocol = get_instance(protocol, Protocol, "protocol") 216 | hasher_factory = _get_hasher_factory(algorithm) 217 | 218 | def dir_apply(dir_node): 219 | if not filter_.empty_dirs: 220 | if dir_node.path.relative == "" and dir_node.empty: 221 | # only check if root node is empty (other empty dirs are filter 222 | # before `dir_apply` with `filter_.empty_dirs=False`) 223 | raise ValueError(f"{directory}: Nothing to hash") 224 | descriptor = protocol.get_descriptor(dir_node) 225 | _dirhash = hasher_factory(descriptor.encode("utf-8")).hexdigest() 226 | 227 | return dir_node.path, _dirhash 228 | 229 | if jobs == 1: 230 | cache = {} 231 | 232 | def file_apply(path): 233 | return path, _get_filehash( 234 | path.real, hasher_factory, chunk_size=chunk_size, cache=cache 235 | ) 236 | 237 | _, dirhash_ = scantree( 238 | directory, 239 | recursion_filter=filter_, 240 | file_apply=file_apply, 241 | dir_apply=dir_apply, 242 | follow_links=True, 243 | allow_cyclic_links=protocol.allow_cyclic_links, 244 | cache_file_apply=False, 245 | include_empty=filter_.empty_dirs, 246 | jobs=1, 247 | ) 248 | else: # multiprocessing 249 | real_paths = set() 250 | 251 | def extract_real_paths(path): 252 | real_paths.add(path.real) 253 | return path 254 | 255 | root_node = scantree( 256 | directory, 257 | recursion_filter=filter_, 258 | file_apply=extract_real_paths, 259 | follow_links=True, 260 | allow_cyclic_links=protocol.allow_cyclic_links, 261 | cache_file_apply=False, 262 | include_empty=filter_.empty_dirs, 263 | jobs=1, 264 | ) 265 | real_paths = list(real_paths) 266 | # hash files in parallel 267 | file_hashes = _parmap( 268 | partial( 269 | _get_filehash, hasher_factory=hasher_factory, chunk_size=chunk_size 270 | ), 271 | real_paths, 272 | jobs=jobs, 273 | ) 274 | # prepare the mapping with precomputed file hashes 275 | real_path_to_hash = dict(zip(real_paths, file_hashes)) 276 | 277 | def file_apply(path): 278 | return path, real_path_to_hash[path.real] 279 | 280 | _, dirhash_ = root_node.apply(file_apply=file_apply, dir_apply=dir_apply) 281 | 282 | return dirhash_ 283 | 284 | 285 | def included_paths( 286 | directory, 287 | match=("*",), 288 | ignore=None, 289 | linked_dirs=True, 290 | linked_files=True, 291 | empty_dirs=False, 292 | allow_cyclic_links=False, 293 | ): 294 | """Inspect what paths are included for the corresponding arguments to the 295 | `dirhash.dirhash` function. 296 | 297 | # Arguments: 298 | This function accepts the following subset of the function `dirhash.dirhash` 299 | arguments: `directory`, `match`, `ignore`, `linked_dirs`, `linked_files`, 300 | `empty_dirs` and `allow_cyclic_links`, *with the same interpretation*. See 301 | docs of `dirhash.dirhash` for further details. 302 | 303 | # Returns 304 | List[str] - A sorted list of the paths that would be included when computing 305 | the hash of the `directory` using `dirhash.dirhash` and the same arguments. 306 | """ 307 | filter_ = Filter( 308 | match_patterns=get_match_patterns(match=match, ignore=ignore), 309 | linked_dirs=linked_dirs, 310 | linked_files=linked_files, 311 | empty_dirs=empty_dirs, 312 | ) 313 | protocol = Protocol(allow_cyclic_links=allow_cyclic_links) 314 | 315 | leafpaths = scantree( 316 | directory, 317 | recursion_filter=filter_, 318 | follow_links=True, 319 | allow_cyclic_links=protocol.allow_cyclic_links, 320 | include_empty=filter_.empty_dirs, 321 | ).leafpaths() 322 | 323 | return [ 324 | path.relative if path.is_file() else os.path.join(path.relative, ".") 325 | for path in leafpaths 326 | ] 327 | 328 | 329 | class Filter(RecursionFilter): 330 | """Specification of what files and directories to include for the `dirhash` 331 | computation. 332 | 333 | # Arguments 334 | match: Iterable[str] - An iterable of glob/wildcard (".gitignore style") 335 | match patterns for selection of which files and directories to include. 336 | Paths *relative to the root `directory`* (i.e. excluding the name of the 337 | root directory itself) are matched against the provided patterns. For 338 | example, to include all files, except for hidden ones use: 339 | `match=['*', '!.*']` Default `None` which is equivalent to `['*']`, 340 | i.e. everything is included. 341 | linked_dirs: bool - If `True` (default), follow symbolic links to other 342 | *directories* and include these and their content in the hash 343 | computation. 344 | linked_files: bool - If `True` (default), include symbolic linked files in 345 | the hash computation. 346 | empty_dirs: bool - If `True`, include empty directories when computing the 347 | hash. A directory is considered empty if it does not contain any files 348 | that *matches provided matching criteria*. Default `False`, i.e. empty 349 | directories are ignored (as is done in git version control). 350 | """ 351 | 352 | def __init__( 353 | self, match_patterns=None, linked_dirs=True, linked_files=True, empty_dirs=False 354 | ): 355 | super().__init__( 356 | linked_dirs=linked_dirs, linked_files=linked_files, match=match_patterns 357 | ) 358 | self.empty_dirs = empty_dirs 359 | 360 | 361 | def get_match_patterns( 362 | match=None, 363 | ignore=None, 364 | ignore_extensions=None, 365 | ignore_hidden=False, 366 | ): 367 | """Helper to compose a list of list of glob/wildcard (".gitignore style") match 368 | patterns based on options dedicated for a few standard use-cases. 369 | 370 | # Arguments 371 | match: Optional[List[str]] - A list of match-patterns for files to *include*. 372 | Default `None` which is equivalent to `['*']`, i.e. everything is 373 | included (unless excluded by arguments below). 374 | ignore: Optional[List[str]] - A list of match-patterns for files to 375 | *ignore*. Default `None` (no ignore patterns). 376 | ignore_extensions: Optional[List[str]] - A list of file extensions to 377 | ignore. Short for `ignore=['*.', ...]` Default `None` (no 378 | extensions ignored). 379 | ignore_hidden: bool - If `True` ignore hidden files and directories. Short 380 | for `ignore=['.*', '.*/']` Default `False`. 381 | """ 382 | match = ["*"] if match is None else list(match) 383 | ignore = [] if ignore is None else list(ignore) 384 | ignore_extensions = [] if ignore_extensions is None else list(ignore_extensions) 385 | 386 | if ignore_hidden: 387 | ignore.extend([".*", ".*/"]) 388 | 389 | for ext in ignore_extensions: 390 | if not ext.startswith("."): 391 | ext = "." + ext 392 | ext = "*" + ext 393 | ignore.append(ext) 394 | 395 | match_spec = match + ["!" + ign for ign in ignore] 396 | 397 | def deduplicate(items): 398 | items_set = set() 399 | dd_items = [] 400 | for item in items: 401 | if item not in items_set: 402 | dd_items.append(item) 403 | items_set.add(item) 404 | 405 | return dd_items 406 | 407 | return deduplicate(match_spec) 408 | 409 | 410 | class Protocol: 411 | """Specifications of which file and directory properties to consider when 412 | computing the `dirhash` value. 413 | 414 | # Arguments 415 | entry_properties: Iterable[str] - A combination of the supported properties 416 | {"name", "data", "is_link"} where at least one of "name" and "data" is 417 | included. Interpretation: 418 | - ["name", "data"] (Default) - The name as well as data is included. Due 419 | to the recursive nature of the dirhash computation, "name" implies 420 | that the path relative to the root `directory` of each file/directory 421 | affects the computed hash value. 422 | - ["data"] - Compute the hash only based on the data of files - 423 | *not* their names or the names of their parent directories. NOTE that 424 | the tree structure in which files are organized under the `directory` 425 | root still influences the computed hash. As longs as all files have 426 | the same content and are organised the same way in relation to all 427 | other files in the Directed Acyclic Graph representing the file-tree, 428 | the hash will remain the same (but the "name of nodes" does not 429 | matter). This option can e.g. be used to verify that that data is 430 | unchanged after renaming files (change extensions etc.). 431 | - ["name"] - Compute the hash only based on the name and location of 432 | files in the file tree under the `directory` root. This option can 433 | e.g. be used to check if any files have been added/moved/removed, 434 | ignoring the content of each file. 435 | - "is_link" - if this options is added to any of the cases above the 436 | hash value is also affected by whether a file or directory is a 437 | symbolic link or not. NOTE: which this property added, the hash 438 | will be different than without it even if there are no symbolic links 439 | in the directory. 440 | allow_cyclic_links: bool - If `False` (default) a `SymlinkRecursionError` is 441 | raised on presence of cyclic symbolic links. If set to `True` the the 442 | dirhash value for directory causing the cyclic link is replaced with the 443 | hash function hexdigest of the relative path from the link to the target. 444 | """ 445 | 446 | class EntryProperties: 447 | NAME = "name" 448 | DATA = "data" 449 | IS_LINK = "is_link" 450 | options = {NAME, DATA, IS_LINK} 451 | _DIRHASH = "dirhash" 452 | 453 | _entry_property_separator = "\000" 454 | _entry_descriptor_separator = "\000\000" 455 | 456 | def __init__(self, entry_properties=("name", "data"), allow_cyclic_links=False): 457 | entry_properties = set(entry_properties) 458 | if not entry_properties.issubset(self.EntryProperties.options): 459 | raise ValueError( 460 | f"entry properties {entry_properties - self.EntryProperties.options} " 461 | "not supported" 462 | ) 463 | if not ( 464 | self.EntryProperties.NAME in entry_properties 465 | or self.EntryProperties.DATA in entry_properties 466 | ): 467 | raise ValueError( 468 | "at least one of entry properties `name` and `data` must be used" 469 | ) 470 | self.entry_properties = entry_properties 471 | self._include_name = self.EntryProperties.NAME in entry_properties 472 | self._include_data = self.EntryProperties.DATA in entry_properties 473 | self._include_is_link = self.EntryProperties.IS_LINK in entry_properties 474 | 475 | if not isinstance(allow_cyclic_links, bool): 476 | raise ValueError( 477 | f"allow_cyclic_link must be a boolean, got {allow_cyclic_links}" 478 | ) 479 | self.allow_cyclic_links = allow_cyclic_links 480 | 481 | def get_descriptor(self, dir_node): 482 | if isinstance(dir_node, CyclicLinkedDir): 483 | return self._get_cyclic_linked_dir_descriptor(dir_node) 484 | 485 | entries = dir_node.directories + dir_node.files 486 | entry_descriptors = [ 487 | self._get_entry_descriptor(self._get_entry_properties(path, entry_hash)) 488 | for path, entry_hash in entries 489 | ] 490 | return self._entry_descriptor_separator.join(sorted(entry_descriptors)) 491 | 492 | @classmethod 493 | def _get_entry_descriptor(cls, entry_properties): 494 | entry_strings = [f"{name}:{value}" for name, value in entry_properties] 495 | return cls._entry_property_separator.join(sorted(entry_strings)) 496 | 497 | def _get_entry_properties(self, path, entry_hash): 498 | properties = [] 499 | if path.is_dir(): 500 | properties.append((self.EntryProperties._DIRHASH, entry_hash)) 501 | elif self._include_data: # path is file 502 | properties.append((self.EntryProperties.DATA, entry_hash)) 503 | 504 | if self._include_name: 505 | properties.append((self.EntryProperties.NAME, path.name)) 506 | if self._include_is_link: 507 | properties.append((self.EntryProperties.IS_LINK, path.is_symlink)) 508 | 509 | return properties 510 | 511 | def _get_cyclic_linked_dir_descriptor(self, dir_node): 512 | relpath = dir_node.path.relative 513 | target_relpath = dir_node.target_path.relative 514 | path_to_target = os.path.relpath( 515 | # the extra '.' is needed if link back to root, because 516 | # an empty path ('') is not supported by os.path.relpath 517 | os.path.join(".", target_relpath), 518 | os.path.join(".", relpath), 519 | ) 520 | # TODO normalize posix! 521 | return path_to_target 522 | 523 | 524 | def _get_hasher_factory(algorithm): 525 | """Returns a "factory" of hasher instances corresponding to the given algorithm 526 | name. Bypasses input argument `algorithm` if it is already a hasher factory 527 | (verified by attempting calls to required methods). 528 | """ 529 | if algorithm in algorithms_guaranteed: 530 | return getattr(hashlib, algorithm) 531 | 532 | if algorithm in algorithms_available: 533 | return partial(hashlib.new, algorithm) 534 | 535 | try: # bypass algorithm if already a hasher factory 536 | hasher = algorithm(b"") 537 | hasher.update(b"") 538 | hasher.hexdigest() 539 | return algorithm 540 | except: # noqa: E722 541 | pass 542 | 543 | raise ValueError(f"`algorithm` must be one of: {algorithms_available}`") 544 | 545 | 546 | def _parmap(func, iterable, jobs=1): 547 | """Map with multiprocessing.Pool""" 548 | if jobs == 1: 549 | return [func(element) for element in iterable] 550 | 551 | pool = Pool(jobs) 552 | try: 553 | results = pool.map(func, iterable) 554 | finally: 555 | pool.close() 556 | 557 | return results 558 | 559 | 560 | def _get_filehash(filepath, hasher_factory, chunk_size, cache=None): 561 | """Compute the hash of the given filepath. 562 | 563 | # Arguments 564 | filepath: str - Path to the file to hash. 565 | hasher_factory: (f: f() -> hashlib._hashlib.HASH): Callable that returns an 566 | instance of the `hashlib._hashlib.HASH` interface. 567 | chunk_size (int): The number of bytes to read in one go from files while 568 | being hashed. 569 | cache ({str: str} | None): A mapping from `filepath` to hash (return value 570 | of this function). If not None, a lookup will be attempted before hashing 571 | the file and the result will be added after completion. 572 | 573 | # Returns 574 | The hash/checksum as a string the of hexadecimal digits. 575 | 576 | # Side-effects 577 | The `cache` is updated if not None. 578 | """ 579 | if cache is not None: 580 | filehash = cache.get(filepath, None) 581 | if filehash is None: 582 | filehash = _get_filehash(filepath, hasher_factory, chunk_size) 583 | cache[filepath] = filehash 584 | return filehash 585 | 586 | hasher = hasher_factory() 587 | with open(filepath, "rb") as f: 588 | for chunk in iter(lambda: f.read(chunk_size), b""): 589 | hasher.update(chunk) 590 | 591 | return hasher.hexdigest() 592 | -------------------------------------------------------------------------------- /src/dirhash/_version.py: -------------------------------------------------------------------------------- 1 | # This file helps to compute a version number in source trees obtained from 2 | # git-archive tarball (such as those provided by githubs download-from-tag 3 | # feature). Distribution tarballs (built by setup.py sdist) and build 4 | # directories (produced by setup.py build) will contain a much shorter file 5 | # that just contains the computed version number. 6 | 7 | # This file is released into the public domain. 8 | # Generated by versioneer-0.29 9 | # https://github.com/python-versioneer/python-versioneer 10 | 11 | # ruff: noqa 12 | 13 | """Git implementation of _version.py.""" 14 | 15 | import errno 16 | import functools 17 | import os 18 | import re 19 | import subprocess 20 | import sys 21 | from typing import Any, Callable, Dict, List, Optional, Tuple 22 | 23 | 24 | def get_keywords() -> Dict[str, str]: 25 | """Get the keywords needed to look up the version information.""" 26 | # these strings will be replaced by git during git-archive. 27 | # setup.py/versioneer.py will grep for the variable names, so they must 28 | # each be defined on a line of their own. _version.py will just call 29 | # get_keywords(). 30 | git_refnames = " (HEAD -> master, tag: v0.5.0)" 31 | git_full = "1ead28a0ede6c8f039ab8b8107b71b011b3d435d" 32 | git_date = "2024-08-04 00:12:01 +0200" 33 | keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} 34 | return keywords 35 | 36 | 37 | class VersioneerConfig: 38 | """Container for Versioneer configuration parameters.""" 39 | 40 | VCS: str 41 | style: str 42 | tag_prefix: str 43 | parentdir_prefix: str 44 | versionfile_source: str 45 | verbose: bool 46 | 47 | 48 | def get_config() -> VersioneerConfig: 49 | """Create, populate and return the VersioneerConfig() object.""" 50 | # these strings are filled in when 'setup.py versioneer' creates 51 | # _version.py 52 | cfg = VersioneerConfig() 53 | cfg.VCS = "git" 54 | cfg.style = "pep440" 55 | cfg.tag_prefix = "v" 56 | cfg.parentdir_prefix = "dirhash-" 57 | cfg.versionfile_source = "src/dirhash/_version.py" 58 | cfg.verbose = False 59 | return cfg 60 | 61 | 62 | class NotThisMethod(Exception): 63 | """Exception raised if a method is not valid for the current scenario.""" 64 | 65 | 66 | LONG_VERSION_PY: Dict[str, str] = {} 67 | HANDLERS: Dict[str, Dict[str, Callable]] = {} 68 | 69 | 70 | def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator 71 | """Create decorator to mark a method as the handler of a VCS.""" 72 | 73 | def decorate(f: Callable) -> Callable: 74 | """Store f in HANDLERS[vcs][method].""" 75 | if vcs not in HANDLERS: 76 | HANDLERS[vcs] = {} 77 | HANDLERS[vcs][method] = f 78 | return f 79 | 80 | return decorate 81 | 82 | 83 | def run_command( 84 | commands: List[str], 85 | args: List[str], 86 | cwd: Optional[str] = None, 87 | verbose: bool = False, 88 | hide_stderr: bool = False, 89 | env: Optional[Dict[str, str]] = None, 90 | ) -> Tuple[Optional[str], Optional[int]]: 91 | """Call the given command(s).""" 92 | assert isinstance(commands, list) 93 | process = None 94 | 95 | popen_kwargs: Dict[str, Any] = {} 96 | if sys.platform == "win32": 97 | # This hides the console window if pythonw.exe is used 98 | startupinfo = subprocess.STARTUPINFO() 99 | startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW 100 | popen_kwargs["startupinfo"] = startupinfo 101 | 102 | for command in commands: 103 | try: 104 | dispcmd = str([command] + args) 105 | # remember shell=False, so use git.cmd on windows, not just git 106 | process = subprocess.Popen( 107 | [command] + args, 108 | cwd=cwd, 109 | env=env, 110 | stdout=subprocess.PIPE, 111 | stderr=(subprocess.PIPE if hide_stderr else None), 112 | **popen_kwargs, 113 | ) 114 | break 115 | except OSError as e: 116 | if e.errno == errno.ENOENT: 117 | continue 118 | if verbose: 119 | print("unable to run %s" % dispcmd) 120 | print(e) 121 | return None, None 122 | else: 123 | if verbose: 124 | print(f"unable to find command, tried {commands}") 125 | return None, None 126 | stdout = process.communicate()[0].strip().decode() 127 | if process.returncode != 0: 128 | if verbose: 129 | print("unable to run %s (error)" % dispcmd) 130 | print("stdout was %s" % stdout) 131 | return None, process.returncode 132 | return stdout, process.returncode 133 | 134 | 135 | def versions_from_parentdir( 136 | parentdir_prefix: str, 137 | root: str, 138 | verbose: bool, 139 | ) -> Dict[str, Any]: 140 | """Try to determine the version from the parent directory name. 141 | 142 | Source tarballs conventionally unpack into a directory that includes both 143 | the project name and a version string. We will also support searching up 144 | two directory levels for an appropriately named parent directory 145 | """ 146 | rootdirs = [] 147 | 148 | for _ in range(3): 149 | dirname = os.path.basename(root) 150 | if dirname.startswith(parentdir_prefix): 151 | return { 152 | "version": dirname[len(parentdir_prefix) :], 153 | "full-revisionid": None, 154 | "dirty": False, 155 | "error": None, 156 | "date": None, 157 | } 158 | rootdirs.append(root) 159 | root = os.path.dirname(root) # up a level 160 | 161 | if verbose: 162 | print( 163 | "Tried directories %s but none started with prefix %s" 164 | % (str(rootdirs), parentdir_prefix) 165 | ) 166 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix") 167 | 168 | 169 | @register_vcs_handler("git", "get_keywords") 170 | def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: 171 | """Extract version information from the given file.""" 172 | # the code embedded in _version.py can just fetch the value of these 173 | # keywords. When used from setup.py, we don't want to import _version.py, 174 | # so we do it with a regexp instead. This function is not used from 175 | # _version.py. 176 | keywords: Dict[str, str] = {} 177 | try: 178 | with open(versionfile_abs) as fobj: 179 | for line in fobj: 180 | if line.strip().startswith("git_refnames ="): 181 | mo = re.search(r'=\s*"(.*)"', line) 182 | if mo: 183 | keywords["refnames"] = mo.group(1) 184 | if line.strip().startswith("git_full ="): 185 | mo = re.search(r'=\s*"(.*)"', line) 186 | if mo: 187 | keywords["full"] = mo.group(1) 188 | if line.strip().startswith("git_date ="): 189 | mo = re.search(r'=\s*"(.*)"', line) 190 | if mo: 191 | keywords["date"] = mo.group(1) 192 | except OSError: 193 | pass 194 | return keywords 195 | 196 | 197 | @register_vcs_handler("git", "keywords") 198 | def git_versions_from_keywords( 199 | keywords: Dict[str, str], 200 | tag_prefix: str, 201 | verbose: bool, 202 | ) -> Dict[str, Any]: 203 | """Get version information from git keywords.""" 204 | if "refnames" not in keywords: 205 | raise NotThisMethod("Short version file found") 206 | date = keywords.get("date") 207 | if date is not None: 208 | # Use only the last line. Previous lines may contain GPG signature 209 | # information. 210 | date = date.splitlines()[-1] 211 | 212 | # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant 213 | # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 214 | # -like" string, which we must then edit to make compliant), because 215 | # it's been around since git-1.5.3, and it's too difficult to 216 | # discover which version we're using, or to work around using an 217 | # older one. 218 | date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 219 | refnames = keywords["refnames"].strip() 220 | if refnames.startswith("$Format"): 221 | if verbose: 222 | print("keywords are unexpanded, not using") 223 | raise NotThisMethod("unexpanded keywords, not a git-archive tarball") 224 | refs = {r.strip() for r in refnames.strip("()").split(",")} 225 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of 226 | # just "foo-1.0". If we see a "tag: " prefix, prefer those. 227 | TAG = "tag: " 228 | tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} 229 | if not tags: 230 | # Either we're using git < 1.8.3, or there really are no tags. We use 231 | # a heuristic: assume all version tags have a digit. The old git %d 232 | # expansion behaves like git log --decorate=short and strips out the 233 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish 234 | # between branches and tags. By ignoring refnames without digits, we 235 | # filter out many common branch names like "release" and 236 | # "stabilization", as well as "HEAD" and "master". 237 | tags = {r for r in refs if re.search(r"\d", r)} 238 | if verbose: 239 | print("discarding '%s', no digits" % ",".join(refs - tags)) 240 | if verbose: 241 | print("likely tags: %s" % ",".join(sorted(tags))) 242 | for ref in sorted(tags): 243 | # sorting will prefer e.g. "2.0" over "2.0rc1" 244 | if ref.startswith(tag_prefix): 245 | r = ref[len(tag_prefix) :] 246 | # Filter out refs that exactly match prefix or that don't start 247 | # with a number once the prefix is stripped (mostly a concern 248 | # when prefix is '') 249 | if not re.match(r"\d", r): 250 | continue 251 | if verbose: 252 | print("picking %s" % r) 253 | return { 254 | "version": r, 255 | "full-revisionid": keywords["full"].strip(), 256 | "dirty": False, 257 | "error": None, 258 | "date": date, 259 | } 260 | # no suitable tags, so version is "0+unknown", but full hex is still there 261 | if verbose: 262 | print("no suitable tags, using unknown + full revision id") 263 | return { 264 | "version": "0+unknown", 265 | "full-revisionid": keywords["full"].strip(), 266 | "dirty": False, 267 | "error": "no suitable tags", 268 | "date": None, 269 | } 270 | 271 | 272 | @register_vcs_handler("git", "pieces_from_vcs") 273 | def git_pieces_from_vcs( 274 | tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command 275 | ) -> Dict[str, Any]: 276 | """Get version from 'git describe' in the root of the source tree. 277 | 278 | This only gets called if the git-archive 'subst' keywords were *not* 279 | expanded, and _version.py hasn't already been rewritten with a short 280 | version string, meaning we're inside a checked out source tree. 281 | """ 282 | GITS = ["git"] 283 | if sys.platform == "win32": 284 | GITS = ["git.cmd", "git.exe"] 285 | 286 | # GIT_DIR can interfere with correct operation of Versioneer. 287 | # It may be intended to be passed to the Versioneer-versioned project, 288 | # but that should not change where we get our version from. 289 | env = os.environ.copy() 290 | env.pop("GIT_DIR", None) 291 | runner = functools.partial(runner, env=env) 292 | 293 | _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) 294 | if rc != 0: 295 | if verbose: 296 | print("Directory %s not under git control" % root) 297 | raise NotThisMethod("'git rev-parse --git-dir' returned error") 298 | 299 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] 300 | # if there isn't one, this yields HEX[-dirty] (no NUM) 301 | describe_out, rc = runner( 302 | GITS, 303 | [ 304 | "describe", 305 | "--tags", 306 | "--dirty", 307 | "--always", 308 | "--long", 309 | "--match", 310 | f"{tag_prefix}[[:digit:]]*", 311 | ], 312 | cwd=root, 313 | ) 314 | # --long was added in git-1.5.5 315 | if describe_out is None: 316 | raise NotThisMethod("'git describe' failed") 317 | describe_out = describe_out.strip() 318 | full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) 319 | if full_out is None: 320 | raise NotThisMethod("'git rev-parse' failed") 321 | full_out = full_out.strip() 322 | 323 | pieces: Dict[str, Any] = {} 324 | pieces["long"] = full_out 325 | pieces["short"] = full_out[:7] # maybe improved later 326 | pieces["error"] = None 327 | 328 | branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) 329 | # --abbrev-ref was added in git-1.6.3 330 | if rc != 0 or branch_name is None: 331 | raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") 332 | branch_name = branch_name.strip() 333 | 334 | if branch_name == "HEAD": 335 | # If we aren't exactly on a branch, pick a branch which represents 336 | # the current commit. If all else fails, we are on a branchless 337 | # commit. 338 | branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) 339 | # --contains was added in git-1.5.4 340 | if rc != 0 or branches is None: 341 | raise NotThisMethod("'git branch --contains' returned error") 342 | branches = branches.split("\n") 343 | 344 | # Remove the first line if we're running detached 345 | if "(" in branches[0]: 346 | branches.pop(0) 347 | 348 | # Strip off the leading "* " from the list of branches. 349 | branches = [branch[2:] for branch in branches] 350 | if "master" in branches: 351 | branch_name = "master" 352 | elif not branches: 353 | branch_name = None 354 | else: 355 | # Pick the first branch that is returned. Good or bad. 356 | branch_name = branches[0] 357 | 358 | pieces["branch"] = branch_name 359 | 360 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] 361 | # TAG might have hyphens. 362 | git_describe = describe_out 363 | 364 | # look for -dirty suffix 365 | dirty = git_describe.endswith("-dirty") 366 | pieces["dirty"] = dirty 367 | if dirty: 368 | git_describe = git_describe[: git_describe.rindex("-dirty")] 369 | 370 | # now we have TAG-NUM-gHEX or HEX 371 | 372 | if "-" in git_describe: 373 | # TAG-NUM-gHEX 374 | mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) 375 | if not mo: 376 | # unparsable. Maybe git-describe is misbehaving? 377 | pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out 378 | return pieces 379 | 380 | # tag 381 | full_tag = mo.group(1) 382 | if not full_tag.startswith(tag_prefix): 383 | if verbose: 384 | fmt = "tag '%s' doesn't start with prefix '%s'" 385 | print(fmt % (full_tag, tag_prefix)) 386 | pieces["error"] = ( 387 | f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" 388 | ) 389 | return pieces 390 | pieces["closest-tag"] = full_tag[len(tag_prefix) :] 391 | 392 | # distance: number of commits since tag 393 | pieces["distance"] = int(mo.group(2)) 394 | 395 | # commit: short hex revision ID 396 | pieces["short"] = mo.group(3) 397 | 398 | else: 399 | # HEX: no tags 400 | pieces["closest-tag"] = None 401 | out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) 402 | pieces["distance"] = len(out.split()) # total number of commits 403 | 404 | # commit date: see ISO-8601 comment in git_versions_from_keywords() 405 | date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() 406 | # Use only the last line. Previous lines may contain GPG signature 407 | # information. 408 | date = date.splitlines()[-1] 409 | pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 410 | 411 | return pieces 412 | 413 | 414 | def plus_or_dot(pieces: Dict[str, Any]) -> str: 415 | """Return a + if we don't already have one, else return a .""" 416 | if "+" in pieces.get("closest-tag", ""): 417 | return "." 418 | return "+" 419 | 420 | 421 | def render_pep440(pieces: Dict[str, Any]) -> str: 422 | """Build up version string, with post-release "local version identifier". 423 | 424 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you 425 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty 426 | 427 | Exceptions: 428 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] 429 | """ 430 | if pieces["closest-tag"]: 431 | rendered = pieces["closest-tag"] 432 | if pieces["distance"] or pieces["dirty"]: 433 | rendered += plus_or_dot(pieces) 434 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) 435 | if pieces["dirty"]: 436 | rendered += ".dirty" 437 | else: 438 | # exception #1 439 | rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) 440 | if pieces["dirty"]: 441 | rendered += ".dirty" 442 | return rendered 443 | 444 | 445 | def render_pep440_branch(pieces: Dict[str, Any]) -> str: 446 | """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . 447 | 448 | The ".dev0" means not master branch. Note that .dev0 sorts backwards 449 | (a feature branch will appear "older" than the master branch). 450 | 451 | Exceptions: 452 | 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] 453 | """ 454 | if pieces["closest-tag"]: 455 | rendered = pieces["closest-tag"] 456 | if pieces["distance"] or pieces["dirty"]: 457 | if pieces["branch"] != "master": 458 | rendered += ".dev0" 459 | rendered += plus_or_dot(pieces) 460 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) 461 | if pieces["dirty"]: 462 | rendered += ".dirty" 463 | else: 464 | # exception #1 465 | rendered = "0" 466 | if pieces["branch"] != "master": 467 | rendered += ".dev0" 468 | rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) 469 | if pieces["dirty"]: 470 | rendered += ".dirty" 471 | return rendered 472 | 473 | 474 | def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: 475 | """Split pep440 version string at the post-release segment. 476 | 477 | Returns the release segments before the post-release and the 478 | post-release version number (or -1 if no post-release segment is present). 479 | """ 480 | vc = str.split(ver, ".post") 481 | return vc[0], int(vc[1] or 0) if len(vc) == 2 else None 482 | 483 | 484 | def render_pep440_pre(pieces: Dict[str, Any]) -> str: 485 | """TAG[.postN.devDISTANCE] -- No -dirty. 486 | 487 | Exceptions: 488 | 1: no tags. 0.post0.devDISTANCE 489 | """ 490 | if pieces["closest-tag"]: 491 | if pieces["distance"]: 492 | # update the post release segment 493 | tag_version, post_version = pep440_split_post(pieces["closest-tag"]) 494 | rendered = tag_version 495 | if post_version is not None: 496 | rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) 497 | else: 498 | rendered += ".post0.dev%d" % (pieces["distance"]) 499 | else: 500 | # no commits, use the tag as the version 501 | rendered = pieces["closest-tag"] 502 | else: 503 | # exception #1 504 | rendered = "0.post0.dev%d" % pieces["distance"] 505 | return rendered 506 | 507 | 508 | def render_pep440_post(pieces: Dict[str, Any]) -> str: 509 | """TAG[.postDISTANCE[.dev0]+gHEX] . 510 | 511 | The ".dev0" means dirty. Note that .dev0 sorts backwards 512 | (a dirty tree will appear "older" than the corresponding clean one), 513 | but you shouldn't be releasing software with -dirty anyways. 514 | 515 | Exceptions: 516 | 1: no tags. 0.postDISTANCE[.dev0] 517 | """ 518 | if pieces["closest-tag"]: 519 | rendered = pieces["closest-tag"] 520 | if pieces["distance"] or pieces["dirty"]: 521 | rendered += ".post%d" % pieces["distance"] 522 | if pieces["dirty"]: 523 | rendered += ".dev0" 524 | rendered += plus_or_dot(pieces) 525 | rendered += "g%s" % pieces["short"] 526 | else: 527 | # exception #1 528 | rendered = "0.post%d" % pieces["distance"] 529 | if pieces["dirty"]: 530 | rendered += ".dev0" 531 | rendered += "+g%s" % pieces["short"] 532 | return rendered 533 | 534 | 535 | def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: 536 | """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . 537 | 538 | The ".dev0" means not master branch. 539 | 540 | Exceptions: 541 | 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] 542 | """ 543 | if pieces["closest-tag"]: 544 | rendered = pieces["closest-tag"] 545 | if pieces["distance"] or pieces["dirty"]: 546 | rendered += ".post%d" % pieces["distance"] 547 | if pieces["branch"] != "master": 548 | rendered += ".dev0" 549 | rendered += plus_or_dot(pieces) 550 | rendered += "g%s" % pieces["short"] 551 | if pieces["dirty"]: 552 | rendered += ".dirty" 553 | else: 554 | # exception #1 555 | rendered = "0.post%d" % pieces["distance"] 556 | if pieces["branch"] != "master": 557 | rendered += ".dev0" 558 | rendered += "+g%s" % pieces["short"] 559 | if pieces["dirty"]: 560 | rendered += ".dirty" 561 | return rendered 562 | 563 | 564 | def render_pep440_old(pieces: Dict[str, Any]) -> str: 565 | """TAG[.postDISTANCE[.dev0]] . 566 | 567 | The ".dev0" means dirty. 568 | 569 | Exceptions: 570 | 1: no tags. 0.postDISTANCE[.dev0] 571 | """ 572 | if pieces["closest-tag"]: 573 | rendered = pieces["closest-tag"] 574 | if pieces["distance"] or pieces["dirty"]: 575 | rendered += ".post%d" % pieces["distance"] 576 | if pieces["dirty"]: 577 | rendered += ".dev0" 578 | else: 579 | # exception #1 580 | rendered = "0.post%d" % pieces["distance"] 581 | if pieces["dirty"]: 582 | rendered += ".dev0" 583 | return rendered 584 | 585 | 586 | def render_git_describe(pieces: Dict[str, Any]) -> str: 587 | """TAG[-DISTANCE-gHEX][-dirty]. 588 | 589 | Like 'git describe --tags --dirty --always'. 590 | 591 | Exceptions: 592 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 593 | """ 594 | if pieces["closest-tag"]: 595 | rendered = pieces["closest-tag"] 596 | if pieces["distance"]: 597 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 598 | else: 599 | # exception #1 600 | rendered = pieces["short"] 601 | if pieces["dirty"]: 602 | rendered += "-dirty" 603 | return rendered 604 | 605 | 606 | def render_git_describe_long(pieces: Dict[str, Any]) -> str: 607 | """TAG-DISTANCE-gHEX[-dirty]. 608 | 609 | Like 'git describe --tags --dirty --always -long'. 610 | The distance/hash is unconditional. 611 | 612 | Exceptions: 613 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 614 | """ 615 | if pieces["closest-tag"]: 616 | rendered = pieces["closest-tag"] 617 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 618 | else: 619 | # exception #1 620 | rendered = pieces["short"] 621 | if pieces["dirty"]: 622 | rendered += "-dirty" 623 | return rendered 624 | 625 | 626 | def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: 627 | """Render the given version pieces into the requested style.""" 628 | if pieces["error"]: 629 | return { 630 | "version": "unknown", 631 | "full-revisionid": pieces.get("long"), 632 | "dirty": None, 633 | "error": pieces["error"], 634 | "date": None, 635 | } 636 | 637 | if not style or style == "default": 638 | style = "pep440" # the default 639 | 640 | if style == "pep440": 641 | rendered = render_pep440(pieces) 642 | elif style == "pep440-branch": 643 | rendered = render_pep440_branch(pieces) 644 | elif style == "pep440-pre": 645 | rendered = render_pep440_pre(pieces) 646 | elif style == "pep440-post": 647 | rendered = render_pep440_post(pieces) 648 | elif style == "pep440-post-branch": 649 | rendered = render_pep440_post_branch(pieces) 650 | elif style == "pep440-old": 651 | rendered = render_pep440_old(pieces) 652 | elif style == "git-describe": 653 | rendered = render_git_describe(pieces) 654 | elif style == "git-describe-long": 655 | rendered = render_git_describe_long(pieces) 656 | else: 657 | raise ValueError("unknown style '%s'" % style) 658 | 659 | return { 660 | "version": rendered, 661 | "full-revisionid": pieces["long"], 662 | "dirty": pieces["dirty"], 663 | "error": None, 664 | "date": pieces.get("date"), 665 | } 666 | 667 | 668 | def get_versions() -> Dict[str, Any]: 669 | """Get version information or return default if unable to do so.""" 670 | # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have 671 | # __file__, we can work backwards from there to the root. Some 672 | # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which 673 | # case we can only use expanded keywords. 674 | 675 | cfg = get_config() 676 | verbose = cfg.verbose 677 | 678 | try: 679 | return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) 680 | except NotThisMethod: 681 | pass 682 | 683 | try: 684 | root = os.path.realpath(__file__) 685 | # versionfile_source is the relative path from the top of the source 686 | # tree (where the .git directory might live) to this file. Invert 687 | # this to find the root from __file__. 688 | for _ in cfg.versionfile_source.split("/"): 689 | root = os.path.dirname(root) 690 | except NameError: 691 | return { 692 | "version": "0+unknown", 693 | "full-revisionid": None, 694 | "dirty": None, 695 | "error": "unable to find root of source tree", 696 | "date": None, 697 | } 698 | 699 | try: 700 | pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) 701 | return render(pieces, cfg.style) 702 | except NotThisMethod: 703 | pass 704 | 705 | try: 706 | if cfg.parentdir_prefix: 707 | return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) 708 | except NotThisMethod: 709 | pass 710 | 711 | return { 712 | "version": "0+unknown", 713 | "full-revisionid": None, 714 | "dirty": None, 715 | "error": "unable to compute version", 716 | "date": None, 717 | } 718 | -------------------------------------------------------------------------------- /src/dirhash/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Get hash for the content and/or structure of a directory.""" 3 | 4 | import argparse 5 | import sys 6 | 7 | import dirhash 8 | 9 | 10 | def main(): 11 | try: 12 | kwargs = get_kwargs(sys.argv[1:]) 13 | if kwargs.pop("list"): 14 | # kwargs below have no effect when listing 15 | for k in ["algorithm", "chunk_size", "jobs", "entry_properties"]: 16 | kwargs.pop(k) 17 | for leafpath in dirhash.included_paths(**kwargs): 18 | print(leafpath) 19 | else: 20 | print(dirhash.dirhash(**kwargs)) 21 | except Exception as e: # pragma: no cover (not picked up by coverage) 22 | sys.stderr.write(f"dirhash: {e}\n") 23 | sys.exit(1) 24 | 25 | 26 | def get_kwargs(args): 27 | parser = argparse.ArgumentParser(description="Determine the hash for a directory.") 28 | parser.add_argument( 29 | "-v", 30 | "--version", 31 | action="version", 32 | version=f"dirhash {dirhash.__version__}", 33 | ) 34 | parser.add_argument("directory", help="Directory to hash.") 35 | parser.add_argument( 36 | "-a", 37 | "--algorithm", 38 | choices=dirhash.algorithms_available, 39 | default="md5", 40 | help=( 41 | "Hashing algorithm to use, by default 'md5'. " 42 | f"Always available: {sorted(dirhash.algorithms_guaranteed)}. " 43 | f"Additionally available on current platform: " 44 | f"{sorted(dirhash.algorithms_available - dirhash.algorithms_guaranteed)}. " 45 | "Note that the same algorithm may appear multiple times in this set " 46 | "under different names (thanks to OpenSSL) " 47 | "[https://docs.python.org/2/library/hashlib.html]." 48 | ), 49 | metavar="", 50 | ) 51 | 52 | filter_options = parser.add_argument_group( 53 | title="Filtering options", 54 | description=( 55 | "Specify what files and directories to include. All files and " 56 | "directories (including symbolic links) are included by default. The " 57 | "--match/--ignore arguments allows for selection using glob/wildcard " 58 | '(".gitignore style") path matching. Paths relative to the root ' 59 | "`directory` (i.e. excluding the name of the root directory itself) are " 60 | "matched against the provided patterns. For example, to only include " 61 | 'python source files, use: `dirhash path/to/dir -m "*.py"` or to ' 62 | "exclude hidden files and directories use: " 63 | '`dirhash path/to.dir -i ".*" ".*/"` which is short for ' 64 | '`dirhash path/to.dir -m "*" "!.*" "!.*/"`. By adding the --list ' 65 | "argument, all included paths, for the given filtering arguments, are " 66 | "returned instead of the hash value. For further details see " 67 | "https://github.com/andhus/dirhash/README.md#filtering" 68 | ), 69 | ) 70 | filter_options.add_argument( 71 | "-m", 72 | "--match", 73 | nargs="+", 74 | default=["*"], 75 | help=( 76 | "One or several patterns for paths to include. NOTE: patterns " 77 | 'with an asterisk must be in quotes ("*") or the asterisk ' 78 | "preceded by an escape character (`*)." 79 | ), 80 | metavar="", 81 | ) 82 | filter_options.add_argument( 83 | "-i", 84 | "--ignore", 85 | nargs="+", 86 | default=None, 87 | help=( 88 | "One or several patterns for paths to exclude. NOTE: patterns " 89 | 'with an asterisk must be in quotes ("*") or the asterisk ' 90 | "preceded by an escape character (`*)." 91 | ), 92 | metavar="", 93 | ) 94 | filter_options.add_argument( 95 | "--empty-dirs", 96 | action="store_true", 97 | default=False, 98 | help="Include empty directories (containing no files that meet the matching " 99 | "criteria and no non-empty sub directories).", 100 | ) 101 | filter_options.add_argument( 102 | "--no-linked-dirs", 103 | dest="linked_dirs", 104 | action="store_false", 105 | help="Do not include symbolic links to other directories.", 106 | ) 107 | filter_options.add_argument( 108 | "--no-linked-files", 109 | dest="linked_files", 110 | action="store_false", 111 | help="Do not include symbolic links to files.", 112 | ) 113 | parser.set_defaults(linked_dirs=True, linked_files=True) 114 | 115 | protocol_options = parser.add_argument_group( 116 | title="Protocol options", 117 | description=( 118 | "Specify what properties of files and directories to include and " 119 | "whether to allow cyclic links. For further details see " 120 | "https://github.com/andhus/dirhash/DIRHASH_STANDARD.md#protocol" 121 | ), 122 | ) 123 | protocol_options.add_argument( 124 | "-p", 125 | "--properties", 126 | nargs="+", 127 | dest="entry_properties", 128 | default=["data", "name"], 129 | help=( 130 | "List of file/directory properties to include in the hash. Available " 131 | f"properties are: {list(dirhash.Protocol.EntryProperties.options)} and at " 132 | "least one of name and data must be included. Default is [data name] which " 133 | "means that both the name/paths and content (actual data) of files and " 134 | "directories will be included" 135 | ), 136 | metavar="", 137 | ) 138 | protocol_options.add_argument( 139 | "-c", 140 | "--allow-cyclic-links", 141 | default=False, 142 | action="store_true", 143 | help=( 144 | "Allow presence of cyclic links (by hashing the relative path to the " 145 | "target directory)." 146 | ), 147 | ) 148 | 149 | implementation_options = parser.add_argument_group( 150 | title="Implementation options", description="" 151 | ) 152 | implementation_options.add_argument( 153 | "-s", 154 | "--chunk-size", 155 | default=2**20, 156 | type=int, 157 | help="The chunk size (in bytes) for reading of files.", 158 | ) 159 | implementation_options.add_argument( 160 | "-j", 161 | "--jobs", 162 | type=int, 163 | default=1, # TODO make default number of cores? 164 | help="Number of jobs (parallel processes) to use.", 165 | ) 166 | 167 | special_options = parser.add_argument_group(title="Special options") 168 | special_options.add_argument( 169 | "-l", 170 | "--list", 171 | action="store_true", 172 | default=False, 173 | help="List the file paths that will be taken into account, given the " 174 | "provided filtering options.", 175 | ) 176 | 177 | return vars(parser.parse_args(args)) 178 | 179 | 180 | if __name__ == "__main__": # pragma: no cover 181 | main() 182 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shlex 3 | import subprocess 4 | import sys 5 | 6 | import pytest 7 | 8 | import dirhash 9 | 10 | console_script = os.path.join( 11 | os.path.dirname(sys.executable), 12 | "dirhash.exe" if os.name == "nt" else "dirhash", 13 | ) 14 | if not os.path.isfile(console_script): 15 | print(os.listdir(os.path.dirname(sys.executable))) 16 | raise FileNotFoundError(f"Could not find console script at {console_script}.") 17 | if not os.access(console_script, os.X_OK): 18 | raise PermissionError(f"Console script at {console_script} is not executable.") 19 | 20 | 21 | def dirhash_run(argstring, add_env=None): 22 | if add_env: 23 | env = os.environ.copy() 24 | env.update(add_env) 25 | else: 26 | env = None 27 | process = subprocess.Popen( 28 | [console_script] + shlex.split(argstring), 29 | stdout=subprocess.PIPE, 30 | stderr=subprocess.PIPE, 31 | text=True, 32 | env=env, 33 | ) 34 | output, error = process.communicate() 35 | 36 | # in python3 output and error are `bytes` as opposed to `str` in python2 37 | if isinstance(output, bytes): 38 | output = output.decode("utf-8") 39 | if isinstance(error, bytes): 40 | error = error.decode("utf-8") 41 | 42 | return output, error, process.returncode 43 | 44 | 45 | def create_default_tree(tmpdir): 46 | """ 47 | tmpdir/ 48 | |__.dir/ 49 | | |__file 50 | |__.file 51 | |__dir/ 52 | | |__file 53 | |__empty/ 54 | |__file 55 | |__file.ext1 56 | |__file.ext2 57 | """ 58 | dotdir = tmpdir.mkdir(".dir") 59 | dotdir.join("file").write("file in hidden sub-directory") 60 | tmpdir.join(".file").write("hidden file") 61 | dir = tmpdir.mkdir("dir") 62 | dir.join("file").write("file in sub-directory") 63 | tmpdir.mkdir("empty") 64 | tmpdir.join("file").write("file") 65 | tmpdir.join("file.ext1").write("file with extension .ext1") 66 | tmpdir.join("file.ext2").write("file with extension .ext2") 67 | 68 | 69 | def osp(path: str) -> str: 70 | """Normalize path for OS.""" 71 | if os.name == "nt": # pragma: no cover 72 | return path.replace("/", "\\") 73 | return path 74 | 75 | 76 | class TestCLI: 77 | @pytest.mark.parametrize( 78 | "argstring, non_default_kwargs", 79 | [ 80 | (". -a md5", {}), 81 | (".. -a md5", {"directory": ".."}), 82 | ("target-dir -a md5", {"directory": "target-dir"}), 83 | (". -a sha256", {"algorithm": "sha256"}), 84 | # Filtering options 85 | ('. -a md5 -m "*" "!.*"', {"match": ["*", "!.*"]}), 86 | ( 87 | '. -a md5 --match "d1/*" "d2/*" --ignore "*.txt"', 88 | {"match": ["d1/*", "d2/*"], "ignore": ["*.txt"]}, 89 | ), 90 | (". -a md5 --empty-dirs", {"empty_dirs": True}), 91 | (". -a md5 --no-linked-dirs", {"linked_dirs": False}), 92 | (". -a md5 --no-linked-files", {"linked_files": False}), 93 | # Protocol options 94 | (". -a md5 --allow-cyclic-links", {"allow_cyclic_links": True}), 95 | (". -a md5 --properties name", {"entry_properties": ["name"]}), 96 | (". -a md5 --properties name data", {"entry_properties": ["name", "data"]}), 97 | # Implementation 98 | (". -a md5 -j 10", {"jobs": 10}), 99 | (". -a md5 -s 32000", {"chunk_size": 32000}), 100 | ], 101 | ) 102 | def test_get_kwargs(self, argstring, non_default_kwargs): 103 | from dirhash.cli import get_kwargs 104 | 105 | kwargs_expected = { 106 | "list": False, 107 | "directory": ".", 108 | "algorithm": "md5", 109 | "match": ["*"], 110 | "ignore": None, 111 | "empty_dirs": False, 112 | "linked_dirs": True, 113 | "linked_files": True, 114 | "entry_properties": ["data", "name"], 115 | "allow_cyclic_links": False, 116 | "chunk_size": 2**20, 117 | "jobs": 1, 118 | } 119 | kwargs_expected.update(non_default_kwargs) 120 | kwargs = get_kwargs(shlex.split(argstring)) 121 | assert kwargs == kwargs_expected 122 | 123 | @pytest.mark.parametrize( 124 | "description, argstrings, output", 125 | [ 126 | ( 127 | "ARGS WITHOUT EFFECT WHEN LISTING", 128 | [ 129 | ". -l", 130 | ". --list", 131 | ". -a md5 --list", 132 | ". -a sha256 --list", 133 | ". --properties name --list", 134 | ". --jobs 2 --list", 135 | ". --chunk-size 2 --list", 136 | ], 137 | ( 138 | ".dir/file\n" 139 | ".file\n" 140 | "dir/file\n" 141 | "file\n" 142 | "file.ext1\n" 143 | "file.ext2\n" 144 | ), 145 | ), 146 | ( 147 | "IGNORE EXTENSION", 148 | [ 149 | '. -i "*.ext1" --list', 150 | '. --ignore "*.ext1" --list', 151 | '. -m "*" "!*.ext1" --list', 152 | '. --match "*" "!*.ext1" --list', 153 | ], 154 | (".dir/file\n" ".file\n" "dir/file\n" "file\n" "file.ext2\n"), 155 | ), 156 | ( 157 | "IGNORE MULTIPLE EXTENSIONS", 158 | ['. -i "*.ext1" "*.ext2" --list', '. -i "*.ext*" --list'], 159 | (".dir/file\n" ".file\n" "dir/file\n" "file\n"), 160 | ), 161 | ( 162 | "IGNORE HIDDEN", 163 | ['. -i ".*" ".*/" --list'], 164 | ("dir/file\n" "file\n" "file.ext1\n" "file.ext2\n"), 165 | ), 166 | ( 167 | "INCLUDE EMPTY", 168 | [". --empty-dirs --list"], 169 | ( 170 | ".dir/file\n" 171 | ".file\n" 172 | "dir/file\n" 173 | "empty/.\n" 174 | "file\n" 175 | "file.ext1\n" 176 | "file.ext2\n" 177 | ), 178 | ), 179 | ], 180 | ) 181 | def test_list(self, description, argstrings, output, tmpdir): 182 | create_default_tree(tmpdir) 183 | with tmpdir.as_cwd(): 184 | for argstring in argstrings: 185 | o, error, returncode = dirhash_run(argstring) 186 | assert returncode == 0 187 | assert error == "" 188 | assert o == osp(output) 189 | 190 | @pytest.mark.parametrize( 191 | "argstring, kwargs, expected_hashes", 192 | [ 193 | ( 194 | ". -a md5", 195 | {"algorithm": "md5"}, 196 | [ 197 | "594c48dde0776b03eddeeb0232190be7", 198 | "d8ab965636d48e407b73b9dbba4cb928", 199 | "050e7bc9ffcb09c15186c04e0f8026df", 200 | ], 201 | ), 202 | ( 203 | ". -a sha256", 204 | {"algorithm": "sha256"}, 205 | [ 206 | "23a04964149889e932ba3348fe22442f4f6a3b3fec616a386a70579ee857ab7b", 207 | "7b76bac43e963f9561f37b96b92d7a174094bff230c6efbf1d8bf650e8b40b7a", 208 | "7156da2b2e5a2926eb4b72e65f389343cb6aca0578f0aedcd6f7457abd67d8f5", 209 | ], 210 | ), 211 | ], 212 | ) 213 | def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): 214 | # verify same result from cmdline and library + regression test of actual 215 | # hashes 216 | create_default_tree(tmpdir) 217 | with tmpdir.as_cwd(): 218 | for add_argstring, add_kwargs, expected_hash in zip( 219 | ["", " -p data", " -p name"], 220 | [ 221 | {}, 222 | {"entry_properties": ["data"]}, 223 | {"entry_properties": ["name"]}, 224 | ], 225 | expected_hashes, 226 | ): 227 | # run CLI 228 | full_argstring = argstring + add_argstring 229 | cli_out, error, returncode = dirhash_run(full_argstring) 230 | assert error == "" 231 | assert returncode == 0 232 | assert cli_out[-1] == "\n" 233 | cli_hash = cli_out[:-1] 234 | 235 | # run CLI multiproc 236 | full_argstring_mp = argstring + add_argstring + " --jobs 2" 237 | cli_out_mp, error_mp, returncode_mp = dirhash_run(full_argstring_mp) 238 | assert error_mp == "" 239 | assert returncode_mp == 0 240 | assert cli_out_mp[-1] == "\n" 241 | cli_hash_mp = cli_out_mp[:-1] 242 | 243 | # run lib function 244 | full_kwargs = kwargs.copy() 245 | full_kwargs.update(add_kwargs) 246 | lib_hash = dirhash.dirhash(str(tmpdir), **full_kwargs) 247 | 248 | assert cli_hash == cli_hash_mp == lib_hash == expected_hash 249 | 250 | def test_error_bad_argument(self, tmpdir): 251 | with tmpdir.as_cwd(): 252 | o, error, returncode = dirhash_run(". --chunk-size not_an_int") 253 | assert returncode > 0 254 | assert error != "" 255 | -------------------------------------------------------------------------------- /tests/test_dirhash.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import shutil 4 | import tempfile 5 | from time import sleep, time 6 | 7 | import pytest 8 | from scantree import SymlinkRecursionError 9 | 10 | from dirhash import ( 11 | Filter, 12 | Protocol, 13 | _get_hasher_factory, 14 | _parmap, 15 | algorithms_available, 16 | algorithms_guaranteed, 17 | dirhash, 18 | dirhash_impl, 19 | get_match_patterns, 20 | included_paths, 21 | ) 22 | 23 | 24 | def osp(path: str) -> str: 25 | """Normalize path for OS.""" 26 | if os.name == "nt": # pragma: no cover 27 | return path.replace("/", "\\") 28 | return path 29 | 30 | 31 | def map_osp(paths): 32 | return [osp(path) for path in paths] 33 | 34 | 35 | class TestGetHasherFactory: 36 | def test_get_guaranteed(self): 37 | algorithm_and_hasher_factory = [ 38 | ("md5", hashlib.md5), 39 | ("sha1", hashlib.sha1), 40 | ("sha224", hashlib.sha224), 41 | ("sha256", hashlib.sha256), 42 | ("sha384", hashlib.sha384), 43 | ("sha512", hashlib.sha512), 44 | ] 45 | assert algorithms_guaranteed == {a for a, _ in algorithm_and_hasher_factory} 46 | for algorithm, expected_hasher_factory in algorithm_and_hasher_factory: 47 | hasher_factory = _get_hasher_factory(algorithm) 48 | assert hasher_factory == expected_hasher_factory 49 | 50 | def test_get_available(self): 51 | for algorithm in algorithms_available: 52 | hasher_factory = _get_hasher_factory(algorithm) 53 | try: 54 | hasher = hasher_factory() 55 | except ValueError as exc: 56 | # Some "available" algorithms are not necessarily available 57 | # (fails for e.g. 'ripemd160' in github actions for python 3.8). 58 | # See: https://stackoverflow.com/questions/72409563/unsupported-hash-type-ripemd160-with-hashlib-in-python # noqa: E501 59 | print(f"Failed to create hasher for {algorithm}: {exc}") 60 | assert exc.args[0] == f"unsupported hash type {algorithm}" 61 | hasher = None 62 | 63 | if hasher is not None: 64 | assert hasattr(hasher, "update") 65 | assert hasattr(hasher, "hexdigest") 66 | 67 | def test_not_available(self): 68 | with pytest.raises(ValueError): 69 | _get_hasher_factory("not available") 70 | 71 | def test_bypass_hasher_factory(self): 72 | # test standard hasher 73 | hasher_factory = _get_hasher_factory(hashlib.sha256) 74 | assert hasher_factory is hashlib.sha256 75 | 76 | # test raise on custom hasher with bad interface 77 | class IncompleteMockHasher: 78 | def __init__(self, *args, **kwargs): 79 | pass 80 | 81 | def update(self, *args, **kwargs): 82 | pass 83 | 84 | with pytest.raises(ValueError): 85 | _get_hasher_factory(IncompleteMockHasher) 86 | 87 | # test custom hasher with ok interface 88 | class MockHasher(IncompleteMockHasher): 89 | def hexdigest(self): 90 | return "" 91 | 92 | hasher_factory = _get_hasher_factory(MockHasher) 93 | assert hasher_factory is MockHasher 94 | 95 | 96 | class TestGetMatchPatterns: 97 | def test_default_match_all(self): 98 | ms = get_match_patterns() 99 | assert ms == ["*"] 100 | 101 | def test_only_match(self): 102 | ms = get_match_patterns(match=["a*", "b*"]) 103 | assert ms == ["a*", "b*"] 104 | 105 | def test_only_ignore(self): 106 | ms = get_match_patterns(ignore=["a*", "b*"]) 107 | assert ms == ["*", "!a*", "!b*"] 108 | 109 | def test_match_and_ignore(self): 110 | ms = get_match_patterns(match=["a*"], ignore=["*.ext"]) 111 | assert ms == ["a*", "!*.ext"] 112 | 113 | def test_ignore_hidden(self): 114 | ms = get_match_patterns(ignore_hidden=True) 115 | assert ms == ["*", "!.*", "!.*/"] 116 | 117 | # should not duplicate if present in (general) ignore 118 | ms = get_match_patterns(ignore=[".*"], ignore_hidden=True) 119 | assert ms == ["*", "!.*", "!.*/"] 120 | 121 | ms = get_match_patterns(ignore=[".*/"], ignore_hidden=True) 122 | assert ms == ["*", "!.*/", "!.*"] 123 | 124 | ms = get_match_patterns(ignore=[".*", ".*/"], ignore_hidden=True) 125 | assert ms == ["*", "!.*", "!.*/"] 126 | 127 | def test_ignore_extensions(self): 128 | ms = get_match_patterns(ignore_extensions=[".ext"]) 129 | assert ms == ["*", "!*.ext"] 130 | 131 | # automatically adds '.' 132 | ms = get_match_patterns(ignore_extensions=["ext"]) 133 | assert ms == ["*", "!*.ext"] 134 | 135 | # mixed also works 136 | ms = get_match_patterns(ignore_extensions=["ext1", ".ext2"]) 137 | assert ms == ["*", "!*.ext1", "!*.ext2"] 138 | 139 | # should not duplicate if present in (general) ignore 140 | ms = get_match_patterns(ignore=["*.ext"], ignore_extensions=[".ext"]) 141 | assert ms == ["*", "!*.ext"] 142 | 143 | ms = get_match_patterns(ignore=["*.ext"], ignore_extensions=["ext"]) 144 | assert ms == ["*", "!*.ext"] 145 | 146 | 147 | class TempDirTest: 148 | def setup_method(self): 149 | self.dir = tempfile.mkdtemp() 150 | 151 | def teardown_method(self): 152 | if os.path.exists(self.dir): 153 | shutil.rmtree(self.dir) 154 | 155 | def path_to(self, relpath): 156 | return os.path.join(self.dir, osp(relpath)) 157 | 158 | def mkdirs(self, dirpath): 159 | os.makedirs(self.path_to(dirpath)) 160 | 161 | def mkfile(self, relpath, content=None): 162 | with open(self.path_to(relpath), "w") as f: 163 | if content: 164 | f.write(content) 165 | 166 | def symlink(self, src, dst): 167 | os.symlink(self.path_to(src), self.path_to(dst)) 168 | 169 | def remove(self, relpath): 170 | if os.path.isdir(self.path_to(relpath)): 171 | shutil.rmtree(self.path_to(relpath)) 172 | os.remove(self.path_to(relpath)) 173 | 174 | 175 | class TestGetIncludedPaths(TempDirTest): 176 | # Integration tests with `pathspec` for basic use cases. 177 | 178 | def test_basic(self): 179 | self.mkdirs("root/d1/d11") 180 | self.mkdirs("root/d2") 181 | 182 | self.mkfile("root/f1") 183 | self.mkfile("root/d1/f1") 184 | self.mkfile("root/d1/d11/f1") 185 | self.mkfile("root/d2/f1") 186 | 187 | expected_filepaths = map_osp(["d1/d11/f1", "d1/f1", "d2/f1", "f1"]) 188 | filepaths = included_paths(self.path_to("root")) 189 | assert filepaths == expected_filepaths 190 | 191 | # end with '/' or not should not matter 192 | filepaths = included_paths(self.path_to("root/")) 193 | assert filepaths == expected_filepaths 194 | 195 | def test_not_a_directory(self): 196 | self.mkdirs("root") 197 | self.mkfile("root/f1") 198 | # does not exist 199 | with pytest.raises(ValueError): 200 | included_paths(self.path_to("wrong_root")) 201 | with pytest.raises(ValueError): 202 | included_paths(self.path_to("root/f1")) 203 | 204 | def test_symlinked_file(self): 205 | self.mkdirs("root") 206 | self.mkfile("root/f1") 207 | self.mkfile("linked_file") 208 | self.symlink("linked_file", "root/f2") 209 | 210 | filepaths = included_paths(self.path_to("root"), linked_files=True) 211 | assert filepaths == ["f1", "f2"] 212 | 213 | filepaths = included_paths(self.path_to("root"), linked_files=False) 214 | assert filepaths == ["f1"] 215 | 216 | # default is 'linked_files': True 217 | filepaths = included_paths( 218 | self.path_to("root"), 219 | ) 220 | assert filepaths == ["f1", "f2"] 221 | 222 | def test_symlinked_dir(self): 223 | self.mkdirs("root") 224 | self.mkfile("root/f1") 225 | self.mkdirs("linked_dir") 226 | self.mkfile("linked_dir/f1") 227 | self.mkfile("linked_dir/f2") 228 | self.symlink("linked_dir", "root/d1") 229 | 230 | filepaths = included_paths(self.path_to("root"), linked_dirs=False) 231 | assert filepaths == ["f1"] 232 | 233 | filepaths = included_paths(self.path_to("root"), linked_dirs=True) 234 | assert filepaths == map_osp(["d1/f1", "d1/f2", "f1"]) 235 | 236 | # default is 'linked_dirs': True 237 | filepaths = included_paths(self.path_to("root")) 238 | assert filepaths == map_osp(["d1/f1", "d1/f2", "f1"]) 239 | 240 | def test_cyclic_link(self): 241 | self.mkdirs("root/d1") 242 | self.symlink("root", "root/d1/link_back") 243 | with pytest.raises(SymlinkRecursionError) as exc_info: 244 | included_paths(self.path_to("root"), allow_cyclic_links=False) 245 | assert exc_info.value.real_path == os.path.realpath(self.path_to("root")) 246 | assert exc_info.value.first_path == self.path_to("root/") 247 | assert exc_info.value.second_path == self.path_to("root/d1/link_back") 248 | assert str(exc_info.value).startswith("Symlink recursion:") 249 | 250 | filepaths = included_paths(self.path_to("root"), allow_cyclic_links=True) 251 | assert filepaths == map_osp(["d1/link_back/."]) 252 | 253 | # default is 'allow_cyclic_links': False 254 | with pytest.raises(SymlinkRecursionError): 255 | filepaths = included_paths(self.path_to("root")) 256 | 257 | def test_ignore_hidden(self): 258 | self.mkdirs("root/d1") 259 | self.mkdirs("root/.d2") 260 | 261 | self.mkfile("root/f1") 262 | self.mkfile("root/.f2") 263 | self.mkfile("root/d1/f1") 264 | self.mkfile("root/d1/.f2") 265 | self.mkfile("root/.d2/f1") 266 | 267 | # no ignore 268 | filepaths = included_paths(self.path_to("root")) 269 | assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) 270 | 271 | # with ignore 272 | filepaths = included_paths(self.path_to("root"), match=["*", "!.*"]) 273 | assert filepaths == map_osp(["d1/f1", "f1"]) 274 | 275 | def test_ignore_hidden_files_only(self): 276 | self.mkdirs("root/d1") 277 | self.mkdirs("root/.d2") 278 | 279 | self.mkfile("root/f1") 280 | self.mkfile("root/.f2") 281 | self.mkfile("root/d1/f1") 282 | self.mkfile("root/d1/.f2") 283 | self.mkfile("root/.d2/f1") 284 | 285 | # no ignore 286 | filepaths = included_paths(self.path_to("root")) 287 | assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) 288 | 289 | # with ignore 290 | filepaths = included_paths( 291 | self.path_to("root"), match=["**/*", "!**/.*", "**/.*/*", "!**/.*/.*"] 292 | ) 293 | assert filepaths == map_osp([".d2/f1", "d1/f1", "f1"]) 294 | 295 | def test_ignore_hidden_explicitly_recursive(self): 296 | self.mkdirs("root/d1") 297 | self.mkdirs("root/.d2") 298 | 299 | self.mkfile("root/f1") 300 | self.mkfile("root/.f2") 301 | self.mkfile("root/d1/f1") 302 | self.mkfile("root/d1/.f2") 303 | self.mkfile("root/.d2/f1") 304 | 305 | # no ignore 306 | filepaths = included_paths(self.path_to("root")) 307 | assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) 308 | 309 | # with ignore 310 | filepaths = included_paths(self.path_to("root"), match=["*", "!**/.*"]) 311 | assert filepaths == map_osp(["d1/f1", "f1"]) 312 | 313 | def test_exclude_hidden_dirs(self): 314 | self.mkdirs("root/d1") 315 | self.mkdirs("root/.d2") 316 | self.mkdirs("root/d1/.d1") 317 | 318 | self.mkfile("root/f1") 319 | self.mkfile("root/.f2") 320 | self.mkfile("root/d1/f1") 321 | self.mkfile("root/d1/.f2") 322 | self.mkfile("root/.d2/f1") 323 | 324 | # no ignore 325 | filepaths = included_paths(self.path_to("root"), empty_dirs=True) 326 | assert filepaths == map_osp( 327 | [".d2/f1", ".f2", "d1/.d1/.", "d1/.f2", "d1/f1", "f1"] 328 | ) 329 | 330 | # with ignore 331 | filepaths = included_paths(self.path_to("root"), match=["*", "!.*/"]) 332 | assert filepaths == map_osp([".f2", "d1/.f2", "d1/f1", "f1"]) 333 | 334 | def test_exclude_hidden_dirs_and_files(self): 335 | self.mkdirs("root/d1") 336 | self.mkdirs("root/.d2") 337 | 338 | self.mkfile("root/f1") 339 | self.mkfile("root/.f2") 340 | self.mkfile("root/d1/f1") 341 | self.mkfile("root/d1/.f2") 342 | self.mkfile("root/.d2/f1") 343 | 344 | # no ignore 345 | filepaths = included_paths(self.path_to("root")) 346 | assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"]) 347 | 348 | # using ignore 349 | filepaths = included_paths(self.path_to("root"), match=["*", "!.*/", "!.*"]) 350 | assert filepaths == map_osp(["d1/f1", "f1"]) 351 | 352 | def test_exclude_extensions(self): 353 | self.mkdirs("root/d1") 354 | 355 | self.mkfile("root/f") 356 | self.mkfile("root/f.txt") 357 | self.mkfile("root/f.skip1") 358 | self.mkfile("root/fskip1") 359 | self.mkfile("root/f.skip2") 360 | self.mkfile("root/f.skip1.txt") 361 | self.mkfile("root/f.skip1.skip2") 362 | self.mkfile("root/f.skip1skip2") 363 | self.mkfile("root/d1/f.txt") 364 | self.mkfile("root/d1/f.skip1") 365 | 366 | filepaths = included_paths( 367 | self.path_to("root"), match=["*", "!*.skip1", "!*.skip2"] 368 | ) 369 | assert filepaths == map_osp( 370 | [ 371 | "d1/f.txt", 372 | "f", 373 | "f.skip1.txt", 374 | "f.skip1skip2", 375 | "f.txt", 376 | "fskip1", 377 | ] 378 | ) 379 | 380 | def test_empty_dirs_include_vs_exclude(self): 381 | self.mkdirs("root/d1") 382 | self.mkdirs("root/d2") 383 | self.mkdirs("root/d3/d31") 384 | self.mkdirs("root/d4/d41") 385 | 386 | self.mkfile("root/d1/f") 387 | self.mkfile("root/d3/d31/f") 388 | 389 | filepaths = included_paths(self.path_to("root"), empty_dirs=False) 390 | assert filepaths == map_osp(["d1/f", "d3/d31/f"]) 391 | 392 | # `include_empty=False` is default 393 | filepaths = included_paths(self.path_to("root")) 394 | assert filepaths == map_osp(["d1/f", "d3/d31/f"]) 395 | 396 | filepaths = included_paths(self.path_to("root"), empty_dirs=True) 397 | assert filepaths == map_osp(["d1/f", "d2/.", "d3/d31/f", "d4/d41/."]) 398 | 399 | def test_empty_dirs_because_of_filter_include_vs_exclude(self): 400 | self.mkdirs("root/d1") 401 | self.mkdirs("root/d2") 402 | 403 | self.mkfile("root/d1/f") 404 | self.mkfile("root/d2/.f") 405 | 406 | filepaths = included_paths( 407 | self.path_to("root"), match=["*", "!.*"], empty_dirs=False 408 | ) 409 | assert filepaths == map_osp(["d1/f"]) 410 | 411 | # `include_empty=False` is default 412 | filepaths = included_paths( 413 | self.path_to("root"), 414 | match=["*", "!.*"], 415 | ) 416 | assert filepaths == map_osp(["d1/f"]) 417 | 418 | filepaths = included_paths( 419 | self.path_to("root"), match=["*", "!.*"], empty_dirs=True 420 | ) 421 | assert filepaths == map_osp(["d1/f", "d2/."]) 422 | 423 | def test_empty_dir_inclusion_not_affected_by_match(self): 424 | self.mkdirs("root/d1") 425 | self.mkdirs("root/.d2") 426 | 427 | # NOTE that empty dirs are not excluded by match_patterns: 428 | 429 | filepaths = included_paths( 430 | self.path_to("root"), match=["*", "!.*"], empty_dirs=True 431 | ) 432 | assert filepaths == map_osp([".d2/.", "d1/."]) 433 | 434 | filepaths = included_paths( 435 | self.path_to("root"), match=["*", "!.*/"], empty_dirs=True 436 | ) 437 | assert filepaths == map_osp([".d2/.", "d1/."]) 438 | 439 | filepaths = included_paths( 440 | self.path_to("root"), match=["*", "!d1"], empty_dirs=True 441 | ) 442 | assert filepaths == map_osp([".d2/.", "d1/."]) 443 | 444 | 445 | def dirhash_mp_comp(*args, **kwargs): 446 | res = dirhash(*args, **kwargs) 447 | res_mp = dirhash(*args, **{**kwargs, "jobs": 2}) 448 | assert res == res_mp 449 | return res 450 | 451 | 452 | class TestDirhash(TempDirTest): 453 | def test_guaranteed_algorithms(self): 454 | self.mkdirs("root/d1/d11") 455 | self.mkdirs("root/d2") 456 | self.mkfile("root/f1", "a") 457 | self.mkfile("root/d1/f1", "b") 458 | self.mkfile("root/d1/d11/f1", "c") 459 | self.mkfile("root/d2/f1", "d") 460 | 461 | for algorithm, expected_hash in [ 462 | ("md5", "3c631c7f5771468a2187494f802fad8f"), 463 | ("sha1", "992aa2d00d2ed94f0c19eff7f151f5c6a7e0cc41"), 464 | ("sha224", "18013e1df933d5781b2eddb94aceeb7ab689643f1df24060fb478999"), 465 | ( 466 | "sha256", 467 | "ef7e95269fbc0e3478ad31fddd1c7d08" "907d189c61725332e8a2fd14448fe175", 468 | ), 469 | ( 470 | "sha384", 471 | "64ef4360c172bc68250f9326ea231cd1" 472 | "46a7fa1afe9d386cee0cae0e9f1b4ad2" 473 | "1df050d1df436cff792bbe81d6698026", 474 | ), 475 | ( 476 | "sha512", 477 | "7854226eb0278bc136056998890a8399" 478 | "f85ca383f7c54665026358d28b5dc716" 479 | "0ec654d2bcebf5d60974f82ed820600d" 480 | "8e807ea53d57578d076ec1c82f501208", 481 | ), 482 | ]: 483 | hash_value = dirhash_mp_comp(self.path_to("root"), algorithm) 484 | assert hash_value == expected_hash 485 | 486 | def test_recursive_descriptor(self): 487 | self.mkdirs("root/d1") 488 | self.mkdirs("root/d2") 489 | self.mkfile("root/f1", "a") 490 | self.mkfile("root/d1/f12", "b") 491 | 492 | f1_desc = "data:a\000name:f1" 493 | f12_desc = "data:b\000name:f12" 494 | d1_desc = f"dirhash:{f12_desc}\000name:d1" 495 | d2_desc = "dirhash:\000name:d2" 496 | 497 | empty_dirs_false_expected = "\000\000".join([f1_desc, d1_desc]) 498 | empty_dirs_true_expected = "\000\000".join([f1_desc, d2_desc, d1_desc]) 499 | 500 | empty_dirs_false = dirhash(self.path_to("root"), algorithm=IdentityHasher) 501 | assert empty_dirs_false == empty_dirs_false_expected 502 | 503 | empty_dirs_true = dirhash( 504 | self.path_to("root"), algorithm=IdentityHasher, empty_dirs=True 505 | ) 506 | assert empty_dirs_true == empty_dirs_true_expected 507 | 508 | def test_symlinked_file(self): 509 | self.mkdirs("root1") 510 | self.mkfile("root1/f1", "a") 511 | self.mkfile("linked_file", "b") 512 | self.symlink("linked_file", "root1/f2") 513 | 514 | self.mkdirs("root2") 515 | self.mkfile("root2/f1", "a") 516 | self.mkfile("root2/f2", "b") 517 | 518 | root1_linked_files_true = dirhash_mp_comp( 519 | self.path_to("root1"), algorithm="md5" 520 | ) 521 | root1_linked_files_false = dirhash_mp_comp( 522 | self.path_to("root1"), algorithm="md5", linked_files=False 523 | ) 524 | 525 | root2 = dirhash_mp_comp(self.path_to("root2"), algorithm="md5") 526 | 527 | assert root1_linked_files_false != root1_linked_files_true 528 | assert root1_linked_files_true == root2 529 | 530 | def test_symlinked_dir(self): 531 | self.mkdirs("root1") 532 | self.mkfile("root1/f1", "a") 533 | self.mkdirs("linked_dir") 534 | self.mkfile("linked_dir/f1", "b") 535 | self.mkfile("linked_dir/f2", "c") 536 | self.symlink("linked_dir", "root1/d1") 537 | 538 | self.mkdirs("root2") 539 | self.mkfile("root2/f1", "a") 540 | self.mkdirs("root2/d1") 541 | self.mkfile("root2/d1/f1", "b") 542 | self.mkfile("root2/d1/f2", "c") 543 | 544 | root1_linked_dirs_true = dirhash_mp_comp( 545 | self.path_to("root1"), algorithm="md5", linked_dirs=True 546 | ) 547 | root1_linked_dirs_false = dirhash_mp_comp( 548 | self.path_to("root1"), algorithm="md5", linked_dirs=False 549 | ) 550 | root2 = dirhash_mp_comp(self.path_to("root2"), algorithm="md5") 551 | 552 | assert root1_linked_dirs_false != root1_linked_dirs_true 553 | assert root1_linked_dirs_true == root2 554 | 555 | def test_cache_used_for_symlinks(self): 556 | self.mkdirs("root/dir") 557 | self.mkfile("root/file", "< one chunk content") 558 | for i in range(10): 559 | self.symlink("root/file", f"root/link_{i}") 560 | for i in range(10): 561 | self.symlink("root/file", f"root/dir/link_{i}") 562 | start = time() 563 | dirhash(self.path_to("root"), algorithm=SlowHasher) 564 | end = time() 565 | elapsed = end - start 566 | assert elapsed < SlowHasher.wait_time * 2 567 | 568 | def test_raise_on_empty_root_without_include_empty(self): 569 | self.mkdirs("root") 570 | with pytest.raises(ValueError): 571 | dirhash_mp_comp(self.path_to("root"), "sha256") 572 | 573 | def test_empty_root_include_empty(self): 574 | self.mkdirs("root") 575 | dirhash_ = dirhash_mp_comp(self.path_to("root"), "sha256", empty_dirs=True) 576 | expected_dirhash = hashlib.sha256(b"").hexdigest() 577 | assert dirhash_ == expected_dirhash 578 | 579 | def test_include_empty(self): 580 | self.mkdirs("root/d1") 581 | self.mkdirs("root/d2") 582 | self.mkfile("root/d1/f") 583 | 584 | args = (self.path_to("root"), "sha256") 585 | dirhash_ = dirhash_mp_comp(*args, empty_dirs=False) 586 | dirhash_empty = dirhash_mp_comp(*args, empty_dirs=True) 587 | assert dirhash_ != dirhash_empty 588 | 589 | def test_chunksize(self): 590 | self.mkdirs("root") 591 | self.mkfile("root/numbers.txt", str(range(1000))) 592 | 593 | hash_value = dirhash_mp_comp(self.path_to("root"), "sha256") 594 | for chunk_size in [2**4, 2**8, 2**16]: 595 | assert ( 596 | dirhash_mp_comp(self.path_to("root"), "sha256", chunk_size=chunk_size) 597 | == hash_value 598 | ) 599 | 600 | def test_data_only(self): 601 | self.mkdirs("root1") 602 | self.mkfile("root1/a.txt", "abc") 603 | self.mkfile("root1/b.txt", "def") 604 | self.mkdirs("root2") 605 | self.mkfile("root2/a.txt", "abc") 606 | self.mkfile("root2/c.txt", "def") 607 | 608 | hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256") 609 | hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256") 610 | assert hash1 != hash2 611 | 612 | # with entry hash remains the same as long as order of files is the 613 | # same 614 | [dhash1, dhash2] = [ 615 | dirhash_mp_comp(self.path_to(root), "sha256", entry_properties=["data"]) 616 | for root in ["root1", "root2"] 617 | ] 618 | assert dhash1 == dhash2 619 | 620 | def test_name_only(self): 621 | self.mkdirs("root1") 622 | self.mkfile("root1/a.txt", "abc") 623 | self.mkfile("root1/b.txt", "def") 624 | self.mkdirs("root2") 625 | self.mkfile("root2/a.txt", "abc") 626 | self.mkfile("root2/b.txt", "___") 627 | 628 | hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256") 629 | hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256") 630 | assert hash1 != hash2 631 | 632 | [dhash1, dhash2] = [ 633 | dirhash_mp_comp(self.path_to(root), "sha256", entry_properties=["name"]) 634 | for root in ["root1", "root2"] 635 | ] 636 | assert dhash1 == dhash2 637 | 638 | def test_is_link_property(self): 639 | self.mkdirs("root1") 640 | self.mkfile("root1/a.txt", "abc") 641 | self.mkfile("root1/b.txt", "def") 642 | self.mkdirs("root2") 643 | self.mkfile("b_target", "def") 644 | self.mkfile("root2/a.txt", "abc") 645 | self.symlink("b_target", "root2/b.txt") 646 | 647 | hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256") 648 | hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256") 649 | assert hash1 == hash2 650 | 651 | for entry_properties in [ 652 | ["name", "data", "is_link"], 653 | ["name", "is_link"], 654 | ["data", "is_link"], 655 | ]: 656 | [hash1, hash2] = [ 657 | dirhash_mp_comp( 658 | self.path_to(root), "sha256", entry_properties=entry_properties 659 | ) 660 | for root in ["root1", "root2"] 661 | ] 662 | assert hash1 != hash2 663 | 664 | def test_raise_on_not_at_least_one_of_name_and_data(self): 665 | self.mkdirs("root1") 666 | self.mkfile("root1/a.txt", "abc") 667 | dirhash_mp_comp(self.path_to("root1"), "sha256") # check ok 668 | with pytest.raises(ValueError): 669 | dirhash_mp_comp(self.path_to("root1"), "sha256", entry_properties=[]) 670 | 671 | with pytest.raises(ValueError): 672 | dirhash_mp_comp( 673 | self.path_to("root1"), "sha256", entry_properties=["is_link"] 674 | ) 675 | 676 | @pytest.mark.skipif( 677 | os.name == "nt", 678 | reason="TODO: not getting expected speedup on Windows.", 679 | # TODO: see https://github.com/andhus/scantree/issues/25 680 | ) 681 | def test_multiproc_speedup(self): 682 | self.mkdirs("root/dir") 683 | num_files = 10 684 | for i in range(num_files): 685 | self.mkfile(f"root/file_{i}", "< one chunk content") 686 | 687 | expected_min_elapsed_sequential = SlowHasher.wait_time * num_files 688 | 689 | start = time() 690 | dirhash(self.path_to("root"), algorithm=SlowHasher) 691 | end = time() 692 | elapsed_sequential = end - start 693 | assert elapsed_sequential > expected_min_elapsed_sequential 694 | 695 | start = time() 696 | dirhash(self.path_to("root"), algorithm=SlowHasher, jobs=num_files) 697 | end = time() 698 | elapsed_muliproc = end - start 699 | assert elapsed_muliproc < 0.9 * expected_min_elapsed_sequential 700 | # just check "any speedup", the overhead varies (and is high on Travis) 701 | 702 | def test_cache_by_real_path_speedup(self, tmpdir): 703 | num_links = 10 704 | 705 | # reference run without links 706 | root1 = tmpdir.join("root1") 707 | root1.ensure(dir=True) 708 | for i in range(num_links): 709 | file_i = root1.join(f"file_{i}") 710 | file_i.write("< one chunk content", ensure=True) 711 | 712 | wait_time = SlowHasher.wait_time 713 | expected_min_elapsed_no_links = wait_time * num_links 714 | start = time() 715 | dirhash(root1, algorithm=SlowHasher) 716 | end = time() 717 | elapsed_no_links = end - start 718 | assert elapsed_no_links > expected_min_elapsed_no_links 719 | overhead = elapsed_no_links - expected_min_elapsed_no_links 720 | 721 | # all links to same file 722 | root2 = tmpdir.join("root2") 723 | root2.ensure(dir=True) 724 | target_file = tmpdir.join("target_file") 725 | target_file.ensure() 726 | for i in range(num_links): 727 | os.symlink(target_file, root2.join(f"link_{i}")) 728 | 729 | overhead_margin_factor = 1.5 730 | expected_max_elapsed_with_links = overhead * overhead_margin_factor + wait_time 731 | assert expected_max_elapsed_with_links < expected_min_elapsed_no_links 732 | start = time() 733 | dirhash(root2, algorithm=SlowHasher) 734 | end = time() 735 | elapsed_with_links = end - start 736 | assert elapsed_with_links < expected_max_elapsed_with_links 737 | 738 | def test_cache_together_with_multiprocess_speedup(self, tmpdir): 739 | target_file_names = ["target_file_1", "target_file_2"] 740 | num_links_per_file = 10 741 | num_links = num_links_per_file * len(target_file_names) 742 | 743 | # reference run without links 744 | root1 = tmpdir.join("root1") 745 | root1.ensure(dir=True) 746 | for i in range(num_links): 747 | file_i = root1.join(f"file_{i}") 748 | file_i.write("< one chunk content", ensure=True) 749 | 750 | jobs = 2 751 | wait_time = SlowHasher.wait_time 752 | expected_min_elapsed_no_links = wait_time * num_links / jobs 753 | start = time() 754 | dirhash(root1, algorithm=SlowHasher, jobs=jobs) 755 | end = time() 756 | elapsed_no_links = end - start 757 | assert elapsed_no_links > expected_min_elapsed_no_links 758 | overhead = elapsed_no_links - expected_min_elapsed_no_links 759 | 760 | root2 = tmpdir.join("root2") 761 | root2.ensure(dir=True) 762 | for i, target_file_name in enumerate(target_file_names): 763 | target_file = tmpdir.join(target_file_name) 764 | target_file.write("< one chunk content", ensure=True) 765 | for j in range(num_links_per_file): 766 | os.symlink(target_file, root2.join(f"link_{i}_{j}")) 767 | 768 | overhead_margin_factor = 1.5 769 | expected_max_elapsed_with_links = ( 770 | overhead * overhead_margin_factor + wait_time * 2 771 | ) 772 | assert expected_max_elapsed_with_links < expected_min_elapsed_no_links 773 | start = time() 774 | dirhash(root2, algorithm=SlowHasher, jobs=jobs) 775 | end = time() 776 | elapsed_mp_with_links = end - start 777 | assert elapsed_mp_with_links < expected_max_elapsed_with_links 778 | 779 | def test_hash_cyclic_link_to_root(self): 780 | self.mkdirs("root/d1") 781 | self.symlink("root", "root/d1/link_back") 782 | dirhash(self.path_to("root"), "sha256", allow_cyclic_links=True) 783 | 784 | def test_hash_cyclic_link(self): 785 | self.mkdirs("root/d1/d2") 786 | self.symlink("root/d1", "root/d1/d2/link_back") 787 | dirhash(self.path_to("root"), "sha256", allow_cyclic_links=True) 788 | 789 | def test_pass_filtering_instance(self): 790 | self.mkdirs("root") 791 | self.mkfile("root/f1", "") 792 | dirhash_impl(self.path_to("root"), "sha256", filter_=Filter()) 793 | 794 | def test_pass_protocol_instance(self): 795 | self.mkdirs("root") 796 | self.mkfile("root/f1", "") 797 | dirhash_impl(self.path_to("root"), "sha256", protocol=Protocol()) 798 | 799 | def test_raise_on_wrong_type(self): 800 | self.mkdirs("root") 801 | self.mkfile("root/f1", "") 802 | with pytest.raises(TypeError): 803 | dirhash_impl(self.path_to("root"), "sha256", filter_="") 804 | with pytest.raises(TypeError): 805 | dirhash_impl(self.path_to("root"), "sha256", protocol="") 806 | 807 | 808 | class SlowHasher: 809 | wait_time = 0.25 810 | 811 | def __init__(self, *args, **kwargs): 812 | pass 813 | 814 | def update(self, data): 815 | if data != b"": 816 | sleep(self.wait_time) 817 | 818 | def hexdigest(self): 819 | return "" 820 | 821 | 822 | class IdentityHasher: 823 | def __init__(self, initial_data=b""): 824 | self.datas = [initial_data.decode("utf-8")] 825 | 826 | def update(self, data): 827 | self.datas.append(data.decode("utf-8")) 828 | 829 | def hexdigest(self): 830 | return "".join(self.datas) 831 | 832 | 833 | class TestProtocol: 834 | def test_raise_for_invalid_entry_properties(self): 835 | with pytest.raises(ValueError): 836 | Protocol(entry_properties=["not-valid"]) 837 | 838 | def test_raise_for_invalid_allow_cyclic_links(self): 839 | with pytest.raises(ValueError): 840 | Protocol(allow_cyclic_links="not-valid") 841 | 842 | 843 | def mock_func(x): 844 | return x * 2 845 | 846 | 847 | @pytest.mark.parametrize("jobs", [1, 2, 4]) 848 | def test_parmap(jobs): 849 | inputs = [1, 2, 3, 4] 850 | assert _parmap(mock_func, inputs, jobs=jobs) == [2, 4, 6, 8] 851 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = pre-commit,py{38,39,310,311,312} 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | pytest-cov 8 | commands = 9 | pytest --cov=dirhash --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc {posargs:tests} 10 | 11 | [testenv:pre-commit] 12 | skip_install = true 13 | deps = pre-commit 14 | commands = pre-commit run --all-files --show-diff-on-failure 15 | 16 | [gh-actions] 17 | python = 18 | 3.8: py38 19 | 3.9: py39 20 | 3.10: py310 21 | 3.11: py311 22 | 3.12: py312 23 | --------------------------------------------------------------------------------