├── .dockerignore ├── .editorconfig ├── .github └── workflows │ ├── coveralls.yml │ ├── pypi-release.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── MANIFEST.in ├── Makefile ├── README.rst ├── bagit.py ├── bench.py ├── locale ├── bagit-python.pot ├── bagit.pot └── en │ └── LC_MESSAGES │ └── bagit-python.po ├── pyproject.toml ├── setup.py ├── test-data ├── README ├── loc │ ├── 2478433644_2839c5e8b8_o_d.jpg │ └── 3314493806_6f1db86d66_o_d.jpg └── si │ ├── 2584174182_ffd5c24905_b_d.jpg │ └── 4011399822_65987a4806_b_d.jpg └── test.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .tox 2 | __pycache__ 3 | dist 4 | *.egg-info 5 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | end_of_line = lf 11 | charset = utf-8 12 | -------------------------------------------------------------------------------- /.github/workflows/coveralls.yml: -------------------------------------------------------------------------------- 1 | name: LibraryOfCongress/tests-bagit-python 2 | on: 3 | workflow_dispatch: 4 | jobs: 5 | test: 6 | runs-on: ubuntu-16.04 7 | steps: 8 | - name: checkout 9 | uses: actions/checkout@v3.5.0 10 | - uses: actions/setup-python@v4.6.0 11 | with: 12 | python-version: "${{ matrix.python }}" 13 | - run: apt-get -y install gettext 14 | - run: pip install --upgrade pip 15 | - run: pip install coveralls coverage 16 | - run: coverage run --include=bagit.py setup.py test 17 | - run: coveralls 18 | if: "${{ success() }}" 19 | strategy: 20 | matrix: 21 | python: 22 | - '3.10' 23 | -------------------------------------------------------------------------------- /.github/workflows/pypi-release.yml: -------------------------------------------------------------------------------- 1 | name: "PyPI releases" 2 | 3 | on: release 4 | 5 | jobs: 6 | build_sdist: 7 | name: Build Python source distribution 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | 12 | - name: Build sdist 13 | run: pipx run build --sdist 14 | 15 | - uses: actions/upload-artifact@v3 16 | with: 17 | path: dist/*.tar.gz 18 | 19 | pypi-publish: 20 | name: Upload release to PyPI 21 | if: github.event_name == 'release' && github.event.action == 'published' 22 | needs: 23 | - build_sdist 24 | runs-on: ubuntu-latest 25 | environment: 26 | name: pypi 27 | url: https://pypi.org/p/bagit 28 | permissions: 29 | id-token: write 30 | steps: 31 | - uses: actions/download-artifact@v3 32 | with: 33 | # unpacks default artifact into dist/ 34 | # if `name: artifact` is omitted, the action will create extra parent dir 35 | name: artifact 36 | path: dist 37 | - name: Publish package distributions to PyPI 38 | uses: pypa/gh-action-pypi-publish@release/v1 39 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | jobs: 10 | ruff: # https://docs.astral.sh/ruff 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - run: pip install --user ruff 15 | - run: ruff check --output-format=github 16 | 17 | test: 18 | needs: ruff 19 | runs-on: ubuntu-latest 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 24 | steps: 25 | - uses: actions/checkout@v4 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | - name: Install dependencies 31 | run: | 32 | python -m pip install --upgrade pip setuptools wheel 33 | pip install coverage 34 | pip install --editable . 35 | - name: Run test 36 | run: python -m unittest discover 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | bench-data 3 | build 4 | dist 5 | MANIFEST 6 | bagit.egg-info 7 | .idea 8 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: ".*test-data.*" 2 | 3 | repos: 4 | - repo: https://github.com/astral-sh/ruff-pre-commit 5 | rev: v0.6.9 6 | hooks: 7 | - id: ruff 8 | args: [--fix, --exit-non-zero-on-fix] 9 | - id: ruff-format 10 | 11 | - repo: https://github.com/pre-commit/pre-commit-hooks 12 | rev: v5.0.0 13 | hooks: 14 | - id: check-added-large-files 15 | args: ["--maxkb=128"] 16 | - id: check-ast 17 | - id: check-byte-order-marker 18 | - id: check-case-conflict 19 | - id: check-docstring-first 20 | - id: check-executables-have-shebangs 21 | - id: check-json 22 | - id: check-merge-conflict 23 | - id: check-symlinks 24 | - id: check-xml 25 | - id: check-yaml 26 | args: ["--unsafe"] 27 | - id: debug-statements 28 | - id: detect-aws-credentials 29 | args: ["--allow-missing-credentials"] 30 | - id: detect-private-key 31 | - id: end-of-file-fixer 32 | - id: mixed-line-ending 33 | args: ["--fix=lf"] 34 | - id: trailing-whitespace 35 | - id: pretty-format-json 36 | args: ["--autofix", "--no-sort-keys", "--indent=4"] 37 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | RUN useradd --user-group bagit-tester 3 | RUN install -d -o bagit-tester /bagit 4 | USER bagit-tester 5 | WORKDIR /bagit 6 | COPY .git/ /bagit/.git/ 7 | COPY *.rst *.py /bagit/ 8 | COPY test-data /bagit/test-data/ 9 | CMD [ "python", "setup.py", "test" ] 10 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | prune test-data 2 | exclude .* 3 | exclude Dockerfile 4 | exclude MANIFEST.in 5 | exclude test.py 6 | exclude bench.py 7 | recursive-include locale *.po *.mo 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | COMPILED_MESSAGES=$(patsubst %.po,%.mo, $(wildcard locale/*/LC_MESSAGES/bagit-python.po)) 2 | 3 | all: messages compile 4 | 5 | clean: 6 | rm -f locale/*/LC_MESSAGES/*.mo 7 | 8 | messages: 9 | xgettext --language=python -d bagit-python --no-location -o locale/bagit-python.pot bagit.py 10 | # Until http://savannah.gnu.org/bugs/?20923 is fixed: 11 | sed -i '' -e 's/CHARSET/UTF-8/g' locale/bagit-python.pot 12 | msgmerge --no-fuzzy-matching --lang=en --output-file=locale/en/LC_MESSAGES/bagit-python.po locale/en/LC_MESSAGES/bagit-python.po locale/bagit-python.pot 13 | 14 | %.mo: %.po 15 | msgfmt -o $@ $< 16 | 17 | compile: $(COMPILED_MESSAGES) 18 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | bagit-python 2 | ============ 3 | 4 | bagit is a Python library and command line utility for working with 5 | `BagIt `__ style packages. 6 | 7 | Installation 8 | ------------ 9 | 10 | bagit.py is a single-file python module that you can drop into your 11 | project as needed or you can install globally with: 12 | 13 | :: 14 | 15 | pip install bagit 16 | 17 | A supported version of Python 3 is required. 18 | 19 | Command Line Usage 20 | ------------------ 21 | 22 | When you install bagit you should get a command-line program called 23 | bagit.py which you can use to turn an existing directory into a bag: 24 | 25 | :: 26 | 27 | bagit.py --contact-name 'John Kunze' /directory/to/bag 28 | 29 | Finding Bagit on your system 30 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 31 | 32 | The ``bagit.py`` program should be available in your normal command-line 33 | window (Terminal on OS X, Command Prompt or Powershell on Windows, 34 | etc.). If you are unsure where it was installed you can also request 35 | that Python search for ``bagit`` as a Python module: simply replace 36 | ``bagit.py`` with ``python -m bagit``: 37 | 38 | :: 39 | 40 | python -m bagit --help 41 | 42 | On some systems Python may have been installed as ``python3``, ``py``, 43 | etc. – simply use the same name you use to start an interactive Python 44 | shell: 45 | 46 | :: 47 | 48 | py -m bagit --help 49 | python3 -m bagit --help 50 | 51 | Configuring BagIt 52 | ~~~~~~~~~~~~~~~~~ 53 | 54 | You can pass in key/value metadata for the bag using options like 55 | ``--contact-name`` above, which get persisted to the bag-info.txt. For a 56 | complete list of bag-info.txt properties you can use as commmand line 57 | arguments see ``--help``. 58 | 59 | Since calculating checksums can take a while when creating a bag, you 60 | may want to calculate them in parallel if you are on a multicore 61 | machine. You can do that with the ``--processes`` option: 62 | 63 | :: 64 | 65 | bagit.py --processes 4 /directory/to/bag 66 | 67 | To specify which checksum algorithm(s) to use when generating the 68 | manifest, use the --md5, --sha1, --sha256 and/or --sha512 flags (MD5 is 69 | generated by default). 70 | 71 | :: 72 | 73 | bagit.py --sha1 /path/to/bag 74 | bagit.py --sha256 /path/to/bag 75 | bagit.py --sha512 /path/to/bag 76 | 77 | If you would like to validate a bag you can use the --validate flag. 78 | 79 | :: 80 | 81 | bagit.py --validate /path/to/bag 82 | 83 | If you would like to take a quick look at the bag to see if it seems 84 | valid by just examining the structure of the bag, and comparing its 85 | payload-oxum (byte count and number of files) then use the ``--fast`` 86 | flag. 87 | 88 | :: 89 | 90 | bagit.py --validate --fast /path/to/bag 91 | 92 | And finally, if you'd like to parallelize validation to take advantage 93 | of multiple CPUs you can: 94 | 95 | :: 96 | 97 | bagit.py --validate --processes 4 /path/to/bag 98 | 99 | Using BagIt in your programs 100 | ---------------------------- 101 | 102 | You can also use BagIt programatically in your own Python programs by 103 | importing the ``bagit`` module. 104 | 105 | Create 106 | ~~~~~~ 107 | 108 | To create a bag you would do this: 109 | 110 | .. code:: python 111 | 112 | bag = bagit.make_bag('mydir', {'Contact-Name': 'John Kunze'}) 113 | 114 | ``make_bag`` returns a Bag instance. If you have a bag already on disk 115 | and would like to create a Bag instance for it, simply call the 116 | constructor directly: 117 | 118 | .. code:: python 119 | 120 | bag = bagit.Bag('/path/to/bag') 121 | 122 | Update Bag Metadata 123 | ~~~~~~~~~~~~~~~~~~~ 124 | 125 | You can change the metadata persisted to the bag-info.txt by using the 126 | ``info`` property on a ``Bag``. 127 | 128 | .. code:: python 129 | 130 | # load the bag 131 | bag = bagit.Bag('/path/to/bag') 132 | 133 | # update bag info metadata 134 | bag.info['Internal-Sender-Description'] = 'Updated on 2014-06-28.' 135 | bag.info['Authors'] = ['John Kunze', 'Andy Boyko'] 136 | bag.save() 137 | 138 | Update Bag Manifests 139 | ~~~~~~~~~~~~~~~~~~~~ 140 | 141 | By default ``save`` will not update manifests. This guards against a 142 | situation where a call to ``save`` to persist bag metadata accidentally 143 | regenerates manifests for an invalid bag. If you have modified the 144 | payload of a bag by adding, modifying or deleting files in the data 145 | directory, and wish to regenerate the manifests set the ``manifests`` 146 | parameter to True when calling ``save``. 147 | 148 | .. code:: python 149 | 150 | 151 | import shutil, os 152 | 153 | # add a file 154 | shutil.copyfile('newfile', '/path/to/bag/data/newfile') 155 | 156 | # remove a file 157 | os.remove('/path/to/bag/data/file') 158 | 159 | # persist changes 160 | bag.save(manifests=True) 161 | 162 | The save method takes an optional processes parameter which will 163 | determine how many processes are used to regenerate the checksums. This 164 | can be handy on multicore machines. 165 | 166 | Validation 167 | ~~~~~~~~~~ 168 | 169 | If you would like to see if a bag is valid, use its ``is_valid`` method: 170 | 171 | .. code:: python 172 | 173 | bag = bagit.Bag('/path/to/bag') 174 | if bag.is_valid(): 175 | print("yay :)") 176 | else: 177 | print("boo :(") 178 | 179 | If you'd like to get a detailed list of validation errors, execute the 180 | ``validate`` method and catch the ``BagValidationError`` exception. If 181 | the bag's manifest was invalid (and it wasn't caught by the payload 182 | oxum) the exception's ``details`` property will contain a list of 183 | ``ManifestError``\ s that you can introspect on. Each ManifestError, 184 | will be of type ``ChecksumMismatch``, ``FileMissing``, 185 | ``UnexpectedFile``. 186 | 187 | So for example if you want to print out checksums that failed to 188 | validate you can do this: 189 | 190 | .. code:: python 191 | 192 | 193 | bag = bagit.Bag("/path/to/bag") 194 | 195 | try: 196 | bag.validate() 197 | 198 | except bagit.BagValidationError as e: 199 | for d in e.details: 200 | if isinstance(d, bagit.ChecksumMismatch): 201 | print("expected %s to have %s checksum of %s but found %s" % 202 | (d.path, d.algorithm, d.expected, d.found)) 203 | 204 | To iterate through a bag's manifest and retrieve checksums for the 205 | payload files use the bag's entries dictionary: 206 | 207 | .. code:: python 208 | 209 | bag = bagit.Bag("/path/to/bag") 210 | 211 | for path, fixity in bag.entries.items(): 212 | print("path:%s md5:%s" % (path, fixity["md5"])) 213 | 214 | Contributing to bagit-python development 215 | ---------------------------------------- 216 | 217 | :: 218 | 219 | % git clone git://github.com/LibraryOfCongress/bagit-python.git 220 | % cd bagit-python 221 | # MAKE CHANGES 222 | % python test.py 223 | 224 | Running the tests 225 | ~~~~~~~~~~~~~~~~~ 226 | 227 | You can quickly run the tests using the built-in unittest framework: 228 | 229 | :: 230 | 231 | python -m unittest discover 232 | 233 | If you have Docker installed, you can run the tests under Linux inside a 234 | container: 235 | 236 | :: 237 | 238 | % docker build -t bagit:latest . && docker run -it bagit:latest 239 | 240 | Benchmarks 241 | ---------- 242 | 243 | If you'd like to see how increasing parallelization of bag creation on 244 | your system effects the time to create a bag try using the included 245 | bench utility: 246 | 247 | :: 248 | 249 | % ./bench.py 250 | 251 | License 252 | ------- 253 | 254 | |cc0| 255 | 256 | Note: By contributing to this project, you agree to license your work 257 | under the same terms as those that govern this project's distribution. 258 | 259 | .. |Coverage Status| image:: https://coveralls.io/repos/github/LibraryOfCongress/bagit-python/badge.svg?branch=master 260 | :target: https://coveralls.io/github/LibraryOfCongress/bagit-python?branch=master 261 | .. |cc0| image:: http://i.creativecommons.org/p/zero/1.0/88x31.png 262 | :target: http://creativecommons.org/publicdomain/zero/1.0/ 263 | -------------------------------------------------------------------------------- /bagit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import codecs 6 | import gettext 7 | import hashlib 8 | import logging 9 | import multiprocessing 10 | import os 11 | import re 12 | import signal 13 | import sys 14 | import tempfile 15 | import unicodedata 16 | import warnings 17 | from collections import defaultdict 18 | from datetime import date 19 | from functools import partial 20 | 21 | try: 22 | from importlib.metadata import version 23 | except ImportError: 24 | from importlib_metadata import version 25 | 26 | try: 27 | from urllib.parse import urlparse 28 | except ImportError: 29 | from urlparse import urlparse 30 | 31 | 32 | def find_locale_dir(): 33 | for prefix in (os.path.dirname(__file__), sys.prefix): 34 | locale_dir = os.path.join(prefix, "locale") 35 | if os.path.isdir(locale_dir): 36 | return locale_dir 37 | 38 | 39 | TRANSLATION_CATALOG = gettext.translation( 40 | "bagit-python", localedir=find_locale_dir(), fallback=True 41 | ) 42 | 43 | _ = TRANSLATION_CATALOG.gettext 44 | 45 | MODULE_NAME = "bagit" if __name__ == "__main__" else __name__ 46 | 47 | LOGGER = logging.getLogger(MODULE_NAME) 48 | 49 | VERSION = version(MODULE_NAME) 50 | if not VERSION: 51 | VERSION = "0.0.dev0" 52 | 53 | PROJECT_URL = "https://github.com/LibraryOfCongress/bagit-python" 54 | 55 | __doc__ = ( 56 | _( 57 | """ 58 | BagIt is a directory, filename convention for bundling an arbitrary set of 59 | files with a manifest, checksums, and additional metadata. More about BagIt 60 | can be found at: 61 | 62 | http://purl.org/net/bagit 63 | 64 | bagit.py is a pure python drop in library and command line tool for creating, 65 | and working with BagIt directories. 66 | 67 | 68 | Command-Line Usage: 69 | 70 | Basic usage is to give bagit.py a directory to bag up: 71 | 72 | $ bagit.py my_directory 73 | 74 | This does a bag-in-place operation where the current contents will be moved 75 | into the appropriate BagIt structure and the metadata files will be created. 76 | 77 | You can bag multiple directories if you wish: 78 | 79 | $ bagit.py directory1 directory2 80 | 81 | Optionally you can provide metadata which will be stored in bag-info.txt: 82 | 83 | $ bagit.py --source-organization "Library of Congress" directory 84 | 85 | You can also select which manifest algorithms will be used: 86 | 87 | $ bagit.py --sha1 --md5 --sha256 --sha512 directory 88 | 89 | 90 | Using BagIt from your Python code: 91 | 92 | import bagit 93 | bag = bagit.make_bag('example-directory', {'Contact-Name': 'Ed Summers'}) 94 | print(bag.entries) 95 | 96 | For more information or to contribute to bagit-python's development, please 97 | visit %(PROJECT_URL)s 98 | """ 99 | ) 100 | % globals() 101 | ) 102 | 103 | # standard bag-info.txt metadata 104 | STANDARD_BAG_INFO_HEADERS = [ 105 | "Source-Organization", 106 | "Organization-Address", 107 | "Contact-Name", 108 | "Contact-Phone", 109 | "Contact-Email", 110 | "External-Description", 111 | "External-Identifier", 112 | "Bag-Size", 113 | "Bag-Group-Identifier", 114 | "Bag-Count", 115 | "Internal-Sender-Identifier", 116 | "Internal-Sender-Description", 117 | "BagIt-Profile-Identifier", 118 | # Bagging-Date is autogenerated 119 | # Payload-Oxum is autogenerated 120 | ] 121 | 122 | try: 123 | CHECKSUM_ALGOS = hashlib.algorithms_guaranteed 124 | except AttributeError: 125 | # FIXME: remove when we drop Python 2 (https://github.com/LibraryOfCongress/bagit-python/issues/102) 126 | # Python 2.7.0-2.7.8 127 | CHECKSUM_ALGOS = set(hashlib.algorithms) 128 | DEFAULT_CHECKSUMS = ["sha256", "sha512"] 129 | 130 | #: Block size used when reading files for hashing: 131 | HASH_BLOCK_SIZE = 512 * 1024 132 | 133 | #: Convenience function used everywhere we want to open a file to read text 134 | #: rather than undecoded bytes: 135 | open_text_file = partial(codecs.open, encoding="utf-8", errors="strict") 136 | 137 | # This is the same as decoding the byte values in codecs.BOM: 138 | UNICODE_BYTE_ORDER_MARK = "\ufeff" 139 | 140 | 141 | def make_bag( 142 | bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8" 143 | ): 144 | """ 145 | Convert a given directory into a bag. You can pass in arbitrary 146 | key/value pairs to put into the bag-info.txt metadata file as 147 | the bag_info dictionary. 148 | """ 149 | 150 | if checksum is not None: 151 | warnings.warn( 152 | _( 153 | "The `checksum` argument for `make_bag` should be replaced with `checksums`" 154 | ), 155 | DeprecationWarning, 156 | ) 157 | checksums = checksum 158 | 159 | if checksums is None: 160 | checksums = DEFAULT_CHECKSUMS 161 | 162 | bag_dir = os.path.abspath(bag_dir) 163 | cwd = os.path.abspath(os.path.curdir) 164 | 165 | if cwd.startswith(bag_dir) and cwd != bag_dir: 166 | raise RuntimeError( 167 | _("Bagging a parent of the current directory is not supported") 168 | ) 169 | 170 | LOGGER.info(_("Creating bag for directory %s"), bag_dir) 171 | 172 | if not os.path.isdir(bag_dir): 173 | LOGGER.error(_("Bag directory %s does not exist"), bag_dir) 174 | raise RuntimeError(_("Bag directory %s does not exist") % bag_dir) 175 | 176 | # FIXME: we should do the permissions checks before changing directories 177 | old_dir = os.path.abspath(os.path.curdir) 178 | 179 | try: 180 | # TODO: These two checks are currently redundant since an unreadable directory will also 181 | # often be unwritable, and this code will require review when we add the option to 182 | # bag to a destination other than the source. It would be nice if we could avoid 183 | # walking the directory tree more than once even if most filesystems will cache it 184 | 185 | unbaggable = _can_bag(bag_dir) 186 | 187 | if unbaggable: 188 | LOGGER.error( 189 | _("Unable to write to the following directories and files:\n%s"), 190 | unbaggable, 191 | ) 192 | raise BagError(_("Missing permissions to move all files and directories")) 193 | 194 | unreadable_dirs, unreadable_files = _can_read(bag_dir) 195 | 196 | if unreadable_dirs or unreadable_files: 197 | if unreadable_dirs: 198 | LOGGER.error( 199 | _("The following directories do not have read permissions:\n%s"), 200 | unreadable_dirs, 201 | ) 202 | if unreadable_files: 203 | LOGGER.error( 204 | _("The following files do not have read permissions:\n%s"), 205 | unreadable_files, 206 | ) 207 | raise BagError( 208 | _("Read permissions are required to calculate file fixities") 209 | ) 210 | else: 211 | LOGGER.info(_("Creating data directory")) 212 | 213 | # FIXME: if we calculate full paths we won't need to deal with changing directories 214 | os.chdir(bag_dir) 215 | cwd = os.getcwd() 216 | temp_data = tempfile.mkdtemp(dir=cwd) 217 | 218 | for f in os.listdir("."): 219 | if os.path.abspath(f) == temp_data: 220 | continue 221 | new_f = os.path.join(temp_data, f) 222 | LOGGER.info( 223 | _("Moving %(source)s to %(destination)s"), 224 | {"source": f, "destination": new_f}, 225 | ) 226 | os.rename(f, new_f) 227 | 228 | LOGGER.info( 229 | _("Moving %(source)s to %(destination)s"), 230 | {"source": temp_data, "destination": "data"}, 231 | ) 232 | os.rename(temp_data, "data") 233 | 234 | # permissions for the payload directory should match those of the 235 | # original directory 236 | os.chmod("data", os.stat(cwd).st_mode) 237 | 238 | total_bytes, total_files = make_manifests( 239 | "data", processes, algorithms=checksums, encoding=encoding 240 | ) 241 | 242 | LOGGER.info(_("Creating bagit.txt")) 243 | txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" 244 | with open_text_file("bagit.txt", "w") as bagit_file: 245 | bagit_file.write(txt) 246 | 247 | LOGGER.info(_("Creating bag-info.txt")) 248 | if bag_info is None: 249 | bag_info = {} 250 | 251 | # allow 'Bagging-Date' and 'Bag-Software-Agent' to be overidden 252 | if "Bagging-Date" not in bag_info: 253 | bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d") 254 | if "Bag-Software-Agent" not in bag_info: 255 | bag_info["Bag-Software-Agent"] = "bagit.py v%s <%s>" % ( 256 | VERSION, 257 | PROJECT_URL, 258 | ) 259 | 260 | bag_info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files) 261 | _make_tag_file("bag-info.txt", bag_info) 262 | 263 | for c in checksums: 264 | _make_tagmanifest_file(c, bag_dir, encoding="utf-8") 265 | except Exception: 266 | LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir) 267 | raise 268 | finally: 269 | os.chdir(old_dir) 270 | 271 | return Bag(bag_dir) 272 | 273 | 274 | class Bag(object): 275 | """A representation of a bag.""" 276 | 277 | valid_files = ["bagit.txt", "fetch.txt"] 278 | valid_directories = ["data"] 279 | 280 | def __init__(self, path): 281 | super(Bag, self).__init__() 282 | self.tags = {} 283 | self.info = {} 284 | #: Dictionary of manifest entries and the checksum values for each 285 | #: algorithm: 286 | self.entries = {} 287 | 288 | # To reliably handle Unicode normalization differences, we maintain 289 | # lookup dictionaries in both directions for the filenames read from 290 | # the filesystem and the manifests so we can handle cases where the 291 | # normalization form changed between the bag being created and read. 292 | # See https://github.com/LibraryOfCongress/bagit-python/issues/51. 293 | 294 | #: maps Unicode-normalized values to the raw value from the filesystem 295 | self.normalized_filesystem_names = {} 296 | 297 | #: maps Unicode-normalized values to the raw value in the manifest 298 | self.normalized_manifest_names = {} 299 | 300 | self.algorithms = [] 301 | self.tag_file_name = None 302 | self.path = os.path.abspath(path) 303 | self._open() 304 | 305 | def __str__(self): 306 | # FIXME: develop a more informative string representation for a Bag 307 | return self.path 308 | 309 | @property 310 | def algs(self): 311 | warnings.warn(_("Use Bag.algorithms instead of Bag.algs"), DeprecationWarning) 312 | return self.algorithms 313 | 314 | @property 315 | def version(self): 316 | warnings.warn( 317 | _("Use the Bag.version_info tuple instead of Bag.version"), 318 | DeprecationWarning, 319 | ) 320 | return self._version 321 | 322 | def _open(self): 323 | # Open the bagit.txt file, and load any tags from it, including 324 | # the required version and encoding. 325 | bagit_file_path = os.path.join(self.path, "bagit.txt") 326 | 327 | if not os.path.isfile(bagit_file_path): 328 | raise BagError(_("Expected bagit.txt does not exist: %s") % bagit_file_path) 329 | 330 | self.tags = tags = _load_tag_file(bagit_file_path) 331 | 332 | required_tags = ("BagIt-Version", "Tag-File-Character-Encoding") 333 | missing_tags = [i for i in required_tags if i not in tags] 334 | if missing_tags: 335 | raise BagError( 336 | _("Missing required tag in bagit.txt: %s") % ", ".join(missing_tags) 337 | ) 338 | 339 | # To avoid breaking existing code we'll leave self.version as the string 340 | # and parse it into a numeric version_info tuple. In version 2.0 we can 341 | # break that. 342 | 343 | self._version = tags["BagIt-Version"] 344 | 345 | try: 346 | self.version_info = tuple(int(i) for i in self._version.split(".", 1)) 347 | except ValueError: 348 | raise BagError( 349 | _("Bag version numbers must be MAJOR.MINOR numbers, not %s") 350 | % self._version 351 | ) 352 | 353 | if (0, 93) <= self.version_info <= (0, 95): 354 | self.tag_file_name = "package-info.txt" 355 | elif (0, 96) <= self.version_info < (2,): 356 | self.tag_file_name = "bag-info.txt" 357 | else: 358 | raise BagError(_("Unsupported bag version: %s") % self._version) 359 | 360 | self.encoding = tags["Tag-File-Character-Encoding"] 361 | 362 | try: 363 | codecs.lookup(self.encoding) 364 | except LookupError: 365 | raise BagValidationError(_("Unsupported encoding: %s") % self.encoding) 366 | 367 | info_file_path = os.path.join(self.path, self.tag_file_name) 368 | if os.path.exists(info_file_path): 369 | self.info = _load_tag_file(info_file_path, encoding=self.encoding) 370 | 371 | self._load_manifests() 372 | 373 | def manifest_files(self): 374 | for filename in ["manifest-%s.txt" % a for a in CHECKSUM_ALGOS]: 375 | f = os.path.join(self.path, filename) 376 | if os.path.isfile(f): 377 | yield f 378 | 379 | def tagmanifest_files(self): 380 | for filename in ["tagmanifest-%s.txt" % a for a in CHECKSUM_ALGOS]: 381 | f = os.path.join(self.path, filename) 382 | if os.path.isfile(f): 383 | yield f 384 | 385 | def compare_manifests_with_fs(self): 386 | """ 387 | Compare the filenames in the manifests to the filenames present on the 388 | local filesystem and returns two lists of the files which are only 389 | present in the manifests and the files which are only present on the 390 | local filesystem, respectively. 391 | """ 392 | 393 | # We compare the filenames after Unicode normalization so we can 394 | # reliably detect normalization changes after bag creation: 395 | files_on_fs = set(normalize_unicode(i) for i in self.payload_files()) 396 | files_in_manifest = set( 397 | normalize_unicode(i) for i in self.payload_entries().keys() 398 | ) 399 | 400 | if self.version_info >= (0, 97): 401 | files_in_manifest.update(self.missing_optional_tagfiles()) 402 | 403 | only_on_fs = list() 404 | only_in_manifest = list() 405 | 406 | for i in files_on_fs.difference(files_in_manifest): 407 | only_on_fs.append(self.normalized_filesystem_names[i]) 408 | 409 | for i in files_in_manifest.difference(files_on_fs): 410 | only_in_manifest.append(self.normalized_manifest_names[i]) 411 | 412 | return only_in_manifest, only_on_fs 413 | 414 | def compare_fetch_with_fs(self): 415 | """Compares the fetch entries with the files actually 416 | in the payload, and returns a list of all the files 417 | that still need to be fetched. 418 | """ 419 | 420 | files_on_fs = set(self.payload_files()) 421 | files_in_fetch = set(self.files_to_be_fetched()) 422 | 423 | return list(files_in_fetch - files_on_fs) 424 | 425 | def payload_files(self): 426 | """Returns a list of filenames which are present on the local filesystem""" 427 | payload_dir = os.path.join(self.path, "data") 428 | 429 | for dirpath, _, filenames in os.walk(payload_dir): 430 | for f in filenames: 431 | # Jump through some hoops here to make the payload files are 432 | # returned with the directory structure relative to the base 433 | # directory rather than the 434 | normalized_f = os.path.normpath(f) 435 | rel_path = os.path.relpath( 436 | os.path.join(dirpath, normalized_f), start=self.path 437 | ) 438 | 439 | self.normalized_filesystem_names[normalize_unicode(rel_path)] = rel_path 440 | yield rel_path 441 | 442 | def payload_entries(self): 443 | """Return a dictionary of items""" 444 | # Don't use dict comprehension (compatibility with Python < 2.7) 445 | return dict( 446 | (key, value) 447 | for (key, value) in self.entries.items() 448 | if key.startswith("data" + os.sep) 449 | ) 450 | 451 | def save(self, processes=1, manifests=False): 452 | """ 453 | save will persist any changes that have been made to the bag 454 | metadata (self.info). 455 | 456 | If you have modified the payload of the bag (added, modified, 457 | removed files in the data directory) and want to regenerate manifests 458 | set the manifests parameter to True. The default is False since you 459 | wouldn't want a save to accidentally create a new manifest for 460 | a corrupted bag. 461 | 462 | If you want to control the number of processes that are used when 463 | recalculating checksums use the processes parameter. 464 | """ 465 | # Error checking 466 | if not self.path: 467 | raise BagError(_("Bag.save() called before setting the path!")) 468 | 469 | if not os.access(self.path, os.R_OK | os.W_OK | os.X_OK): 470 | raise BagError( 471 | _("Cannot save bag to non-existent or inaccessible directory %s") 472 | % self.path 473 | ) 474 | 475 | unbaggable = _can_bag(self.path) 476 | if unbaggable: 477 | LOGGER.error( 478 | _( 479 | "Missing write permissions for the following directories and files:\n%s" 480 | ), 481 | unbaggable, 482 | ) 483 | raise BagError(_("Missing permissions to move all files and directories")) 484 | 485 | unreadable_dirs, unreadable_files = _can_read(self.path) 486 | if unreadable_dirs or unreadable_files: 487 | if unreadable_dirs: 488 | LOGGER.error( 489 | _("The following directories do not have read permissions:\n%s"), 490 | unreadable_dirs, 491 | ) 492 | if unreadable_files: 493 | LOGGER.error( 494 | _("The following files do not have read permissions:\n%s"), 495 | unreadable_files, 496 | ) 497 | raise BagError( 498 | _("Read permissions are required to calculate file fixities") 499 | ) 500 | 501 | # Change working directory to bag directory so helper functions work 502 | old_dir = os.path.abspath(os.path.curdir) 503 | os.chdir(self.path) 504 | 505 | # Generate new manifest files 506 | if manifests: 507 | total_bytes, total_files = make_manifests( 508 | "data", processes, algorithms=self.algorithms, encoding=self.encoding 509 | ) 510 | 511 | # Update Payload-Oxum 512 | LOGGER.info(_("Updating Payload-Oxum in %s"), self.tag_file_name) 513 | self.info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files) 514 | 515 | _make_tag_file(self.tag_file_name, self.info) 516 | 517 | # Update tag-manifest for changes to manifest & bag-info files 518 | for alg in self.algorithms: 519 | _make_tagmanifest_file(alg, self.path, encoding=self.encoding) 520 | 521 | # Reload the manifests 522 | self._load_manifests() 523 | 524 | os.chdir(old_dir) 525 | 526 | def tagfile_entries(self): 527 | return dict( 528 | (key, value) 529 | for (key, value) in self.entries.items() 530 | if not key.startswith("data" + os.sep) 531 | ) 532 | 533 | def missing_optional_tagfiles(self): 534 | """ 535 | From v0.97 we need to validate any tagfiles listed 536 | in the optional tagmanifest(s). As there is no mandatory 537 | directory structure for additional tagfiles we can 538 | only check for entries with missing files (not missing 539 | entries for existing files). 540 | """ 541 | for tagfilepath in self.tagfile_entries().keys(): 542 | if not os.path.isfile(os.path.join(self.path, tagfilepath)): 543 | yield tagfilepath 544 | 545 | def fetch_entries(self): 546 | """Load fetch.txt if present and iterate over its contents 547 | 548 | yields (url, size, filename) tuples 549 | 550 | raises BagError for errors such as an unsafe filename referencing 551 | data outside of the bag directory 552 | """ 553 | 554 | fetch_file_path = os.path.join(self.path, "fetch.txt") 555 | 556 | if os.path.isfile(fetch_file_path): 557 | with open_text_file( 558 | fetch_file_path, "r", encoding=self.encoding 559 | ) as fetch_file: 560 | for line in fetch_file: 561 | url, file_size, filename = line.strip().split(None, 2) 562 | 563 | if self._path_is_dangerous(filename): 564 | raise BagError( 565 | _('Path "%(payload_file)s" in "%(source_file)s" is unsafe') 566 | % { 567 | "payload_file": filename, 568 | "source_file": os.path.join(self.path, "fetch.txt"), 569 | } 570 | ) 571 | 572 | yield url, file_size, filename 573 | 574 | def files_to_be_fetched(self): 575 | """ 576 | Convenience wrapper for fetch_entries which returns only the 577 | local filename 578 | """ 579 | 580 | for url, file_size, filename in self.fetch_entries(): 581 | yield filename 582 | 583 | def has_oxum(self): 584 | return "Payload-Oxum" in self.info 585 | 586 | def validate(self, processes=1, fast=False, completeness_only=False): 587 | """Checks the structure and contents are valid. 588 | 589 | If you supply the parameter fast=True the Payload-Oxum (if present) will 590 | be used to check that the payload files are present and accounted for, 591 | instead of re-calculating fixities and comparing them against the 592 | manifest. By default validate() will re-calculate fixities (fast=False). 593 | """ 594 | 595 | self._validate_structure() 596 | self._validate_bagittxt() 597 | 598 | self.validate_fetch() 599 | 600 | self._validate_contents( 601 | processes=processes, fast=fast, completeness_only=completeness_only 602 | ) 603 | 604 | return True 605 | 606 | def is_valid(self, processes=1, fast=False, completeness_only=False): 607 | """Returns validation success or failure as boolean. 608 | Optional processes and fast parameters passed directly to validate(). 609 | """ 610 | 611 | try: 612 | self.validate( 613 | processes=processes, fast=fast, completeness_only=completeness_only 614 | ) 615 | except BagError: 616 | return False 617 | 618 | return True 619 | 620 | def _load_manifests(self): 621 | self.entries = {} 622 | manifests = list(self.manifest_files()) 623 | 624 | if self.version_info >= (0, 97): 625 | # v0.97+ requires that optional tagfiles are verified. 626 | manifests += list(self.tagmanifest_files()) 627 | 628 | for manifest_filename in manifests: 629 | if manifest_filename.find("tagmanifest-") != -1: 630 | search = "tagmanifest-" 631 | else: 632 | search = "manifest-" 633 | alg = ( 634 | os.path.basename(manifest_filename) 635 | .replace(search, "") 636 | .replace(".txt", "") 637 | ) 638 | if alg not in self.algorithms: 639 | self.algorithms.append(alg) 640 | 641 | with open_text_file( 642 | manifest_filename, "r", encoding=self.encoding 643 | ) as manifest_file: 644 | if manifest_file.encoding.startswith("UTF"): 645 | # We'll check the first character to see if it's a BOM: 646 | if manifest_file.read(1) == UNICODE_BYTE_ORDER_MARK: 647 | # We'll skip it either way by letting line decoding 648 | # happen at the new offset but we will issue a warning 649 | # for UTF-8 since the presence of a BOM is contrary to 650 | # the BagIt specification: 651 | if manifest_file.encoding == "UTF-8": 652 | LOGGER.warning( 653 | _( 654 | "%s is encoded using UTF-8 but contains an unnecessary" 655 | " byte-order mark, which is not in compliance with the" 656 | " BagIt RFC" 657 | ), 658 | manifest_file.name, 659 | ) 660 | else: 661 | manifest_file.seek(0) # Pretend the first read never happened 662 | 663 | for line in manifest_file: 664 | line = line.strip() 665 | 666 | # Ignore blank lines and comments. 667 | if line == "" or line.startswith("#"): 668 | continue 669 | 670 | entry = line.split(None, 1) 671 | 672 | # Format is FILENAME *CHECKSUM 673 | if len(entry) != 2: 674 | LOGGER.error( 675 | _( 676 | "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s" 677 | ), 678 | {"bag": self, "algorithm": alg, "line": line}, 679 | ) 680 | continue 681 | 682 | entry_hash = entry[0] 683 | entry_path = os.path.normpath(entry[1].lstrip("*")) 684 | entry_path = _decode_filename(entry_path) 685 | 686 | if self._path_is_dangerous(entry_path): 687 | raise BagError( 688 | _( 689 | 'Path "%(payload_file)s" in manifest "%(manifest_file)s" is unsafe' 690 | ) 691 | % { 692 | "payload_file": entry_path, 693 | "manifest_file": manifest_file.name, 694 | } 695 | ) 696 | 697 | entry_hashes = self.entries.setdefault(entry_path, {}) 698 | 699 | if alg in entry_hashes: 700 | warning_ctx = { 701 | "bag": self, 702 | "algorithm": alg, 703 | "filename": entry_path, 704 | } 705 | if entry_hashes[alg] == entry_hash: 706 | msg = _( 707 | "%(bag)s: %(algorithm)s manifest lists %(filename)s" 708 | " multiple times with the same value" 709 | ) 710 | if self.version_info >= (1,): 711 | raise BagError(msg % warning_ctx) 712 | else: 713 | LOGGER.warning(msg, warning_ctx) 714 | else: 715 | raise BagError( 716 | _( 717 | "%(bag)s: %(algorithm)s manifest lists %(filename)s" 718 | " multiple times with conflicting values" 719 | ) 720 | % warning_ctx 721 | ) 722 | 723 | entry_hashes[alg] = entry_hash 724 | 725 | self.normalized_manifest_names.update( 726 | (normalize_unicode(i), i) for i in self.entries.keys() 727 | ) 728 | 729 | def _validate_structure(self): 730 | """ 731 | Checks the structure of the bag to determine whether it conforms to the 732 | BagIt spec. Returns true on success, otherwise it will raise a 733 | BagValidationError exception. 734 | """ 735 | 736 | self._validate_structure_payload_directory() 737 | self._validate_structure_tag_files() 738 | 739 | def _validate_structure_payload_directory(self): 740 | data_dir_path = os.path.join(self.path, "data") 741 | 742 | if not os.path.isdir(data_dir_path): 743 | raise BagValidationError( 744 | _("Expected data directory %s does not exist") % data_dir_path 745 | ) 746 | 747 | def _validate_structure_tag_files(self): 748 | # Note: we deviate somewhat from v0.96 of the spec in that it allows 749 | # other files and directories to be present in the base directory 750 | 751 | if not list(self.manifest_files()): 752 | raise BagValidationError(_("No manifest files found")) 753 | if "bagit.txt" not in os.listdir(self.path): 754 | raise BagValidationError( 755 | _('Expected %s to contain "bagit.txt"') % self.path 756 | ) 757 | 758 | def validate_fetch(self): 759 | """Validate the fetch.txt file 760 | 761 | Raises `BagError` for errors and otherwise returns no value 762 | """ 763 | 764 | for url, file_size, filename in self.fetch_entries(): 765 | # fetch_entries will raise a BagError for unsafe filenames 766 | # so at this point we will check only that the URL is minimally 767 | # well formed: 768 | parsed_url = urlparse(url) 769 | 770 | # each parsed url must resolve to a scheme and point to a netloc 771 | # if the scheme is file, netloc is not necessary 772 | if not ( 773 | all((parsed_url.scheme, parsed_url.netloc)) 774 | or parsed_url.scheme == "file" 775 | ): 776 | raise BagError(_("Malformed URL in fetch.txt: %s") % url) 777 | 778 | def _validate_contents(self, processes=1, fast=False, completeness_only=False): 779 | if fast and not self.has_oxum(): 780 | raise BagValidationError( 781 | _("Fast validation requires bag-info.txt to include Payload-Oxum") 782 | ) 783 | 784 | # Perform the fast file count + size check so we can fail early: 785 | self._validate_oxum() 786 | 787 | if fast: 788 | return 789 | 790 | self._validate_completeness() 791 | 792 | if completeness_only: 793 | return 794 | 795 | self._validate_entries(processes) 796 | 797 | def _validate_oxum(self): 798 | oxum = self.info.get("Payload-Oxum") 799 | 800 | if oxum is None: 801 | return 802 | 803 | # If multiple Payload-Oxum tags (bad idea) 804 | # use the first listed in bag-info.txt 805 | if isinstance(oxum, list): 806 | LOGGER.warning(_("bag-info.txt defines multiple Payload-Oxum values!")) 807 | oxum = oxum[0] 808 | 809 | oxum_byte_count, oxum_file_count = oxum.split(".", 1) 810 | 811 | if not oxum_byte_count.isdigit() or not oxum_file_count.isdigit(): 812 | raise BagError(_("Malformed Payload-Oxum value: %s") % oxum) 813 | 814 | oxum_byte_count = int(oxum_byte_count) 815 | oxum_file_count = int(oxum_file_count) 816 | total_bytes = 0 817 | total_files = 0 818 | 819 | for payload_file in self.payload_files(): 820 | payload_file = os.path.join(self.path, payload_file) 821 | total_bytes += os.stat(payload_file).st_size 822 | total_files += 1 823 | 824 | if oxum_file_count != total_files or oxum_byte_count != total_bytes: 825 | raise BagValidationError( 826 | _( 827 | "Payload-Oxum validation failed." 828 | " Expected %(oxum_file_count)d files and %(oxum_byte_count)d bytes" 829 | " but found %(found_file_count)d files and %(found_byte_count)d bytes" 830 | ) 831 | % { 832 | "found_file_count": total_files, 833 | "found_byte_count": total_bytes, 834 | "oxum_file_count": oxum_file_count, 835 | "oxum_byte_count": oxum_byte_count, 836 | } 837 | ) 838 | 839 | def _validate_completeness(self): 840 | """ 841 | Verify that the actual file manifests match the files in the data directory 842 | """ 843 | errors = list() 844 | 845 | # First we'll make sure there's no mismatch between the filesystem 846 | # and the list of files in the manifest(s) 847 | only_in_manifests, only_on_fs = self.compare_manifests_with_fs() 848 | for path in only_in_manifests: 849 | e = FileMissing(path) 850 | LOGGER.warning(str(e)) 851 | errors.append(e) 852 | for path in only_on_fs: 853 | e = UnexpectedFile(path) 854 | LOGGER.warning(str(e)) 855 | errors.append(e) 856 | 857 | if errors: 858 | raise BagValidationError(_("Bag is incomplete"), errors) 859 | 860 | def _validate_entries(self, processes): 861 | """ 862 | Verify that the actual file contents match the recorded hashes stored in the manifest files 863 | """ 864 | errors = list() 865 | 866 | if os.name == "posix": 867 | worker_init = posix_multiprocessing_worker_initializer 868 | else: 869 | worker_init = None 870 | 871 | args = ( 872 | ( 873 | self.path, 874 | self.normalized_filesystem_names.get(rel_path, rel_path), 875 | hashes, 876 | self.algorithms, 877 | ) 878 | for rel_path, hashes in self.entries.items() 879 | ) 880 | 881 | try: 882 | if processes == 1: 883 | hash_results = [_calc_hashes(i) for i in args] 884 | else: 885 | pool = multiprocessing.Pool( 886 | processes if processes else None, initializer=worker_init 887 | ) 888 | hash_results = pool.map(_calc_hashes, args) 889 | pool.close() 890 | pool.join() 891 | 892 | # Any unhandled exceptions are probably fatal 893 | except: 894 | LOGGER.exception(_("Unable to calculate file hashes for %s"), self) 895 | raise 896 | 897 | for rel_path, f_hashes, hashes in hash_results: 898 | for alg, computed_hash in f_hashes.items(): 899 | stored_hash = hashes[alg] 900 | if stored_hash.lower() != computed_hash: 901 | e = ChecksumMismatch( 902 | rel_path, alg, stored_hash.lower(), computed_hash 903 | ) 904 | LOGGER.warning(str(e)) 905 | errors.append(e) 906 | 907 | if errors: 908 | raise BagValidationError(_("Bag validation failed"), errors) 909 | 910 | def _validate_bagittxt(self): 911 | """ 912 | Verify that bagit.txt conforms to specification 913 | """ 914 | bagit_file_path = os.path.join(self.path, "bagit.txt") 915 | 916 | # Note that we are intentionally opening this file in binary mode so we can confirm 917 | # that it does not start with the UTF-8 byte-order-mark 918 | with open(bagit_file_path, "rb") as bagit_file: 919 | first_line = bagit_file.read(4) 920 | if first_line.startswith(codecs.BOM_UTF8): 921 | raise BagValidationError( 922 | _("bagit.txt must not contain a byte-order mark") 923 | ) 924 | 925 | def _path_is_dangerous(self, path): 926 | """ 927 | Return true if path looks dangerous, i.e. potentially operates 928 | outside the bagging directory structure, e.g. ~/.bashrc, ../../../secrets.json, 929 | \\\\?\\c:\\, D:\\sys32\\cmd.exe 930 | """ 931 | if os.path.isabs(path): 932 | return True 933 | if os.path.expanduser(path) != path: 934 | return True 935 | if os.path.expandvars(path) != path: 936 | return True 937 | real_path = os.path.realpath(os.path.join(self.path, path)) 938 | real_path = os.path.normpath(real_path) 939 | bag_path = os.path.realpath(self.path) 940 | bag_path = os.path.normpath(bag_path) 941 | common = os.path.commonprefix((bag_path, real_path)) 942 | return not (common == bag_path) 943 | 944 | 945 | class BagError(Exception): 946 | pass 947 | 948 | 949 | class BagValidationError(BagError): 950 | def __init__(self, message, details=None): 951 | super(BagValidationError, self).__init__() 952 | 953 | if details is None: 954 | details = [] 955 | 956 | self.message = message 957 | self.details = details 958 | 959 | def __str__(self): 960 | if len(self.details) > 0: 961 | details = "; ".join([str(e) for e in self.details]) 962 | return "%s: %s" % (self.message, details) 963 | return self.message 964 | 965 | 966 | class ManifestErrorDetail(BagError): 967 | def __init__(self, path): 968 | super(ManifestErrorDetail, self).__init__() 969 | 970 | self.path = path 971 | 972 | 973 | class ChecksumMismatch(ManifestErrorDetail): 974 | def __init__(self, path, algorithm=None, expected=None, found=None): 975 | super(ChecksumMismatch, self).__init__(path) 976 | 977 | self.path = path 978 | self.algorithm = algorithm 979 | self.expected = expected 980 | self.found = found 981 | 982 | def __str__(self): 983 | return _( 984 | '%(path)s %(algorithm)s validation failed: expected="%(expected)s" found="%(found)s"' 985 | ) % { 986 | "path": str(self.path), 987 | "algorithm": self.algorithm, 988 | "expected": self.expected, 989 | "found": self.found, 990 | } 991 | 992 | 993 | class FileMissing(ManifestErrorDetail): 994 | def __str__(self): 995 | return _("%s exists in manifest but was not found on filesystem") % str( 996 | self.path 997 | ) 998 | 999 | 1000 | class UnexpectedFile(ManifestErrorDetail): 1001 | def __str__(self): 1002 | return _("%s exists on filesystem but is not in the manifest") % self.path 1003 | 1004 | 1005 | class FileNormalizationConflict(BagError): 1006 | """ 1007 | Exception raised when two files differ only in normalization and thus 1008 | are not safely portable 1009 | """ 1010 | 1011 | def __init__(self, file_a, file_b): 1012 | super(FileNormalizationConflict, self).__init__() 1013 | 1014 | self.file_a = file_a 1015 | self.file_b = file_b 1016 | 1017 | def __str__(self): 1018 | return _( 1019 | 'Unicode normalization conflict for file "%(file_a)s" and "%(file_b)s"' 1020 | ) % {"file_a": self.file_a, "file_b": self.file_b} 1021 | 1022 | 1023 | def posix_multiprocessing_worker_initializer(): 1024 | """Ignore SIGINT in multiprocessing workers on POSIX systems""" 1025 | signal.signal(signal.SIGINT, signal.SIG_IGN) 1026 | 1027 | 1028 | # The Unicode normalization form used here doesn't matter – all we care about 1029 | # is consistency since the input value will be preserved: 1030 | 1031 | 1032 | def normalize_unicode(s): 1033 | return unicodedata.normalize("NFC", s) 1034 | 1035 | 1036 | def build_unicode_normalized_lookup_dict(filenames): 1037 | """ 1038 | Return a dictionary mapping unicode-normalized filenames to as-encoded 1039 | values to efficiently detect conflicts between the filesystem and manifests. 1040 | 1041 | This is necessary because some filesystems and utilities may automatically 1042 | apply a different Unicode normalization form to filenames than was applied 1043 | when the bag was originally created. 1044 | 1045 | The best known example of this is when a bag is created using a 1046 | normalization form other than NFD and then transferred to a Mac where the 1047 | HFS+ filesystem will transparently normalize filenames to a variant of NFD 1048 | for every call: 1049 | 1050 | https://developer.apple.com/legacy/library/technotes/tn/tn1150.html#UnicodeSubtleties 1051 | 1052 | Windows is documented as storing filenames exactly as provided: 1053 | 1054 | https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx 1055 | 1056 | Linux performs no normalization in the kernel but it is technically 1057 | valid for a filesystem to perform normalization, such as when an HFS+ 1058 | volume is mounted. 1059 | 1060 | See http://www.unicode.org/reports/tr15/ for a full discussion of 1061 | equivalence and normalization in Unicode. 1062 | """ 1063 | 1064 | output = dict() 1065 | 1066 | for filename in filenames: 1067 | normalized_filename = normalize_unicode(filename) 1068 | if normalized_filename in output: 1069 | raise FileNormalizationConflict(filename, output[normalized_filename]) 1070 | else: 1071 | output[normalized_filename] = filename 1072 | 1073 | return output 1074 | 1075 | 1076 | def get_hashers(algorithms): 1077 | """ 1078 | Given a list of algorithm names, return a dictionary of hasher instances 1079 | 1080 | This avoids redundant code between the creation and validation code where in 1081 | both cases we want to avoid reading the same file more than once. The 1082 | intended use is a simple for loop: 1083 | 1084 | for block in file: 1085 | for hasher in hashers.values(): 1086 | hasher.update(block) 1087 | """ 1088 | 1089 | hashers = {} 1090 | 1091 | for alg in algorithms: 1092 | try: 1093 | hasher = hashlib.new(alg) 1094 | except ValueError: 1095 | LOGGER.warning( 1096 | _("Disabling requested hash algorithm %s: hashlib does not support it"), 1097 | alg, 1098 | ) 1099 | continue 1100 | 1101 | hashers[alg] = hasher 1102 | 1103 | if not hashers: 1104 | raise ValueError( 1105 | _( 1106 | "Unable to continue: hashlib does not support any of the requested algorithms!" 1107 | ) 1108 | ) 1109 | 1110 | return hashers 1111 | 1112 | 1113 | def _calc_hashes(args): 1114 | # auto unpacking of sequences illegal in Python3 1115 | (base_path, rel_path, hashes, algorithms) = args 1116 | full_path = os.path.join(base_path, rel_path) 1117 | 1118 | # Create a clone of the default empty hash objects: 1119 | f_hashers = dict((alg, hashlib.new(alg)) for alg in hashes if alg in algorithms) 1120 | 1121 | try: 1122 | f_hashes = _calculate_file_hashes(full_path, f_hashers) 1123 | except BagValidationError as e: 1124 | f_hashes = dict((alg, str(e)) for alg in f_hashers.keys()) 1125 | 1126 | return rel_path, f_hashes, hashes 1127 | 1128 | 1129 | def _calculate_file_hashes(full_path, f_hashers): 1130 | """ 1131 | Returns a dictionary of (algorithm, hexdigest) values for the provided 1132 | filename 1133 | """ 1134 | LOGGER.info(_("Verifying checksum for file %s"), full_path) 1135 | 1136 | try: 1137 | with open(full_path, "rb") as f: 1138 | while True: 1139 | block = f.read(HASH_BLOCK_SIZE) 1140 | if not block: 1141 | break 1142 | for i in f_hashers.values(): 1143 | i.update(block) 1144 | except (OSError, IOError) as e: 1145 | raise BagValidationError( 1146 | _("Could not read %(filename)s: %(error)s") 1147 | % {"filename": full_path, "error": str(e)} 1148 | ) 1149 | 1150 | return dict((alg, h.hexdigest()) for alg, h in f_hashers.items()) 1151 | 1152 | 1153 | def _load_tag_file(tag_file_name, encoding="utf-8-sig"): 1154 | with open_text_file(tag_file_name, "r", encoding=encoding) as tag_file: 1155 | # Store duplicate tags as list of vals 1156 | # in order of parsing under the same key. 1157 | tags = {} 1158 | for name, value in _parse_tags(tag_file): 1159 | if name not in tags: 1160 | tags[name] = value 1161 | continue 1162 | 1163 | if not isinstance(tags[name], list): 1164 | tags[name] = [tags[name], value] 1165 | else: 1166 | tags[name].append(value) 1167 | 1168 | return tags 1169 | 1170 | 1171 | def _parse_tags(tag_file): 1172 | """Parses a tag file, according to RFC 2822. This 1173 | includes line folding, permitting extra-long 1174 | field values. 1175 | 1176 | See http://www.faqs.org/rfcs/rfc2822.html for 1177 | more information. 1178 | """ 1179 | 1180 | tag_name = None 1181 | tag_value = None 1182 | 1183 | # Line folding is handled by yielding values only after we encounter 1184 | # the start of a new tag, or if we pass the EOF. 1185 | for num, line in enumerate(tag_file): 1186 | # Skip over any empty or blank lines. 1187 | if len(line) == 0 or line.isspace(): 1188 | continue 1189 | elif line[0].isspace() and tag_value is not None: # folded line 1190 | tag_value += line 1191 | else: 1192 | # Starting a new tag; yield the last one. 1193 | if tag_name: 1194 | yield (tag_name, tag_value.strip()) 1195 | 1196 | if ":" not in line: 1197 | raise BagValidationError( 1198 | _("%(filename)s contains invalid tag: %(line)s") 1199 | % { 1200 | "line": line.strip(), 1201 | "filename": os.path.basename(tag_file.name), 1202 | } 1203 | ) 1204 | 1205 | parts = line.strip().split(":", 1) 1206 | tag_name = parts[0].strip() 1207 | tag_value = parts[1] 1208 | 1209 | # Passed the EOF. All done after this. 1210 | if tag_name: 1211 | yield (tag_name, tag_value.strip()) 1212 | 1213 | 1214 | def _make_tag_file(bag_info_path, bag_info): 1215 | headers = sorted(bag_info.keys()) 1216 | with open_text_file(bag_info_path, "w") as f: 1217 | for h in headers: 1218 | values = bag_info[h] 1219 | if not isinstance(values, list): 1220 | values = [values] 1221 | for txt in values: 1222 | # strip CR, LF and CRLF so they don't mess up the tag file 1223 | txt = re.sub(r"\n|\r|(\r\n)", "", str(txt)) 1224 | f.write("%s: %s\n" % (h, txt)) 1225 | 1226 | 1227 | def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8"): 1228 | LOGGER.info( 1229 | _("Using %(process_count)d processes to generate manifests: %(algorithms)s"), 1230 | {"process_count": processes, "algorithms": ", ".join(algorithms)}, 1231 | ) 1232 | 1233 | manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms) 1234 | 1235 | if processes > 1: 1236 | pool = multiprocessing.Pool(processes=processes) 1237 | checksums = pool.map(manifest_line_generator, _walk(data_dir)) 1238 | pool.close() 1239 | pool.join() 1240 | else: 1241 | checksums = [manifest_line_generator(i) for i in _walk(data_dir)] 1242 | 1243 | # At this point we have a list of tuples which start with the algorithm name: 1244 | manifest_data = {} 1245 | for batch in checksums: 1246 | for entry in batch: 1247 | manifest_data.setdefault(entry[0], []).append(entry[1:]) 1248 | 1249 | # These will be keyed on the algorithm name so we can perform sanity checks 1250 | # below to catch failures in the hashing process: 1251 | num_files = defaultdict(lambda: 0) 1252 | total_bytes = defaultdict(lambda: 0) 1253 | 1254 | for algorithm, values in manifest_data.items(): 1255 | manifest_filename = "manifest-%s.txt" % algorithm 1256 | 1257 | with open_text_file(manifest_filename, "w", encoding=encoding) as manifest: 1258 | for digest, filename, byte_count in values: 1259 | manifest.write("%s %s\n" % (digest, _encode_filename(filename))) 1260 | num_files[algorithm] += 1 1261 | total_bytes[algorithm] += byte_count 1262 | 1263 | # We'll use sets of the values for the error checks and eventually return the payload oxum values: 1264 | byte_value_set = set(total_bytes.values()) 1265 | file_count_set = set(num_files.values()) 1266 | 1267 | # allow a bag with an empty payload 1268 | if not byte_value_set and not file_count_set: 1269 | return 0, 0 1270 | 1271 | if len(file_count_set) != 1: 1272 | raise RuntimeError(_("Expected the same number of files for each checksum")) 1273 | 1274 | if len(byte_value_set) != 1: 1275 | raise RuntimeError(_("Expected the same number of bytes for each checksums")) 1276 | 1277 | return byte_value_set.pop(), file_count_set.pop() 1278 | 1279 | 1280 | def _make_tagmanifest_file(alg, bag_dir, encoding="utf-8"): 1281 | tagmanifest_file = os.path.join(bag_dir, "tagmanifest-%s.txt" % alg) 1282 | LOGGER.info(_("Creating %s"), tagmanifest_file) 1283 | 1284 | checksums = [] 1285 | for f in _find_tag_files(bag_dir): 1286 | if re.match(r"^tagmanifest-.+\.txt$", f): 1287 | continue 1288 | with open(os.path.join(bag_dir, f), "rb") as fh: 1289 | m = hashlib.new(alg) 1290 | while True: 1291 | block = fh.read(HASH_BLOCK_SIZE) 1292 | if not block: 1293 | break 1294 | m.update(block) 1295 | checksums.append((m.hexdigest(), f)) 1296 | 1297 | with open_text_file( 1298 | os.path.join(bag_dir, tagmanifest_file), mode="w", encoding=encoding 1299 | ) as tagmanifest: 1300 | for digest, filename in checksums: 1301 | tagmanifest.write("%s %s\n" % (digest, filename)) 1302 | 1303 | 1304 | def _find_tag_files(bag_dir): 1305 | for dir in os.listdir(bag_dir): 1306 | if dir != "data": 1307 | if os.path.isfile(dir) and not dir.startswith("tagmanifest-"): 1308 | yield dir 1309 | for dir_name, _, filenames in os.walk(dir): 1310 | for filename in filenames: 1311 | if filename.startswith("tagmanifest-"): 1312 | continue 1313 | # remove everything up to the bag_dir directory 1314 | p = os.path.join(dir_name, filename) 1315 | yield os.path.relpath(p, bag_dir) 1316 | 1317 | 1318 | def _walk(data_dir): 1319 | for dirpath, dirnames, filenames in os.walk(data_dir): 1320 | # if we don't sort here the order of entries is non-deterministic 1321 | # which makes it hard to test the fixity of tagmanifest-md5.txt 1322 | filenames.sort() 1323 | dirnames.sort() 1324 | for fn in filenames: 1325 | path = os.path.join(dirpath, fn) 1326 | # BagIt spec requires manifest to always use '/' as path separator 1327 | if os.path.sep != "/": 1328 | parts = path.split(os.path.sep) 1329 | path = "/".join(parts) 1330 | yield path 1331 | 1332 | 1333 | def _can_bag(test_dir): 1334 | """Scan the provided directory for files which cannot be bagged due to insufficient permissions""" 1335 | unbaggable = [] 1336 | 1337 | if not os.access(test_dir, os.R_OK): 1338 | # We cannot continue without permission to read the source directory 1339 | unbaggable.append(test_dir) 1340 | return unbaggable 1341 | 1342 | if not os.access(test_dir, os.W_OK): 1343 | unbaggable.append(test_dir) 1344 | 1345 | for dirpath, dirnames, filenames in os.walk(test_dir): 1346 | for directory in dirnames: 1347 | full_path = os.path.join(dirpath, directory) 1348 | if not os.access(full_path, os.W_OK): 1349 | unbaggable.append(full_path) 1350 | 1351 | return unbaggable 1352 | 1353 | 1354 | def _can_read(test_dir): 1355 | """ 1356 | returns ((unreadable_dirs), (unreadable_files)) 1357 | """ 1358 | unreadable_dirs = [] 1359 | unreadable_files = [] 1360 | 1361 | if not os.access(test_dir, os.R_OK): 1362 | unreadable_dirs.append(test_dir) 1363 | else: 1364 | for dirpath, dirnames, filenames in os.walk(test_dir): 1365 | for dn in dirnames: 1366 | full_path = os.path.join(dirpath, dn) 1367 | if not os.access(full_path, os.R_OK): 1368 | unreadable_dirs.append(full_path) 1369 | for fn in filenames: 1370 | full_path = os.path.join(dirpath, fn) 1371 | if not os.access(full_path, os.R_OK): 1372 | unreadable_files.append(full_path) 1373 | return (tuple(unreadable_dirs), tuple(unreadable_files)) 1374 | 1375 | 1376 | def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS): 1377 | LOGGER.info(_("Generating manifest lines for file %s"), filename) 1378 | 1379 | # For performance we'll read the file only once and pass it block 1380 | # by block to every requested hash algorithm: 1381 | hashers = get_hashers(algorithms) 1382 | 1383 | total_bytes = 0 1384 | 1385 | with open(filename, "rb") as f: 1386 | while True: 1387 | block = f.read(HASH_BLOCK_SIZE) 1388 | 1389 | if not block: 1390 | break 1391 | 1392 | total_bytes += len(block) 1393 | for hasher in hashers.values(): 1394 | hasher.update(block) 1395 | 1396 | decoded_filename = _decode_filename(filename) 1397 | 1398 | # We'll generate a list of results in roughly manifest format but prefixed with the algorithm: 1399 | results = [ 1400 | (alg, hasher.hexdigest(), decoded_filename, total_bytes) 1401 | for alg, hasher in hashers.items() 1402 | ] 1403 | 1404 | return results 1405 | 1406 | 1407 | def _encode_filename(s): 1408 | s = s.replace("\r", "%0D") 1409 | s = s.replace("\n", "%0A") 1410 | return s 1411 | 1412 | 1413 | def _decode_filename(s): 1414 | s = re.sub(r"%0D", "\r", s, re.IGNORECASE) 1415 | s = re.sub(r"%0A", "\n", s, re.IGNORECASE) 1416 | return s 1417 | 1418 | 1419 | # following code is used for command line program 1420 | 1421 | 1422 | class BagArgumentParser(argparse.ArgumentParser): 1423 | def __init__(self, *args, **kwargs): 1424 | argparse.ArgumentParser.__init__(self, *args, **kwargs) 1425 | self.set_defaults(bag_info={}) 1426 | 1427 | 1428 | class BagHeaderAction(argparse.Action): 1429 | def __call__(self, parser, namespace, values, option_string=None): 1430 | opt = option_string.lstrip("--") 1431 | opt_caps = "-".join([o.capitalize() for o in opt.split("-")]) 1432 | namespace.bag_info[opt_caps] = values 1433 | 1434 | 1435 | def _make_parser(): 1436 | parser = BagArgumentParser( 1437 | formatter_class=argparse.RawDescriptionHelpFormatter, 1438 | description="bagit-python version %s\n\n%s\n" % (VERSION, __doc__.strip()), 1439 | ) 1440 | parser.add_argument( 1441 | "--processes", 1442 | type=int, 1443 | dest="processes", 1444 | default=1, 1445 | help=_( 1446 | "Use multiple processes to calculate checksums faster (default: %(default)s)" 1447 | ), 1448 | ) 1449 | parser.add_argument("--log", help=_("The name of the log file (default: stdout)")) 1450 | parser.add_argument( 1451 | "--quiet", 1452 | action="store_true", 1453 | help=_("Suppress all progress information other than errors"), 1454 | ) 1455 | parser.add_argument( 1456 | "--validate", 1457 | action="store_true", 1458 | help=_( 1459 | "Validate existing bags in the provided directories instead of" 1460 | " creating new ones" 1461 | ), 1462 | ) 1463 | parser.add_argument( 1464 | "--fast", 1465 | action="store_true", 1466 | help=_( 1467 | "Modify --validate behaviour to only test whether the bag directory" 1468 | " has the number of files and total size specified in Payload-Oxum" 1469 | " without performing checksum validation to detect corruption." 1470 | ), 1471 | ) 1472 | parser.add_argument( 1473 | "--completeness-only", 1474 | action="store_true", 1475 | help=_( 1476 | "Modify --validate behaviour to test whether the bag directory" 1477 | " has the expected payload specified in the checksum manifests" 1478 | " without performing checksum validation to detect corruption." 1479 | ), 1480 | ) 1481 | 1482 | checksum_args = parser.add_argument_group( 1483 | _("Checksum Algorithms"), 1484 | _( 1485 | "Select the manifest algorithms to be used when creating bags" 1486 | " (default=%s)" 1487 | ) 1488 | % ", ".join(DEFAULT_CHECKSUMS), 1489 | ) 1490 | 1491 | for i in CHECKSUM_ALGOS: 1492 | alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper()) 1493 | checksum_args.add_argument( 1494 | "--%s" % i, 1495 | action="append_const", 1496 | dest="checksums", 1497 | const=i, 1498 | help=_("Generate %s manifest when creating a bag") % alg_name, 1499 | ) 1500 | 1501 | metadata_args = parser.add_argument_group(_("Optional Bag Metadata")) 1502 | for header in STANDARD_BAG_INFO_HEADERS: 1503 | metadata_args.add_argument( 1504 | "--%s" % header.lower(), 1505 | type=str, 1506 | action=BagHeaderAction, 1507 | default=argparse.SUPPRESS, 1508 | ) 1509 | 1510 | parser.add_argument( 1511 | "directory", 1512 | nargs="+", 1513 | help=_( 1514 | "Directory which will be converted into a bag in place" 1515 | " by moving any existing files into the BagIt structure" 1516 | " and creating the manifests and other metadata." 1517 | ), 1518 | ) 1519 | 1520 | return parser 1521 | 1522 | 1523 | def _configure_logging(opts): 1524 | log_format = "%(asctime)s - %(levelname)s - %(message)s" 1525 | if opts.quiet: 1526 | level = logging.ERROR 1527 | else: 1528 | level = logging.INFO 1529 | if opts.log: 1530 | logging.basicConfig(filename=opts.log, level=level, format=log_format) 1531 | else: 1532 | logging.basicConfig(level=level, format=log_format) 1533 | 1534 | 1535 | def main(): 1536 | if "--version" in sys.argv: 1537 | print(_("bagit-python version %s") % VERSION) 1538 | sys.exit(0) 1539 | 1540 | parser = _make_parser() 1541 | args = parser.parse_args() 1542 | 1543 | if args.processes <= 0: 1544 | parser.error(_("The number of processes must be greater than 0")) 1545 | 1546 | if args.fast and not args.validate: 1547 | parser.error(_("--fast is only allowed as an option for --validate!")) 1548 | 1549 | if args.completeness_only and not args.validate: 1550 | parser.error( 1551 | _("--completeness-only is only allowed as an option for --validate!") 1552 | ) 1553 | 1554 | _configure_logging(args) 1555 | 1556 | rc = 0 1557 | for bag_dir in args.directory: 1558 | # validate the bag 1559 | if args.validate: 1560 | try: 1561 | bag = Bag(bag_dir) 1562 | # validate throws a BagError or BagValidationError 1563 | bag.validate( 1564 | processes=args.processes, 1565 | fast=args.fast, 1566 | completeness_only=args.completeness_only, 1567 | ) 1568 | if args.fast: 1569 | LOGGER.info(_("%s valid according to Payload-Oxum"), bag_dir) 1570 | elif args.completeness_only: 1571 | LOGGER.info( 1572 | _("%s is complete and valid according to Payload-Oxum"), bag_dir 1573 | ) 1574 | else: 1575 | LOGGER.info(_("%s is valid"), bag_dir) 1576 | except BagError as e: 1577 | LOGGER.error( 1578 | _("%(bag)s is invalid: %(error)s"), {"bag": bag_dir, "error": e} 1579 | ) 1580 | rc = 1 1581 | 1582 | # make the bag 1583 | else: 1584 | try: 1585 | make_bag( 1586 | bag_dir, 1587 | bag_info=args.bag_info, 1588 | processes=args.processes, 1589 | checksums=args.checksums, 1590 | ) 1591 | except Exception as exc: 1592 | LOGGER.error( 1593 | _("Failed to create bag in %(bag_directory)s: %(error)s"), 1594 | {"bag_directory": bag_dir, "error": exc}, 1595 | exc_info=True, 1596 | ) 1597 | rc = 1 1598 | 1599 | sys.exit(rc) 1600 | 1601 | 1602 | if __name__ == "__main__": 1603 | main() 1604 | -------------------------------------------------------------------------------- /bench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a little benchmarking script to exercise bagit.make_bag and 5 | bagit.validate using 1-8 parallel processes. It will download some images 6 | from NASA for use in bagging the first time it is run. 7 | """ 8 | 9 | import ftplib 10 | import os 11 | import shutil 12 | import timeit 13 | 14 | import bagit 15 | 16 | # fetch some images from NASA to bag up 17 | 18 | if not os.path.isdir("bench-data"): 19 | print("fetching some images to bag up from nasa") 20 | os.mkdir("bench-data") 21 | ftp = ftplib.FTP("nssdcftp.gsfc.nasa.gov") 22 | ftp.login() 23 | 24 | ftp.cwd("/pub/misc/photo_gallery/hi-res/planetary/mars/") 25 | files = [] 26 | ftp.retrlines("NLST", files.append) 27 | 28 | for file in files: 29 | print(("fetching %s" % file)) 30 | fh = open(os.path.join("bench-data", file), "wb") 31 | ftp.retrbinary("RETR %s" % file, fh.write) 32 | fh.close() 33 | 34 | 35 | # create bags using 1-8 processes 36 | 37 | statement = """ 38 | import os 39 | import bagit 40 | 41 | if os.path.isdir('bench-data/data'): 42 | os.system("rm bench-data/bag*") 43 | os.system("mv bench-data/data/* bench-data/") 44 | os.system("rmdir bench-data/data") 45 | 46 | bagit.make_bag('bench-data', processes=%s) 47 | """ 48 | 49 | for p in range(1, 9): 50 | t = timeit.Timer(statement % p) 51 | print( 52 | ("create w/ %s processes: %.2f seconds " % (p, (10 * t.timeit(number=10) / 10))) 53 | ) 54 | 55 | 56 | # validate a bag with 1-8 processes 57 | 58 | shutil.copytree("bench-data", "bench-data-bag") 59 | bagit.make_bag("bench-data-bag") 60 | 61 | # validate bench-data using n processes 62 | statement = """ 63 | import os 64 | import bagit 65 | 66 | bag = bagit.Bag('bench-data-bag') 67 | bag.validate(processes=%s) 68 | """ 69 | 70 | # try 1-8 parallel processes 71 | for p in range(1, 9): 72 | t = timeit.Timer(statement % p) 73 | print( 74 | ( 75 | "validate w/ %s processes: %.2f seconds " 76 | % (p, (10 * t.timeit(number=10) / 10)) 77 | ) 78 | ) 79 | 80 | shutil.rmtree("bench-data-bag") 81 | -------------------------------------------------------------------------------- /locale/bagit-python.pot: -------------------------------------------------------------------------------- 1 | # SOME DESCRIPTIVE TITLE. 2 | # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER 3 | # This file is distributed under the same license as the PACKAGE package. 4 | # FIRST AUTHOR , YEAR. 5 | # 6 | #, fuzzy 7 | msgid "" 8 | msgstr "" 9 | "Project-Id-Version: PACKAGE VERSION\n" 10 | "Report-Msgid-Bugs-To: \n" 11 | "POT-Creation-Date: 2018-06-26 10:28-0400\n" 12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" 13 | "Last-Translator: FULL NAME \n" 14 | "Language-Team: LANGUAGE \n" 15 | "Language: \n" 16 | "MIME-Version: 1.0\n" 17 | "Content-Type: text/plain; charset=UTF-8\n" 18 | "Content-Transfer-Encoding: 8bit\n" 19 | 20 | #, python-format 21 | msgid "" 22 | "\n" 23 | "BagIt is a directory, filename convention for bundling an arbitrary set of\n" 24 | "files with a manifest, checksums, and additional metadata. More about BagIt\n" 25 | "can be found at:\n" 26 | "\n" 27 | " http://purl.org/net/bagit\n" 28 | "\n" 29 | "bagit.py is a pure python drop in library and command line tool for " 30 | "creating,\n" 31 | "and working with BagIt directories.\n" 32 | "\n" 33 | "\n" 34 | "Command-Line Usage:\n" 35 | "\n" 36 | "Basic usage is to give bagit.py a directory to bag up:\n" 37 | "\n" 38 | " $ bagit.py my_directory\n" 39 | "\n" 40 | "This does a bag-in-place operation where the current contents will be moved\n" 41 | "into the appropriate BagIt structure and the metadata files will be " 42 | "created.\n" 43 | "\n" 44 | "You can bag multiple directories if you wish:\n" 45 | "\n" 46 | " $ bagit.py directory1 directory2\n" 47 | "\n" 48 | "Optionally you can provide metadata which will be stored in bag-info.txt:\n" 49 | "\n" 50 | " $ bagit.py --source-organization \"Library of Congress\" directory\n" 51 | "\n" 52 | "You can also select which manifest algorithms will be used:\n" 53 | "\n" 54 | " $ bagit.py --sha1 --md5 --sha256 --sha512 directory\n" 55 | "\n" 56 | "\n" 57 | "Using BagIt from your Python code:\n" 58 | "\n" 59 | " import bagit\n" 60 | " bag = bagit.make_bag('example-directory', {'Contact-Name': 'Ed " 61 | "Summers'})\n" 62 | " print(bag.entries)\n" 63 | "\n" 64 | "For more information or to contribute to bagit-python's development, please\n" 65 | "visit %(PROJECT_URL)s\n" 66 | msgstr "" 67 | 68 | msgid "" 69 | "The `checksum` argument for `make_bag` should be replaced with `checksums`" 70 | msgstr "" 71 | 72 | msgid "Bagging a parent of the current directory is not supported" 73 | msgstr "" 74 | 75 | #, python-format 76 | msgid "Creating bag for directory %s" 77 | msgstr "" 78 | 79 | #, python-format 80 | msgid "Bag directory %s does not exist" 81 | msgstr "" 82 | 83 | #, python-format 84 | msgid "" 85 | "Unable to write to the following directories and files:\n" 86 | "%s" 87 | msgstr "" 88 | 89 | msgid "Missing permissions to move all files and directories" 90 | msgstr "" 91 | 92 | #, python-format 93 | msgid "" 94 | "The following directories do not have read permissions:\n" 95 | "%s" 96 | msgstr "" 97 | 98 | #, python-format 99 | msgid "" 100 | "The following files do not have read permissions:\n" 101 | "%s" 102 | msgstr "" 103 | 104 | msgid "Read permissions are required to calculate file fixities" 105 | msgstr "" 106 | 107 | msgid "Creating data directory" 108 | msgstr "" 109 | 110 | #, python-format 111 | msgid "Moving %(source)s to %(destination)s" 112 | msgstr "" 113 | 114 | msgid "Creating bagit.txt" 115 | msgstr "" 116 | 117 | msgid "Creating bag-info.txt" 118 | msgstr "" 119 | 120 | #, python-format 121 | msgid "An error occurred creating a bag in %s" 122 | msgstr "" 123 | 124 | msgid "Use Bag.algorithms instead of Bag.algs" 125 | msgstr "" 126 | 127 | msgid "Use the Bag.version_info tuple instead of Bag.version" 128 | msgstr "" 129 | 130 | #, python-format 131 | msgid "Expected bagit.txt does not exist: %s" 132 | msgstr "" 133 | 134 | #, python-format 135 | msgid "Missing required tag in bagit.txt: %s" 136 | msgstr "" 137 | 138 | #, python-format 139 | msgid "Bag version numbers must be MAJOR.MINOR numbers, not %s" 140 | msgstr "" 141 | 142 | #, python-format 143 | msgid "Unsupported bag version: %s" 144 | msgstr "" 145 | 146 | #, python-format 147 | msgid "Unsupported encoding: %s" 148 | msgstr "" 149 | 150 | msgid "Bag.save() called before setting the path!" 151 | msgstr "" 152 | 153 | #, python-format 154 | msgid "Cannot save bag to non-existent or inaccessible directory %s" 155 | msgstr "" 156 | 157 | #, python-format 158 | msgid "" 159 | "Missing write permissions for the following directories and files:\n" 160 | "%s" 161 | msgstr "" 162 | 163 | #, python-format 164 | msgid "Updating Payload-Oxum in %s" 165 | msgstr "" 166 | 167 | #, python-format 168 | msgid "Path \"%(payload_file)s\" in \"%(source_file)s\" is unsafe" 169 | msgstr "" 170 | 171 | #, python-format 172 | msgid "" 173 | "%s is encoded using UTF-8 but contains an unnecessary byte-order mark, which " 174 | "is not in compliance with the BagIt RFC" 175 | msgstr "" 176 | 177 | #, python-format 178 | msgid "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s" 179 | msgstr "" 180 | 181 | #, python-format 182 | msgid "Path \"%(payload_file)s\" in manifest \"%(manifest_file)s\" is unsafe" 183 | msgstr "" 184 | 185 | #, python-format 186 | msgid "" 187 | "%(bag)s: %(algorithm)s manifest lists %(filename)s multiple times with the " 188 | "same value" 189 | msgstr "" 190 | 191 | #, python-format 192 | msgid "" 193 | "%(bag)s: %(algorithm)s manifest lists %(filename)s multiple times with " 194 | "conflicting values" 195 | msgstr "" 196 | 197 | #, python-format 198 | msgid "Expected data directory %s does not exist" 199 | msgstr "" 200 | 201 | msgid "No manifest files found" 202 | msgstr "" 203 | 204 | #, python-format 205 | msgid "Expected %s to contain \"bagit.txt\"" 206 | msgstr "" 207 | 208 | #, python-format 209 | msgid "Malformed URL in fetch.txt: %s" 210 | msgstr "" 211 | 212 | msgid "Fast validation requires bag-info.txt to include Payload-Oxum" 213 | msgstr "" 214 | 215 | msgid "bag-info.txt defines multiple Payload-Oxum values!" 216 | msgstr "" 217 | 218 | #, python-format 219 | msgid "Malformed Payload-Oxum value: %s" 220 | msgstr "" 221 | 222 | #, python-format 223 | msgid "" 224 | "Payload-Oxum validation failed. Expected %(oxum_file_count)d files and " 225 | "%(oxum_byte_count)d bytes but found %(found_file_count)d files and " 226 | "%(found_byte_count)d bytes" 227 | msgstr "" 228 | 229 | msgid "Bag validation failed" 230 | msgstr "" 231 | 232 | #, python-format 233 | msgid "Unable to calculate file hashes for %s" 234 | msgstr "" 235 | 236 | msgid "bagit.txt must not contain a byte-order mark" 237 | msgstr "" 238 | 239 | #, python-format 240 | msgid "" 241 | "%(path)s %(algorithm)s validation failed: expected=\"%(expected)s\" found=" 242 | "\"%(found)s\"" 243 | msgstr "" 244 | 245 | #, python-format 246 | msgid "%s exists in manifest but was not found on filesystem" 247 | msgstr "" 248 | 249 | #, python-format 250 | msgid "%s exists on filesystem but is not in the manifest" 251 | msgstr "" 252 | 253 | #, python-format 254 | msgid "" 255 | "Unicode normalization conflict for file \"%(file_a)s\" and \"%(file_b)s\"" 256 | msgstr "" 257 | 258 | #, python-format 259 | msgid "Disabling requested hash algorithm %s: hashlib does not support it" 260 | msgstr "" 261 | 262 | msgid "" 263 | "Unable to continue: hashlib does not support any of the requested algorithms!" 264 | msgstr "" 265 | 266 | #, python-format 267 | msgid "Verifying checksum for file %s" 268 | msgstr "" 269 | 270 | #, python-format 271 | msgid "Could not read %(filename)s: %(error)s" 272 | msgstr "" 273 | 274 | #, python-format 275 | msgid "%(filename)s contains invalid tag: %(line)s" 276 | msgstr "" 277 | 278 | #, python-format 279 | msgid "Using %(process_count)d processes to generate manifests: %(algorithms)s" 280 | msgstr "" 281 | 282 | msgid "Expected the same number of files for each checksum" 283 | msgstr "" 284 | 285 | msgid "Expected the same number of bytes for each checksums" 286 | msgstr "" 287 | 288 | #, python-format 289 | msgid "Creating %s" 290 | msgstr "" 291 | 292 | #, python-format 293 | msgid "Generating manifest lines for file %s" 294 | msgstr "" 295 | 296 | #, python-format 297 | msgid "" 298 | "Use multiple processes to calculate checksums faster (default: %(default)s)" 299 | msgstr "" 300 | 301 | msgid "The name of the log file (default: stdout)" 302 | msgstr "" 303 | 304 | msgid "Suppress all progress information other than errors" 305 | msgstr "" 306 | 307 | msgid "" 308 | "Validate existing bags in the provided directories instead of creating new " 309 | "ones" 310 | msgstr "" 311 | 312 | msgid "" 313 | "Modify --validate behaviour to only test whether the bag directory has the " 314 | "number of files and total size specified in Payload-Oxum without performing " 315 | "checksum validation to detect corruption." 316 | msgstr "" 317 | 318 | msgid "" 319 | "Modify --validate behaviour to test whether the bag directory has the " 320 | "expected payload specified in the checksum manifests without performing " 321 | "checksum validation to detect corruption." 322 | msgstr "" 323 | 324 | msgid "Checksum Algorithms" 325 | msgstr "" 326 | 327 | #, python-format 328 | msgid "" 329 | "Select the manifest algorithms to be used when creating bags (default=%s)" 330 | msgstr "" 331 | 332 | #, python-format 333 | msgid "Generate %s manifest when creating a bag" 334 | msgstr "" 335 | 336 | msgid "Optional Bag Metadata" 337 | msgstr "" 338 | 339 | msgid "" 340 | "Directory which will be converted into a bag in place by moving any existing " 341 | "files into the BagIt structure and creating the manifests and other metadata." 342 | msgstr "" 343 | 344 | #, python-format 345 | msgid "bagit-python version %s" 346 | msgstr "" 347 | 348 | msgid "The number of processes must be 0 or greater" 349 | msgstr "" 350 | 351 | msgid "--fast is only allowed as an option for --validate!" 352 | msgstr "" 353 | 354 | #, python-format 355 | msgid "%s valid according to Payload-Oxum" 356 | msgstr "" 357 | 358 | #, python-format 359 | msgid "%s is valid" 360 | msgstr "" 361 | 362 | #, python-format 363 | msgid "%(bag)s is invalid: %(error)s" 364 | msgstr "" 365 | 366 | #, python-format 367 | msgid "Failed to create bag in %(bag_directory)s: %(error)s" 368 | msgstr "" 369 | -------------------------------------------------------------------------------- /locale/bagit.pot: -------------------------------------------------------------------------------- 1 | # SOME DESCRIPTIVE TITLE. 2 | # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER 3 | # This file is distributed under the same license as the PACKAGE package. 4 | # FIRST AUTHOR , YEAR. 5 | # 6 | #, fuzzy 7 | msgid "" 8 | msgstr "" 9 | "Project-Id-Version: PACKAGE VERSION\n" 10 | "Report-Msgid-Bugs-To: \n" 11 | "POT-Creation-Date: 2017-04-27 15:09-0400\n" 12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" 13 | "Last-Translator: FULL NAME \n" 14 | "Language-Team: LANGUAGE \n" 15 | "Language: \n" 16 | "MIME-Version: 1.0\n" 17 | "Content-Type: text/plain; charset=UTF-8\n" 18 | "Content-Transfer-Encoding: 8bit\n" 19 | 20 | #: bagit.py:126 21 | msgid "" 22 | "The `checksum` argument for `make_bag` should be replaced with `checksums`" 23 | msgstr "" 24 | 25 | #: bagit.py:134 26 | #, python-format 27 | msgid "Creating bag for directory %s" 28 | msgstr "" 29 | 30 | #: bagit.py:137 bagit.py:138 31 | #, python-format 32 | msgid "Bag directory %s does not exist" 33 | msgstr "" 34 | 35 | #: bagit.py:146 36 | #, python-format 37 | msgid "" 38 | "No write permissions for the following directories and files:\n" 39 | "%s" 40 | msgstr "" 41 | 42 | #: bagit.py:151 bagit.py:391 43 | #, python-format 44 | msgid "" 45 | "The following directories do not have read permissions:\n" 46 | "%s" 47 | msgstr "" 48 | 49 | #: bagit.py:154 bagit.py:394 50 | #, python-format 51 | msgid "" 52 | "The following files do not have read permissions:\n" 53 | "%s" 54 | msgstr "" 55 | 56 | #: bagit.py:157 57 | msgid "Creating data directory" 58 | msgstr "" 59 | 60 | #: bagit.py:166 bagit.py:170 61 | #, python-format 62 | msgid "Moving %(source)s to %(destination)s" 63 | msgstr "" 64 | 65 | #: bagit.py:181 66 | msgid "Creating bagit.txt" 67 | msgstr "" 68 | 69 | #: bagit.py:186 70 | msgid "Creating bag-info.txt" 71 | msgstr "" 72 | 73 | #: bagit.py:202 74 | #, python-format 75 | msgid "An error occurred creating a bag in %s" 76 | msgstr "" 77 | 78 | #: bagit.py:250 79 | msgid "Use Bag.algorithms instead of Bag.algs" 80 | msgstr "" 81 | 82 | #: bagit.py:385 83 | #, python-format 84 | msgid "" 85 | "no write permissions for the following directories and files:\n" 86 | "%s" 87 | msgstr "" 88 | 89 | #: bagit.py:403 90 | #, python-format 91 | msgid "Updating %s" 92 | msgstr "" 93 | 94 | #: bagit.py:505 95 | #, python-format 96 | msgid "" 97 | "%s is encoded using UTF-8 but contains an unnecessary byte-order mark, which " 98 | "is not in compliance with the BagIt RFC" 99 | msgstr "" 100 | 101 | #: bagit.py:523 102 | #, python-format 103 | msgid "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s" 104 | msgstr "" 105 | 106 | #: bagit.py:572 107 | msgid "Fast validation requires a Payload-Oxum in bag-info.txt" 108 | msgstr "" 109 | 110 | #: bagit.py:590 111 | msgid "bag-info.txt defines multiple Payload-Oxum values!" 112 | msgstr "" 113 | 114 | #: bagit.py:654 115 | #, python-format 116 | msgid "Unable to calculate file hashes for %s" 117 | msgstr "" 118 | 119 | #: bagit.py:852 120 | #, python-format 121 | msgid "Disabling requested hash algorithm %s: hashlib does not support it" 122 | msgstr "" 123 | 124 | #: bagit.py:888 125 | #, python-format 126 | msgid "Verifying checksum for file %s" 127 | msgstr "" 128 | 129 | #: bagit.py:981 130 | #, python-format 131 | msgid "Using %(process_count)d processes to generate manifests: %(algorithms)s" 132 | msgstr "" 133 | 134 | #: bagit.py:1029 135 | #, python-format 136 | msgid "Creating %s" 137 | msgstr "" 138 | 139 | #: bagit.py:1106 140 | #, python-format 141 | msgid "Generating manifest lines for file %s" 142 | msgstr "" 143 | 144 | #: bagit.py:1252 145 | #, python-format 146 | msgid "%s valid according to Payload-Oxum" 147 | msgstr "" 148 | 149 | #: bagit.py:1254 150 | #, python-format 151 | msgid "%s is valid" 152 | msgstr "" 153 | 154 | #: bagit.py:1256 155 | #, python-format 156 | msgid "%(bag)s is invalid: %(error)s" 157 | msgstr "" 158 | 159 | #: bagit.py:1267 160 | #, python-format 161 | msgid "Failed to create bag in %(bag_directory)s: %(error)s" 162 | msgstr "" 163 | -------------------------------------------------------------------------------- /locale/en/LC_MESSAGES/bagit-python.po: -------------------------------------------------------------------------------- 1 | # English translations for PACKAGE package. 2 | # Copyright (C) 2017 THE PACKAGE'S COPYRIGHT HOLDER 3 | # This file is distributed under the same license as the PACKAGE package. 4 | # Automatically generated, 2017. 5 | # 6 | msgid "" 7 | msgstr "" 8 | "Project-Id-Version: PACKAGE VERSION\n" 9 | "Report-Msgid-Bugs-To: \n" 10 | "POT-Creation-Date: 2018-06-26 10:28-0400\n" 11 | "PO-Revision-Date: 2017-04-27 15:02-0400\n" 12 | "Last-Translator: Automatically generated\n" 13 | "Language-Team: none\n" 14 | "Language: en\n" 15 | "MIME-Version: 1.0\n" 16 | "Content-Type: text/plain; charset=UTF-8\n" 17 | "Content-Transfer-Encoding: 8bit\n" 18 | "Plural-Forms: nplurals=2; plural=(n != 1);\n" 19 | 20 | #, python-format 21 | msgid "" 22 | "\n" 23 | "BagIt is a directory, filename convention for bundling an arbitrary set of\n" 24 | "files with a manifest, checksums, and additional metadata. More about BagIt\n" 25 | "can be found at:\n" 26 | "\n" 27 | " http://purl.org/net/bagit\n" 28 | "\n" 29 | "bagit.py is a pure python drop in library and command line tool for " 30 | "creating,\n" 31 | "and working with BagIt directories.\n" 32 | "\n" 33 | "\n" 34 | "Command-Line Usage:\n" 35 | "\n" 36 | "Basic usage is to give bagit.py a directory to bag up:\n" 37 | "\n" 38 | " $ bagit.py my_directory\n" 39 | "\n" 40 | "This does a bag-in-place operation where the current contents will be moved\n" 41 | "into the appropriate BagIt structure and the metadata files will be " 42 | "created.\n" 43 | "\n" 44 | "You can bag multiple directories if you wish:\n" 45 | "\n" 46 | " $ bagit.py directory1 directory2\n" 47 | "\n" 48 | "Optionally you can provide metadata which will be stored in bag-info.txt:\n" 49 | "\n" 50 | " $ bagit.py --source-organization \"Library of Congress\" directory\n" 51 | "\n" 52 | "You can also select which manifest algorithms will be used:\n" 53 | "\n" 54 | " $ bagit.py --sha1 --md5 --sha256 --sha512 directory\n" 55 | "\n" 56 | "\n" 57 | "Using BagIt from your Python code:\n" 58 | "\n" 59 | " import bagit\n" 60 | " bag = bagit.make_bag('example-directory', {'Contact-Name': 'Ed " 61 | "Summers'})\n" 62 | " print(bag.entries)\n" 63 | "\n" 64 | "For more information or to contribute to bagit-python's development, please\n" 65 | "visit %(PROJECT_URL)s\n" 66 | msgstr "" 67 | 68 | msgid "" 69 | "The `checksum` argument for `make_bag` should be replaced with `checksums`" 70 | msgstr "" 71 | "The `checksum` argument for `make_bag` should be replaced with `checksums`" 72 | 73 | msgid "Bagging a parent of the current directory is not supported" 74 | msgstr "" 75 | 76 | #, python-format 77 | msgid "Creating bag for directory %s" 78 | msgstr "Creating bag for directory %s" 79 | 80 | #, python-format 81 | msgid "Bag directory %s does not exist" 82 | msgstr "Bag directory %s does not exist" 83 | 84 | #, python-format 85 | msgid "" 86 | "Unable to write to the following directories and files:\n" 87 | "%s" 88 | msgstr "" 89 | 90 | msgid "Missing permissions to move all files and directories" 91 | msgstr "" 92 | 93 | #, python-format 94 | msgid "" 95 | "The following directories do not have read permissions:\n" 96 | "%s" 97 | msgstr "" 98 | "The following directories do not have read permissions:\n" 99 | "%s" 100 | 101 | #, python-format 102 | msgid "" 103 | "The following files do not have read permissions:\n" 104 | "%s" 105 | msgstr "" 106 | "The following files do not have read permissions:\n" 107 | "%s" 108 | 109 | msgid "Read permissions are required to calculate file fixities" 110 | msgstr "" 111 | 112 | msgid "Creating data directory" 113 | msgstr "Creating data directory" 114 | 115 | #, python-format 116 | msgid "Moving %(source)s to %(destination)s" 117 | msgstr "Moving %(source)s to %(destination)s" 118 | 119 | msgid "Creating bagit.txt" 120 | msgstr "Creating bagit.txt" 121 | 122 | msgid "Creating bag-info.txt" 123 | msgstr "Creating bag-info.txt" 124 | 125 | #, python-format 126 | msgid "An error occurred creating a bag in %s" 127 | msgstr "An error occurred creating a bag in %s" 128 | 129 | msgid "Use Bag.algorithms instead of Bag.algs" 130 | msgstr "Use Bag.algorithms instead of Bag.algs" 131 | 132 | msgid "Use the Bag.version_info tuple instead of Bag.version" 133 | msgstr "" 134 | 135 | #, python-format 136 | msgid "Expected bagit.txt does not exist: %s" 137 | msgstr "" 138 | 139 | #, python-format 140 | msgid "Missing required tag in bagit.txt: %s" 141 | msgstr "" 142 | 143 | #, python-format 144 | msgid "Bag version numbers must be MAJOR.MINOR numbers, not %s" 145 | msgstr "" 146 | 147 | #, python-format 148 | msgid "Unsupported bag version: %s" 149 | msgstr "" 150 | 151 | #, python-format 152 | msgid "Unsupported encoding: %s" 153 | msgstr "" 154 | 155 | msgid "Bag.save() called before setting the path!" 156 | msgstr "" 157 | 158 | #, python-format 159 | msgid "Cannot save bag to non-existent or inaccessible directory %s" 160 | msgstr "" 161 | 162 | #, python-format 163 | msgid "" 164 | "Missing write permissions for the following directories and files:\n" 165 | "%s" 166 | msgstr "" 167 | 168 | #, python-format 169 | msgid "Updating Payload-Oxum in %s" 170 | msgstr "" 171 | 172 | #, python-format 173 | msgid "Path \"%(payload_file)s\" in \"%(source_file)s\" is unsafe" 174 | msgstr "" 175 | 176 | #, python-format 177 | msgid "" 178 | "%s is encoded using UTF-8 but contains an unnecessary byte-order mark, which " 179 | "is not in compliance with the BagIt RFC" 180 | msgstr "" 181 | "%s is encoded using UTF-8 but contains an unnecessary byte-order mark, which " 182 | "is not in compliance with the BagIt RFC" 183 | 184 | #, python-format 185 | msgid "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s" 186 | msgstr "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s" 187 | 188 | #, python-format 189 | msgid "Path \"%(payload_file)s\" in manifest \"%(manifest_file)s\" is unsafe" 190 | msgstr "" 191 | 192 | #, python-format 193 | msgid "" 194 | "%(bag)s: %(algorithm)s manifest lists %(filename)s multiple times with the " 195 | "same value" 196 | msgstr "" 197 | 198 | #, python-format 199 | msgid "" 200 | "%(bag)s: %(algorithm)s manifest lists %(filename)s multiple times with " 201 | "conflicting values" 202 | msgstr "" 203 | 204 | #, python-format 205 | msgid "Expected data directory %s does not exist" 206 | msgstr "" 207 | 208 | msgid "No manifest files found" 209 | msgstr "" 210 | 211 | #, python-format 212 | msgid "Expected %s to contain \"bagit.txt\"" 213 | msgstr "" 214 | 215 | #, python-format 216 | msgid "Malformed URL in fetch.txt: %s" 217 | msgstr "" 218 | 219 | msgid "Fast validation requires bag-info.txt to include Payload-Oxum" 220 | msgstr "" 221 | 222 | msgid "bag-info.txt defines multiple Payload-Oxum values!" 223 | msgstr "bag-info.txt defines multiple Payload-Oxum values!" 224 | 225 | #, python-format 226 | msgid "Malformed Payload-Oxum value: %s" 227 | msgstr "" 228 | 229 | #, python-format 230 | msgid "" 231 | "Payload-Oxum validation failed. Expected %(oxum_file_count)d files and " 232 | "%(oxum_byte_count)d bytes but found %(found_file_count)d files and " 233 | "%(found_byte_count)d bytes" 234 | msgstr "" 235 | 236 | msgid "Bag validation failed" 237 | msgstr "" 238 | 239 | #, python-format 240 | msgid "Unable to calculate file hashes for %s" 241 | msgstr "Unable to calculate file hashes for %s" 242 | 243 | msgid "bagit.txt must not contain a byte-order mark" 244 | msgstr "" 245 | 246 | #, python-format 247 | msgid "" 248 | "%(path)s %(algorithm)s validation failed: expected=\"%(expected)s\" found=" 249 | "\"%(found)s\"" 250 | msgstr "" 251 | 252 | #, python-format 253 | msgid "%s exists in manifest but was not found on filesystem" 254 | msgstr "" 255 | 256 | #, python-format 257 | msgid "%s exists on filesystem but is not in the manifest" 258 | msgstr "" 259 | 260 | #, python-format 261 | msgid "" 262 | "Unicode normalization conflict for file \"%(file_a)s\" and \"%(file_b)s\"" 263 | msgstr "" 264 | 265 | #, python-format 266 | msgid "Disabling requested hash algorithm %s: hashlib does not support it" 267 | msgstr "Disabling requested hash algorithm %s: hashlib does not support it" 268 | 269 | msgid "" 270 | "Unable to continue: hashlib does not support any of the requested algorithms!" 271 | msgstr "" 272 | 273 | #, python-format 274 | msgid "Verifying checksum for file %s" 275 | msgstr "Verifying checksum for file %s" 276 | 277 | #, python-format 278 | msgid "Could not read %(filename)s: %(error)s" 279 | msgstr "" 280 | 281 | #, python-format 282 | msgid "%(filename)s contains invalid tag: %(line)s" 283 | msgstr "" 284 | 285 | #, python-format 286 | msgid "Using %(process_count)d processes to generate manifests: %(algorithms)s" 287 | msgstr "" 288 | 289 | msgid "Expected the same number of files for each checksum" 290 | msgstr "" 291 | 292 | msgid "Expected the same number of bytes for each checksums" 293 | msgstr "" 294 | 295 | #, python-format 296 | msgid "Creating %s" 297 | msgstr "Creating %s" 298 | 299 | #, python-format 300 | msgid "Generating manifest lines for file %s" 301 | msgstr "Generating manifest lines for file %s" 302 | 303 | #, python-format 304 | msgid "" 305 | "Use multiple processes to calculate checksums faster (default: %(default)s)" 306 | msgstr "" 307 | 308 | msgid "The name of the log file (default: stdout)" 309 | msgstr "" 310 | 311 | msgid "Suppress all progress information other than errors" 312 | msgstr "" 313 | 314 | msgid "" 315 | "Validate existing bags in the provided directories instead of creating new " 316 | "ones" 317 | msgstr "" 318 | 319 | msgid "" 320 | "Modify --validate behaviour to only test whether the bag directory has the " 321 | "number of files and total size specified in Payload-Oxum without performing " 322 | "checksum validation to detect corruption." 323 | msgstr "" 324 | 325 | msgid "" 326 | "Modify --validate behaviour to test whether the bag directory has the " 327 | "expected payload specified in the checksum manifests without performing " 328 | "checksum validation to detect corruption." 329 | msgstr "" 330 | 331 | msgid "Checksum Algorithms" 332 | msgstr "" 333 | 334 | #, python-format 335 | msgid "" 336 | "Select the manifest algorithms to be used when creating bags (default=%s)" 337 | msgstr "" 338 | 339 | #, python-format 340 | msgid "Generate %s manifest when creating a bag" 341 | msgstr "" 342 | 343 | msgid "Optional Bag Metadata" 344 | msgstr "" 345 | 346 | msgid "" 347 | "Directory which will be converted into a bag in place by moving any existing " 348 | "files into the BagIt structure and creating the manifests and other metadata." 349 | msgstr "" 350 | 351 | #, python-format 352 | msgid "bagit-python version %s" 353 | msgstr "" 354 | 355 | msgid "The number of processes must be 0 or greater" 356 | msgstr "" 357 | 358 | msgid "--fast is only allowed as an option for --validate!" 359 | msgstr "" 360 | 361 | #, python-format 362 | msgid "%s valid according to Payload-Oxum" 363 | msgstr "%s valid according to Payload-Oxum" 364 | 365 | #, python-format 366 | msgid "%s is valid" 367 | msgstr "%s is valid" 368 | 369 | #, python-format 370 | msgid "%(bag)s is invalid: %(error)s" 371 | msgstr "%(bag)s is invalid: %(error)s" 372 | 373 | #, python-format 374 | msgid "Failed to create bag in %(bag_directory)s: %(error)s" 375 | msgstr "Failed to create bag in %(bag_directory)s: %(error)s" 376 | 377 | #~ msgid "Fast validation requires a Payload-Oxum in bag-info.txt" 378 | #~ msgstr "Fast validation requires a Payload-Oxum in bag-info.txt" 379 | 380 | #~ msgid "" 381 | #~ "No write permissions for the following directories and files:\n" 382 | #~ "%s" 383 | #~ msgstr "" 384 | #~ "No write permissions for the following directories and files:\n" 385 | #~ "%s" 386 | 387 | #~ msgid "" 388 | #~ "no write permissions for the following directories and files:\n" 389 | #~ "%s" 390 | #~ msgstr "" 391 | #~ "no write permissions for the following directories and files:\n" 392 | #~ "%s" 393 | 394 | #~ msgid "Updating %s" 395 | #~ msgstr "Updating %s" 396 | 397 | #~ msgid "" 398 | #~ "Using %(process_count)d processes to generate manifests: %(algoritms)s" 399 | #~ msgstr "" 400 | #~ "Using %(process_count)d processes to generate manifests: %(algoritms)s" 401 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "setuptools-scm>=8"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bagit" 7 | dynamic = ["version"] 8 | description = "Create and validate BagIt packages" 9 | readme = {file = "README.rst", content-type = "text/x-rst"} 10 | authors = [ 11 | { name = "Ed Summers", email = "ehs@pobox.com" }, 12 | ] 13 | classifiers = [ 14 | "Intended Audience :: Developers", 15 | "License :: Public Domain", 16 | "Programming Language :: Python :: 3", 17 | "Topic :: Communications :: File Sharing", 18 | "Topic :: Software Development :: Libraries :: Python Modules", 19 | "Topic :: System :: Filesystems", 20 | ] 21 | 22 | [project.urls] 23 | Homepage = "https://libraryofcongress.github.io/bagit-python/" 24 | 25 | [tool] 26 | 27 | [tool.ruff] 28 | target-version = "py38" 29 | 30 | 31 | [tool.setuptools_scm] 32 | 33 | [tool.isort] 34 | line_length = 110 35 | default_section = "THIRDPARTY" 36 | known_first_party = "bagit" 37 | 38 | [tool.coverage.run] 39 | branch = true 40 | include = "bagit.py" 41 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | from __future__ import absolute_import, print_function 5 | 6 | import glob 7 | import os 8 | import subprocess 9 | import sys 10 | 11 | from setuptools import setup 12 | 13 | description = "Create and validate BagIt packages" 14 | 15 | 16 | def get_message_catalogs(): 17 | message_catalogs = [] 18 | 19 | for po_file in glob.glob("locale/*/LC_MESSAGES/bagit-python.po"): 20 | mo_file = po_file.replace(".po", ".mo") 21 | 22 | if not os.path.exists(mo_file) or os.path.getmtime(mo_file) < os.path.getmtime( 23 | po_file 24 | ): 25 | try: 26 | subprocess.check_call(["msgfmt", "-o", mo_file, po_file]) 27 | except (OSError, subprocess.CalledProcessError) as exc: 28 | print( 29 | "Translation catalog %s could not be compiled (is gettext installed?) " 30 | " — translations will not be available for this language: %s" 31 | % (po_file, exc), 32 | file=sys.stderr, 33 | ) 34 | continue 35 | 36 | message_catalogs.append((os.path.dirname(mo_file), (mo_file,))) 37 | 38 | return message_catalogs 39 | 40 | 41 | setup( 42 | name="bagit", 43 | use_scm_version=True, 44 | url="https://libraryofcongress.github.io/bagit-python/", 45 | author="Ed Summers", 46 | author_email="ehs@pobox.com", 47 | py_modules=["bagit"], 48 | scripts=["bagit.py"], 49 | data_files=get_message_catalogs(), 50 | description=description, 51 | platforms=["POSIX"], 52 | setup_requires=["setuptools_scm"], 53 | classifiers=[ 54 | "License :: Public Domain", 55 | "Intended Audience :: Developers", 56 | "Topic :: Communications :: File Sharing", 57 | "Topic :: Software Development :: Libraries :: Python Modules", 58 | "Topic :: System :: Filesystems", 59 | "Programming Language :: Python :: 3", 60 | ], 61 | ) 62 | -------------------------------------------------------------------------------- /test-data/README: -------------------------------------------------------------------------------- 1 | public domain images obtained from flickr commons: 2 | 3 | http://www.flickr.com/photos/smithsonian/2584174182/ 4 | http://www.flickr.com/photos/smithsonian/4011399822/ 5 | http://www.flickr.com/photos/library_of_congress/2478433644/ 6 | 7 | 8 | -------------------------------------------------------------------------------- /test-data/loc/2478433644_2839c5e8b8_o_d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LibraryOfCongress/bagit-python/da041808d2453da2209054a9f4c48187dc323c0a/test-data/loc/2478433644_2839c5e8b8_o_d.jpg -------------------------------------------------------------------------------- /test-data/loc/3314493806_6f1db86d66_o_d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LibraryOfCongress/bagit-python/da041808d2453da2209054a9f4c48187dc323c0a/test-data/loc/3314493806_6f1db86d66_o_d.jpg -------------------------------------------------------------------------------- /test-data/si/2584174182_ffd5c24905_b_d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LibraryOfCongress/bagit-python/da041808d2453da2209054a9f4c48187dc323c0a/test-data/si/2584174182_ffd5c24905_b_d.jpg -------------------------------------------------------------------------------- /test-data/si/4011399822_65987a4806_b_d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LibraryOfCongress/bagit-python/da041808d2453da2209054a9f4c48187dc323c0a/test-data/si/4011399822_65987a4806_b_d.jpg -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from __future__ import absolute_import, division, print_function, unicode_literals 4 | 5 | import codecs 6 | import datetime 7 | import hashlib 8 | import logging 9 | import os 10 | import shutil 11 | import stat 12 | import sys 13 | import tempfile 14 | import unicodedata 15 | import unittest 16 | from os.path import join as j 17 | 18 | from unittest import mock 19 | from io import StringIO 20 | 21 | import bagit 22 | 23 | logging.basicConfig(filename="test.log", level=logging.DEBUG) 24 | stderr = logging.StreamHandler() 25 | stderr.setLevel(logging.WARNING) 26 | logging.getLogger().addHandler(stderr) 27 | 28 | # But we do want any exceptions raised in the logging path to be raised: 29 | logging.raiseExceptions = True 30 | 31 | 32 | def slurp_text_file(filename): 33 | with bagit.open_text_file(filename) as f: 34 | return f.read() 35 | 36 | 37 | class SelfCleaningTestCase(unittest.TestCase): 38 | """TestCase subclass which cleans up self.tmpdir after each test""" 39 | 40 | def setUp(self): 41 | super(SelfCleaningTestCase, self).setUp() 42 | 43 | self.starting_directory = ( 44 | os.getcwd() 45 | ) # FIXME: remove this after we stop changing directories in bagit.py 46 | self.tmpdir = tempfile.mkdtemp() 47 | if os.path.isdir(self.tmpdir): 48 | shutil.rmtree(self.tmpdir) 49 | shutil.copytree("test-data", self.tmpdir) 50 | 51 | def tearDown(self): 52 | # FIXME: remove this after we stop changing directories in bagit.py 53 | os.chdir(self.starting_directory) 54 | if os.path.isdir(self.tmpdir): 55 | # Clean up after tests which leave inaccessible files behind: 56 | 57 | os.chmod(self.tmpdir, 0o700) 58 | 59 | for dirpath, subdirs, filenames in os.walk(self.tmpdir, topdown=True): 60 | for i in subdirs: 61 | os.chmod(os.path.join(dirpath, i), 0o700) 62 | 63 | shutil.rmtree(self.tmpdir) 64 | 65 | super(SelfCleaningTestCase, self).tearDown() 66 | 67 | 68 | @mock.patch( 69 | "bagit.VERSION", new="1.5.4" 70 | ) # This avoids needing to change expected hashes on each release 71 | class TestSingleProcessValidation(SelfCleaningTestCase): 72 | def validate(self, bag, *args, **kwargs): 73 | return bag.validate(*args, **kwargs) 74 | 75 | def test_make_bag_sha1_sha256_manifest(self): 76 | bag = bagit.make_bag(self.tmpdir, checksum=["sha1", "sha256"]) 77 | # check that relevant manifests are created 78 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-sha1.txt"))) 79 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-sha256.txt"))) 80 | # check valid with two manifests 81 | self.assertTrue(self.validate(bag, fast=True)) 82 | 83 | def test_make_bag_md5_sha256_manifest(self): 84 | bag = bagit.make_bag(self.tmpdir, checksum=["md5", "sha256"]) 85 | # check that relevant manifests are created 86 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-md5.txt"))) 87 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-sha256.txt"))) 88 | # check valid with two manifests 89 | self.assertTrue(self.validate(bag, fast=True)) 90 | 91 | def test_make_bag_md5_sha1_sha256_manifest(self): 92 | bag = bagit.make_bag(self.tmpdir, checksum=["md5", "sha1", "sha256"]) 93 | # check that relevant manifests are created 94 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-md5.txt"))) 95 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-sha1.txt"))) 96 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-sha256.txt"))) 97 | # check valid with three manifests 98 | self.assertTrue(self.validate(bag, fast=True)) 99 | 100 | def test_validate_flipped_bit(self): 101 | bag = bagit.make_bag(self.tmpdir) 102 | readme = j(self.tmpdir, "data", "README") 103 | txt = slurp_text_file(readme) 104 | txt = "A" + txt[1:] 105 | with open(readme, "w") as r: 106 | r.write(txt) 107 | bag = bagit.Bag(self.tmpdir) 108 | self.assertRaises(bagit.BagValidationError, self.validate, bag) 109 | # fast doesn't catch the flipped bit, since oxsum is the same 110 | self.assertTrue(self.validate(bag, fast=True)) 111 | self.assertTrue(self.validate(bag, completeness_only=True)) 112 | 113 | def test_validate_fast(self): 114 | bag = bagit.make_bag(self.tmpdir) 115 | self.assertEqual(self.validate(bag, fast=True), True) 116 | os.remove(j(self.tmpdir, "data", "loc", "2478433644_2839c5e8b8_o_d.jpg")) 117 | self.assertRaises(bagit.BagValidationError, self.validate, bag, fast=True) 118 | 119 | def test_validate_completeness(self): 120 | bag = bagit.make_bag(self.tmpdir) 121 | old_path = j(self.tmpdir, "data", "README") 122 | new_path = j(self.tmpdir, "data", "extra_file") 123 | os.rename(old_path, new_path) 124 | bag = bagit.Bag(self.tmpdir) 125 | self.assertTrue(self.validate(bag, fast=True)) 126 | with mock.patch.object(bag, "_validate_entries") as m: 127 | self.assertRaises( 128 | bagit.BagValidationError, self.validate, bag, completeness_only=True 129 | ) 130 | self.assertEqual(m.call_count, 0) 131 | 132 | def test_validate_fast_without_oxum(self): 133 | bag = bagit.make_bag(self.tmpdir) 134 | os.remove(j(self.tmpdir, "bag-info.txt")) 135 | bag = bagit.Bag(self.tmpdir) 136 | self.assertRaises(bagit.BagValidationError, self.validate, bag, fast=True) 137 | 138 | def test_validate_slow_without_oxum_extra_file(self): 139 | bag = bagit.make_bag(self.tmpdir) 140 | os.remove(j(self.tmpdir, "bag-info.txt")) 141 | with open(j(self.tmpdir, "data", "extra_file"), "w") as ef: 142 | ef.write("foo") 143 | bag = bagit.Bag(self.tmpdir) 144 | self.assertRaises(bagit.BagValidationError, self.validate, bag, fast=False) 145 | 146 | def test_validate_missing_directory(self): 147 | bagit.make_bag(self.tmpdir) 148 | 149 | tmp_data_dir = os.path.join(self.tmpdir, "data") 150 | shutil.rmtree(tmp_data_dir) 151 | 152 | bag = bagit.Bag(self.tmpdir) 153 | with self.assertRaises(bagit.BagValidationError) as error_catcher: 154 | bag.validate() 155 | 156 | self.assertEqual( 157 | "Expected data directory %s does not exist" % tmp_data_dir, 158 | str(error_catcher.exception), 159 | ) 160 | 161 | def test_validation_error_details(self): 162 | bag = bagit.make_bag( 163 | self.tmpdir, checksums=["md5"], bag_info={"Bagging-Date": "1970-01-01"} 164 | ) 165 | readme = j(self.tmpdir, "data", "README") 166 | txt = slurp_text_file(readme) 167 | txt = "A" + txt[1:] 168 | with open(readme, "w") as r: 169 | r.write(txt) 170 | 171 | bag = bagit.Bag(self.tmpdir) 172 | got_exception = False 173 | 174 | try: 175 | self.validate(bag) 176 | except bagit.BagValidationError as e: 177 | got_exception = True 178 | 179 | exc_str = str(e) 180 | self.assertIn( 181 | 'data/README md5 validation failed: expected="8e2af7a0143c7b8f4de0b3fc90f27354" found="fd41543285d17e7c29cd953f5cf5b955"', 182 | exc_str, 183 | ) 184 | self.assertEqual(len(e.details), 1) 185 | 186 | readme_error = e.details[0] 187 | self.assertEqual( 188 | 'data/README md5 validation failed: expected="8e2af7a0143c7b8f4de0b3fc90f27354" found="fd41543285d17e7c29cd953f5cf5b955"', 189 | str(readme_error), 190 | ) 191 | self.assertIsInstance(readme_error, bagit.ChecksumMismatch) 192 | self.assertEqual(readme_error.algorithm, "md5") 193 | self.assertEqual(readme_error.path, "data/README") 194 | self.assertEqual(readme_error.expected, "8e2af7a0143c7b8f4de0b3fc90f27354") 195 | self.assertEqual(readme_error.found, "fd41543285d17e7c29cd953f5cf5b955") 196 | 197 | if not got_exception: 198 | self.fail("didn't get BagValidationError") 199 | 200 | def test_validation_completeness_error_details(self): 201 | bag = bagit.make_bag( 202 | self.tmpdir, checksums=["md5"], bag_info={"Bagging-Date": "1970-01-01"} 203 | ) 204 | 205 | old_path = j(self.tmpdir, "data", "README") 206 | new_path = j(self.tmpdir, "data", "extra") 207 | os.rename(old_path, new_path) 208 | 209 | # remove the bag-info.txt which contains the oxum to force a full 210 | # check of the manifest 211 | os.remove(j(self.tmpdir, "bag-info.txt")) 212 | 213 | bag = bagit.Bag(self.tmpdir) 214 | got_exception = False 215 | 216 | try: 217 | self.validate(bag) 218 | except bagit.BagValidationError as e: 219 | got_exception = True 220 | 221 | exc_str = str(e) 222 | self.assertIn("Bag is incomplete: ", exc_str) 223 | self.assertIn( 224 | "bag-info.txt exists in manifest but was not found on filesystem", 225 | exc_str, 226 | ) 227 | self.assertIn( 228 | "data/README exists in manifest but was not found on filesystem", 229 | exc_str, 230 | ) 231 | self.assertIn( 232 | "data/extra exists on filesystem but is not in the manifest", exc_str 233 | ) 234 | self.assertEqual(len(e.details), 3) 235 | 236 | if e.details[0].path == "bag-info.txt": 237 | baginfo_error = e.details[0] 238 | readme_error = e.details[1] 239 | else: 240 | baginfo_error = e.details[1] 241 | readme_error = e.details[0] 242 | 243 | self.assertEqual( 244 | str(baginfo_error), 245 | "bag-info.txt exists in manifest but was not found on filesystem", 246 | ) 247 | self.assertIsInstance(baginfo_error, bagit.FileMissing) 248 | self.assertEqual(baginfo_error.path, "bag-info.txt") 249 | 250 | self.assertEqual( 251 | str(readme_error), 252 | "data/README exists in manifest but was not found on filesystem", 253 | ) 254 | self.assertIsInstance(readme_error, bagit.FileMissing) 255 | self.assertEqual(readme_error.path, "data/README") 256 | 257 | error = e.details[2] 258 | self.assertEqual( 259 | str(error), "data/extra exists on filesystem but is not in the manifest" 260 | ) 261 | self.assertTrue(error, bagit.UnexpectedFile) 262 | self.assertEqual(error.path, "data/extra") 263 | 264 | if not got_exception: 265 | self.fail("didn't get BagValidationError") 266 | 267 | def test_bom_in_bagit_txt(self): 268 | bag = bagit.make_bag(self.tmpdir) 269 | BOM = codecs.BOM_UTF8.decode("utf-8") 270 | with open(j(self.tmpdir, "bagit.txt"), "r") as bf: 271 | bagfile = BOM + bf.read() 272 | with open(j(self.tmpdir, "bagit.txt"), "w") as bf: 273 | bf.write(bagfile) 274 | bag = bagit.Bag(self.tmpdir) 275 | self.assertRaises(bagit.BagValidationError, self.validate, bag) 276 | 277 | def test_missing_file(self): 278 | bag = bagit.make_bag(self.tmpdir) 279 | os.remove(j(self.tmpdir, "data", "loc", "3314493806_6f1db86d66_o_d.jpg")) 280 | self.assertRaises(bagit.BagValidationError, self.validate, bag) 281 | 282 | def test_handle_directory_end_slash_gracefully(self): 283 | bag = bagit.make_bag(self.tmpdir + "/") 284 | self.assertTrue(self.validate(bag)) 285 | bag2 = bagit.Bag(self.tmpdir + "/") 286 | self.assertTrue(self.validate(bag2)) 287 | 288 | def test_allow_extraneous_files_in_base(self): 289 | bag = bagit.make_bag(self.tmpdir) 290 | self.assertTrue(self.validate(bag)) 291 | f = j(self.tmpdir, "IGNOREFILE") 292 | with open(f, "w"): 293 | self.assertTrue(self.validate(bag)) 294 | 295 | def test_allow_extraneous_dirs_in_base(self): 296 | bag = bagit.make_bag(self.tmpdir) 297 | self.assertTrue(self.validate(bag)) 298 | d = j(self.tmpdir, "IGNOREDIR") 299 | os.mkdir(d) 300 | self.assertTrue(self.validate(bag)) 301 | 302 | def test_missing_tagfile_raises_error(self): 303 | bag = bagit.make_bag(self.tmpdir) 304 | self.assertTrue(self.validate(bag)) 305 | os.remove(j(self.tmpdir, "bagit.txt")) 306 | self.assertRaises(bagit.BagValidationError, self.validate, bag) 307 | 308 | def test_missing_manifest_raises_error(self): 309 | bag = bagit.make_bag(self.tmpdir, checksums=["sha512"]) 310 | self.assertTrue(self.validate(bag)) 311 | os.remove(j(self.tmpdir, "manifest-sha512.txt")) 312 | self.assertRaises(bagit.BagValidationError, self.validate, bag) 313 | 314 | def test_mixed_case_checksums(self): 315 | bag = bagit.make_bag(self.tmpdir, checksums=["md5"]) 316 | hashstr = {} 317 | # Extract entries only for the payload and ignore 318 | # entries from the tagmanifest file 319 | for key in bag.entries.keys(): 320 | if key.startswith("data" + os.sep): 321 | hashstr = bag.entries[key] 322 | hashstr = next(iter(hashstr.values())) 323 | manifest = slurp_text_file(j(self.tmpdir, "manifest-md5.txt")) 324 | 325 | manifest = manifest.replace(hashstr, hashstr.upper()) 326 | 327 | with open(j(self.tmpdir, "manifest-md5.txt"), "wb") as m: 328 | m.write(manifest.encode("utf-8")) 329 | 330 | # Since manifest-md5.txt file is updated, re-calculate its 331 | # md5 checksum and update it in the tagmanifest-md5.txt file 332 | hasher = hashlib.new("md5") 333 | contents = slurp_text_file(j(self.tmpdir, "manifest-md5.txt")).encode("utf-8") 334 | hasher.update(contents) 335 | with open(j(self.tmpdir, "tagmanifest-md5.txt"), "r") as tagmanifest: 336 | tagman_contents = tagmanifest.read() 337 | tagman_contents = tagman_contents.replace( 338 | bag.entries["manifest-md5.txt"]["md5"], hasher.hexdigest() 339 | ) 340 | with open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") as tagmanifest: 341 | tagmanifest.write(tagman_contents) 342 | 343 | bag = bagit.Bag(self.tmpdir) 344 | self.assertTrue(self.validate(bag)) 345 | 346 | def test_unsafe_directory_entries_raise_error(self): 347 | bad_paths = None 348 | # This could be more granular, but ought to be 349 | # adequate. 350 | if os.name == "nt": 351 | bad_paths = ( 352 | r"C:\win32\cmd.exe", 353 | "\\\\?\\C:\\", 354 | "COM1:", 355 | "\\\\.\\COM56", 356 | "..\\..\\..\\win32\\cmd.exe", 357 | "data\\..\\..\\..\\win32\\cmd.exe", 358 | ) 359 | else: 360 | bad_paths = ( 361 | "../../../secrets.json", 362 | "~/.pgp/id_rsa", 363 | "/dev/null", 364 | "data/../../../secrets.json", 365 | ) 366 | hasher = hashlib.new("md5") 367 | corpus = "this is not a real checksum" 368 | hasher.update(corpus.encode("utf-8")) 369 | for bad_path in bad_paths: 370 | bagit.make_bag(self.tmpdir, checksums=["md5"]) 371 | with open(j(self.tmpdir, "manifest-md5.txt"), "wb+") as manifest_out: 372 | line = "%s %s\n" % (hasher.hexdigest(), bad_path) 373 | manifest_out.write(line.encode("utf-8")) 374 | self.assertRaises(bagit.BagError, bagit.Bag, self.tmpdir) 375 | 376 | def test_multiple_oxum_values(self): 377 | bag = bagit.make_bag(self.tmpdir) 378 | with open(j(self.tmpdir, "bag-info.txt"), "a") as baginfo: 379 | baginfo.write("Payload-Oxum: 7.7\n") 380 | bag = bagit.Bag(self.tmpdir) 381 | self.assertTrue(self.validate(bag, fast=True)) 382 | 383 | def test_validate_optional_tagfile(self): 384 | bag = bagit.make_bag(self.tmpdir, checksums=["md5"]) 385 | tagdir = tempfile.mkdtemp(dir=self.tmpdir) 386 | with open(j(tagdir, "tagfile"), "w") as tagfile: 387 | tagfile.write("test") 388 | relpath = j(tagdir, "tagfile").replace(self.tmpdir + os.sep, "") 389 | relpath.replace("\\", "/") 390 | with open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") as tagman: 391 | # Incorrect checksum. 392 | tagman.write("8e2af7a0143c7b8f4de0b3fc90f27354 " + relpath + "\n") 393 | bag = bagit.Bag(self.tmpdir) 394 | self.assertRaises(bagit.BagValidationError, self.validate, bag) 395 | 396 | hasher = hashlib.new("md5") 397 | contents = slurp_text_file(j(tagdir, "tagfile")).encode("utf-8") 398 | hasher.update(contents) 399 | with open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") as tagman: 400 | tagman.write(hasher.hexdigest() + " " + relpath + "\n") 401 | bag = bagit.Bag(self.tmpdir) 402 | self.assertTrue(self.validate(bag)) 403 | 404 | # Missing tagfile. 405 | os.remove(j(tagdir, "tagfile")) 406 | bag = bagit.Bag(self.tmpdir) 407 | self.assertRaises(bagit.BagValidationError, self.validate, bag) 408 | 409 | def test_validate_optional_tagfile_in_directory(self): 410 | bag = bagit.make_bag(self.tmpdir, checksums=["md5"]) 411 | tagdir = tempfile.mkdtemp(dir=self.tmpdir) 412 | 413 | if not os.path.exists(j(tagdir, "tagfolder")): 414 | os.makedirs(j(tagdir, "tagfolder")) 415 | 416 | with open(j(tagdir, "tagfolder", "tagfile"), "w") as tagfile: 417 | tagfile.write("test") 418 | relpath = j(tagdir, "tagfolder", "tagfile").replace(self.tmpdir + os.sep, "") 419 | relpath.replace("\\", "/") 420 | with open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") as tagman: 421 | # Incorrect checksum. 422 | tagman.write("8e2af7a0143c7b8f4de0b3fc90f27354 " + relpath + "\n") 423 | bag = bagit.Bag(self.tmpdir) 424 | self.assertRaises(bagit.BagValidationError, self.validate, bag) 425 | 426 | hasher = hashlib.new("md5") 427 | with open(j(tagdir, "tagfolder", "tagfile"), "r") as tf: 428 | contents = tf.read().encode("utf-8") 429 | hasher.update(contents) 430 | with open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") as tagman: 431 | tagman.write(hasher.hexdigest() + " " + relpath + "\n") 432 | bag = bagit.Bag(self.tmpdir) 433 | self.assertTrue(self.validate(bag)) 434 | 435 | # Missing tagfile. 436 | os.remove(j(tagdir, "tagfolder", "tagfile")) 437 | bag = bagit.Bag(self.tmpdir) 438 | self.assertRaises(bagit.BagValidationError, self.validate, bag) 439 | 440 | def test_sha1_tagfile(self): 441 | info = {"Bagging-Date": "1970-01-01", "Contact-Email": "ehs@pobox.com"} 442 | bag = bagit.make_bag(self.tmpdir, checksum=["sha1"], bag_info=info) 443 | self.assertTrue(os.path.isfile(j(self.tmpdir, "tagmanifest-sha1.txt"))) 444 | self.assertEqual( 445 | "f69110479d0d395f7c321b3860c2bc0c96ae9fe8", 446 | bag.entries["bag-info.txt"]["sha1"], 447 | ) 448 | 449 | def test_validate_unreadable_file(self): 450 | bag = bagit.make_bag(self.tmpdir, checksum=["md5"]) 451 | os.chmod(j(self.tmpdir, "data/loc/2478433644_2839c5e8b8_o_d.jpg"), 0) 452 | self.assertRaises(bagit.BagValidationError, self.validate, bag, fast=False) 453 | 454 | 455 | class TestMultiprocessValidation(TestSingleProcessValidation): 456 | def validate(self, bag, *args, **kwargs): 457 | return super(TestMultiprocessValidation, self).validate( 458 | bag, *args, processes=2, **kwargs 459 | ) 460 | 461 | @mock.patch("bagit.multiprocessing.Pool") 462 | def test_validate_pool_error(self, pool): 463 | # Simulate the Pool constructor raising a RuntimeError. 464 | pool.side_effect = RuntimeError 465 | bag = bagit.make_bag(self.tmpdir) 466 | # Previously, this raised UnboundLocalError if uninitialized. 467 | with self.assertRaises(RuntimeError): 468 | self.validate(bag) 469 | 470 | 471 | @mock.patch( 472 | "bagit.VERSION", new="1.5.4" 473 | ) # This avoids needing to change expected hashes on each release 474 | class TestBag(SelfCleaningTestCase): 475 | def test_make_bag(self): 476 | info = {"Bagging-Date": "1970-01-01", "Contact-Email": "ehs@pobox.com"} 477 | bagit.make_bag(self.tmpdir, bag_info=info, checksums=["md5"]) 478 | 479 | # data dir should've been created 480 | self.assertTrue(os.path.isdir(j(self.tmpdir, "data"))) 481 | 482 | # check bagit.txt 483 | self.assertTrue(os.path.isfile(j(self.tmpdir, "bagit.txt"))) 484 | bagit_txt = slurp_text_file(j(self.tmpdir, "bagit.txt")) 485 | self.assertTrue("BagIt-Version: 0.97", bagit_txt) 486 | self.assertTrue("Tag-File-Character-Encoding: UTF-8", bagit_txt) 487 | 488 | # check manifest 489 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-md5.txt"))) 490 | manifest_txt = slurp_text_file(j(self.tmpdir, "manifest-md5.txt")).splitlines() 491 | self.assertIn("8e2af7a0143c7b8f4de0b3fc90f27354 data/README", manifest_txt) 492 | self.assertIn( 493 | "9a2b89e9940fea6ac3a0cc71b0a933a0 data/loc/2478433644_2839c5e8b8_o_d.jpg", 494 | manifest_txt, 495 | ) 496 | self.assertIn( 497 | "6172e980c2767c12135e3b9d246af5a3 data/loc/3314493806_6f1db86d66_o_d.jpg", 498 | manifest_txt, 499 | ) 500 | self.assertIn( 501 | "38a84cd1c41de793a0bccff6f3ec8ad0 data/si/2584174182_ffd5c24905_b_d.jpg", 502 | manifest_txt, 503 | ) 504 | self.assertIn( 505 | "5580eaa31ad1549739de12df819e9af8 data/si/4011399822_65987a4806_b_d.jpg", 506 | manifest_txt, 507 | ) 508 | 509 | # check bag-info.txt 510 | self.assertTrue(os.path.isfile(j(self.tmpdir, "bag-info.txt"))) 511 | bag_info_txt = slurp_text_file(j(self.tmpdir, "bag-info.txt")) 512 | bag_info_txt = bag_info_txt.splitlines() 513 | self.assertIn("Contact-Email: ehs@pobox.com", bag_info_txt) 514 | self.assertIn("Bagging-Date: 1970-01-01", bag_info_txt) 515 | self.assertIn("Payload-Oxum: 991765.5", bag_info_txt) 516 | self.assertIn( 517 | "Bag-Software-Agent: bagit.py v1.5.4 ", 518 | bag_info_txt, 519 | ) 520 | 521 | # check tagmanifest-md5.txt 522 | self.assertTrue(os.path.isfile(j(self.tmpdir, "tagmanifest-md5.txt"))) 523 | tagmanifest_txt = slurp_text_file( 524 | j(self.tmpdir, "tagmanifest-md5.txt") 525 | ).splitlines() 526 | self.assertIn("9e5ad981e0d29adc278f6a294b8c2aca bagit.txt", tagmanifest_txt) 527 | self.assertIn( 528 | "a0ce6631a2a6d1a88e6d38453ccc72a5 manifest-md5.txt", tagmanifest_txt 529 | ) 530 | self.assertIn("0a6ffcffe67e9a34e44220f7ebcb4baa bag-info.txt", tagmanifest_txt) 531 | 532 | def test_make_bag_sha1_manifest(self): 533 | bagit.make_bag(self.tmpdir, checksum=["sha1"]) 534 | # check manifest 535 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-sha1.txt"))) 536 | manifest_txt = slurp_text_file(j(self.tmpdir, "manifest-sha1.txt")).splitlines() 537 | self.assertIn( 538 | "ace19416e605cfb12ab11df4898ca7fd9979ee43 data/README", manifest_txt 539 | ) 540 | self.assertIn( 541 | "4c0a3da57374e8db379145f18601b159f3cad44b data/loc/2478433644_2839c5e8b8_o_d.jpg", 542 | manifest_txt, 543 | ) 544 | self.assertIn( 545 | "62095aeddae2f3207cb77c85937e13c51641ef71 data/loc/3314493806_6f1db86d66_o_d.jpg", 546 | manifest_txt, 547 | ) 548 | self.assertIn( 549 | "e592194b3733e25166a631e1ec55bac08066cbc1 data/si/2584174182_ffd5c24905_b_d.jpg", 550 | manifest_txt, 551 | ) 552 | self.assertIn( 553 | "db49ef009f85a5d0701829f38d29f8cf9c5df2ea data/si/4011399822_65987a4806_b_d.jpg", 554 | manifest_txt, 555 | ) 556 | 557 | def test_make_bag_sha256_manifest(self): 558 | bagit.make_bag(self.tmpdir, checksum=["sha256"]) 559 | # check manifest 560 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-sha256.txt"))) 561 | manifest_txt = slurp_text_file( 562 | j(self.tmpdir, "manifest-sha256.txt") 563 | ).splitlines() 564 | self.assertIn( 565 | "b6df8058fa818acfd91759edffa27e473f2308d5a6fca1e07a79189b95879953 data/loc/2478433644_2839c5e8b8_o_d.jpg", 566 | manifest_txt, 567 | ) 568 | self.assertIn( 569 | "1af90c21e72bb0575ae63877b3c69cfb88284f6e8c7820f2c48dc40a08569da5 data/loc/3314493806_6f1db86d66_o_d.jpg", 570 | manifest_txt, 571 | ) 572 | self.assertIn( 573 | "f065a4ae2bc5d47c6d046c3cba5c8cdfd66b07c96ff3604164e2c31328e41c1a data/si/2584174182_ffd5c24905_b_d.jpg", 574 | manifest_txt, 575 | ) 576 | self.assertIn( 577 | "45d257c93e59ec35187c6a34c8e62e72c3e9cfbb548984d6f6e8deb84bac41f4 data/si/4011399822_65987a4806_b_d.jpg", 578 | manifest_txt, 579 | ) 580 | 581 | def test_make_bag_sha512_manifest(self): 582 | bagit.make_bag(self.tmpdir, checksum=["sha512"]) 583 | # check manifest 584 | self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-sha512.txt"))) 585 | manifest_txt = slurp_text_file( 586 | j(self.tmpdir, "manifest-sha512.txt") 587 | ).splitlines() 588 | self.assertIn( 589 | "51fb9236a23795886cf42d539d580739245dc08f72c3748b60ed8803c9cb0e2accdb91b75dbe7d94a0a461827929d720ef45fe80b825941862fcde4c546a376d data/loc/2478433644_2839c5e8b8_o_d.jpg", 590 | manifest_txt, 591 | ) 592 | self.assertIn( 593 | "627c15be7f9aabc395c8b2e4c3ff0b50fd84b3c217ca38044cde50fd4749621e43e63828201fa66a97975e316033e4748fb7a4a500183b571ecf17715ec3aea3 data/loc/3314493806_6f1db86d66_o_d.jpg", 594 | manifest_txt, 595 | ) 596 | self.assertIn( 597 | "4cb4dafe39b2539536a9cb31d5addf335734cb91e2d2786d212a9b574e094d7619a84ad53f82bd9421478a7994cf9d3f44fea271d542af09d26ce764edbada46 data/si/2584174182_ffd5c24905_b_d.jpg", 598 | manifest_txt, 599 | ) 600 | self.assertIn( 601 | "af1c03483cd1999098cce5f9e7689eea1f81899587508f59ba3c582d376f8bad34e75fed55fd1b1c26bd0c7a06671b85e90af99abac8753ad3d76d8d6bb31ebd data/si/4011399822_65987a4806_b_d.jpg", 602 | manifest_txt, 603 | ) 604 | 605 | def test_make_bag_unknown_algorithm(self): 606 | self.assertRaises( 607 | ValueError, bagit.make_bag, self.tmpdir, checksum=["not-really-a-name"] 608 | ) 609 | 610 | def test_make_bag_with_empty_directory(self): 611 | tmpdir = tempfile.mkdtemp() 612 | try: 613 | bagit.make_bag(tmpdir) 614 | finally: 615 | shutil.rmtree(tmpdir) 616 | 617 | def test_make_bag_with_empty_directory_tree(self): 618 | tmpdir = tempfile.mkdtemp() 619 | path = j(tmpdir, "test1", "test2") 620 | try: 621 | os.makedirs(path) 622 | bagit.make_bag(tmpdir) 623 | finally: 624 | shutil.rmtree(tmpdir) 625 | 626 | def test_make_bag_with_bogus_directory(self): 627 | bogus_directory = os.path.realpath("this-directory-does-not-exist") 628 | 629 | with self.assertRaises(RuntimeError) as error_catcher: 630 | bagit.make_bag(bogus_directory) 631 | 632 | self.assertEqual( 633 | "Bag directory %s does not exist" % bogus_directory, 634 | str(error_catcher.exception), 635 | ) 636 | 637 | def test_make_bag_with_unreadable_source(self): 638 | os.chmod(self.tmpdir, 0) 639 | 640 | with self.assertRaises(bagit.BagError) as error_catcher: 641 | bagit.make_bag(self.tmpdir, checksum=["sha256"]) 642 | 643 | self.assertEqual( 644 | "Missing permissions to move all files and directories", 645 | str(error_catcher.exception), 646 | ) 647 | 648 | def test_make_bag_with_unreadable_subdirectory(self): 649 | # We'll set this write-only to exercise the second permission check in make_bag: 650 | os.chmod(j(self.tmpdir, "loc"), 0o200) 651 | 652 | with self.assertRaises(bagit.BagError) as error_catcher: 653 | bagit.make_bag(self.tmpdir, checksum=["sha256"]) 654 | 655 | self.assertEqual( 656 | "Read permissions are required to calculate file fixities", 657 | str(error_catcher.exception), 658 | ) 659 | 660 | def test_make_bag_with_unwritable_source(self): 661 | path_suffixes = ("", "loc") 662 | 663 | for path_suffix in reversed(path_suffixes): 664 | os.chmod(j(self.tmpdir, path_suffix), 0o500) 665 | 666 | with self.assertRaises(bagit.BagError) as error_catcher: 667 | bagit.make_bag(self.tmpdir, checksum=["sha256"]) 668 | 669 | self.assertEqual( 670 | "Missing permissions to move all files and directories", 671 | str(error_catcher.exception), 672 | ) 673 | 674 | def test_make_bag_with_unreadable_file(self): 675 | os.chmod(j(self.tmpdir, "loc", "2478433644_2839c5e8b8_o_d.jpg"), 0) 676 | 677 | with self.assertRaises(bagit.BagError) as error_catcher: 678 | bagit.make_bag(self.tmpdir, checksum=["sha256"]) 679 | 680 | self.assertEqual( 681 | "Read permissions are required to calculate file fixities", 682 | str(error_catcher.exception), 683 | ) 684 | 685 | def test_make_bag_with_data_dir_present(self): 686 | os.mkdir(j(self.tmpdir, "data")) 687 | bagit.make_bag(self.tmpdir) 688 | 689 | # data dir should now contain another data dir 690 | self.assertTrue(os.path.isdir(j(self.tmpdir, "data", "data"))) 691 | 692 | def test_bag_class(self): 693 | info = {"Contact-Email": "ehs@pobox.com"} 694 | bag = bagit.make_bag(self.tmpdir, bag_info=info, checksums=["sha384"]) 695 | self.assertIsInstance(bag, bagit.Bag) 696 | self.assertEqual( 697 | set(bag.payload_files()), 698 | set( 699 | [ 700 | "data/README", 701 | "data/si/2584174182_ffd5c24905_b_d.jpg", 702 | "data/si/4011399822_65987a4806_b_d.jpg", 703 | "data/loc/2478433644_2839c5e8b8_o_d.jpg", 704 | "data/loc/3314493806_6f1db86d66_o_d.jpg", 705 | ] 706 | ), 707 | ) 708 | self.assertEqual( 709 | list(bag.manifest_files()), ["%s/manifest-sha384.txt" % self.tmpdir] 710 | ) 711 | 712 | def test_bag_string_representation(self): 713 | bag = bagit.make_bag(self.tmpdir) 714 | self.assertEqual(self.tmpdir, str(bag)) 715 | 716 | def test_has_oxum(self): 717 | bag = bagit.make_bag(self.tmpdir) 718 | self.assertTrue(bag.has_oxum()) 719 | 720 | def test_bag_constructor(self): 721 | bag = bagit.make_bag(self.tmpdir) 722 | bag = bagit.Bag(self.tmpdir) 723 | self.assertEqual(type(bag), bagit.Bag) 724 | self.assertEqual(len(list(bag.payload_files())), 5) 725 | 726 | def test_is_valid(self): 727 | bag = bagit.make_bag(self.tmpdir) 728 | bag = bagit.Bag(self.tmpdir) 729 | self.assertTrue(bag.is_valid()) 730 | with open(j(self.tmpdir, "data", "extra_file"), "w") as ef: 731 | ef.write("bar") 732 | self.assertFalse(bag.is_valid()) 733 | 734 | def test_garbage_in_bagit_txt(self): 735 | bagit.make_bag(self.tmpdir) 736 | bagfile = """BagIt-Version: 0.97 737 | Tag-File-Character-Encoding: UTF-8 738 | ================================== 739 | """ 740 | with open(j(self.tmpdir, "bagit.txt"), "w") as bf: 741 | bf.write(bagfile) 742 | self.assertRaises(bagit.BagValidationError, bagit.Bag, self.tmpdir) 743 | 744 | def test_make_bag_multiprocessing(self): 745 | bagit.make_bag(self.tmpdir, processes=2) 746 | self.assertTrue(os.path.isdir(j(self.tmpdir, "data"))) 747 | 748 | def test_multiple_meta_values(self): 749 | baginfo = {"Multival-Meta": [7, 4, 8, 6, 8]} 750 | bag = bagit.make_bag(self.tmpdir, baginfo) 751 | meta = bag.info.get("Multival-Meta") 752 | self.assertEqual(type(meta), list) 753 | self.assertEqual(len(meta), len(baginfo["Multival-Meta"])) 754 | 755 | def test_unicode_bag_info(self): 756 | info = { 757 | "Test-BMP": "This element contains a \N{LATIN SMALL LETTER U WITH DIAERESIS}", 758 | "Test-SMP": "This element contains a \N{LINEAR B SYMBOL B049}", 759 | } 760 | 761 | bagit.make_bag(self.tmpdir, bag_info=info, checksums=["md5"]) 762 | 763 | bag_info_txt = slurp_text_file(j(self.tmpdir, "bag-info.txt")) 764 | for v in info.values(): 765 | self.assertIn(v, bag_info_txt) 766 | 767 | def test_unusual_bag_info_separators(self): 768 | bag = bagit.make_bag(self.tmpdir) 769 | 770 | with open(j(self.tmpdir, "bag-info.txt"), "a") as f: 771 | print("Test-Tag: 1", file=f) 772 | print("Test-Tag:\t2", file=f) 773 | print("Test-Tag\t: 3", file=f) 774 | print("Test-Tag\t:\t4", file=f) 775 | print("Test-Tag\t \t: 5", file=f) 776 | print("Test-Tag:\t \t 6", file=f) 777 | 778 | bag = bagit.Bag(self.tmpdir) 779 | bag.save(manifests=True) 780 | 781 | self.assertTrue(bag.is_valid()) 782 | self.assertEqual(bag.info["Test-Tag"], list(map(str, range(1, 7)))) 783 | 784 | def test_default_bagging_date(self): 785 | info = {"Contact-Email": "ehs@pobox.com"} 786 | bagit.make_bag(self.tmpdir, bag_info=info) 787 | bag_info_txt = slurp_text_file(j(self.tmpdir, "bag-info.txt")) 788 | self.assertTrue("Contact-Email: ehs@pobox.com" in bag_info_txt) 789 | today = datetime.date.strftime(datetime.date.today(), "%Y-%m-%d") 790 | self.assertTrue("Bagging-Date: %s" % today in bag_info_txt) 791 | 792 | def test_missing_tagmanifest_valid(self): 793 | info = {"Contact-Email": "ehs@pobox.com"} 794 | bag = bagit.make_bag(self.tmpdir, bag_info=info, checksums=["md5"]) 795 | self.assertTrue(bag.is_valid()) 796 | os.remove(j(self.tmpdir, "tagmanifest-md5.txt")) 797 | self.assertTrue(bag.is_valid()) 798 | 799 | def test_carriage_return_manifest(self): 800 | with open(j(self.tmpdir, "newline\r"), "w") as whatever: 801 | whatever.write("ugh") 802 | bag = bagit.make_bag(self.tmpdir) 803 | self.assertTrue(bag.is_valid()) 804 | 805 | def test_payload_permissions(self): 806 | perms = os.stat(self.tmpdir).st_mode 807 | 808 | # our tmpdir should not be writeable by group 809 | self.assertEqual(perms & stat.S_IWOTH, 0) 810 | 811 | # but if we make it writeable by the group then resulting 812 | # payload directory should have the same permissions 813 | new_perms = perms | stat.S_IWOTH 814 | self.assertTrue(perms != new_perms) 815 | os.chmod(self.tmpdir, new_perms) 816 | bagit.make_bag(self.tmpdir) 817 | payload_dir = j(self.tmpdir, "data") 818 | self.assertEqual(os.stat(payload_dir).st_mode, new_perms) 819 | 820 | def test_save_bag_to_unwritable_directory(self): 821 | bag = bagit.make_bag(self.tmpdir, checksum=["sha256"]) 822 | 823 | os.chmod(self.tmpdir, 0) 824 | 825 | with self.assertRaises(bagit.BagError) as error_catcher: 826 | bag.save() 827 | 828 | self.assertEqual( 829 | "Cannot save bag to non-existent or inaccessible directory %s" 830 | % self.tmpdir, 831 | str(error_catcher.exception), 832 | ) 833 | 834 | def test_save_bag_with_unwritable_file(self): 835 | bag = bagit.make_bag(self.tmpdir, checksum=["sha256"]) 836 | 837 | os.chmod(os.path.join(self.tmpdir, "bag-info.txt"), 0) 838 | 839 | with self.assertRaises(bagit.BagError) as error_catcher: 840 | bag.save() 841 | 842 | self.assertEqual( 843 | "Read permissions are required to calculate file fixities", 844 | str(error_catcher.exception), 845 | ) 846 | 847 | def test_save_manifests(self): 848 | bag = bagit.make_bag(self.tmpdir) 849 | self.assertTrue(bag.is_valid()) 850 | bag.save(manifests=True) 851 | self.assertTrue(bag.is_valid()) 852 | with open(j(self.tmpdir, "data", "newfile"), "w") as nf: 853 | nf.write("newfile") 854 | self.assertRaises(bagit.BagValidationError, bag.validate, bag, fast=False) 855 | bag.save(manifests=True) 856 | self.assertTrue(bag.is_valid()) 857 | 858 | def test_save_manifests_deleted_files(self): 859 | bag = bagit.make_bag(self.tmpdir) 860 | self.assertTrue(bag.is_valid()) 861 | bag.save(manifests=True) 862 | self.assertTrue(bag.is_valid()) 863 | os.remove(j(self.tmpdir, "data", "loc", "2478433644_2839c5e8b8_o_d.jpg")) 864 | self.assertRaises(bagit.BagValidationError, bag.validate, bag, fast=False) 865 | bag.save(manifests=True) 866 | self.assertTrue(bag.is_valid()) 867 | 868 | def test_save_baginfo(self): 869 | bag = bagit.make_bag(self.tmpdir) 870 | 871 | bag.info["foo"] = "bar" 872 | bag.save() 873 | bag = bagit.Bag(self.tmpdir) 874 | self.assertEqual(bag.info["foo"], "bar") 875 | self.assertTrue(bag.is_valid()) 876 | 877 | bag.info["x"] = ["a", "b", "c"] 878 | bag.save() 879 | b = bagit.Bag(self.tmpdir) 880 | self.assertEqual(b.info["x"], ["a", "b", "c"]) 881 | self.assertTrue(bag.is_valid()) 882 | 883 | def test_save_baginfo_with_sha1(self): 884 | bag = bagit.make_bag(self.tmpdir, checksum=["sha1", "md5"]) 885 | self.assertTrue(bag.is_valid()) 886 | bag.save() 887 | 888 | bag.info["foo"] = "bar" 889 | bag.save() 890 | 891 | bag = bagit.Bag(self.tmpdir) 892 | self.assertTrue(bag.is_valid()) 893 | 894 | def test_save_only_baginfo(self): 895 | bag = bagit.make_bag(self.tmpdir) 896 | with open(j(self.tmpdir, "data", "newfile"), "w") as nf: 897 | nf.write("newfile") 898 | bag.info["foo"] = "bar" 899 | bag.save() 900 | 901 | bag = bagit.Bag(self.tmpdir) 902 | self.assertEqual(bag.info["foo"], "bar") 903 | self.assertFalse(bag.is_valid()) 904 | 905 | def test_make_bag_with_newline(self): 906 | bag = bagit.make_bag(self.tmpdir, {"test": "foo\nbar"}) 907 | self.assertEqual(bag.info["test"], "foobar") 908 | 909 | def test_unicode_in_tags(self): 910 | bag = bagit.make_bag(self.tmpdir, {"test": "♡"}) 911 | bag = bagit.Bag(self.tmpdir) 912 | self.assertEqual(bag.info["test"], "♡") 913 | 914 | def test_filename_unicode_normalization(self): 915 | # We need to handle cases where the Unicode normalization form of a 916 | # filename has changed in-transit. This is hard to do portably in both 917 | # directions because OS X normalizes *all* filenames to an NFD variant 918 | # so we'll start with a basic test which writes the manifest using the 919 | # NFC form and confirm that this does not cause the bag to fail when it 920 | # is written to the filesystem using the NFD form, which will not be 921 | # altered when saved to an HFS+ filesystem: 922 | 923 | test_filename = "Núñez Papers.txt" 924 | test_filename_nfd = unicodedata.normalize("NFD", test_filename) 925 | 926 | os.makedirs(j(self.tmpdir, "unicode-normalization")) 927 | 928 | with open(j(self.tmpdir, "unicode-normalization", test_filename_nfd), "w") as f: 929 | f.write("This is a test filename written using NFD normalization\n") 930 | 931 | bag = bagit.make_bag(self.tmpdir) 932 | bag.save() 933 | 934 | self.assertTrue(bag.is_valid()) 935 | 936 | # Now we'll cause the entire manifest file was normalized to NFC: 937 | for m_f in bag.manifest_files(): 938 | contents = slurp_text_file(m_f) 939 | normalized_bytes = unicodedata.normalize("NFC", contents).encode("utf-8") 940 | with open(m_f, "wb") as f: 941 | f.write(normalized_bytes) 942 | 943 | for alg in bag.algorithms: 944 | bagit._make_tagmanifest_file(alg, bag.path, encoding=bag.encoding) 945 | 946 | # Now we'll reload the whole thing: 947 | bag = bagit.Bag(self.tmpdir) 948 | self.assertTrue(bag.is_valid()) 949 | 950 | def test_open_bag_with_missing_bagit_txt(self): 951 | bagit.make_bag(self.tmpdir) 952 | 953 | os.unlink(j(self.tmpdir, "bagit.txt")) 954 | 955 | with self.assertRaises(bagit.BagError) as error_catcher: 956 | bagit.Bag(self.tmpdir) 957 | 958 | self.assertEqual( 959 | "Expected bagit.txt does not exist: %s/bagit.txt" % self.tmpdir, 960 | str(error_catcher.exception), 961 | ) 962 | 963 | def test_open_bag_with_malformed_bagit_txt(self): 964 | bagit.make_bag(self.tmpdir) 965 | 966 | with open(j(self.tmpdir, "bagit.txt"), "w") as f: 967 | os.ftruncate(f.fileno(), 0) 968 | 969 | with self.assertRaises(bagit.BagError) as error_catcher: 970 | bagit.Bag(self.tmpdir) 971 | 972 | self.assertEqual( 973 | "Missing required tag in bagit.txt: BagIt-Version, Tag-File-Character-Encoding", 974 | str(error_catcher.exception), 975 | ) 976 | 977 | def test_open_bag_with_invalid_versions(self): 978 | bagit.make_bag(self.tmpdir) 979 | 980 | for v in ("a.b", "2.", "0.1.2", "1.2.3"): 981 | with open(j(self.tmpdir, "bagit.txt"), "w") as f: 982 | f.write("BagIt-Version: %s\nTag-File-Character-Encoding: UTF-8\n" % v) 983 | 984 | with self.assertRaises(bagit.BagError) as error_catcher: 985 | bagit.Bag(self.tmpdir) 986 | 987 | self.assertEqual( 988 | "Bag version numbers must be MAJOR.MINOR numbers, not %s" % v, 989 | str(error_catcher.exception), 990 | ) 991 | 992 | def test_open_bag_with_unsupported_version(self): 993 | bagit.make_bag(self.tmpdir) 994 | 995 | with open(j(self.tmpdir, "bagit.txt"), "w") as f: 996 | f.write("BagIt-Version: 2.0\nTag-File-Character-Encoding: UTF-8\n") 997 | 998 | with self.assertRaises(bagit.BagError) as error_catcher: 999 | bagit.Bag(self.tmpdir) 1000 | 1001 | self.assertEqual("Unsupported bag version: 2.0", str(error_catcher.exception)) 1002 | 1003 | def test_open_bag_with_unknown_encoding(self): 1004 | bagit.make_bag(self.tmpdir) 1005 | 1006 | with open(j(self.tmpdir, "bagit.txt"), "w") as f: 1007 | f.write("BagIt-Version: 0.97\nTag-File-Character-Encoding: WTF-8\n") 1008 | 1009 | with self.assertRaises(bagit.BagError) as error_catcher: 1010 | bagit.Bag(self.tmpdir) 1011 | 1012 | self.assertEqual("Unsupported encoding: WTF-8", str(error_catcher.exception)) 1013 | 1014 | 1015 | class TestFetch(SelfCleaningTestCase): 1016 | def setUp(self): 1017 | super(TestFetch, self).setUp() 1018 | 1019 | # All of these tests will involve fetch.txt usage with an existing bag 1020 | # so we'll simply create one: 1021 | self.bag = bagit.make_bag(self.tmpdir) 1022 | 1023 | def test_fetch_loader(self): 1024 | with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: 1025 | print( 1026 | "https://photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg - data/nasa/PIA21390.jpg", 1027 | file=fetch_txt, 1028 | ) 1029 | 1030 | self.bag.save(manifests=True) 1031 | self.bag.validate() 1032 | 1033 | self.assertListEqual( 1034 | [ 1035 | ( 1036 | "https://photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg", 1037 | "-", 1038 | "data/nasa/PIA21390.jpg", 1039 | ) 1040 | ], 1041 | list(self.bag.fetch_entries()), 1042 | ) 1043 | 1044 | self.assertListEqual( 1045 | ["data/nasa/PIA21390.jpg"], list(self.bag.files_to_be_fetched()) 1046 | ) 1047 | 1048 | self.assertListEqual( 1049 | ["data/nasa/PIA21390.jpg"], list(self.bag.compare_fetch_with_fs()) 1050 | ) 1051 | 1052 | def test_fetch_validation(self): 1053 | with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: 1054 | print( 1055 | "https://photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg - data/nasa/PIA21390.jpg", 1056 | file=fetch_txt, 1057 | ) 1058 | 1059 | self.bag.save(manifests=True) 1060 | 1061 | with mock.patch.object(bagit.Bag, "validate_fetch") as mock_vf: 1062 | self.bag.validate() 1063 | self.assertTrue( 1064 | mock_vf.called, msg="Bag.validate() should call Bag.validate_fetch()" 1065 | ) 1066 | 1067 | def test_fetch_unsafe_payloads(self): 1068 | with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: 1069 | print( 1070 | "https://photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg - /etc/passwd", 1071 | file=fetch_txt, 1072 | ) 1073 | 1074 | self.bag.save(manifests=True) 1075 | 1076 | expected_msg = 'Path "/etc/passwd" in "%s/fetch.txt" is unsafe' % self.tmpdir 1077 | 1078 | # We expect both validate() and fetch entry iteration to raise errors on security hazards 1079 | # so we'll test both: 1080 | 1081 | with self.assertRaises(bagit.BagError) as cm: 1082 | self.bag.validate() 1083 | 1084 | self.assertEqual(expected_msg, str(cm.exception)) 1085 | 1086 | # Note the use of list() to exhaust the fetch_entries generator: 1087 | with self.assertRaises(bagit.BagError) as cm: 1088 | list(self.bag.fetch_entries()) 1089 | 1090 | self.assertEqual(expected_msg, str(cm.exception)) 1091 | 1092 | def test_fetch_malformed_url(self): 1093 | with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: 1094 | print( 1095 | "//photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg - data/nasa/PIA21390.jpg", 1096 | file=fetch_txt, 1097 | ) 1098 | 1099 | self.bag.save(manifests=True) 1100 | 1101 | expected_msg = ( 1102 | "Malformed URL in fetch.txt: //photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg" 1103 | ) 1104 | 1105 | with self.assertRaises(bagit.BagError) as cm: 1106 | self.bag.validate_fetch() 1107 | 1108 | self.assertEqual(expected_msg, str(cm.exception)) 1109 | 1110 | 1111 | class TestCLI(SelfCleaningTestCase): 1112 | @mock.patch("sys.stderr", new_callable=StringIO) 1113 | def test_directory_required(self, mock_stderr): 1114 | testargs = ["bagit.py"] 1115 | 1116 | with self.assertRaises(SystemExit) as cm: 1117 | with mock.patch.object(sys, "argv", testargs): 1118 | bagit.main() 1119 | 1120 | self.assertEqual(cm.exception.code, 2) 1121 | self.assertIn( 1122 | "error: the following arguments are required: directory", 1123 | mock_stderr.getvalue(), 1124 | ) 1125 | 1126 | @mock.patch("sys.stderr", new_callable=StringIO) 1127 | def test_not_enough_processes(self, mock_stderr): 1128 | testargs = ["bagit.py", "--processes", "0", self.tmpdir] 1129 | 1130 | with self.assertRaises(SystemExit) as cm: 1131 | with mock.patch.object(sys, "argv", testargs): 1132 | bagit.main() 1133 | 1134 | self.assertEqual(cm.exception.code, 2) 1135 | self.assertIn( 1136 | "error: The number of processes must be greater than 0", 1137 | mock_stderr.getvalue(), 1138 | ) 1139 | 1140 | @mock.patch("sys.stderr", new_callable=StringIO) 1141 | def test_fast_flag_without_validate(self, mock_stderr): 1142 | bagit.make_bag(self.tmpdir) 1143 | testargs = ["bagit.py", "--fast", self.tmpdir] 1144 | 1145 | with self.assertRaises(SystemExit) as cm: 1146 | with mock.patch.object(sys, "argv", testargs): 1147 | bagit.main() 1148 | 1149 | self.assertEqual(cm.exception.code, 2) 1150 | self.assertIn( 1151 | "error: --fast is only allowed as an option for --validate!", 1152 | mock_stderr.getvalue(), 1153 | ) 1154 | 1155 | def test_invalid_fast_validate(self): 1156 | bagit.make_bag(self.tmpdir) 1157 | os.remove(j(self.tmpdir, "data", "loc", "2478433644_2839c5e8b8_o_d.jpg")) 1158 | testargs = ["bagit.py", "--validate", "--completeness-only", self.tmpdir] 1159 | 1160 | with self.assertLogs() as captured: 1161 | with self.assertRaises(SystemExit) as cm: 1162 | with mock.patch.object(sys, "argv", testargs): 1163 | bagit.main() 1164 | 1165 | self.assertEqual(cm.exception.code, 1) 1166 | self.assertIn( 1167 | "%s is invalid: Payload-Oxum validation failed." % self.tmpdir, 1168 | captured.records[0].getMessage(), 1169 | ) 1170 | 1171 | def test_valid_fast_validate(self): 1172 | bagit.make_bag(self.tmpdir) 1173 | testargs = ["bagit.py", "--validate", "--fast", self.tmpdir] 1174 | 1175 | with self.assertLogs() as captured: 1176 | with self.assertRaises(SystemExit) as cm: 1177 | with mock.patch.object(sys, "argv", testargs): 1178 | bagit.main() 1179 | 1180 | self.assertEqual(cm.exception.code, 0) 1181 | self.assertEqual( 1182 | "%s valid according to Payload-Oxum" % self.tmpdir, 1183 | captured.records[0].getMessage(), 1184 | ) 1185 | 1186 | @mock.patch("sys.stderr", new_callable=StringIO) 1187 | def test_completeness_flag_without_validate(self, mock_stderr): 1188 | bagit.make_bag(self.tmpdir) 1189 | testargs = ["bagit.py", "--completeness-only", self.tmpdir] 1190 | 1191 | with self.assertRaises(SystemExit) as cm: 1192 | with mock.patch.object(sys, "argv", testargs): 1193 | bagit.main() 1194 | 1195 | self.assertEqual(cm.exception.code, 2) 1196 | self.assertIn( 1197 | "error: --completeness-only is only allowed as an option for --validate!", 1198 | mock_stderr.getvalue(), 1199 | ) 1200 | 1201 | def test_invalid_completeness_validate(self): 1202 | bagit.make_bag(self.tmpdir) 1203 | old_path = j(self.tmpdir, "data", "README") 1204 | new_path = j(self.tmpdir, "data", "extra_file") 1205 | os.rename(old_path, new_path) 1206 | 1207 | testargs = ["bagit.py", "--validate", "--completeness-only", self.tmpdir] 1208 | 1209 | with self.assertLogs() as captured: 1210 | with self.assertRaises(SystemExit) as cm: 1211 | with mock.patch.object(sys, "argv", testargs): 1212 | bagit.main() 1213 | 1214 | self.assertEqual(cm.exception.code, 1) 1215 | self.assertIn( 1216 | "%s is invalid: Bag is incomplete" % self.tmpdir, 1217 | captured.records[-1].getMessage(), 1218 | ) 1219 | 1220 | def test_valid_completeness_validate(self): 1221 | bagit.make_bag(self.tmpdir) 1222 | testargs = ["bagit.py", "--validate", "--completeness-only", self.tmpdir] 1223 | 1224 | with self.assertLogs() as captured: 1225 | with self.assertRaises(SystemExit) as cm: 1226 | with mock.patch.object(sys, "argv", testargs): 1227 | bagit.main() 1228 | 1229 | self.assertEqual(cm.exception.code, 0) 1230 | self.assertEqual( 1231 | "%s is complete and valid according to Payload-Oxum" % self.tmpdir, 1232 | captured.records[0].getMessage(), 1233 | ) 1234 | 1235 | def test_invalid_full_validate(self): 1236 | bagit.make_bag(self.tmpdir) 1237 | readme = j(self.tmpdir, "data", "README") 1238 | txt = slurp_text_file(readme) 1239 | txt = "A" + txt[1:] 1240 | with open(readme, "w") as r: 1241 | r.write(txt) 1242 | 1243 | testargs = ["bagit.py", "--validate", self.tmpdir] 1244 | 1245 | with self.assertLogs() as captured: 1246 | with self.assertRaises(SystemExit) as cm: 1247 | with mock.patch.object(sys, "argv", testargs): 1248 | bagit.main() 1249 | 1250 | self.assertEqual(cm.exception.code, 1) 1251 | self.assertIn("Bag validation failed", captured.records[-1].getMessage()) 1252 | 1253 | def test_valid_full_validate(self): 1254 | bagit.make_bag(self.tmpdir) 1255 | testargs = ["bagit.py", "--validate", self.tmpdir] 1256 | 1257 | with self.assertLogs() as captured: 1258 | with self.assertRaises(SystemExit) as cm: 1259 | with mock.patch.object(sys, "argv", testargs): 1260 | bagit.main() 1261 | 1262 | self.assertEqual(cm.exception.code, 0) 1263 | self.assertEqual("%s is valid" % self.tmpdir, captured.records[-1].getMessage()) 1264 | 1265 | def test_failed_create_bag(self): 1266 | os.chmod(self.tmpdir, 0) 1267 | 1268 | testargs = ["bagit.py", self.tmpdir] 1269 | 1270 | with self.assertLogs() as captured: 1271 | with self.assertRaises(SystemExit) as cm: 1272 | with mock.patch.object(sys, "argv", testargs): 1273 | bagit.main() 1274 | 1275 | self.assertEqual(cm.exception.code, 1) 1276 | self.assertIn( 1277 | "Failed to create bag in %s" % self.tmpdir, 1278 | captured.records[-1].getMessage(), 1279 | ) 1280 | 1281 | def test_create_bag(self): 1282 | testargs = ["bagit.py", self.tmpdir] 1283 | 1284 | with self.assertLogs() as captured: 1285 | with self.assertRaises(SystemExit) as cm: 1286 | with mock.patch.object(sys, "argv", testargs): 1287 | bagit.main() 1288 | 1289 | for rec in captured.records: 1290 | print(rec.getMessage()) 1291 | 1292 | self.assertEqual(cm.exception.code, 0) 1293 | 1294 | 1295 | if __name__ == "__main__": 1296 | unittest.main() 1297 | --------------------------------------------------------------------------------