├── .gitignore ├── test-data ├── loc │ ├── 2478433644_2839c5e8b8_o_d.jpg │ └── 3314493806_6f1db86d66_o_d.jpg ├── si │ ├── 2584174182_ffd5c24905_b_d.jpg │ └── 4011399822_65987a4806_b_d.jpg └── README ├── .travis.yml ├── bench.py ├── setup.py ├── README.md ├── test.py └── bagit.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | bench-data 3 | build 4 | dist 5 | MANIFEST 6 | bagit.egg-info 7 | -------------------------------------------------------------------------------- /test-data/loc/2478433644_2839c5e8b8_o_d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mistydemeo/bagit-python/master/test-data/loc/2478433644_2839c5e8b8_o_d.jpg -------------------------------------------------------------------------------- /test-data/loc/3314493806_6f1db86d66_o_d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mistydemeo/bagit-python/master/test-data/loc/3314493806_6f1db86d66_o_d.jpg -------------------------------------------------------------------------------- /test-data/si/2584174182_ffd5c24905_b_d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mistydemeo/bagit-python/master/test-data/si/2584174182_ffd5c24905_b_d.jpg -------------------------------------------------------------------------------- /test-data/si/4011399822_65987a4806_b_d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mistydemeo/bagit-python/master/test-data/si/4011399822_65987a4806_b_d.jpg -------------------------------------------------------------------------------- /test-data/README: -------------------------------------------------------------------------------- 1 | public domain images obtained from flickr commons: 2 | 3 | http://www.flickr.com/photos/smithsonian/2584174182/ 4 | http://www.flickr.com/photos/smithsonian/4011399822/ 5 | http://www.flickr.com/photos/library_of_congress/2478433644/ 6 | 7 | 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "2.6" 5 | script: python setup.py test 6 | install: 7 | # this can go away when this is resolved satisfactorily 8 | # https://github.com/travis-ci/travis-cookbooks/issues/155 9 | - "sudo rm -rf /dev/shm && sudo ln -s /run/shm /dev/shm" 10 | -------------------------------------------------------------------------------- /bench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a little benchmarking script to exercise bagit.make_bag 5 | using 1-8 parallel processes. It will download some images from 6 | NASA for use in bagging the first time it is run. 7 | """ 8 | 9 | import os 10 | import ftplib 11 | import timeit 12 | 13 | # fetch some images from NASA to bag up 14 | if not os.path.isdir('bench-data'): 15 | print "fetching some images to bag up from nasa" 16 | os.mkdir('bench-data') 17 | ftp = ftplib.FTP('nssdcftp.gsfc.nasa.gov') 18 | ftp.login() 19 | 20 | ftp.cwd('/photo_gallery/hi-res/planetary/mars/') 21 | files = [] 22 | ftp.retrlines('NLST', files.append) 23 | 24 | for file in files: 25 | print "fetching %s" % file 26 | fh = open(os.path.join('bench-data', file), 'wb') 27 | ftp.retrbinary('RETR %s' % file, fh.write) 28 | fh.close() 29 | 30 | # bag up bench-data using n processes 31 | statement = """ 32 | import os 33 | import bagit 34 | 35 | if os.path.isdir('bench-data/data'): 36 | os.system("rm bench-data/bag*") 37 | os.system("mv bench-data/data/* bench-data/") 38 | os.system("rmdir bench-data/data") 39 | 40 | bagit.make_bag('bench-data', processes=%s) 41 | """ 42 | 43 | # try 1-8 parallel processes 44 | for p in range(1, 9): 45 | t = timeit.Timer(statement % p) 46 | print "%s processes: %.2f seconds " % (p, (10 * t.timeit(number=10) / 10)) 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from sys import version, exit 2 | from setuptools import setup 3 | 4 | if version < '2.4.0' or version > '3.0.0': 5 | print "python 2.4 - 2.7 is required" 6 | exit(1) 7 | 8 | description = \ 9 | """ 10 | This package can be used to create BagIt style packages of 11 | digital content for safe transmission and digital preservation. 12 | See: http://en.wikipedia.org/wiki/BagIt for more details. 13 | """ 14 | 15 | # for older pythons ... 16 | requirements = [] 17 | try: 18 | import multiprocessing 19 | except: 20 | requirements.append("multiprocessing") 21 | try: 22 | import hashlib 23 | except: 24 | requirements.append("hashlib") 25 | 26 | 27 | setup( 28 | name = 'bagit', 29 | version = '1.3.5', 30 | url = 'http://github.com/LibraryOfCongress/bagit-python', 31 | author = 'Ed Summers', 32 | author_email = 'ehs@pobox.com', 33 | py_modules = ['bagit',], 34 | scripts = ['bagit.py'], 35 | description = description, 36 | platforms = ['POSIX'], 37 | test_suite = 'test', 38 | install_requires = requirements, 39 | classifiers = [ 40 | 'License :: Public Domain', 41 | 'Intended Audience :: Developers', 42 | 'Topic :: Communications :: File Sharing', 43 | 'Topic :: Software Development :: Libraries :: Python Modules', 44 | 'Topic :: System :: Filesystems', 45 | 'Programming Language :: Python :: 2.4', 46 | 'Programming Language :: Python :: 2.5', 47 | 'Programming Language :: Python :: 2.6', 48 | 'Programming Language :: Python :: 2.7' 49 | ], 50 | ) 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | bagit-python 2 | ============ 3 | 4 | [![Build Status](https://travis-ci.org/LibraryOfCongress/bagit-python.svg)](http://travis-ci.org/LibraryOfCongress/bagit-python) 5 | 6 | bagit is a Python library and command line utility for working with [BagIt](http://purl.org/net/bagit) style packages. 7 | 8 | Installation 9 | ------------ 10 | 11 | bagit.py is a single-file python module that you can drop into your project as 12 | needed or you can install globally with: 13 | 14 | pip install bagit 15 | 16 | Python v2.4+ is required. 17 | 18 | Command Line Usage 19 | ------------------ 20 | 21 | When you install bagit you should get a command line program called bagit.py 22 | which you can use to turn an existing directory into a bag: 23 | 24 | bagit.py --contact-name 'John Kunze' /directory/to/bag 25 | 26 | You can pass in key/value metadata for the bag using options like 27 | `--contact-name` above, which get persisted to the bag-info.txt. For a 28 | complete list of bag-info.txt properties you can use as commmand line 29 | arguments see `--help`. 30 | 31 | Since calculating checksums can take a while when creating a bag, you may want 32 | to calculate them in parallel if you are on a multicore machine. You can do 33 | that with the `--processes` option: 34 | 35 | bagit.py --processes 4 /directory/to/bag 36 | 37 | To specify which checksum algorithm(s) to use when generating the manifest, 38 | use the --md5, --sha1, and/or --sha256 flags (MD5 is generated by default). 39 | 40 | bagit.py --sha1 /path/to/bag 41 | bagit.py --sha256 /path/to/bag 42 | 43 | If you would like to validate a bag you can use the --validate flag. 44 | 45 | bagit.py --validate /path/to/bag 46 | 47 | If you would like to take a quick look at the bag to see if it seems valid 48 | by just examining the structure of the bag, and comparing its payload-oxum (byte 49 | count and number of files) then use the `--fast` flag. 50 | 51 | bagit.py --validate --fast /path/to/bag 52 | 53 | Python Usage 54 | ------------ 55 | 56 | You can also use bagit programatically in your own Python programs. To 57 | create a bag you would do this: 58 | 59 | ```python 60 | import bagit 61 | bag = bagit.make_bag('mydir', {'Contact-Name': 'John Kunze'}) 62 | ``` 63 | 64 | `make_bag` returns a Bag instance. If you have a bag already on disk and would 65 | like to create a Bag instance for it, simply call the constructor directly: 66 | 67 | ```python 68 | import bagit 69 | bag = bagit.Bag('/path/to/bag') 70 | ``` 71 | 72 | If you would like to see if a bag is valid, use its `is_valid` method: 73 | 74 | ```python 75 | bag = bagit.Bag('/path/to/bag') 76 | if bag.is_valid(): 77 | print "yay :)" 78 | else: 79 | print "boo :(" 80 | ``` 81 | 82 | If you'd like to get a detailed list of validation errors, 83 | execute the `validate` method and catch the `BagValidationError` 84 | exception. If the bag's manifest was invalid (and it wasn't caught by the 85 | payload oxum) the exception's `details` property will contain a list of 86 | `ManifestError`s that you can introspect on. Each ManifestError, will be of 87 | type `ChecksumMismatch`, `FileMissing`, `UnexpectedFile`. 88 | 89 | So for example if you want to print out checksums that failed to validate 90 | you can do this: 91 | 92 | ```python 93 | 94 | import bagit 95 | 96 | bag = bagit.Bag("/path/to/bag") 97 | 98 | try: 99 | bag.validate() 100 | 101 | except bagit.BagValidationError, e: 102 | for d in e.details: 103 | if isinstance(d, bag.ChecksumMismatch): 104 | print "expected %s to have %s checksum of %s but found %s" % \ 105 | (e.path, e.algorithm, e.expected, e.found) 106 | ``` 107 | 108 | To iterate through a bag's manifest and retrieve checksums for the payload 109 | files use the bag's entries dictionary: 110 | 111 | ```python 112 | bag = bagit.Bag("/path/to/bag") 113 | 114 | for path, fixity in bag.entries.items(): 115 | print "path:%s md5:%s" % (path, fixity["md5"]) 116 | ``` 117 | 118 | Development 119 | ----------- 120 | 121 | % git clone git://github.com/LibraryOfCongress/bagit-python.git 122 | % cd bagit-python 123 | % python test.py 124 | 125 | If you'd like to see how increasing parallelization of bag creation on 126 | your system effects the time to create a bag try using the included bench 127 | utility: 128 | 129 | % ./bench.py 130 | 131 | License 132 | ------- 133 | 134 | [![cc0](http://i.creativecommons.org/p/zero/1.0/88x31.png)](http://creativecommons.org/publicdomain/zero/1.0/) 135 | 136 | Note: By contributing to this project, you agree to license your work under the 137 | same terms as those that govern this project's distribution. 138 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import logging 4 | import datetime 5 | import tempfile 6 | import unittest 7 | import codecs 8 | import hashlib 9 | 10 | from os.path import join as j 11 | 12 | import bagit 13 | 14 | # don't let < ERROR clutter up test output 15 | logging.basicConfig(level=logging.ERROR) 16 | 17 | 18 | class TestBag(unittest.TestCase): 19 | 20 | def setUp(self): 21 | self.tmpdir = tempfile.mkdtemp() 22 | if os.path.isdir(self.tmpdir): 23 | shutil.rmtree(self.tmpdir) 24 | shutil.copytree('test-data', self.tmpdir) 25 | 26 | def tearDown(self): 27 | if os.path.isdir(self.tmpdir): 28 | shutil.rmtree(self.tmpdir) 29 | 30 | def test_make_bag(self): 31 | info = {'Bagging-Date': '1970-01-01', 'Contact-Email': 'ehs@pobox.com'} 32 | bag = bagit.make_bag(self.tmpdir, bag_info=info) 33 | 34 | # data dir should've been created 35 | self.assertTrue(os.path.isdir(j(self.tmpdir, 'data'))) 36 | 37 | # check bagit.txt 38 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'bagit.txt'))) 39 | bagit_txt = open(j(self.tmpdir, 'bagit.txt')).read() 40 | self.assertTrue('BagIt-Version: 0.97' in bagit_txt) 41 | self.assertTrue('Tag-File-Character-Encoding: UTF-8' in bagit_txt) 42 | 43 | # check manifest 44 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'manifest-md5.txt'))) 45 | manifest_txt = open(j(self.tmpdir, 'manifest-md5.txt')).read() 46 | self.assertTrue('8e2af7a0143c7b8f4de0b3fc90f27354 data/README' in manifest_txt) 47 | self.assertTrue('9a2b89e9940fea6ac3a0cc71b0a933a0 data/loc/2478433644_2839c5e8b8_o_d.jpg' in manifest_txt) 48 | self.assertTrue('6172e980c2767c12135e3b9d246af5a3 data/loc/3314493806_6f1db86d66_o_d.jpg' in manifest_txt) 49 | self.assertTrue('38a84cd1c41de793a0bccff6f3ec8ad0 data/si/2584174182_ffd5c24905_b_d.jpg' in manifest_txt) 50 | self.assertTrue('5580eaa31ad1549739de12df819e9af8 data/si/4011399822_65987a4806_b_d.jpg' in manifest_txt) 51 | 52 | # check bag-info.txt 53 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'bag-info.txt'))) 54 | bag_info_txt = open(j(self.tmpdir, 'bag-info.txt')).read() 55 | self.assertTrue('Contact-Email: ehs@pobox.com' in bag_info_txt) 56 | self.assertTrue('Bagging-Date: 1970-01-01' in bag_info_txt) 57 | self.assertTrue('Payload-Oxum: 991765.5' in bag_info_txt) 58 | self.assertTrue('Bag-Software-Agent: bagit.py ' in bag_info_txt) 59 | 60 | # check tagmanifest-md5.txt 61 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'tagmanifest-md5.txt'))) 62 | tagmanifest_txt = open(j(self.tmpdir, 'tagmanifest-md5.txt')).read() 63 | self.assertTrue('9e5ad981e0d29adc278f6a294b8c2aca bagit.txt' in tagmanifest_txt) 64 | self.assertTrue('a0ce6631a2a6d1a88e6d38453ccc72a5 manifest-md5.txt' in tagmanifest_txt) 65 | self.assertTrue('6a5090e27cb29d5dda8a0142fbbdf37e bag-info.txt' in tagmanifest_txt) 66 | 67 | def test_make_bag_sha1_manifest(self): 68 | bag = bagit.make_bag(self.tmpdir, checksum=['sha1']) 69 | # check manifest 70 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'manifest-sha1.txt'))) 71 | manifest_txt = open(j(self.tmpdir, 'manifest-sha1.txt')).read() 72 | self.assertTrue('ace19416e605cfb12ab11df4898ca7fd9979ee43 data/README' in manifest_txt) 73 | self.assertTrue('4c0a3da57374e8db379145f18601b159f3cad44b data/loc/2478433644_2839c5e8b8_o_d.jpg' in manifest_txt) 74 | self.assertTrue('62095aeddae2f3207cb77c85937e13c51641ef71 data/loc/3314493806_6f1db86d66_o_d.jpg' in manifest_txt) 75 | self.assertTrue('e592194b3733e25166a631e1ec55bac08066cbc1 data/si/2584174182_ffd5c24905_b_d.jpg' in manifest_txt) 76 | self.assertTrue('db49ef009f85a5d0701829f38d29f8cf9c5df2ea data/si/4011399822_65987a4806_b_d.jpg' in manifest_txt) 77 | 78 | def test_make_bag_sha256_manifest(self): 79 | bag = bagit.make_bag(self.tmpdir, checksum=['sha256']) 80 | # check manifest 81 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'manifest-sha256.txt'))) 82 | manifest_txt = open(j(self.tmpdir, 'manifest-sha256.txt')).read() 83 | self.assertTrue('b6df8058fa818acfd91759edffa27e473f2308d5a6fca1e07a79189b95879953 data/loc/2478433644_2839c5e8b8_o_d.jpg' in manifest_txt) 84 | self.assertTrue('1af90c21e72bb0575ae63877b3c69cfb88284f6e8c7820f2c48dc40a08569da5 data/loc/3314493806_6f1db86d66_o_d.jpg' in manifest_txt) 85 | self.assertTrue('f065a4ae2bc5d47c6d046c3cba5c8cdfd66b07c96ff3604164e2c31328e41c1a data/si/2584174182_ffd5c24905_b_d.jpg' in manifest_txt) 86 | self.assertTrue('45d257c93e59ec35187c6a34c8e62e72c3e9cfbb548984d6f6e8deb84bac41f4 data/si/4011399822_65987a4806_b_d.jpg' in manifest_txt) 87 | 88 | def test_make_bag_sha1_sha256_manifest(self): 89 | bag = bagit.make_bag(self.tmpdir, checksum=['sha1', 'sha256']) 90 | # check that relevant manifests are created 91 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'manifest-sha1.txt'))) 92 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'manifest-sha256.txt'))) 93 | # check valid with two manifests 94 | self.assertTrue(bag.validate(fast=True)) 95 | 96 | def test_make_bag_md5_sha256_manifest(self): 97 | bag = bagit.make_bag(self.tmpdir, checksum=['md5', 'sha256']) 98 | # check that relevant manifests are created 99 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'manifest-md5.txt'))) 100 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'manifest-sha256.txt'))) 101 | # check valid with two manifests 102 | self.assertTrue(bag.validate(fast=True)) 103 | 104 | def test_make_bag_md5_sha1_sha256_manifest(self): 105 | bag = bagit.make_bag(self.tmpdir, checksum=['md5', 'sha1', 'sha256']) 106 | # check that relevant manifests are created 107 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'manifest-md5.txt'))) 108 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'manifest-sha1.txt'))) 109 | self.assertTrue(os.path.isfile(j(self.tmpdir, 'manifest-sha256.txt'))) 110 | # check valid with three manifests 111 | self.assertTrue(bag.validate(fast=True)) 112 | 113 | def test_make_bag_with_data_dir_present(self): 114 | os.mkdir(j(self.tmpdir, 'data')) 115 | bag = bagit.make_bag(self.tmpdir) 116 | 117 | # data dir should now contain another data dir 118 | self.assertTrue(os.path.isdir(j(self.tmpdir, 'data', 'data'))) 119 | 120 | def test_bag_class(self): 121 | info = {'Contact-Email': 'ehs@pobox.com'} 122 | bag = bagit.make_bag(self.tmpdir, bag_info=info) 123 | self.assertTrue(isinstance(bag, bagit.Bag)) 124 | self.assertEqual(set(bag.payload_files()), set([ 125 | 'data/README', 126 | 'data/si/2584174182_ffd5c24905_b_d.jpg', 127 | 'data/si/4011399822_65987a4806_b_d.jpg', 128 | 'data/loc/2478433644_2839c5e8b8_o_d.jpg', 129 | 'data/loc/3314493806_6f1db86d66_o_d.jpg'])) 130 | self.assertEqual(list(bag.manifest_files()), ['%s/manifest-md5.txt' % 131 | self.tmpdir]) 132 | 133 | def test_has_oxum(self): 134 | bag = bagit.make_bag(self.tmpdir) 135 | self.assertTrue(bag.has_oxum()) 136 | 137 | def test_bag_constructor(self): 138 | bag = bagit.make_bag(self.tmpdir) 139 | bag = bagit.Bag(self.tmpdir) 140 | self.assertEqual(type(bag), bagit.Bag) 141 | self.assertEqual(len(list(bag.payload_files())), 5) 142 | 143 | def test_validate_flipped_bit(self): 144 | bag = bagit.make_bag(self.tmpdir) 145 | readme = j(self.tmpdir, "data", "README") 146 | txt = open(readme).read() 147 | txt = 'A' + txt[1:] 148 | open(readme, "w").write(txt) 149 | bag = bagit.Bag(self.tmpdir) 150 | self.assertRaises(bagit.BagValidationError, bag.validate) 151 | # fast doesn't catch the flipped bit, since oxsum is the same 152 | self.assertTrue(bag.validate(fast=True)) 153 | 154 | def test_validate_fast(self): 155 | bag = bagit.make_bag(self.tmpdir) 156 | self.assertEqual(bag.validate(fast=True), True) 157 | os.remove(j(self.tmpdir, "data", "loc", 158 | "2478433644_2839c5e8b8_o_d.jpg")) 159 | self.assertRaises(bagit.BagValidationError, bag.validate, fast=True) 160 | 161 | def test_validate_fast_without_oxum(self): 162 | bag = bagit.make_bag(self.tmpdir) 163 | os.remove(j(self.tmpdir, "bag-info.txt")) 164 | bag = bagit.Bag(self.tmpdir) 165 | self.assertRaises(bagit.BagValidationError, bag.validate, fast=True) 166 | 167 | def test_validate_slow_without_oxum_extra_file(self): 168 | bag = bagit.make_bag(self.tmpdir) 169 | os.remove(j(self.tmpdir, "bag-info.txt")) 170 | open(j(self.tmpdir, "data", "extra_file"), "w").write("foo") 171 | bag = bagit.Bag(self.tmpdir) 172 | self.assertRaises(bagit.BagValidationError, bag.validate, fast=False) 173 | 174 | def test_validation_error_details(self): 175 | bag = bagit.make_bag(self.tmpdir) 176 | readme = j(self.tmpdir, "data", "README") 177 | txt = open(readme).read() 178 | txt = 'A' + txt[1:] 179 | open(readme, "w").write(txt) 180 | 181 | extra_file = j(self.tmpdir, "data", "extra") 182 | open(extra_file, "w").write('foo') 183 | 184 | # remove the bag-info.txt which contains the oxum to force a full 185 | # check of the manifest 186 | os.remove(j(self.tmpdir, "bag-info.txt")) 187 | 188 | bag = bagit.Bag(self.tmpdir) 189 | got_exception = False 190 | try: 191 | bag.validate() 192 | except bagit.BagValidationError, e: 193 | got_exception = True 194 | 195 | self.assertEqual(str(e), "invalid bag: bag-info.txt exists in manifest but not found on filesystem ; data/extra exists on filesystem but is not in manifest ; data/README checksum validation failed (alg=md5 expected=8e2af7a0143c7b8f4de0b3fc90f27354 found=fd41543285d17e7c29cd953f5cf5b955)") 196 | self.assertEqual(len(e.details), 3) 197 | 198 | error = e.details[0] 199 | self.assertEqual(str(error), "bag-info.txt exists in manifest but not found on filesystem") 200 | self.assertTrue(isinstance(error, bagit.FileMissing)) 201 | self.assertEqual(error.path, "bag-info.txt") 202 | 203 | error = e.details[1] 204 | self.assertEqual(str(error), "data/extra exists on filesystem but is not in manifest") 205 | self.assertTrue(isinstance(error, bagit.UnexpectedFile)) 206 | self.assertEqual(error.path, "data/extra") 207 | 208 | error = e.details[2] 209 | self.assertEqual(str(error), "data/README checksum validation failed (alg=md5 expected=8e2af7a0143c7b8f4de0b3fc90f27354 found=fd41543285d17e7c29cd953f5cf5b955)") 210 | self.assertTrue(isinstance(error, bagit.ChecksumMismatch)) 211 | self.assertEqual(error.algorithm, 'md5') 212 | self.assertEqual(error.path, 'data/README') 213 | self.assertEqual(error.expected, '8e2af7a0143c7b8f4de0b3fc90f27354') 214 | self.assertEqual(error.found, 'fd41543285d17e7c29cd953f5cf5b955') 215 | if not got_exception: 216 | self.fail("didn't get BagValidationError") 217 | 218 | def test_is_valid(self): 219 | bag = bagit.make_bag(self.tmpdir) 220 | bag = bagit.Bag(self.tmpdir) 221 | self.assertTrue(bag.is_valid()) 222 | open(j(self.tmpdir, "data", "extra_file"), "w").write("bar") 223 | self.assertFalse(bag.is_valid()) 224 | 225 | def test_bom_in_bagit_txt(self): 226 | bag = bagit.make_bag(self.tmpdir) 227 | bagfile = codecs.BOM_UTF8 228 | bagfile += open(j(self.tmpdir, "bagit.txt"), "rb").read() 229 | bf = open(j(self.tmpdir, "bagit.txt"), "wb") 230 | bf.write(bagfile) 231 | bf.close() 232 | bag = bagit.Bag(self.tmpdir) 233 | self.assertRaises(bagit.BagValidationError, bag.validate) 234 | 235 | def test_missing_file(self): 236 | bag = bagit.make_bag(self.tmpdir) 237 | os.remove(j(self.tmpdir, 'data', 'loc', '3314493806_6f1db86d66_o_d.jpg')) 238 | self.assertRaises(bagit.BagValidationError, bag.validate) 239 | 240 | def test_handle_directory_end_slash_gracefully(self): 241 | bag = bagit.make_bag(self.tmpdir + '/') 242 | self.assertTrue(bag.validate()) 243 | bag2 = bagit.Bag(self.tmpdir + '/') 244 | self.assertTrue(bag2.validate()) 245 | 246 | def test_allow_extraneous_files_in_base(self): 247 | bag = bagit.make_bag(self.tmpdir) 248 | self.assertTrue(bag.validate()) 249 | f = j(self.tmpdir, "IGNOREFILE") 250 | open(f, 'w') 251 | self.assertTrue(bag.validate()) 252 | 253 | def test_allow_extraneous_dirs_in_base(self): 254 | bag = bagit.make_bag(self.tmpdir) 255 | self.assertTrue(bag.validate()) 256 | d = j(self.tmpdir, "IGNOREDIR") 257 | os.mkdir(d) 258 | self.assertTrue(bag.validate()) 259 | 260 | def test_missing_tagfile_raises_error(self): 261 | bag = bagit.make_bag(self.tmpdir) 262 | self.assertTrue(bag.validate()) 263 | os.remove(j(self.tmpdir, "bagit.txt")) 264 | self.assertRaises(bagit.BagValidationError, bag.validate) 265 | 266 | def test_missing_manifest_raises_error(self): 267 | bag = bagit.make_bag(self.tmpdir) 268 | self.assertTrue(bag.validate()) 269 | os.remove(j(self.tmpdir, "manifest-md5.txt")) 270 | self.assertRaises(bagit.BagValidationError, bag.validate) 271 | 272 | def test_make_bag_multiprocessing(self): 273 | bag = bagit.make_bag(self.tmpdir, processes=2) 274 | self.assertTrue(os.path.isdir(j(self.tmpdir, 'data'))) 275 | 276 | def test_mixed_case_checksums(self): 277 | bag = bagit.make_bag(self.tmpdir) 278 | hashstr = {} 279 | #Extract entries only for the payload and ignore 280 | # entries from the tagmanifest file 281 | for key in bag.entries.iterkeys(): 282 | if key.startswith('data' + os.sep): 283 | hashstr = bag.entries[key] 284 | hashstr = hashstr.itervalues().next() 285 | manifest = open(j(self.tmpdir, "manifest-md5.txt"), "r").read() 286 | manifest = manifest.replace(hashstr, hashstr.upper()) 287 | open(j(self.tmpdir, "manifest-md5.txt"), 288 | "w").write(manifest) 289 | 290 | #Since manifest-md5.txt file is updated, re-calculate its 291 | # md5 checksum and update it in the tagmanifest-md5.txt file 292 | hasher = hashlib.new('md5') 293 | hasher.update(open(j(self.tmpdir, "manifest-md5.txt"), "r").read()) 294 | tagmanifest = open(j(self.tmpdir, "tagmanifest-md5.txt"), "r").read() 295 | tagmanifest = tagmanifest.replace( 296 | bag.entries['manifest-md5.txt']['md5'], hasher.hexdigest()) 297 | open(j(self.tmpdir, "tagmanifest-md5.txt"), "w").write(tagmanifest) 298 | 299 | bag = bagit.Bag(self.tmpdir) 300 | self.assertTrue(bag.validate()) 301 | 302 | def test_multiple_oxum_values(self): 303 | bag = bagit.make_bag(self.tmpdir) 304 | baginfo = open(j(self.tmpdir, "bag-info.txt"), "a") 305 | baginfo.write('Payload-Oxum: 7.7\n') 306 | baginfo.close() 307 | bag = bagit.Bag(self.tmpdir) 308 | self.assertTrue(bag.validate(fast=True)) 309 | 310 | def test_multiple_meta_values(self): 311 | baginfo = {"Multival-Meta": [7, 4, 8, 6, 8]} 312 | bag = bagit.make_bag(self.tmpdir, baginfo) 313 | meta = bag.info.get("Multival-Meta") 314 | self.assertEqual(type(meta), list) 315 | self.assertEqual(len(meta), len(baginfo["Multival-Meta"])) 316 | 317 | def test_validate_optional_tagfile(self): 318 | bag = bagit.make_bag(self.tmpdir) 319 | tagdir = tempfile.mkdtemp(dir=self.tmpdir) 320 | tagfile = open(j(tagdir, "tagfile"), "w") 321 | tagfile.write("test") 322 | tagfile.close() 323 | relpath = j(tagdir, "tagfile").replace(self.tmpdir + os.sep, "") 324 | relpath.replace("\\", "/") 325 | tagman = open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") 326 | 327 | # Incorrect checksum. 328 | tagman.write("8e2af7a0143c7b8f4de0b3fc90f27354 " + relpath + "\n") 329 | tagman.close() 330 | bag = bagit.Bag(self.tmpdir) 331 | self.assertRaises(bagit.BagValidationError, bag.validate) 332 | 333 | hasher = hashlib.new("md5") 334 | hasher.update(open(j(tagdir, "tagfile"), "rb").read()) 335 | tagman = open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") 336 | tagman.write(hasher.hexdigest() + " " + relpath + "\n") 337 | tagman.close() 338 | bag = bagit.Bag(self.tmpdir) 339 | self.assertTrue(bag.validate()) 340 | 341 | # Missing tagfile. 342 | os.remove(j(tagdir, "tagfile")) 343 | bag = bagit.Bag(self.tmpdir) 344 | self.assertRaises(bagit.BagValidationError, bag.validate) 345 | 346 | def test_default_bagging_date(self): 347 | info = {'Contact-Email': 'ehs@pobox.com'} 348 | bag = bagit.make_bag(self.tmpdir, bag_info=info) 349 | bag_info_txt = open(j(self.tmpdir, 'bag-info.txt')).read() 350 | self.assertTrue('Contact-Email: ehs@pobox.com' in bag_info_txt) 351 | today = datetime.date.strftime(datetime.date.today(), "%Y-%m-%d") 352 | self.assertTrue('Bagging-Date: %s' % today in bag_info_txt) 353 | 354 | def test_missing_tagmanifest_valid(self): 355 | info = {'Contact-Email': 'ehs@pobox.com'} 356 | bag = bagit.make_bag(self.tmpdir, bag_info=info) 357 | self.assertEqual(bag.is_valid(), True) 358 | os.remove(j(self.tmpdir, 'tagmanifest-md5.txt')) 359 | self.assertEqual(bag.is_valid(), True) 360 | 361 | def test_carriage_return_manifest(self): 362 | open(j(self.tmpdir, "newline\r"), 'w').write("ugh") 363 | bag = bagit.make_bag(self.tmpdir) 364 | self.assertEqual(bag.is_valid(), True) 365 | 366 | 367 | if __name__ == '__main__': 368 | unittest.main() 369 | -------------------------------------------------------------------------------- /bagit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | BagIt is a directory, filename convention for bundling an arbitrary set of 5 | files with a manifest, checksums, and additional metadata. More about BagIt 6 | can be found at: 7 | 8 | http://purl.org/net/bagit 9 | 10 | bagit.py is a pure python drop in library and command line tool for creating, 11 | and working with BagIt directories: 12 | 13 | import bagit 14 | bag = bagit.make_bag('example-directory', {'Contact-Name': 'Ed Summers'}) 15 | print bag.entries 16 | 17 | Basic usage is to give bag a directory to bag up: 18 | 19 | % bagit.py my_directory 20 | 21 | You can bag multiple directories if you wish: 22 | 23 | % bagit.py directory1 directory2 24 | 25 | Optionally you can pass metadata intended for the bag-info.txt: 26 | 27 | % bagit.py --source-organization "Library of Congress" directory 28 | 29 | For more help see: 30 | 31 | % bagit.py --help 32 | """ 33 | 34 | import os 35 | import re 36 | import sys 37 | import codecs 38 | import hashlib 39 | import logging 40 | import optparse 41 | import tempfile 42 | import multiprocessing 43 | 44 | from glob import glob 45 | from os import listdir 46 | from datetime import date 47 | from itertools import chain 48 | from os.path import isdir, isfile, join 49 | 50 | logger = logging.getLogger(__name__) 51 | 52 | # standard bag-info.txt metadata 53 | _bag_info_headers = [ 54 | 'Source-Organization', 55 | 'Organization-Address', 56 | 'Contact-Name', 57 | 'Contact-Phone', 58 | 'Contact-Email', 59 | 'External-Description', 60 | 'External-Identifier', 61 | 'Bag-Size', 62 | 'Bag-Group-Identifier', 63 | 'Bag-Count', 64 | 'Internal-Sender-Identifier', 65 | 'Internal-Sender-Description', 66 | 'BagIt-Profile-Identifier', 67 | # Bagging-Date is autogenerated 68 | # Payload-Oxum is autogenerated 69 | ] 70 | 71 | checksum_algos = ['md5', 'sha1', 'sha256'] 72 | 73 | 74 | def make_bag(bag_dir, bag_info=None, processes=1, checksum=None): 75 | """ 76 | Convert a given directory into a bag. You can pass in arbitrary 77 | key/value pairs to put into the bag-info.txt metadata file as 78 | the bag_info dictionary. 79 | """ 80 | bag_dir = os.path.abspath(bag_dir) 81 | logging.info("creating bag for directory %s" % bag_dir) 82 | # assume md5 checksum if not specified 83 | if not checksum: 84 | checksum = ['md5'] 85 | 86 | if not os.path.isdir(bag_dir): 87 | logging.error("no such bag directory %s" % bag_dir) 88 | raise RuntimeError("no such bag directory %s" % bag_dir) 89 | 90 | old_dir = os.path.abspath(os.path.curdir) 91 | os.chdir(bag_dir) 92 | 93 | try: 94 | unbaggable = _can_bag(os.curdir) 95 | if unbaggable: 96 | logging.error("no write permissions for the following directories and files: \n%s", unbaggable) 97 | sys.exit("\nNot all files/folders can be moved.") 98 | unreadable_dirs, unreadable_files = _can_read(os.curdir) 99 | if unreadable_dirs or unreadable_files: 100 | if unreadable_dirs: 101 | logging.error("The following directories do not have read permissions: \n%s", unreadable_dirs) 102 | if unreadable_files: 103 | logging.error("The following files do not have read permissions: \n%s", unreadable_files) 104 | sys.exit("\nRead permissions are required to calculate file fixities.") 105 | else: 106 | logging.info("creating data dir") 107 | temp_data = tempfile.mkdtemp(dir=os.getcwd()) 108 | 109 | for f in os.listdir('.'): 110 | if os.path.abspath(f) == temp_data: 111 | continue 112 | new_f = os.path.join(temp_data, f) 113 | logging.info("moving %s to %s" % (f, new_f)) 114 | os.rename(f, new_f) 115 | 116 | logging.info("moving %s to %s" % (temp_data, 'data')) 117 | os.rename(temp_data, 'data') 118 | 119 | for c in checksum: 120 | logging.info("writing manifest-%s.txt" % c) 121 | Oxum = _make_manifest('manifest-%s.txt' % c, 'data', processes, c) 122 | 123 | logging.info("writing bagit.txt") 124 | txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" 125 | open("bagit.txt", "wb").write(txt) 126 | 127 | logging.info("writing bag-info.txt") 128 | bag_info_txt = open("bag-info.txt", "wb") 129 | if bag_info == None: 130 | bag_info = {} 131 | 132 | # allow 'Bagging-Date' and 'Bag-Software-Agent' to be overidden 133 | if 'Bagging-Date' not in bag_info: 134 | bag_info['Bagging-Date'] = date.strftime(date.today(), "%Y-%m-%d") 135 | if 'Bag-Software-Agent' not in bag_info: 136 | bag_info['Bag-Software-Agent'] = 'bagit.py ' 137 | bag_info['Payload-Oxum'] = Oxum 138 | headers = bag_info.keys() 139 | headers.sort() 140 | for h in headers: 141 | # v0.97 support for multiple instances of any meta item. 142 | if type(bag_info[h]) == list: 143 | for val in bag_info[h]: 144 | bag_info_txt.write("%s: %s\n" % (h, val)) 145 | continue 146 | bag_info_txt.write("%s: %s\n" % (h, bag_info[h])) 147 | bag_info_txt.close() 148 | _make_tagmanifest_file('tagmanifest-md5.txt', bag_dir) 149 | 150 | except Exception, e: 151 | os.chdir(old_dir) 152 | logging.error(e) 153 | raise e 154 | 155 | os.chdir(old_dir) 156 | return Bag(bag_dir) 157 | 158 | 159 | class Bag(object): 160 | """A representation of a bag.""" 161 | 162 | valid_files = ["bagit.txt", "fetch.txt"] 163 | valid_directories = ['data'] 164 | 165 | def __init__(self, path=None): 166 | super(Bag, self).__init__() 167 | self.tags = {} 168 | self.info = {} 169 | self.entries = {} 170 | self.algs = [] 171 | self.tag_file_name = None 172 | self.path = path 173 | if path: 174 | # if path ends in a path separator, strip it off 175 | if path[-1] == os.sep: 176 | self.path = path[:-1] 177 | self._open() 178 | 179 | def __str__(self): 180 | return self.path 181 | 182 | def _open(self): 183 | # Open the bagit.txt file, and load any tags from it, including 184 | # the required version and encoding. 185 | bagit_file_path = os.path.join(self.path, "bagit.txt") 186 | 187 | if not isfile(bagit_file_path): 188 | raise BagError("No bagit.txt found: %s" % bagit_file_path) 189 | 190 | self.tags = tags = _load_tag_file(bagit_file_path) 191 | 192 | try: 193 | self.version = tags["BagIt-Version"] 194 | self.encoding = tags["Tag-File-Character-Encoding"] 195 | except KeyError, e: 196 | raise BagError("Missing required tag in bagit.txt: %s" % e) 197 | 198 | if self.version == "0.95": 199 | self.tag_file_name = "package-info.txt" 200 | elif self.version in ["0.96", "0.97"]: 201 | self.tag_file_name = "bag-info.txt" 202 | else: 203 | raise BagError("Unsupported bag version: %s" % self.version) 204 | 205 | if not self.encoding.lower() == "utf-8": 206 | raise BagValidationError("Unsupported encoding: %s" % self.encoding) 207 | 208 | info_file_path = os.path.join(self.path, self.tag_file_name) 209 | if os.path.exists(info_file_path): 210 | self.info = _load_tag_file(info_file_path, duplicates=True) 211 | 212 | self._load_manifests() 213 | 214 | def manifest_files(self): 215 | for filename in ["manifest-%s.txt" % a for a in checksum_algos]: 216 | f = os.path.join(self.path, filename) 217 | if isfile(f): 218 | yield f 219 | 220 | def tagmanifest_files(self): 221 | for filename in ["tagmanifest-%s.txt" % a for a in checksum_algos]: 222 | f = os.path.join(self.path, filename) 223 | if isfile(f): 224 | yield f 225 | 226 | def compare_manifests_with_fs(self): 227 | files_on_fs = set(map(_encode_filename, self.payload_files())) 228 | files_in_manifest = set(self.payload_entries().keys()) 229 | 230 | if self.version == "0.97": 231 | files_in_manifest = files_in_manifest | set(self.missing_optional_tagfiles()) 232 | 233 | return (list(files_in_manifest - files_on_fs), 234 | list(files_on_fs - files_in_manifest)) 235 | 236 | def compare_fetch_with_fs(self): 237 | """Compares the fetch entries with the files actually 238 | in the payload, and returns a list of all the files 239 | that still need to be fetched. 240 | """ 241 | 242 | files_on_fs = set(self.payload_files()) 243 | files_in_fetch = set(self.files_to_be_fetched()) 244 | 245 | return list(files_in_fetch - files_on_fs) 246 | 247 | def payload_files(self): 248 | payload_dir = os.path.join(self.path, "data") 249 | 250 | for dirpath, dirnames, filenames in os.walk(payload_dir): 251 | for f in filenames: 252 | # Jump through some hoops here to make the payload files come out 253 | # looking like data/dir/file, rather than having the entire path. 254 | rel_path = os.path.join(dirpath, os.path.normpath(f.replace('\\', '/'))) 255 | rel_path = rel_path.replace(self.path + os.path.sep, "", 1) 256 | yield rel_path 257 | 258 | def payload_entries(self): 259 | # Don't use dict comprehension (compatibility with Python < 2.7) 260 | return dict((key, value) for (key, value) in self.entries.iteritems() \ 261 | if key.startswith("data" + os.sep)) 262 | 263 | def tagfile_entries(self): 264 | return dict((key, value) for (key, value) in self.entries.iteritems() \ 265 | if not key.startswith("data" + os.sep)) 266 | 267 | def missing_optional_tagfiles(self): 268 | """ 269 | From v0.97 we need to validate any tagfiles listed 270 | in the optional tagmanifest(s). As there is no mandatory 271 | directory structure for additional tagfiles we can 272 | only check for entries with missing files (not missing 273 | entries for existing files). 274 | """ 275 | for tagfilepath in self.tagfile_entries().keys(): 276 | if not os.path.isfile(os.path.join(self.path, tagfilepath)): 277 | yield tagfilepath 278 | 279 | def fetch_entries(self): 280 | fetch_file_path = os.path.join(self.path, "fetch.txt") 281 | 282 | if isfile(fetch_file_path): 283 | fetch_file = open(fetch_file_path, 'rb') 284 | 285 | try: 286 | for line in fetch_file: 287 | parts = line.strip().split(None, 2) 288 | yield (parts[0], parts[1], parts[2]) 289 | except Exception, e: 290 | fetch_file.close() 291 | raise e 292 | 293 | fetch_file.close() 294 | 295 | def files_to_be_fetched(self): 296 | for f, size, path in self.fetch_entries(): 297 | yield f 298 | 299 | def has_oxum(self): 300 | return self.info.has_key('Payload-Oxum') 301 | 302 | def validate(self, fast=False): 303 | """Checks the structure and contents are valid. If you supply 304 | the parameter fast=True the Payload-Oxum (if present) will 305 | be used to check that the payload files are present and 306 | accounted for, instead of re-calculating fixities and 307 | comparing them against the manifest. By default validate() 308 | will re-calculate fixities (fast=False). 309 | """ 310 | self._validate_structure() 311 | self._validate_bagittxt() 312 | self._validate_contents(fast=fast) 313 | return True 314 | 315 | def is_valid(self, fast=False): 316 | """Returns validation success or failure as boolean. 317 | Optional fast parameter passed directly to validate(). 318 | """ 319 | try: 320 | self.validate(fast=fast) 321 | except BagError, e: 322 | return False 323 | return True 324 | 325 | def _load_manifests(self): 326 | manifests = list(self.manifest_files()) 327 | 328 | if self.version == "0.97": 329 | # v0.97 requires that optional tagfiles are verified. 330 | manifests += list(self.tagmanifest_files()) 331 | 332 | for manifest_file in manifests: 333 | if not manifest_file.find("tagmanifest-") is -1: 334 | search = "tagmanifest-" 335 | else: 336 | search = "manifest-" 337 | alg = os.path.basename(manifest_file).replace(search, "").replace(".txt", "") 338 | self.algs.append(alg) 339 | 340 | manifest_file = open(manifest_file, 'rb') 341 | 342 | try: 343 | for line in manifest_file: 344 | line = line.strip() 345 | 346 | # Ignore blank lines and comments. 347 | if line == "" or line.startswith("#"): continue 348 | 349 | entry = line.split(None, 1) 350 | 351 | # Format is FILENAME *CHECKSUM 352 | if len(entry) != 2: 353 | logging.error("%s: Invalid %s manifest entry: %s", self, alg, line) 354 | continue 355 | 356 | entry_hash = entry[0] 357 | entry_path = os.path.normpath(entry[1].lstrip("*")) 358 | 359 | if self.entries.has_key(entry_path): 360 | if self.entries[entry_path].has_key(alg): 361 | logging.warning("%s: Duplicate %s manifest entry: %s", self, alg, entry_path) 362 | 363 | self.entries[entry_path][alg] = entry_hash 364 | else: 365 | self.entries[entry_path] = {} 366 | self.entries[entry_path][alg] = entry_hash 367 | finally: 368 | manifest_file.close() 369 | 370 | def _validate_structure(self): 371 | """Checks the structure of the bag, determining if it conforms to the 372 | BagIt spec. Returns true on success, otherwise it will raise 373 | a BagValidationError exception. 374 | """ 375 | self._validate_structure_payload_directory() 376 | self._validate_structure_tag_files() 377 | 378 | def _validate_structure_payload_directory(self): 379 | data_dir_path = os.path.join(self.path, "data") 380 | 381 | if not isdir(data_dir_path): 382 | raise BagValidationError("Missing data directory") 383 | 384 | def _validate_structure_tag_files(self): 385 | # Note: we deviate somewhat from v0.96 of the spec in that it allows 386 | # other files and directories to be present in the base directory 387 | if len(list(self.manifest_files())) == 0: 388 | raise BagValidationError("Missing manifest file") 389 | if "bagit.txt" not in os.listdir(self.path): 390 | raise BagValidationError("Missing bagit.txt") 391 | 392 | def _validate_contents(self, fast=False): 393 | if fast and not self.has_oxum(): 394 | raise BagValidationError("cannot validate Bag with fast=True if Bag lacks a Payload-Oxum") 395 | self._validate_oxum() # Fast 396 | if not fast: 397 | self._validate_entries() # *SLOW* 398 | 399 | def _validate_oxum(self): 400 | oxum = self.info.get('Payload-Oxum') 401 | if oxum == None: return 402 | 403 | # If multiple Payload-Oxum tags (bad idea) 404 | # use the first listed in bag-info.txt 405 | if type(oxum) is list: 406 | oxum = oxum[0] 407 | 408 | byte_count, file_count = oxum.split('.', 1) 409 | 410 | if not byte_count.isdigit() or not file_count.isdigit(): 411 | raise BagError("Invalid oxum: %s" % oxum) 412 | 413 | byte_count = long(byte_count) 414 | file_count = long(file_count) 415 | total_bytes = 0 416 | total_files = 0 417 | 418 | for payload_file in self.payload_files(): 419 | payload_file = os.path.join(self.path, payload_file) 420 | total_bytes += os.stat(payload_file).st_size 421 | total_files += 1 422 | 423 | if file_count != total_files or byte_count != total_bytes: 424 | raise BagValidationError("Oxum error. Found %s files and %s bytes on disk; expected %s files and %s bytes." % (total_files, total_bytes, file_count, byte_count)) 425 | 426 | def _validate_entries(self): 427 | """ 428 | Verify that the actual file contents match the recorded hashes stored in the manifest files 429 | """ 430 | errors = list() 431 | 432 | # First we'll make sure there's no mismatch between the filesystem 433 | # and the list of files in the manifest(s) 434 | only_in_manifests, only_on_fs = self.compare_manifests_with_fs() 435 | for path in only_in_manifests: 436 | e = FileMissing(path) 437 | logging.warning(str(e)) 438 | errors.append(e) 439 | for path in only_on_fs: 440 | e = UnexpectedFile(path) 441 | logging.warning(str(e)) 442 | errors.append(e) 443 | 444 | # To avoid the overhead of reading the file more than once or loading 445 | # potentially massive files into memory we'll create a dictionary of 446 | # hash objects so we can open a file, read a block and pass it to 447 | # multiple hash objects 448 | 449 | hashers = {} 450 | for alg in self.algs: 451 | try: 452 | hashers[alg] = hashlib.new(alg) 453 | except KeyError: 454 | logging.warning("Unable to validate file contents using unknown %s hash algorithm", alg) 455 | 456 | if not hashers: 457 | raise RuntimeError("%s: Unable to validate bag contents: none of the hash algorithms in %s are supported!" % (self, self.algs)) 458 | 459 | for rel_path, hashes in self.entries.items(): 460 | full_path = os.path.join(self.path, rel_path) 461 | 462 | # Create a clone of the default empty hash objects: 463 | f_hashers = dict( 464 | (alg, hashlib.new(alg)) for alg, h in hashers.items() if alg in hashes 465 | ) 466 | 467 | try: 468 | f_hashes = self._calculate_file_hashes(full_path, f_hashers) 469 | except BagValidationError, e: 470 | f_hashes = dict() # continue with no hashes 471 | # Any unhandled exceptions are probably fatal 472 | except: 473 | logging.exception("unable to calculate file hashes for %s: %s", self, full_path) 474 | raise 475 | 476 | for alg, computed_hash in f_hashes.items(): 477 | stored_hash = hashes[alg] 478 | if stored_hash.lower() != computed_hash: 479 | e = ChecksumMismatch(rel_path, alg, stored_hash.lower(), computed_hash) 480 | logging.warning(str(e)) 481 | errors.append(e) 482 | 483 | if errors: 484 | raise BagValidationError("invalid bag", errors) 485 | 486 | def _validate_bagittxt(self): 487 | """ 488 | Verify that bagit.txt conforms to specification 489 | """ 490 | bagit_file_path = os.path.join(self.path, "bagit.txt") 491 | bagit_file = open(bagit_file_path, 'rb') 492 | try: 493 | first_line = bagit_file.readline() 494 | if first_line.startswith(codecs.BOM_UTF8): 495 | raise BagValidationError("bagit.txt must not contain a byte-order mark") 496 | finally: 497 | bagit_file.close() 498 | 499 | 500 | def _calculate_file_hashes(self, full_path, f_hashers): 501 | """ 502 | Returns a dictionary of (algorithm, hexdigest) values for the provided 503 | filename 504 | """ 505 | if not os.path.exists(full_path): 506 | raise BagValidationError("%s does not exist" % full_path) 507 | 508 | f = open(full_path, 'rb') 509 | 510 | f_size = os.stat(full_path).st_size 511 | 512 | while True: 513 | block = f.read(1048576) 514 | if not block: 515 | break 516 | [ i.update(block) for i in f_hashers.values() ] 517 | f.close() 518 | 519 | return dict( 520 | (alg, h.hexdigest()) for alg, h in f_hashers.items() 521 | ) 522 | 523 | class BagError(Exception): 524 | pass 525 | 526 | class BagValidationError(BagError): 527 | def __init__(self, message, details=[]): 528 | self.message = message 529 | self.details = details 530 | def __str__(self): 531 | if len(self.details) > 0: 532 | details = " ; ".join([str(e) for e in self.details]) 533 | return "%s: %s" % (self.message, details) 534 | return self.message 535 | 536 | class ManifestErrorDetail(): 537 | def __init__(self, path): 538 | self.path = path 539 | 540 | class ChecksumMismatch(ManifestErrorDetail): 541 | def __init__(self, path, algorithm=None, expected=None, found=None): 542 | self.path = path 543 | self.algorithm = algorithm 544 | self.expected = expected 545 | self.found = found 546 | def __str__(self): 547 | return "%s checksum validation failed (alg=%s expected=%s found=%s)" % (self.path, self.algorithm, self.expected, self.found) 548 | 549 | class FileMissing(ManifestErrorDetail): 550 | def __str__(self): 551 | return "%s exists in manifest but not found on filesystem" % self.path 552 | 553 | class UnexpectedFile(ManifestErrorDetail): 554 | def __str__(self): 555 | return "%s exists on filesystem but is not in manifest" % self.path 556 | 557 | 558 | def _load_tag_file(tag_file_name, duplicates=False): 559 | """ 560 | If duplicates is True then allow duplicate entries 561 | for a given tag. This is desirable for bag-info.txt 562 | metadata in v0.97 of the spec. 563 | """ 564 | tag_file = open(tag_file_name, 'rb') 565 | 566 | try: 567 | if not duplicates: 568 | return dict(_parse_tags(tag_file)) 569 | 570 | # Store duplicate tags as list of vals 571 | # in order of parsing under the same key. 572 | tags = {} 573 | for name, value in _parse_tags(tag_file): 574 | if not name in tags.keys(): 575 | tags[name] = value 576 | continue 577 | 578 | if not type(tags[name]) is list: 579 | tags[name] = [tags[name], value] 580 | else: 581 | tags[name].append(value) 582 | return tags 583 | 584 | finally: 585 | tag_file.close() 586 | 587 | def _parse_tags(file): 588 | """Parses a tag file, according to RFC 2822. This 589 | includes line folding, permitting extra-long 590 | field values. 591 | 592 | See http://www.faqs.org/rfcs/rfc2822.html for 593 | more information. 594 | """ 595 | 596 | tag_name = None 597 | tag_value = None 598 | 599 | # Line folding is handled by yielding values 600 | # only after we encounter the start of a new 601 | # tag, or if we pass the EOF. 602 | for num, line in enumerate(file): 603 | # If byte-order mark ignore it for now. 604 | if 0 == num: 605 | if line.startswith(codecs.BOM_UTF8): 606 | line = line.lstrip(codecs.BOM_UTF8) 607 | 608 | # Skip over any empty or blank lines. 609 | if len(line) == 0 or line.isspace(): 610 | continue 611 | 612 | if line[0].isspace(): # folded line 613 | tag_value += line.strip() 614 | else: 615 | # Starting a new tag; yield the last one. 616 | if tag_name: 617 | yield (tag_name, tag_value) 618 | 619 | parts = line.strip().split(':', 1) 620 | tag_name = parts[0].strip() 621 | tag_value = parts[1].strip() 622 | 623 | # Passed the EOF. All done after this. 624 | if tag_name: 625 | yield (tag_name, tag_value) 626 | 627 | 628 | def _make_manifest(manifest_file, data_dir, processes, algorithm='md5'): 629 | logging.info('writing manifest with %s processes' % processes) 630 | 631 | # avoid using multiprocessing unless it is required since 632 | # multiprocessing doesn't work in some environments (mod_wsgi, etc) 633 | 634 | if algorithm == 'md5': 635 | manifest_line = _manifest_line_md5 636 | elif algorithm == 'sha1': 637 | manifest_line = _manifest_line_sha1 638 | elif algorithm == 'sha256': 639 | manifest_line = _manifest_line_sha256 640 | 641 | 642 | if processes > 1: 643 | pool = multiprocessing.Pool(processes=processes) 644 | checksums = pool.map(manifest_line, _walk(data_dir)) 645 | pool.close() 646 | pool.join() 647 | else: 648 | checksums = map(manifest_line, _walk(data_dir)) 649 | 650 | manifest = open(manifest_file, 'wb') 651 | num_files = 0 652 | total_bytes = 0 653 | 654 | for digest, filename, bytes in checksums: 655 | num_files += 1 656 | total_bytes += bytes 657 | manifest.write("%s %s\n" % (digest, _encode_filename(filename))) 658 | manifest.close() 659 | return "%s.%s" % (total_bytes, num_files) 660 | 661 | 662 | def _make_tagmanifest_file(tagmanifest_file, bag_dir): 663 | files = [f for f in listdir(bag_dir) if isfile(join(bag_dir, f))] 664 | checksums = [] 665 | for f in files: 666 | if f == tagmanifest_file: 667 | continue 668 | fh = open(join(bag_dir, f), 'rb') 669 | m = hashlib.md5() 670 | while True: 671 | bytes = fh.read(16384) 672 | if not bytes: 673 | break 674 | m.update(bytes) 675 | checksums.append((m.hexdigest(), f)) 676 | fh.close() 677 | 678 | tagmanifest = open(join(bag_dir, tagmanifest_file), 'wb') 679 | for digest, filename in checksums: 680 | tagmanifest.write('%s %s\n' % (digest, filename)) 681 | tagmanifest.close() 682 | 683 | 684 | def _walk(data_dir): 685 | for dirpath, dirnames, filenames in os.walk(data_dir): 686 | # if we don't sort here the order of entries is non-deterministic 687 | # which makes it hard to test the fixity of tagmanifest-md5.txt 688 | filenames.sort() 689 | dirnames.sort() 690 | for fn in filenames: 691 | path = os.path.join(dirpath, fn) 692 | # BagIt spec requires manifest to always use '/' as path separator 693 | if os.path.sep != '/': 694 | parts = path.split(os.path.sep) 695 | path = '/'.join(parts) 696 | yield path 697 | 698 | def _can_bag(test_dir): 699 | """returns (unwriteable files/folders) 700 | """ 701 | unwriteable = [] 702 | for inode in os.listdir(test_dir): 703 | if not os.access(os.path.join(test_dir, inode), os.W_OK): 704 | unwriteable.append(os.path.join(os.path.abspath(test_dir), inode)) 705 | return tuple(unwriteable) 706 | 707 | def _can_read(test_dir): 708 | """ 709 | returns ((unreadable_dirs), (unreadable_files)) 710 | """ 711 | unreadable_dirs = [] 712 | unreadable_files = [] 713 | for dirpath, dirnames, filenames in os.walk(test_dir): 714 | for dn in dirnames: 715 | if not os.access(os.path.join(dirpath, dn), os.R_OK): 716 | unreadable_dirs.append(os.path.join(dirpath, dn)) 717 | for fn in filenames: 718 | if not os.access(os.path.join(dirpath, fn), os.R_OK): 719 | unreadable_files.append(os.path.join(dirpath, fn)) 720 | return (tuple(unreadable_dirs), tuple(unreadable_files)) 721 | 722 | def _manifest_line_md5(filename): 723 | return _manifest_line(filename, 'md5') 724 | 725 | def _manifest_line_sha1(filename): 726 | return _manifest_line(filename, 'sha1') 727 | 728 | def _manifest_line_sha256(filename): 729 | return _manifest_line(filename, 'sha256') 730 | 731 | def _manifest_line(filename, algorithm='md5'): 732 | fh = open(filename, 'rb') 733 | if algorithm == 'md5': 734 | m = hashlib.md5() 735 | elif algorithm == 'sha1': 736 | m = hashlib.sha1() 737 | elif algorithm == 'sha256': 738 | m = hashlib.sha256() 739 | 740 | total_bytes = 0 741 | while True: 742 | bytes = fh.read(16384) 743 | total_bytes += len(bytes) 744 | if not bytes: break 745 | m.update(bytes) 746 | fh.close() 747 | 748 | return (m.hexdigest(), _decode_filename(filename), total_bytes) 749 | 750 | def _encode_filename(s): 751 | s = s.replace("\r", "%0D") 752 | s = s.replace("\n", "%0A") 753 | return s 754 | 755 | def _decode_filename(s): 756 | s = re.sub("%0D", "\r", s, re.IGNORECASE) 757 | s = re.sub("%0A", "\n", s, re.IGNORECASE) 758 | return s 759 | 760 | 761 | # following code is used for command line program 762 | 763 | class BagOptionParser(optparse.OptionParser): 764 | def __init__(self, *args, **opts): 765 | self.bag_info = {} 766 | optparse.OptionParser.__init__(self, *args, **opts) 767 | 768 | def _bag_info_store(option, opt, value, parser): 769 | opt = opt.lstrip('--') 770 | opt_caps = '-'.join([o.capitalize() for o in opt.split('-')]) 771 | parser.bag_info[opt_caps] = value 772 | 773 | def _make_opt_parser(): 774 | parser = BagOptionParser(usage='usage: %prog [options] dir1 dir2 ...') 775 | parser.add_option('--processes', action='store', type="int", 776 | dest='processes', default=1, 777 | help='parallelize checksums generation') 778 | parser.add_option('--log', action='store', dest='log') 779 | parser.add_option('--quiet', action='store_true', dest='quiet') 780 | parser.add_option('--validate', action='store_true', dest='validate') 781 | parser.add_option('--fast', action='store_true', dest='fast') 782 | 783 | # optionally specify which checksum algorithm(s) to use when creating a bag 784 | # NOTE: could generate from checksum_algos ? 785 | parser.add_option('--md5', action='append_const', dest='checksum', 786 | const='md5', help='Generate MD5 manifest when creating a bag (default)') 787 | parser.add_option('--sha1', action='append_const', dest='checksum', 788 | const='sha1', help='Generate SHA1 manifest when creating a bag') 789 | parser.add_option('--sha256', action='append_const', dest='checksum', 790 | const='sha256', help='Generate SHA-256 manifest when creating a bag') 791 | 792 | for header in _bag_info_headers: 793 | parser.add_option('--%s' % header.lower(), type="string", 794 | action='callback', callback=_bag_info_store) 795 | return parser 796 | 797 | def _configure_logging(opts): 798 | log_format="%(asctime)s - %(levelname)s - %(message)s" 799 | if opts.quiet: 800 | level = logging.ERROR 801 | else: 802 | level = logging.INFO 803 | if opts.log: 804 | logging.basicConfig(filename=opts.log, level=level, format=log_format) 805 | else: 806 | logging.basicConfig(level=level, format=log_format) 807 | 808 | if __name__ == '__main__': 809 | opt_parser = _make_opt_parser() 810 | opts, args = opt_parser.parse_args() 811 | _configure_logging(opts) 812 | log = logging.getLogger() 813 | 814 | rc = 0 815 | for bag_dir in args: 816 | 817 | # validate the bag 818 | if opts.validate: 819 | try: 820 | bag = Bag(bag_dir) 821 | # validate throws a BagError or BagValidationError 822 | valid = bag.validate(fast=opts.fast) 823 | if opts.fast: 824 | log.info("%s valid according to Payload-Oxum", bag_dir) 825 | else: 826 | log.info("%s is valid", bag_dir) 827 | except BagError, e: 828 | log.info("%s is invalid: %s", bag_dir, e) 829 | rc = 1 830 | 831 | # make the bag 832 | else: 833 | make_bag(bag_dir, bag_info=opt_parser.bag_info, 834 | processes=opts.processes, 835 | checksum=opts.checksum) 836 | 837 | sys.exit(rc) 838 | --------------------------------------------------------------------------------