├── .gitignore ├── test ├── rerun.sh ├── iterate.sh ├── test.sh ├── test_1.sh ├── test_2.sh ├── test_3.sh ├── test_1.py └── test_4.sh └── bgzip_md5.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | /target 3 | .cache 4 | __pycache__ 5 | -------------------------------------------------------------------------------- /test/rerun.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | rerun -i target ./test/iterate.sh 4 | -------------------------------------------------------------------------------- /test/iterate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | clear 4 | rm -rf target 5 | pycodestyle bgzip_md5.py 6 | ./test/test.sh 7 | date 8 | -------------------------------------------------------------------------------- /test/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Run all the tests. 4 | 5 | # TODO: test -b option 6 | 7 | rm -rf target/test.log 8 | mkdir -p target # in case target already exists 9 | 10 | for t in test/test_?.sh; do 11 | echo $t | tee -a target/test.log 12 | $t 2>>target/test.log 13 | done 14 | -------------------------------------------------------------------------------- /test/test_1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ROOT="$(pwd)" 4 | TARGET="$ROOT"/target/test_1 5 | 6 | rm -rf "$TARGET" 7 | mkdir -p "$TARGET" 8 | 9 | cd "$TARGET"/ 10 | echo hello world > input 11 | md5sum input > expected.md5 12 | 13 | cd "$ROOT" 14 | ./bgzip_md5.py "$TARGET"/input 15 | 16 | cd "$TARGET"/ 17 | gzip -cd input.gz | diff input - 18 | diff expected.md5 input.md5 19 | -------------------------------------------------------------------------------- /test/test_2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ROOT="$(pwd)" 4 | TARGET="$ROOT"/target/test_2 5 | 6 | rm -rf "$TARGET" 7 | mkdir -p "$TARGET"/{in,out} 8 | 9 | cd "$TARGET"/in 10 | echo hello world a > input_a 11 | echo hello world b > input_b 12 | md5sum input_a > expected_a.md5 13 | md5sum input_b > expected_b.md5 14 | 15 | cd "$ROOT" 16 | ./bgzip_md5.py -d "$TARGET"/out "$TARGET"/in/input_? 17 | 18 | cd "$TARGET"/ 19 | for x in a b; do 20 | gzip -cd out/input_${x}.gz | diff in/input_${x} - 21 | diff in/expected_${x}.md5 out/input_${x}.md5 22 | done 23 | -------------------------------------------------------------------------------- /test/test_3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ROOT="$(pwd)" 4 | TARGET="$ROOT"/target/test_3 5 | 6 | rm -rf "$TARGET" 7 | mkdir -p "$TARGET"/{in,out} 8 | 9 | cd "$TARGET"/in 10 | echo hello world a > input_a 11 | echo hello world b > input_b 12 | md5sum input_a > expected_a.md5 13 | md5sum input_b > expected_b.md5 14 | 15 | cd "$ROOT" 16 | ./bgzip_md5.py -bv -d "$TARGET"/out "$TARGET"/in/input_? 17 | 18 | cd "$TARGET"/ 19 | for x in a b; do 20 | gzip -cd out/input_${x}.gz | diff in/input_${x} - 21 | diff in/expected_${x}.md5 out/input_${x}.md5 22 | done 23 | -------------------------------------------------------------------------------- /test/test_1.py: -------------------------------------------------------------------------------- 1 | """Testing basic operation of bgzip_md5, minimal use case.""" 2 | 3 | from pathlib import Path 4 | from shutil import rmtree 5 | from subprocess import run, PIPE 6 | 7 | 8 | ORIGINAL_FILE_CONTENTS = 'hello world\n' 9 | 10 | 11 | def test_1(): 12 | target = Path('target', 'test_1') 13 | if target.exists(): 14 | rmtree(target) 15 | target.mkdir(parents=True) 16 | input_file = target / 'input' 17 | input_file.write_text(ORIGINAL_FILE_CONTENTS) 18 | expected_md5 = run(['md5sum', 'input'], 19 | cwd=target, 20 | stdout=PIPE, 21 | check=True).stdout 22 | run(['./bgzip_md5.py', input_file], check=True) 23 | output_file = target / 'input.gz' 24 | md5_file = target / 'input.md5' 25 | assert md5_file.read_bytes() == expected_md5, 'checksum' 26 | decompressed_data = run(['gzip', '-cd', output_file], 27 | stdout=PIPE, 28 | check=True).stdout.decode('ascii') 29 | assert decompressed_data == ORIGINAL_FILE_CONTENTS, 'decompression' 30 | -------------------------------------------------------------------------------- /test/test_4.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ROOT="$(pwd)" 4 | TARGET="$ROOT"/target/test_4 5 | 6 | rm -rf "$TARGET" 7 | mkdir -p "$TARGET" 8 | 9 | cd "$TARGET" 10 | 11 | echo hello world a > input_a 12 | echo hello world b > input_b 13 | 14 | md5sum input_a > input_a.md5 15 | md5sum input_a > input_b.md5 # Deliberate error! 16 | 17 | bgzip input_a 18 | bgzip input_b 19 | 20 | cd "$ROOT" 21 | ./bgzip_md5.py -cv "$TARGET"/input_a 2> "$TARGET"/test_4a.log 22 | if [[ $? != 0 ]]; then 23 | echo "got unexpected error; see test_4a.log" 24 | fi 25 | if [ $(cut -d' ' -f3- target/test_4/test_4a.log | grep ^ERROR | wc -l) -ne 0 ]; then 26 | echo "wrong number of ERROR lines in test_4a.log" 27 | fi 28 | 29 | ./bgzip_md5.py -cv "$TARGET"/input_b 2> "$TARGET"/test_4b.log 30 | if [[ $? == 0 ]]; then 31 | echo "did not get expected error; see test_4b.log" 32 | fi 33 | if [ $(cut -d' ' -f3- target/test_4/test_4b.log | grep ^ERROR | wc -l) -ne 2 ]; then 34 | echo "wrong number of ERROR lines in test_4b.log" 35 | fi 36 | 37 | ./bgzip_md5.py -cv "$TARGET"/input_{a,b} 2> "$TARGET"/test_4c.log 38 | if [[ $? == 0 ]]; then 39 | echo "did not get expected error; see test_4c.log" 40 | fi 41 | if [ $(cut -d' ' -f3- target/test_4/test_4c.log | grep ^ERROR | wc -l) -ne 2 ]; then 42 | echo "wrong number of ERROR lines in test_4c.log" 43 | fi 44 | 45 | ./bgzip_md5.py -cv "$TARGET"/input_{a,c,a,d,a} 2> "$TARGET"/test_4d.log 46 | if [[ $? == 0 ]]; then 47 | echo "did not get expected error; see test_4d.log" 48 | fi 49 | if [ $(cut -d' ' -f3- target/test_4/test_4d.log | grep ^ERROR | wc -l) -ne 2 ]; then 50 | echo "wrong number of ERROR lines in test_4d.log" 51 | fi 52 | -------------------------------------------------------------------------------- /bgzip_md5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Filter each named file through MD5 and bgzip in parallel. 4 | If an input file is named "foo", files 5 | """ 6 | 7 | import argparse 8 | import logging 9 | import os 10 | from pathlib import Path 11 | import re 12 | import shlex 13 | import subprocess 14 | import sys 15 | 16 | 17 | __version__ = '1.1.0-rc1' 18 | MD5_LENGTH = 32 # The MD5 hex digest is always 32 characters. 19 | MD5_PAT = re.compile(r'([0-9a-f]{32}) (.+)') 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def main(): 25 | args = parse_args() 26 | config_logging(args) 27 | logger.debug('starting %r', args) 28 | compressing = not args.check 29 | checking = args.both or args.check 30 | exit_code = run(args.dest_dir, args.input_files, compressing, checking) 31 | logger.info('finished') 32 | logging.shutdown() 33 | sys.exit(exit_code) 34 | 35 | 36 | def parse_args(): 37 | parser = argparse.ArgumentParser(description=__doc__) 38 | parser.add_argument('-d', '--dest_dir', 39 | help='If not specified, outputs are written' 40 | ' to the same directories as the corresponding' 41 | ' original files. Conflicts with --check.') 42 | parser.add_argument('-b', '--both', action='store_true', 43 | help='both compress and check the results' 44 | ' by decompressing and recomputing the MD5.' 45 | ' Conflicts with --check.') 46 | parser.add_argument('-c', '--check', action='store_true', 47 | help='In this case, input_files must by the .gz' 48 | ' files. Only check that the decompressed .gz' 49 | ' files have the correct MD5s. Conflicts with' 50 | ' --dest_dir and --check.') 51 | parser.add_argument('input_files', nargs='*') 52 | parser.add_argument('-v', '--verbose', action='store_true') 53 | parser.add_argument('--version', action='version', 54 | version='%(prog)s {}'.format(__version__)) 55 | args = parser.parse_args() 56 | if (args.dest_dir or args.both) and args.check: 57 | parser.error('Cannot have --check and also --dest_dir or --check.') 58 | return args 59 | 60 | 61 | def config_logging(args): 62 | global logger 63 | level = logging.DEBUG if args.verbose else logging.INFO 64 | logging.basicConfig( 65 | level=level, 66 | format='%(asctime)s %(levelname)-8s %(name)s: %(message)s', 67 | datefmt='%Y-%m-%d %H:%M:%S' 68 | ) 69 | logger = logging.getLogger('bgzip_md5') 70 | 71 | 72 | def run(dest_dir, input_files, compressing, checking): 73 | if dest_dir: 74 | os.makedirs(dest_dir, exist_ok=True) 75 | encountered_errors = False 76 | for input_file_path in input_files: 77 | logger.info('processing %r', input_file_path) 78 | input_dir_path, file_name = os.path.split(input_file_path) 79 | output_dir_path = dest_dir or input_dir_path 80 | output_file_base = os.path.join(output_dir_path, file_name) 81 | gz_file_path = output_file_base + '.gz' 82 | md5_file_path = output_file_base + '.md5' 83 | if compressing: 84 | compress(input_file_path, file_name, gz_file_path, md5_file_path) 85 | if checking: 86 | logger.debug('checking %s', gz_file_path) 87 | try: 88 | hashes_match = check(gz_file_path, md5_file_path) 89 | if not hashes_match: 90 | encountered_errors = True 91 | except NoRegularFile as e: 92 | logger.error('not a regular file: %s', gz_file_path) 93 | encountered_errors = True 94 | except Exception as e: 95 | logger.exception('error checking %s', gz_file_path) 96 | encountered_errors = True 97 | return encountered_errors 98 | 99 | 100 | def compress(input_file_path, file_name, gz_file_path, md5_file_path): 101 | """Compress a single file, while computing the MD5 of the original.""" 102 | template = "tee >(bgzip > {1}) < {0} | md5sum" 103 | cmd = template.format(shlex.quote(input_file_path), 104 | shlex.quote(gz_file_path)) 105 | logger.debug(cmd) 106 | proc = subprocess.run(['bash', '-c', cmd], 107 | stdin=subprocess.DEVNULL, 108 | stdout=subprocess.PIPE, 109 | universal_newlines=True, 110 | check=True) 111 | md5sum_line = proc.stdout.rstrip() 112 | assert md5sum_line[-1] == '-' 113 | md5sum_out_line = '{}{}\n'.format(md5sum_line[:-1], file_name) 114 | with open(md5_file_path, 'w') as fout: 115 | fout.write(md5sum_out_line) 116 | logger.debug(md5sum_out_line.rstrip()) 117 | 118 | 119 | def check(gz_file_path, md5_file_path): 120 | """Compare the checksum of the uncompressed data to the checksum stored in 121 | the MD5 file. Return True if they match, else log the missmatch and 122 | return False.""" 123 | error = False # Optimistic 124 | observed = compute_md5_of_uncompressed_data(gz_file_path) 125 | gz_path = Path(gz_file_path) 126 | md5_path = Path(md5_file_path) 127 | if gz_path.suffix != '.gz': 128 | error = True 129 | logger.error('bad gz file name: %s', gz_path) 130 | if md5_path.suffix != '.md5': 131 | error = True 132 | logger.error('bad MD5 file name: %s', md5) 133 | if gz_path.stem != md5_path.stem: 134 | error = True 135 | logger.error('mismatching names for gz and MD5 files: %s %s', 136 | gz_path, md5_path) 137 | with md5_path.open() as fin: 138 | raw_line = fin.readline() 139 | m = MD5_PAT.match(raw_line) 140 | if not m: 141 | error = True 142 | logger.error('illegal MD5') 143 | else: 144 | expected_md5, expected_file_name = m.groups() 145 | if expected_md5 != observed: 146 | error = True 147 | logger.error('MD5 mismatch for %s, %s != %s', 148 | gz_path, expected_md5, observed) 149 | if expected_file_name != gz_path.stem: 150 | error = True 151 | logger.error('File name mismatch inside MD5 file %s (%s) ~ %s', 152 | md5_path, expected_file_name, gz_path) 153 | return not error 154 | 155 | 156 | def compute_md5_of_uncompressed_data(gz_file_path): 157 | """Return the hex digest of the corresponding uncompressed data.""" 158 | # I want "zcat", which on most linux systems is the same as "gunzip -c", 159 | # which on all unix systems is the same as "gzip -c -d". On BSD, we 160 | # would have to use gzcat rather than zcat, and gzcat is not a thing 161 | # on most Linux systems. Best to be completely explicit. 162 | gz_path = Path(gz_file_path) 163 | if not gz_path.is_file(): 164 | raise NoRegularFile(str(gz_path)) 165 | zcat = subprocess.Popen(['gzip', '-c', '-d', gz_file_path], 166 | stdin=subprocess.DEVNULL, 167 | stdout=subprocess.PIPE) 168 | md5sum = subprocess.Popen('md5sum', 169 | stdin=zcat.stdout, 170 | stdout=subprocess.PIPE) 171 | out, err = md5sum.communicate() 172 | zcat.wait() 173 | if md5sum.returncode: 174 | raise ChildProcessError('md5sum returned error {} for {}'.format( 175 | md5sum.returncode, gz_file_path)) 176 | if zcat.returncode: 177 | raise ChildProcessError('gzip -c -d returned error {} for {}'.format( 178 | zcat.returncode, gz_file_path)) 179 | # We know that this is hexadecimal digits in ASCII. 180 | return out[:MD5_LENGTH].decode('ascii') 181 | 182 | 183 | class NoRegularFile(Exception): 184 | """Either the file is missing or there is something not a file there.""" 185 | 186 | 187 | class ChildProcessError(Exception): 188 | """Child process returned nonzero returncode.""" 189 | 190 | 191 | if __name__ == '__main__': 192 | main() 193 | --------------------------------------------------------------------------------