├── bgzip_md5.py ├── bgzip_md5_v2.py └── bgzip_md5_v3.py /bgzip_md5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Filter each named file through MD5 and bgzip in parallel. 4 | If an input file is named "foo", files 5 | """ 6 | 7 | import argparse 8 | import logging 9 | import os 10 | import shlex 11 | import subprocess 12 | import sys 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def main(): 19 | args = parse_args() 20 | config_logging(args) 21 | logger.debug('starting %r', args) 22 | compressing = not args.check 23 | checking = args.both or args.check 24 | run(args.dest_dir, args.input_files, compressing, checking) 25 | logger.info('finished') 26 | logging.shutdown() 27 | 28 | 29 | def parse_args(): 30 | parser = argparse.ArgumentParser(description=__doc__) 31 | parser.add_argument('-d', '--dest_dir', 32 | help='If not specified, outputs are written' 33 | ' to the same directories as the corresponding' 34 | ' original files. Conflicts with --check.') 35 | parser.add_argument('-b', '--both', action='store_true', 36 | help='both compress and check the results' 37 | ' by decompressing and recomputing the MD5.' 38 | ' Conflicts with --check.') 39 | parser.add_argument('-c', '--check', action='store_true', 40 | help='In this case, input_files must by the .gz' 41 | ' files. Only check that the decompressed .gz' 42 | ' files have the correct MD5s. Conflicts with' 43 | ' --dest_dir and --check.') 44 | parser.add_argument('input_files', nargs='*') 45 | parser.add_argument('-v', '--verbose', action='store_true') 46 | args = parser.parse_args() 47 | if (args.dest_dir or args.both) and args.check: 48 | parser.error('Cannot have --check and also --dest_dir or --check.') 49 | return args 50 | 51 | 52 | def config_logging(args): 53 | global logger 54 | level = logging.DEBUG if args.verbose else logging.INFO 55 | logging.basicConfig( 56 | level=level, 57 | format='%(asctime)s %(levelname)-8s %(name)s: %(message)s', 58 | datefmt='%Y-%m-%d %H:%M:%S' 59 | ) 60 | logger = logging.getLogger('bgzip_md5') 61 | 62 | 63 | def run(dest_dir, input_files, compressing, checking): 64 | if dest_dir: 65 | os.makedirs(dest_dir, exist_ok=True) 66 | for input_file_path in input_files: 67 | logger.info('processing %r', input_file_path) 68 | input_dir_path, file_name = os.path.split(input_file_path) 69 | output_dir_path = dest_dir or input_dir_path 70 | output_file_base = os.path.join(output_dir_path, file_name) 71 | gz_file_path = output_file_base + '.gz' 72 | md5_file_path = output_file_base + '.md5' 73 | if compressing: 74 | compress(input_file_path, file_name, gz_file_path, md5_file_path) 75 | if checking: 76 | pass 77 | 78 | 79 | def compress(input_file_path, file_name, gz_file_path, md5_file_path): 80 | """Compress a single file, while computing the MD5 of the original.""" 81 | template = "tee >(bgzip > {1}) < {0} | md5sum" 82 | cmd = template.format(shlex.quote(input_file_path), 83 | shlex.quote(gz_file_path)) 84 | logger.debug(cmd) 85 | proc = subprocess.run(['bash', '-c', cmd], 86 | stdin=subprocess.DEVNULL, 87 | stdout=subprocess.PIPE, 88 | universal_newlines=True, 89 | check=True) 90 | md5sum_line = proc.stdout.rstrip() 91 | assert md5sum_line[-1] == '-' 92 | md5sum_out_line = '{}{}\n'.format(md5sum_line[:-1], file_name) 93 | with open(md5_file_path, 'w') as fout: 94 | fout.write(md5sum_out_line) 95 | logger.debug(md5sum_out_line.rstrip()) 96 | 97 | 98 | if __name__ == '__main__': 99 | main() 100 | -------------------------------------------------------------------------------- /bgzip_md5_v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Filter each named file through MD5 and bgzip in parallel. 4 | If an input file is named "foo", files 5 | """ 6 | 7 | import argparse 8 | import logging 9 | import os 10 | import shlex 11 | import subprocess 12 | import sys 13 | 14 | import gzip 15 | import hashlib 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | MD5_LENGTH = 32 # The MD5 hex digest is always 32 characters. 21 | 22 | 23 | def main(): 24 | args = parse_args() 25 | config_logging(args) 26 | logger.debug('starting %r', args) 27 | compressing = not args.check 28 | checking = args.both or args.check 29 | exit_code = run(args.dest_dir, args.input_files, compressing, checking) 30 | logger.info('finished') 31 | logging.shutdown() 32 | sys.exit(exit_code) 33 | 34 | 35 | def parse_args(): 36 | parser = argparse.ArgumentParser(description=__doc__) 37 | parser.add_argument('-d', '--dest_dir', 38 | help='If not specified, outputs are written' 39 | ' to the same directories as the corresponding' 40 | ' original files. Conflicts with --check.') 41 | parser.add_argument('-b', '--both', action='store_true', 42 | help='both compress and check the results' 43 | ' by decompressing and recomputing the MD5.' 44 | ' Conflicts with --check.') 45 | parser.add_argument('-c', '--check', action='store_true', 46 | help='In this case, input_files must by the .gz' 47 | ' files. Only check that the decompressed .gz' 48 | ' files have the correct MD5s. Conflicts with' 49 | ' --dest_dir and --check.') 50 | parser.add_argument('input_files', nargs='*') 51 | parser.add_argument('-v', '--verbose', action='store_true') 52 | args = parser.parse_args() 53 | if (args.dest_dir or args.both) and args.check: 54 | parser.error('Cannot have --check and also --dest_dir or --check.') 55 | return args 56 | 57 | 58 | def config_logging(args): 59 | global logger 60 | level = logging.DEBUG if args.verbose else logging.INFO 61 | logging.basicConfig( 62 | level=level, 63 | format='%(asctime)s %(levelname)-8s %(name)s: %(message)s', 64 | datefmt='%Y-%m-%d %H:%M:%S' 65 | ) 66 | logger = logging.getLogger('bgzip_md5') 67 | 68 | 69 | def run(dest_dir, input_files, compressing, checking): 70 | if dest_dir: 71 | os.makedirs(dest_dir, exist_ok=True) 72 | encountered_errors = False 73 | for input_file_path in input_files: 74 | logger.info('processing %r', input_file_path) 75 | input_dir_path, file_name = os.path.split(input_file_path) 76 | output_dir_path = dest_dir or input_dir_path 77 | output_file_base = os.path.join(output_dir_path, file_name) 78 | gz_file_path = output_file_base + '.gz' 79 | md5_file_path = output_file_base + '.md5' 80 | if compressing: 81 | compress(input_file_path, file_name, gz_file_path, md5_file_path) 82 | if checking: 83 | logger.debug('checking %s', gz_file_path) 84 | try: 85 | hashes_match = check(gz_file_path, md5_file_path) 86 | if not hashes_match: 87 | encountered_errors = True 88 | except Exception as e: 89 | logger.exception('error checking %s', gz_file_path) 90 | encountered_errors = True 91 | return encountered_errors 92 | 93 | 94 | def compress(input_file_path, file_name, gz_file_path, md5_file_path): 95 | """Compress a single file, while computing the MD5 of the original.""" 96 | template = "tee >(bgzip > {1}) < {0} | md5sum" 97 | cmd = template.format(shlex.quote(input_file_path), 98 | shlex.quote(gz_file_path)) 99 | logger.debug(cmd) 100 | proc = subprocess.run(['bash', '-c', cmd], 101 | stdin=subprocess.DEVNULL, 102 | stdout=subprocess.PIPE, 103 | universal_newlines=True, 104 | check=True) 105 | md5sum_line = proc.stdout.rstrip() 106 | assert md5sum_line[-1] == '-' 107 | md5sum_out_line = '{}{}\n'.format(md5sum_line[:-1], file_name) 108 | with open(md5_file_path, 'w') as fout: 109 | fout.write(md5sum_out_line) 110 | logger.debug(md5sum_out_line.rstrip()) 111 | 112 | 113 | def check(gz_file_path, md5_file_path): 114 | """Compare the checksum of the uncompressed data to the checksum stored in 115 | the MD5 file. Return True if they match, else log the missmatch and 116 | return False.""" 117 | observed = compute_md5_of_uncompressed_data(gz_file_path) 118 | with open(md5_file_path) as fin: 119 | expected = fin.readline()[:MD5_LENGTH] 120 | if expected != observed: 121 | logger.error('MD5 mismatch for %, %s != %s', gz_file_path, expected, observed) 122 | return False 123 | return True 124 | 125 | 126 | HASH_BLOCK_SIZE = 128 * 2 ** 10 # 128 kB 127 | """ 128 | def compute_md5_of_uncompressed_data(gz_file_path): 129 | #Return the hex digest of the corresponding uncompressed data.# 130 | md5_of_uncompressed_data = hashlib.md5() 131 | with gzip.open(gz_file_path) as fin: 132 | while True: 133 | data = fin.read(HASH_BLOCK_SIZE) 134 | if not data: 135 | break 136 | md5_of_uncompressed_data.update(data) 137 | return md5_of_uncompressed_data.hexdigest() 138 | """ 139 | 140 | def compute_md5_of_uncompressed_data(gz_file_path): 141 | """Return the hex digest of the corresponding uncompressed data.""" 142 | zcat = subprocess.Popen(['zcat', gz_file_path], 143 | stdin=subprocess.DEVNULL, 144 | stdout=subprocess.PIPE) 145 | md5sum = subprocess.Popen('md5sum', 146 | stdin=zcat.stdout, 147 | stdout=subprocess.PIPE) 148 | out, err = md5sum.communicate() 149 | if md5sum.returncode: 150 | raise Exception('md5sum returned error %s for %s', md5sum.returncode, gz_file_path) 151 | zcat.wait() 152 | if zcat.returncode: 153 | raise Exception('zcat returned error %s for %s', zcat.returncode, gz_file_path) 154 | return out[:MD5_LENGTH].decode('ascii') 155 | 156 | 157 | if __name__ == '__main__': 158 | main() 159 | -------------------------------------------------------------------------------- /bgzip_md5_v3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Filter each named file through MD5 and bgzip in parallel. 4 | If an input file is named "foo", files 5 | """ 6 | 7 | import argparse 8 | import logging 9 | import os 10 | import shlex 11 | import subprocess 12 | import sys 13 | 14 | import gzip 15 | import hashlib 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | MD5_LENGTH = 32 # The MD5 hex digest is always 32 characters. 21 | 22 | 23 | def main(): 24 | args = parse_args() 25 | config_logging(args) 26 | logger.debug('starting %r', args) 27 | compressing = not args.check 28 | checking = args.both or args.check 29 | exit_code = run(args.dest_dir, args.input_files, compressing, checking) 30 | logger.info('finished') 31 | logging.shutdown() 32 | sys.exit(exit_code) 33 | 34 | 35 | def parse_args(): 36 | parser = argparse.ArgumentParser(description=__doc__) 37 | parser.add_argument('-d', '--dest_dir', 38 | help='If not specified, outputs are written' 39 | ' to the same directories as the corresponding' 40 | ' original files. Conflicts with --check.') 41 | parser.add_argument('-b', '--both', action='store_true', 42 | help='both compress and check the results' 43 | ' by decompressing and recomputing the MD5.' 44 | ' Conflicts with --check.') 45 | parser.add_argument('-c', '--check', action='store_true', 46 | help='In this case, input_files must by the .gz' 47 | ' files. Only check that the decompressed .gz' 48 | ' files have the correct MD5s. Conflicts with' 49 | ' --dest_dir and --check.') 50 | parser.add_argument('input_files', nargs='*') 51 | parser.add_argument('-v', '--verbose', action='store_true') 52 | args = parser.parse_args() 53 | if (args.dest_dir or args.both) and args.check: 54 | parser.error('Cannot have --check and also --dest_dir or --check.') 55 | return args 56 | 57 | 58 | def config_logging(args): 59 | global logger 60 | level = logging.DEBUG if args.verbose else logging.INFO 61 | logging.basicConfig( 62 | level=level, 63 | format='%(asctime)s %(levelname)-8s %(name)s: %(message)s', 64 | datefmt='%Y-%m-%d %H:%M:%S' 65 | ) 66 | logger = logging.getLogger('bgzip_md5') 67 | 68 | 69 | def run(dest_dir, input_files, compressing, checking): 70 | if dest_dir: 71 | os.makedirs(dest_dir, exist_ok=True) 72 | encountered_errors = False 73 | for input_file_path in input_files: 74 | logger.info('processing %r', input_file_path) 75 | input_dir_path, file_name = os.path.split(input_file_path) 76 | output_dir_path = dest_dir or input_dir_path 77 | output_file_base = os.path.join(output_dir_path, file_name) 78 | gz_file_path = output_file_base + '.gz' 79 | md5_file_path = output_file_base + '.md5' 80 | if compressing: 81 | compress(input_file_path, file_name, gz_file_path, md5_file_path) 82 | if checking: 83 | logger.debug('checking %s', gz_file_path) 84 | try: 85 | hashes_match = check(gz_file_path, md5_file_path) 86 | if not hashes_match: 87 | encountered_errors = True 88 | except Exception as e: 89 | logger.exception('error checking %s', gz_file_path) 90 | encountered_errors = True 91 | return encountered_errors 92 | 93 | 94 | def compress(input_file_path, file_name, gz_file_path, md5_file_path): 95 | """Compress a single file, while computing the MD5 of the original.""" 96 | template = "tee >(bgzip > {1}) < {0} | md5sum" 97 | cmd = template.format(shlex.quote(input_file_path), 98 | shlex.quote(gz_file_path)) 99 | logger.debug(cmd) 100 | proc = subprocess.run(['bash', '-c', cmd], 101 | stdin=subprocess.DEVNULL, 102 | stdout=subprocess.PIPE, 103 | universal_newlines=True, 104 | check=True) 105 | md5sum_line = proc.stdout.rstrip() 106 | assert md5sum_line[-1] == '-' 107 | md5sum_out_line = '{}{}\n'.format(md5sum_line[:-1], file_name) 108 | with open(md5_file_path, 'w') as fout: 109 | fout.write(md5sum_out_line) 110 | logger.debug(md5sum_out_line.rstrip()) 111 | 112 | 113 | def check(gz_file_path, md5_file_path): 114 | """Compare the checksum of the uncompressed data to the checksum stored in 115 | the MD5 file. Return True if they match, else log the missmatch and 116 | return False.""" 117 | observed = compute_md5_of_uncompressed_data(gz_file_path) 118 | with open(md5_file_path) as fin: 119 | expected = fin.readline()[:MD5_LENGTH] 120 | if expected != observed: 121 | logger.error('MD5 mismatch for %s, %s != %s', gz_file_path, expected, observed) 122 | return False 123 | return True 124 | 125 | 126 | HASH_BLOCK_SIZE = 128 * 2 ** 10 # 128 kB 127 | """" 128 | def compute_md5_of_uncompressed_data(gz_file_path): 129 | #Return the hex digest of the corresponding uncompressed data.# 130 | md5_of_uncompressed_data = hashlib.md5() 131 | with gzip.open(gz_file_path) as fin: 132 | while True: 133 | data = fin.read(HASH_BLOCK_SIZE) 134 | if not data: 135 | break 136 | md5_of_uncompressed_data.update(data) 137 | return md5_of_uncompressed_data.hexdigest() 138 | """ 139 | 140 | def compute_md5_of_uncompressed_data(gz_file_path): 141 | """Return the hex digest of the corresponding uncompressed data.""" 142 | gunzip = subprocess.Popen(['gunzip', '-c', gz_file_path], 143 | stdin=subprocess.DEVNULL, 144 | stdout=subprocess.PIPE) 145 | md5sum = subprocess.Popen('md5sum', 146 | stdin=gunzip.stdout, 147 | stdout=subprocess.PIPE) 148 | out, err = md5sum.communicate() 149 | if md5sum.returncode: 150 | raise Exception('md5sum returned error %s for %s', md5sum.returncode, gz_file_path) 151 | gunzip.wait() 152 | if gunzip.returncode: 153 | raise Exception('gunzip returned error %s for %s', gunzip.returncode, gz_file_path) 154 | # We know that this is hexadecimal digits in ASCII. 155 | result = out[:MD5_LENGTH].decode('ascii') 156 | return result 157 | 158 | 159 | if __name__ == '__main__': 160 | main() 161 | --------------------------------------------------------------------------------