├── v2a ├── test ├── test.py ├── vcf_batch.yml └── test.tsv ├── README.md └── vcfs_to_aspera.py /v2a: -------------------------------------------------------------------------------- 1 | vcfs_to_aspera.py -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | print(*sys.argv) 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # batch_vcfs 2 | 3 | This tool will 4 | 5 | 1. Read the YAML file designated on the command line. 6 | 2. Open the XLSX file named in the YAML file. 7 | 3. Run a worker script for each SNP and indel VCF file whose path is specified in the XLSX file. 8 | 9 | The worker script instances will be run in parallel as child processes with a maximum number of simultaneous instances. 10 | -------------------------------------------------------------------------------- /test/vcf_batch.yml: -------------------------------------------------------------------------------- 1 | batch: 1 2 | date: 2017-06-16 3 | worklist_file: test.tsv 4 | batch_dest_root: '/tmp/testing/aspera/share/submissions/staging/test/one/{batch_name}' 5 | batch_name: VTE_Mayo_test{batch}_{date} 6 | source_root: '{batch_name}' 7 | snp_dir: '{batch_dest_root}/{batch_name}_SNPs' 8 | indel_dir: '{batch_dest_root}/{batch_name}_indels' 9 | program_args: 10 | - python3 11 | - test.py 12 | - -d 13 | -------------------------------------------------------------------------------- /test/test.tsv: -------------------------------------------------------------------------------- 1 | sample_id_XYZ_id lane_barcode batch vcf_batch bam_file bam_path snp_file snp_path indel_file indel_path 2 | XYZ644215 ABCVCALXX-3 TEST_batch62_2017-05-23 demo_vcfs_b04 ABCVCALXX-3.bam /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-3/ABCVCALXX-3.bam ABCVCALXX-3_snp.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-3/variants/ABCVCALXX-3_snp.vcf ABCVCALXX-3_indel.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-3/variants/ABCVCALXX-3_indel.vcf 3 | XYZ748674 DEFYMALXX-3 TEST_batch62_2017-05-23 demo_vcfs_b04 DEFYMALXX-3.bam /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-3/DEFYMALXX-3.bam DEFYMALXX-3_snp.vcf /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-3/variants/DEFYMALXX-3_snp.vcf DEFYMALXX-3_indel.vcf /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-3/variants/DEFYMALXX-3_indel.vcf 4 | XYZ704713 DEFYMALXX-1 TEST_batch62_2017-05-23 demo_vcfs_b04 DEFYMALXX-1.bam /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-1/DEFYMALXX-1.bam DEFYMALXX-1_snp.vcf /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-1/variants/DEFYMALXX-1_snp.vcf DEFYMALXX-1_indel.vcf /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-1/variants/DEFYMALXX-1_indel.vcf 5 | XYZ899378 DEFYMALXX-4 TEST_batch62_2017-05-23 demo_vcfs_b04 DEFYMALXX-4.bam /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-4/DEFYMALXX-4.bam DEFYMALXX-4_snp.vcf /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-4/variants/DEFYMALXX-4_snp.vcf DEFYMALXX-4_indel.vcf /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-4/variants/DEFYMALXX-4_indel.vcf 6 | XYZ683535 ABCVCALXX-5 TEST_batch62_2017-05-23 demo_vcfs_b04 ABCVCALXX-5.bam /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-5/ABCVCALXX-5.bam ABCVCALXX-5_snp.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-5/variants/ABCVCALXX-5_snp.vcf ABCVCALXX-5_indel.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-5/variants/ABCVCALXX-5_indel.vcf 7 | XYZ411370 ABCVCALXX-1 TEST_batch62_2017-05-23 demo_vcfs_b04 ABCVCALXX-1.bam /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-1/ABCVCALXX-1.bam ABCVCALXX-1_snp.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-1/variants/ABCVCALXX-1_snp.vcf ABCVCALXX-1_indel.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-1/variants/ABCVCALXX-1_indel.vcf 8 | XYZ737853 DEFYMALXX-2 TEST_batch62_2017-05-23 demo_vcfs_b04 DEFYMALXX-2.bam /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-2/DEFYMALXX-2.bam DEFYMALXX-2_snp.vcf /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-2/variants/DEFYMALXX-2_snp.vcf DEFYMALXX-2_indel.vcf /tmp/d100/E00253/170503_ADEFYMALXX/Sample_DEFYMALXX-2/variants/DEFYMALXX-2_indel.vcf 9 | XYZ266532 ABCVCALXX-8 TEST_batch62_2017-05-23 demo_vcfs_b04 ABCVCALXX-8.bam /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-8/ABCVCALXX-8.bam ABCVCALXX-8_snp.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-8/variants/ABCVCALXX-8_snp.vcf ABCVCALXX-8_indel.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-8/variants/ABCVCALXX-8_indel.vcf 10 | XYZ193291 ABCVCALXX-6 TEST_batch62_2017-05-23 demo_vcfs_b04 ABCVCALXX-6.bam /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-6/ABCVCALXX-6.bam ABCVCALXX-6_snp.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-6/variants/ABCVCALXX-6_snp.vcf ABCVCALXX-6_indel.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-6/variants/ABCVCALXX-6_indel.vcf 11 | XYZ674503 ABCVCALXX-4 TEST_batch62_2017-05-23 demo_vcfs_b04 ABCVCALXX-4.bam /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-4/ABCVCALXX-4.bam ABCVCALXX-4_snp.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-4/variants/ABCVCALXX-4_snp.vcf ABCVCALXX-4_indel.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-4/variants/ABCVCALXX-4_indel.vcf 12 | XYZ268598 ABCNYALXX-1 TEST_batch62_2017-05-23 demo_vcfs_b04 ABCNYALXX-1.bam /tmp/d140/E00270/170507_BABCNYALXX/Sample_ABCNYALXX-1/ABCNYALXX-1.bam ABCNYALXX-1_snp.vcf /tmp/d140/E00270/170507_BABCNYALXX/Sample_ABCNYALXX-1/variants/ABCNYALXX-1_snp.vcf ABCNYALXX-1_indel.vcf /tmp/d140/E00270/170507_BABCNYALXX/Sample_ABCNYALXX-1/variants/ABCNYALXX-1_indel.vcf 13 | XYZ508726 ABCVCALXX-2 TEST_batch62_2017-05-23 demo_vcfs_b04 ABCVCALXX-2.bam /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-2/ABCVCALXX-2.bam ABCVCALXX-2_snp.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-2/variants/ABCVCALXX-2_snp.vcf ABCVCALXX-2_indel.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-2/variants/ABCVCALXX-2_indel.vcf 14 | XYZ227686 ABCVCALXX-7 TEST_batch62_2017-05-23 demo_vcfs_b04 ABCVCALXX-7.bam /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-7/ABCVCALXX-7.bam ABCVCALXX-7_snp.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-7/variants/ABCVCALXX-7_snp.vcf ABCVCALXX-7_indel.vcf /tmp/d140/E00270/170507_AABCVCALXX/Sample_ABCVCALXX-7/variants/ABCVCALXX-7_indel.vcf 15 | -------------------------------------------------------------------------------- /vcfs_to_aspera.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | """Batch process a collection of VCF files (SNPs and indels) that are 4 | listed in an XLSX file which is itself listed in a YAML file. 5 | 6 | Sample YAML: 7 | 8 | batch: 1 9 | date: 2017-06-16 10 | worklist_file: test.tsv 11 | batch_dest_root: '/tmp/testing/aspera/share/submissions/staging/test/one/{batch_name}' 12 | batch_name: VTE_Mayo_test{batch}_{date} 13 | source_root: '{batch_name}' 14 | snp_dir: '{batch_dest_root}/{batch_name}_SNPs' 15 | indel_dir: '{batch_dest_root}/{batch_name}_indels' 16 | """ 17 | 18 | import argparse 19 | import os 20 | from pprint import pprint # TODO: replace by logging later 21 | import subprocess 22 | import sys 23 | import time 24 | 25 | import pandas as pd 26 | import yaml 27 | 28 | 29 | DEFAULT_MAX_WORKERS = 2 30 | 31 | def main(): 32 | args = parse_args() 33 | config = load_yaml_config(args.config_file) 34 | if args.max_workers is not None: 35 | config.max_workers = args.max_workers 36 | # TODO: what if user sets to < 1? 37 | run(config) 38 | 39 | 40 | def parse_args(): 41 | parser = argparse.ArgumentParser(description=__doc__) 42 | parser.add_argument('config_file', help='a YAML file') 43 | parser.add_argument('-w', '--max_workers', type=int, 44 | help='default {}'.format(DEFAULT_MAX_WORKERS)) 45 | args = parser.parse_args() 46 | return args 47 | 48 | 49 | def load_yaml_config(config_file): 50 | with open(config_file) as ymlfile: 51 | yam = yaml.load(ymlfile) 52 | 53 | class BatchInfo: 54 | pass 55 | 56 | config = BatchInfo() 57 | config_dict = config.__dict__ 58 | config_dict.update(yam) 59 | config.batch = '{0:02d}'.format(config.batch) 60 | config.date = str(config.date) 61 | # TODO: Is there a more elegant way? 62 | config.batch_name = config.batch_name.format_map(config_dict) 63 | config.source_root = config.source_root.format_map(config_dict) 64 | config.batch_dest_root = config.batch_dest_root.format_map(config_dict) 65 | config.snp_dir = config.snp_dir.format_map(config_dict) 66 | config.indel_dir = config.indel_dir.format_map(config_dict) 67 | 68 | if not os.path.exists(config.source_root): 69 | os.mkdir(config.source_root) 70 | # TODO: replace this kind of printing by logging. 71 | pprint(config.__dict__) 72 | return config 73 | 74 | 75 | def run(config): 76 | for dir_path in (config.snp_dir, config.indel_dir): 77 | if not os.path.exists(dir_path): 78 | os.makedirs(dir_path) 79 | 80 | # TODO: have to be in the directory with the excel file 81 | print(repr(config.worklist_file)) 82 | if config.worklist_file.endswith('.xlsx'): 83 | vcfs = pd.read_excel(config.worklist_file) 84 | else: 85 | vcfs = pd.read_table(config.worklist_file) 86 | vcfs.rename( # The input names are not always consistent. 87 | columns={'indel_vcf_path': 'indel_path', 'snp_vcf_path': 'snp_path'} 88 | ) 89 | snp_paths = vcfs.snp_path 90 | indel_paths = vcfs.indel_path 91 | 92 | # Checking the numbers 93 | NUM_SNP_VCFS = len(snp_paths) 94 | NUM_INDEL_VCFS = len(indel_paths) 95 | 96 | if NUM_SNP_VCFS == NUM_INDEL_VCFS: 97 | print("equal", NUM_SNP_VCFS) 98 | else: 99 | print("not equal", NUM_SNP_VCFS, "is not", NUM_INDEL_VCFS, file=sys.stderr) 100 | 101 | worklist = [] # Pairs of (input file & destination dir) 102 | for snp_path in snp_paths: 103 | worklist.append((snp_path, config.snp_dir)) 104 | for indel_path in indel_paths: 105 | worklist.append((indel_path, config.indel_dir)) 106 | pprint(worklist) 107 | 108 | max_workers = getattr(config, 'max_workers', DEFAULT_MAX_WORKERS) 109 | # Running the compression and checksum script 110 | handle_backlog_with_workers(worklist, config.program_args, max_workers) 111 | 112 | 113 | def handle_backlog_with_workers(worklist, fixed_args, max_workers): 114 | """Run a loop with a maximum number of subprocesses. Consumes""" 115 | backlog = worklist[::-1] # Using pop() will iterate in original order. 116 | workers = [] 117 | while backlog or workers: 118 | while backlog and (len(workers) < max_workers): 119 | unit_of_work = backlog.pop() 120 | vcf_path, dest_dir_path = unit_of_work 121 | print('starting', vcf_path) 122 | worker = Worker(fixed_args, vcf_path, dest_dir_path) 123 | print('started', worker.vcf_name) 124 | workers.append(worker) 125 | for index, worker in enumerate(workers): 126 | worker.proc.poll() 127 | if worker.proc.returncode is not None: 128 | print('finished', worker.vcf_name) 129 | if worker.proc.returncode: # nonzero means error 130 | # Log the error and do whatever. 131 | print('error', worker.vcf_name) 132 | del workers[index] 133 | time.sleep(1) 134 | 135 | 136 | class Worker: 137 | """Composite of a subprocess (proc) and an open file (file). The stderr 138 | and stdout are combined into that single output file.""" 139 | def __init__(self, fixed_args, vcf_path, dest_dir_path): 140 | os.makedirs(dest_dir_path, exist_ok=True) 141 | args = fixed_args + [dest_dir_path, vcf_path] 142 | vcf_name = os.path.basename(vcf_path) 143 | log_name = vcf_name + '.log' 144 | log_path = os.path.join(dest_dir_path, log_name) 145 | self.vcf_name = vcf_name 146 | self.file = open(log_path, 'wb') 147 | self.proc = subprocess.Popen(args, 148 | stdout=self.file, 149 | stderr=subprocess.STDOUT) 150 | 151 | 152 | if __name__ == '__main__': 153 | main() 154 | --------------------------------------------------------------------------------