├── .gitattributes ├── .gitignore ├── Download_FTP_ENA.py ├── HapCUT2VCF.py ├── MUMmerSNPs2VCF.py ├── MUMmer_Helper ├── group_reads_by_len.py ├── scaffold_to_contig.py ├── submit_MUM_folder.py └── submit_MUM_merge_delta.py ├── README.md └── VCFsplit4HapCUT2.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /Download_FTP_ENA.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Download fastq files from ENA. 6 | """ 7 | 8 | from sys import argv 9 | import re 10 | import os 11 | 12 | script, first, second = argv 13 | working_dir = first 14 | data_file = second 15 | 16 | #dir = os.path.dirname(os.path.realpath(__file__)) 17 | #data_file_full = os.path.join(dir,data_file) 18 | data_file_full = data_file 19 | 20 | master_file = "run_ENA_download_Py.sh" 21 | 22 | file_out = open(master_file, 'w') 23 | file_out.write("#!/bin/sh\n") 24 | file_out.write("cd "+working_dir + "\n") 25 | 26 | num = 0 27 | run_position = 0 28 | ftp_position = 0 29 | with open(data_file_full, 'r') as file_in: 30 | for line in file_in.readlines(): 31 | buffer = re.split(r'\t', line.strip()) 32 | num += 1 33 | if num == 1 : 34 | for x in range(0,len(buffer)): 35 | if buffer[x] == 'run_accession' : 36 | run_position = x 37 | if buffer[x] == 'fastq_ftp' : 38 | ftp_position = x 39 | else: 40 | run_id = buffer[run_position] 41 | ftp_id_string = buffer[ftp_position] 42 | 43 | ftp_ids = re.split(r'\;', ftp_id_string) 44 | fq_num = 0 45 | for ftp_addr in ftp_ids : 46 | fq_num += 1 47 | sh_file = 'runPy_'+ str(num-1)+ '_'+ run_id+'_'+ str(fq_num) +'.sh' 48 | # master shell file 49 | file_out.write('qsub -q copyq '+ sh_file + " \n") 50 | 51 | with open(sh_file, 'w') as file_shell : 52 | file_shell.write("#!/bin/sh\n") 53 | file_shell.write("cd "+ working_dir + "\n") 54 | file_shell.write("wget "+ ftp_addr) 55 | 56 | file_out.close() 57 | ## Author : lxue@uga.edu 58 | 59 | 60 | -------------------------------------------------------------------------------- /HapCUT2VCF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This code convert a output file from HapCUT2(https://github.com/pjedge/hapcut2) into VCF format. 6 | Usage: 7 | HapCUT2VCF.py hapcut2_table output_VCF original_VCF(optional) 8 | 9 | Because hapcut2_table only keep phased variants. Unphased variants can be recovered if the original 10 | input VCF file for HapCUT2 is provided. The variants will be combined into the same output file. 11 | Each block in HapCUT2 table is stored as haplotype block in VCF file, and marked using PS tag. 12 | The numbering of PS starts from 1 for each run. The output tables can be concatenated before converting 13 | to generate unique PS IDs in the whole genome. 14 | 15 | Author : lxue@uga.edu 16 | """ 17 | 18 | import re 19 | import gzip 20 | import sys 21 | from sys import argv 22 | 23 | 24 | 25 | ################# 26 | ### FUNCTIONS ### 27 | ################# 28 | 29 | 30 | class VarTable: 31 | """Read variant table from HapCUT2 output """ 32 | def __init__(self): 33 | self.hap_vcf = {} 34 | self.delete_list = {} 35 | self.vcf_head = [] 36 | self.vcf_content = [] 37 | 38 | def load_haptable(self,input_filel): 39 | ps_tracking = 0 40 | with open(input_file,"r") as hap_table : 41 | for line in hap_table: 42 | if line.startswith('*****') or line.startswith('BLOCK'): 43 | if line.startswith('BLOCK'): 44 | ps_tracking += 1 45 | continue 46 | records = line.rstrip().split("\t") 47 | idx, hap1,hap2, chrom, position, ref, alt = records[0:7] 48 | idx = int(idx) 49 | if hap1 == '-' or hap2 == '-': 50 | self.delete_list[idx] = 1 51 | else : 52 | gt_out = hap1+'|'+hap2+':'+str(ps_tracking) # add PS number 53 | out_line = [chrom, position,'.',ref,alt,'30','PASS','.','GT:PS',gt_out] 54 | self.hap_vcf[idx] = out_line 55 | # 0 1 2 3 4 5 6 7 8 9 56 | # CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 57 | 58 | def write_VCF_haponly(self,vcf_out): 59 | OUT = open(vcf_out, "w") 60 | OUT.write('##fileformat=VCFv4.1' + "\n") 61 | OUT.write('##FORMAT=' + "\n") 62 | out_line = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'sample1'] 63 | OUT.write("\t".join(out_line) + "\n") 64 | keys_order = sorted( self.hap_vcf.keys()) 65 | for idx_order in keys_order : 66 | out_line = self.hap_vcf[idx_order] 67 | OUT.write("\t".join(out_line) + "\n") 68 | 69 | def replace_gt(self,format, data, hap_data): 70 | format_records = format.split(':') 71 | gt_pos = format_records.index('GT') 72 | hap_list = hap_data.split(':') 73 | newdata_list = data.split(':') 74 | newdata_list[gt_pos] = hap_list[0] # change GT 75 | newdata_list.append(hap_list[1]) # append PS record 76 | return format+':PS', ':'.join(newdata_list) 77 | 78 | def check_ori_vcf(self, vcf_original, output_file): 79 | check_begin = re.compile("^#") 80 | tracking = 0 81 | OUT = open(output_file, "w") 82 | with open(vcf_original, "r") as INPUT: 83 | for line in INPUT: 84 | if check_begin.match(line): 85 | OUT.write(line) # report head lines 86 | else : 87 | tracking += 1 88 | if tracking in self.delete_list.keys(): 89 | continue 90 | # filter deleted records 91 | record = line.rstrip().split("\t") 92 | # record[7] = '.' 93 | # if phased replace the genotype 94 | if tracking in self.hap_vcf.keys(): 95 | hap_out = self.hap_vcf[tracking] 96 | if record[0] != hap_out[0] or record[1] != hap_out[1]: 97 | sys.exit("Order in original VCF doesn't match with HapCUT2 output") 98 | else : 99 | # change the genotype of 100 | new_format, new_data = self.replace_gt(record[8],record[9],hap_out[9]) 101 | record[8] = new_format 102 | record[9] = new_data 103 | # write the vcf line 104 | OUT.write("\t".join(record)+"\n") # report content lines 105 | OUT.close() 106 | # End of class 107 | 108 | 109 | ################# 110 | ### MAIN ### 111 | ################# 112 | 113 | if __name__ == "__main__": 114 | if len(argv) == 4 : 115 | script, input_file, output_file, vcf_original = argv 116 | elif len(argv) == 3 : 117 | script, input_file, output_file = argv 118 | vcf_original = '' 119 | else : 120 | sys.exit("Usage: HapCUT2VCF hapcut2_table output_VCF original_VCF(optional)") 121 | # read HapCUT2 output 122 | VarTab = VarTable () 123 | VarTab.load_haptable(input_file) 124 | if vcf_original == '': 125 | VarTab.write_VCF_haponly(output_file) 126 | else : 127 | VarTab.check_ori_vcf(vcf_original,output_file) 128 | -------------------------------------------------------------------------------- /MUMmerSNPs2VCF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This code take output from show-snps. 5 | The options should be set as: 6 | show-snps -Clr -x 1 -T mum.delta.filter >mum.delta.filterX.snps 7 | Usage: 8 | python3.4 MUMmerSNPs2VCF.py mum.delta.filterX.snps mum_filterX.snps.vcf 9 | 10 | 11 | Keywords: MUMmer show-snps VCF MUMmer2VCF 12 | """ 13 | 14 | 15 | from sys import argv 16 | script, input_file, output_file = argv 17 | 18 | OUT = open(output_file,"w") 19 | OUT.write('##fileformat=VCFv4.1'+"\n") 20 | OUT.write('##FORMAT='+"\n") 21 | out_line = ['#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','sample1'] 22 | OUT.write("\t".join(out_line)+"\n") 23 | vcf_out = [] 24 | 25 | 26 | def check_buff(indel_buff_in): 27 | allele_ref = indel_buff_in[0][1] 28 | allele_alt = indel_buff_in[0][2] 29 | ref_id = indel_buff_in[0][12] 30 | if allele_ref == '.': 31 | # insertion 32 | pos = indel_buff_in[0][0] 33 | # In MUMmer format, the coordinate of '.' is the coordinate of the last nt so, this position is kept. 34 | ref_start = indel_buff_in[0][8][0] 35 | direction = indel_buff_in[0][11] 36 | alt_out = '' 37 | if direction == '-1': 38 | for line_l in indel_buff_in : 39 | alt_out = line_l[2]+alt_out 40 | else : 41 | for line_l in indel_buff_in : 42 | alt_out += line_l[2] 43 | alt_out = ref_start+alt_out 44 | out_line = [ref_id,pos,'.',ref_start,alt_out,'30','PASS','.','GT','1/1'] 45 | vcf_out.append(out_line) 46 | elif allele_alt == '.': 47 | # deletion 48 | pos = str(int(indel_buff_in[0][0])-1) 49 | # the coordinate here in the reference is correct, but we need the coordinate of last nt. 50 | # In VCF format, we need check the last nt. 51 | alt_start = indel_buff_in[0][8][0] # first nt in context 52 | ref_out = alt_start 53 | for line_l in indel_buff_in : 54 | ref_out += line_l[1] 55 | out_line = [ref_id,pos,'.',ref_out,alt_start,'30','PASS','.','GT','1/1'] 56 | vcf_out.append(out_line) 57 | else : 58 | sys.exit("Both in and del\n") 59 | 60 | ##################################### 61 | # initiation 62 | start = 0 63 | last_pos = 0 64 | last_ref = '' 65 | in_del_start = 0 66 | indel_buff = [] 67 | ################################## 68 | 69 | with open (input_file,"r") as INPUT: 70 | for line in INPUT: 71 | line = line.rstrip() 72 | if len(line)< 1: 73 | continue 74 | elif start == 0 and line[0] == '[': 75 | start = 1 76 | elif start == 1: 77 | line_list = line.split("\t") 78 | ref_id = line_list[12] 79 | pos = line_list[0] 80 | allele_ref = line_list[1] 81 | allele_alt = line_list[2] 82 | if allele_ref == '.' or allele_alt == '.': 83 | # insertion deletion 84 | if in_del_start == 0: 85 | in_del_start = 1 86 | indel_buff.append(line_list) 87 | else : 88 | if allele_ref == '.' : 89 | if ref_id == last_ref and int(pos) == last_pos : 90 | indel_buff.append(line_list) 91 | else : # new insertion 92 | check_buff(indel_buff) 93 | indel_buff = [] 94 | indel_buff.append(line_list) 95 | elif allele_alt == '.': 96 | if ref_id == last_ref and int(pos) == last_pos + 1: 97 | indel_buff.append(line_list) 98 | else: # new deletion 99 | check_buff(indel_buff) 100 | indel_buff = [] 101 | indel_buff.append(line_list) 102 | else : 103 | # SNP 104 | if in_del_start == 1: 105 | check_buff(indel_buff) 106 | indel_buff = [] 107 | in_del_start = 0 108 | ## write SNP regard less of last records 109 | out_line = [ref_id,pos,'.',allele_ref,allele_alt,'30','PASS','.','GT','1/1'] 110 | vcf_out.append(out_line) 111 | ## 112 | last_pos = int(pos) 113 | last_ref = ref_id 114 | ############### 115 | 116 | 117 | # Write VCF 118 | new_list1 = sorted(vcf_out, key=lambda x: int(x[1])) 119 | new_list = sorted(new_list1, key=lambda x: x[0]) 120 | for line_new in new_list: 121 | OUT.write("\t".join(line_new)+"\n") 122 | OUT.close() 123 | ## Author : lxue@uga.edu 124 | 125 | -------------------------------------------------------------------------------- /MUMmer_Helper/group_reads_by_len.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from sys import argv 5 | 6 | code, fasta_file, output_file, size_limit0 = argv 7 | 8 | 9 | #fasta_file = "Potra01-genome.fa" 10 | #output_file = "Potra01" 11 | #size_limit0 = 10000000 12 | 13 | 14 | size_limit = int(size_limit0) 15 | 16 | num = 0 17 | len_test = 0 18 | # read fasta 19 | fasta = open(fasta_file, "rU") 20 | for line in "".join(fasta.readlines()).split(">")[1:]: 21 | line=line.split("\n") 22 | seqid = line[0].split()[0].replace("|","") 23 | seqstr = "".join(line[1:]).upper() 24 | seqlen = len(seqstr) 25 | len_test += seqlen 26 | 27 | 28 | if len_test > size_limit or num == 0 : # initial 29 | if num > 0: 30 | handleOUT.close() 31 | 32 | len_test = 0 33 | num += 1 34 | print ("Open file "+str(num)+"\n") 35 | file_out = output_file + "_p_" + str(num) + ".fasta" 36 | handleOUT = open(file_out, "w") 37 | 38 | # write out all the time 39 | handleOUT.write(">" + seqid + "\n") 40 | handleOUT.write(seqstr + "\n") 41 | 42 | handleOUT.close() 43 | -------------------------------------------------------------------------------- /MUMmer_Helper/scaffold_to_contig.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from sys import argv 5 | import re 6 | 7 | code, fasta_file, output_file, lim_in = argv 8 | 9 | 10 | #fasta_file = "Potra01-genome.fa" 11 | #output_file = "Potra01" 12 | #size_limit0 = 10000000 13 | 14 | file_out = open(output_file, 'w') 15 | 16 | lim = int(lim_in) 17 | 18 | 19 | pattern_in = 'N{'+ str(lim)+',}' 20 | pattern = re.compile(pattern_in) 21 | total_num = 0 22 | fasta = open(fasta_file, "rU") 23 | for line in "".join(fasta.readlines()).split(">")[1:]: 24 | line = line.split("\n") 25 | seqid = line[0].split()[0].replace("|", "") 26 | seqstr = "".join(line[1:]).upper() 27 | out_split = pattern.split(seqstr) 28 | num_part = 0 29 | 30 | if len(out_split)>1 : 31 | for seq_part in out_split: 32 | num_part += 1 33 | seqid_new = seqid + '_'+ str(num_part) 34 | file_out.write(">" + seqid_new + "\n") 35 | file_out.write(seq_part + "\n") 36 | else : 37 | file_out.write(">" + seqid + "\n") 38 | file_out.write(seqstr + "\n") 39 | 40 | file_out.close() 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /MUMmer_Helper/submit_MUM_folder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from sys import argv 5 | import glob 6 | import os 7 | 8 | script, working_dir , data_dir, reference = argv 9 | 10 | #master hash 11 | master_file = "sub_MUM_Py.sh" 12 | 13 | file_out = open(master_file, 'w') 14 | file_out.write("#!/bin/sh\n") 15 | file_out.write("cd "+working_dir + "\n") 16 | 17 | fq_num = 0 18 | #data_dir = "/escratch4/lxue/lxue_Aug_23/SNP_717v2/12Comparative/02MUMeach" 19 | for filename in sorted(glob.glob(os.path.join(data_dir, '*.fasta'))): 20 | dir,file=os.path.split(filename) 21 | fq_num += 1 22 | sh_file = 'run' + str(fq_num) + '_' +file[0:-6] +'.sh' 23 | print(sh_file) 24 | 25 | file_out.write('qsub -q rcc-m128-30d ' + sh_file + " \n") 26 | with open(sh_file, 'w') as file_shell: 27 | file_shell.write("#!/bin/sh\n") 28 | file_shell.write("cd " + working_dir + "\n") 29 | file_shell.write("/usr/local/mummer/latest/nucmer -prefix="+file[0:-6]+" \\\n") 30 | file_shell.write( reference + " \\\n") 31 | file_shell.write(filename+"\n") 32 | 33 | 34 | 35 | file_out.close() 36 | ## Author : lxue@uga.edu 37 | 38 | 39 | -------------------------------------------------------------------------------- /MUMmer_Helper/submit_MUM_merge_delta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from sys import argv 5 | import glob 6 | import os 7 | 8 | script, data_dir, reference, query, method, output_file= argv 9 | 10 | #master hash 11 | 12 | 13 | file_out = open(output_file, 'w') 14 | file_out.write(reference+" "+ query+ "\n") 15 | method_in = method.upper() 16 | file_out.write(method_in+"\n") 17 | 18 | fq_num = 0 19 | #data_dir = "/escratch4/lxue/lxue_Aug_23/SNP_717v2/12Comparative/02MUMeach" 20 | for filename in sorted(glob.glob(os.path.join(data_dir, '*.delta'))): 21 | #dir,file=os.path.split(filename) 22 | print(filename) 23 | fq_num += 1 24 | with open(filename, "rU") as f: 25 | lines = f.readlines()[2:] 26 | for line in lines: 27 | line_n = line.rstrip() 28 | file_out.write(line_n+"\n") 29 | 30 | print ("total file"+str(fq_num)) 31 | file_out.close() 32 | ## Author : lxue@uga.edu 33 | 34 | #/usr/local/mummer/latest/delta-filter -1 Potra01_p_9.delta >Potra01_p_9.delta.1filter 35 | #/usr/local/mummer/latest/show-coords -rclT Potra01_p_9.delta.1filter > Potra01_p_9.1delta.coords 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PythonNGSTools 2 | Scripts for NGS data processing 3 | 4 | ## HapCUT2VCF.py 5 | This code convert a output file from HapCUT2(https://github.com/pjedge/hapcut2) into VCF format. 6 | 7 | Usage(python3): 8 | ```python 9 | python HapCUT2VCF.py hapcut2_table output_VCF original_VCF(optional) 10 | ``` 11 | Because hapcut2_table only keep phased variants. Unphased variants can be recovered if the original 12 | input VCF file for HapCUT2 is provided. The variants will be combined into the same output file. 13 | Each block in HapCUT2 table is stored as haplotype block in VCF file, and marked using PS tag. 14 | The numbering of PS starts from 1 for each run. The output tables can be concatenated before converting 15 | to generate unique PS IDs in the whole genome. 16 | 17 | 18 | ## VCFsplit4HapCUT2.py 19 | This code split a VCF file into files of smaller size. It can be used to run HapCUT2 in parallel. 20 | 21 | Usage(python3): 22 | ```python 23 | python VCFsplit4HapCUT2.py vcf_input line_number_per_file 24 | ``` 25 | 26 | 27 | ## MUMmerSNPs2VCF.py 28 | This code converts output file from show-snps of MUMmer into VCF format. 29 | "-x 1" option should be turned on so that reference fasta is not needed. 30 | 31 | Usage: 32 | ``` 33 | show-snps -Clr -x 1 -T mum.delta.filter >mum.delta.filterX.snps 34 | python3.4 MUMmerSNPs2VCF.py mum.delta.filterX.snps mum_filterX.snps.vcf 35 | ``` 36 | 37 | Notes for the code: 38 | To get the correct converted VCF files from MUMmer/snps: 39 | 1) You need to check the reference sequence to rebuild insertion and deletion. 40 | Instead of reading original reference fasta file, I used "show-snps -x 1", so that the surrounding nucleotides are also reported. 41 | 2) For the insertions, if the query sequences are reversely mapped to the references, the orders of nucleotides in query sequence are reversely reported. 42 | So, they needed to be concatenated in reverse order. 43 | 3) The coordinates of insertion and deletions. 44 | For insertions, the coordinates in MUMmer/snps are the coordinates of nucleotides before insertions. They need to be kept as the same in VCF files. 45 | For deletions, the coordinates in MUMmer/snps are of the nucleotides that are deleted. The coordinates in VCF should be : first_position_of_deletion_block - 1. 46 | 47 | 48 | 49 | ## Download_FTP_ENA.py 50 | 51 | 52 | Download fastq files from ENA. 53 | 54 | Usage: 55 | 1. Search the project in ENA to get the information page of one study. 56 | 2. Click "Read Files" tag next to "Navigation". 57 | 3. Download the TEXT file (save as ENA_description.txt). 58 | 4. Write the shell scripts to download fastq files (The master shell submit downloading jobs to one queue on clusters). 59 | ``` 60 | python2.7 Download_FTP_ENA.py ./ ENA_description.txt 61 | ``` 62 | 63 | ./ : is the current directory and can be changed into the directory to store the fastq files. 64 | 65 | ## MUMmer helper 66 | Some tools to split large query file, submit jobs to clusters and merge the delta output from MUMmer 67 | 68 | 69 | -------------------------------------------------------------------------------- /VCFsplit4HapCUT2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This code split a VCF file into files of smaller size. It can be used to run HapCUT2 in parallel. 6 | Usage: 7 | python VCFsplit4HapCUT2.py vcf_input line_number_per_file 8 | Author : lxue@uga.edu 9 | """ 10 | 11 | import re 12 | from sys import argv 13 | import os.path 14 | 15 | 16 | ################# 17 | ### FUNCTIONS ### 18 | ################# 19 | 20 | class batch_vcf: 21 | def __init__(self,file_in, batch_tracking): 22 | self.head = [] 23 | self.content = [] 24 | self.file_out = file_in[0:-4]+'_P'+str(batch_tracking)+'.vcf' 25 | 26 | def write(self): 27 | VCFOUT = open(self.file_out, "w") 28 | VCFOUT.write("".join(self.head)) # write head lines 29 | VCFOUT.write("".join(self.content)) # write head lines 30 | VCFOUT.close() 31 | 32 | 33 | class VariantTable: 34 | """Hold the VCF records in memory""" 35 | 36 | def __init__(self, file_in): 37 | self.file_full = file_in 38 | dir, self.file_short = os.path.split(file_in) 39 | self.head = [] 40 | 41 | def split(self, batch_line_num): 42 | check_begin = re.compile("^#") 43 | batch_num = 1 44 | b_vcf = batch_vcf(self.file_short,batch_num) 45 | line_num = 0 46 | with open(self.file_full, "r") as INPUT: 47 | for line in INPUT: 48 | if check_begin.match(line): 49 | self.head.append(line) 50 | else: 51 | line_num += 1 52 | b_vcf.content.append(line) 53 | if(line_num >= batch_line_num): 54 | # write and initiate 55 | b_vcf.head = self.head 56 | b_vcf.write() 57 | # init 58 | batch_num += 1 59 | line_num = 0 60 | b_vcf = batch_vcf(self.file_short,batch_num) 61 | # just append as it is 62 | # end of for 63 | b_vcf.head = self.head 64 | b_vcf.write() # last batch in memory 65 | 66 | 67 | ################# 68 | ### MAIN ### 69 | ################# 70 | 71 | if __name__ == "__main__": 72 | script, input_file, batch_line_num = argv 73 | batch_line_num = int(batch_line_num) 74 | VarTab = VariantTable(input_file) 75 | VarTab.split(batch_line_num) 76 | --------------------------------------------------------------------------------