├── .gitattributes
├── .gitignore
├── Download_FTP_ENA.py
├── HapCUT2VCF.py
├── MUMmerSNPs2VCF.py
├── MUMmer_Helper
    ├── group_reads_by_len.py
    ├── scaffold_to_contig.py
    ├── submit_MUM_folder.py
    └── submit_MUM_merge_delta.py
├── README.md
└── VCFsplit4HapCUT2.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/Download_FTP_ENA.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Download fastq files from ENA.
 6 | """
 7 | 
 8 | from sys import argv
 9 | import re
10 | import os
11 | 
12 | script, first, second = argv
13 | working_dir = first
14 | data_file   = second
15 | 
16 | #dir = os.path.dirname(os.path.realpath(__file__))
17 | #data_file_full = os.path.join(dir,data_file)
18 | data_file_full  = data_file
19 | 
20 | master_file = "run_ENA_download_Py.sh"
21 | 
22 | file_out = open(master_file, 'w')
23 | file_out.write("#!/bin/sh\n")
24 | file_out.write("cd "+working_dir + "\n")
25 | 		
26 | num = 0
27 | run_position = 0
28 | ftp_position = 0	
29 | with open(data_file_full, 'r') as file_in:	
30 | 	for line in file_in.readlines():
31 | 	    buffer = re.split(r'\t', line.strip())	    
32 | 	    num += 1	    
33 | 	    if num == 1 :
34 | 	    	for x in range(0,len(buffer)):
35 | 	    		if buffer[x] == 'run_accession' :
36 | 	    			run_position = x
37 | 	    		if buffer[x] == 'fastq_ftp' :
38 | 	    			ftp_position = x
39 | 	    else:
40 | 	    	run_id  = buffer[run_position]
41 | 	    	ftp_id_string = buffer[ftp_position]
42 | 	    	
43 | 	    	ftp_ids =  re.split(r'\;', ftp_id_string)
44 | 	    	fq_num  = 0 
45 | 	    	for ftp_addr in ftp_ids :
46 | 	    		fq_num += 1
47 | 	    		sh_file = 'runPy_'+ str(num-1)+ '_'+ run_id+'_'+ str(fq_num) +'.sh'
48 | 	    		# master shell file
49 | 	    		file_out.write('qsub -q copyq '+ sh_file + " \n")
50 | 	    		
51 | 	    		with open(sh_file, 'w') as file_shell :
52 | 	    			file_shell.write("#!/bin/sh\n")
53 | 	    			file_shell.write("cd "+ working_dir + "\n")
54 | 	    			file_shell.write("wget "+ ftp_addr)
55 | 
56 | file_out.close()
57 | ## Author : lxue@uga.edu
58 |    
59 |    
60 | 


--------------------------------------------------------------------------------
/HapCUT2VCF.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This code convert a output file from HapCUT2(https://github.com/pjedge/hapcut2) into VCF format.
  6 | Usage:
  7 | HapCUT2VCF.py   hapcut2_table  output_VCF   original_VCF(optional)
  8 | 
  9 | Because hapcut2_table only keep phased variants. Unphased variants can be recovered if the original 
 10 | input VCF file for HapCUT2 is provided. The variants will be combined into the same output file.
 11 | Each block in HapCUT2 table is stored as haplotype block in VCF file, and marked using PS tag.
 12 | The numbering of PS starts from 1 for each run. The output tables can be concatenated before converting
 13 | to generate unique PS IDs in the whole genome.
 14 | 
 15 | Author : lxue@uga.edu
 16 | """
 17 | 
 18 | import re
 19 | import gzip
 20 | import sys
 21 | from sys import argv
 22 | 
 23 | 
 24 | 
 25 | #################
 26 | ### FUNCTIONS ###
 27 | #################
 28 | 
 29 | 
 30 | class  VarTable:
 31 |     """Read variant table from HapCUT2 output """
 32 |     def __init__(self):
 33 |         self.hap_vcf = {}
 34 |         self.delete_list = {}
 35 |         self.vcf_head = []
 36 |         self.vcf_content = []
 37 | 
 38 |     def load_haptable(self,input_filel):
 39 |         ps_tracking = 0
 40 |         with open(input_file,"r") as hap_table :
 41 |             for line in hap_table:
 42 |                 if line.startswith('*****') or line.startswith('BLOCK'):
 43 |                     if line.startswith('BLOCK'):
 44 |                         ps_tracking += 1
 45 |                     continue
 46 |                 records = line.rstrip().split("\t")
 47 |                 idx, hap1,hap2, chrom, position, ref, alt = records[0:7]
 48 |                 idx = int(idx)
 49 |                 if hap1 == '-' or hap2 == '-':
 50 |                     self.delete_list[idx] = 1
 51 |                 else :
 52 |                     gt_out = hap1+'|'+hap2+':'+str(ps_tracking) # add PS number
 53 |                     out_line = [chrom, position,'.',ref,alt,'30','PASS','.','GT:PS',gt_out]
 54 |                     self.hap_vcf[idx] = out_line
 55 |                     #  0     1   2   3   4   5       6       7        8       9
 56 |                     # CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample1
 57 | 
 58 |     def write_VCF_haponly(self,vcf_out):
 59 |         OUT = open(vcf_out, "w")
 60 |         OUT.write('##fileformat=VCFv4.1' + "\n")
 61 |         OUT.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' + "\n")
 62 |         out_line = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'sample1']
 63 |         OUT.write("\t".join(out_line) + "\n")
 64 |         keys_order = sorted( self.hap_vcf.keys())
 65 |         for idx_order in keys_order :
 66 |             out_line = self.hap_vcf[idx_order]
 67 |             OUT.write("\t".join(out_line) + "\n")
 68 | 
 69 |     def replace_gt(self,format, data, hap_data):
 70 |         format_records = format.split(':')
 71 |         gt_pos = format_records.index('GT')
 72 |         hap_list = hap_data.split(':')
 73 |         newdata_list = data.split(':')
 74 |         newdata_list[gt_pos] = hap_list[0]  # change GT
 75 |         newdata_list.append(hap_list[1])          # append PS record
 76 |         return format+':PS',  ':'.join(newdata_list)
 77 | 
 78 |     def check_ori_vcf(self, vcf_original, output_file):
 79 |         check_begin = re.compile("^#")
 80 |         tracking = 0
 81 |         OUT = open(output_file, "w")
 82 |         with open(vcf_original, "r") as INPUT:
 83 |             for line in INPUT:
 84 |                 if check_begin.match(line):
 85 |                     OUT.write(line) # report head lines
 86 |                 else :
 87 |                     tracking += 1
 88 |                     if tracking in self.delete_list.keys():
 89 |                         continue
 90 |                     # filter deleted records
 91 |                     record = line.rstrip().split("\t")
 92 |                     # record[7] = '.'
 93 |                     # if phased replace the genotype
 94 |                     if tracking in self.hap_vcf.keys():
 95 |                         hap_out = self.hap_vcf[tracking]
 96 |                         if record[0] != hap_out[0] or   record[1] != hap_out[1]:
 97 |                             sys.exit("Order in original VCF doesn't match with HapCUT2 output")
 98 |                         else :
 99 |                             # change the genotype of
100 |                             new_format, new_data = self.replace_gt(record[8],record[9],hap_out[9])
101 |                             record[8] = new_format
102 |                             record[9] = new_data
103 |                     # write the vcf line
104 |                     OUT.write("\t".join(record)+"\n")  # report content lines
105 |         OUT.close()
106 | # End of class
107 | 
108 | 
109 | #################
110 | ###   MAIN    ###
111 | #################
112 | 
113 | if __name__ == "__main__":
114 |     if len(argv) == 4 :
115 |         script, input_file, output_file, vcf_original = argv
116 |     elif  len(argv) == 3 :
117 |         script, input_file, output_file = argv
118 |         vcf_original = ''
119 |     else :
120 |         sys.exit("Usage: HapCUT2VCF  hapcut2_table  output_VCF   original_VCF(optional)")
121 |     # read HapCUT2 output
122 |     VarTab = VarTable ()
123 |     VarTab.load_haptable(input_file)
124 |     if vcf_original == '':
125 |         VarTab.write_VCF_haponly(output_file)
126 |     else :
127 |         VarTab.check_ori_vcf(vcf_original,output_file)
128 | 


--------------------------------------------------------------------------------
/MUMmerSNPs2VCF.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | This code take output from show-snps.
  5 | The options should be set as:
  6 | show-snps -Clr -x 1  -T mum.delta.filter  >mum.delta.filterX.snps
  7 | Usage:
  8 | python3.4 MUMmerSNPs2VCF.py mum.delta.filterX.snps mum_filterX.snps.vcf
  9 | 
 10 | 
 11 | Keywords: MUMmer show-snps VCF MUMmer2VCF
 12 | """
 13 | 
 14 | 
 15 | from sys import argv
 16 | script, input_file, output_file = argv
 17 | 
 18 | OUT = open(output_file,"w")
 19 | OUT.write('##fileformat=VCFv4.1'+"\n")
 20 | OUT.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">'+"\n")
 21 | out_line = ['#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','sample1']
 22 | OUT.write("\t".join(out_line)+"\n")                        
 23 | vcf_out = []
 24 | 
 25 | 
 26 | def check_buff(indel_buff_in):
 27 |     allele_ref = indel_buff_in[0][1]
 28 |     allele_alt = indel_buff_in[0][2]
 29 |     ref_id  = indel_buff_in[0][12]    
 30 |     if allele_ref == '.':
 31 |         # insertion 
 32 |         pos  = indel_buff_in[0][0]
 33 |         # In MUMmer format, the coordinate of '.' is the coordinate of the last nt so, this position is kept.
 34 |         ref_start = indel_buff_in[0][8][0]
 35 |         direction = indel_buff_in[0][11]
 36 |         alt_out = ''
 37 |         if direction == '-1':
 38 |             for line_l in indel_buff_in :
 39 |                 alt_out = line_l[2]+alt_out
 40 |         else :
 41 |             for line_l in indel_buff_in :
 42 |                 alt_out += line_l[2]
 43 |         alt_out = ref_start+alt_out
 44 |         out_line = [ref_id,pos,'.',ref_start,alt_out,'30','PASS','.','GT','1/1']
 45 |         vcf_out.append(out_line)
 46 |     elif allele_alt == '.':
 47 |         # deletion
 48 |         pos  = str(int(indel_buff_in[0][0])-1)
 49 |         # the coordinate here in the reference is correct, but we need the coordinate of last nt.
 50 |         # In VCF format, we need check the last nt.
 51 |         alt_start = indel_buff_in[0][8][0] # first nt in context
 52 |         ref_out = alt_start
 53 |         for line_l in indel_buff_in :
 54 |             ref_out += line_l[1]
 55 |         out_line = [ref_id,pos,'.',ref_out,alt_start,'30','PASS','.','GT','1/1']
 56 |         vcf_out.append(out_line)
 57 |     else :
 58 |         sys.exit("Both in and del\n")
 59 | 
 60 | #####################################
 61 | # initiation 
 62 | start  = 0
 63 | last_pos = 0
 64 | last_ref = ''
 65 | in_del_start = 0
 66 | indel_buff = []
 67 | ##################################
 68 | 
 69 | with open (input_file,"r") as INPUT:
 70 |     for line in INPUT:
 71 |         line = line.rstrip()
 72 |         if len(line)< 1:
 73 |             continue
 74 |         elif start == 0 and line[0] == '[':
 75 |             start = 1
 76 |         elif start == 1:
 77 |             line_list = line.split("\t")
 78 |             ref_id  = line_list[12]
 79 |             pos  = line_list[0]
 80 |             allele_ref = line_list[1]
 81 |             allele_alt = line_list[2]
 82 |             if allele_ref == '.' or allele_alt == '.':
 83 |                 # insertion     deletion
 84 |                 if in_del_start == 0:
 85 |                     in_del_start = 1
 86 |                     indel_buff.append(line_list)
 87 |                 else :
 88 |                     if allele_ref == '.' :
 89 |                         if ref_id == last_ref and int(pos) == last_pos :
 90 |                             indel_buff.append(line_list)
 91 |                         else : # new insertion
 92 |                             check_buff(indel_buff)
 93 |                             indel_buff = []
 94 |                             indel_buff.append(line_list)
 95 |                     elif allele_alt == '.':
 96 |                         if ref_id == last_ref and int(pos) == last_pos + 1:
 97 |                             indel_buff.append(line_list)
 98 |                         else:  # new deletion
 99 |                             check_buff(indel_buff)
100 |                             indel_buff = []
101 |                             indel_buff.append(line_list)
102 |             else :
103 |                 # SNP
104 |                 if in_del_start == 1:
105 |                     check_buff(indel_buff)
106 |                     indel_buff = []
107 |                     in_del_start = 0
108 |                 ## write SNP regard less of last records
109 |                 out_line = [ref_id,pos,'.',allele_ref,allele_alt,'30','PASS','.','GT','1/1']
110 |                 vcf_out.append(out_line)
111 |             ##
112 |             last_pos = int(pos)
113 |             last_ref = ref_id
114 | ###############
115 | 
116 | 
117 | #  Write VCF
118 | new_list1 = sorted(vcf_out, key=lambda x: int(x[1]))
119 | new_list = sorted(new_list1, key=lambda x: x[0])
120 | for line_new in new_list:
121 |     OUT.write("\t".join(line_new)+"\n")
122 | OUT.close()
123 | ## Author : lxue@uga.edu
124 |    
125 | 


--------------------------------------------------------------------------------
/MUMmer_Helper/group_reads_by_len.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from sys import argv
 5 | 
 6 | code, fasta_file, output_file, size_limit0 = argv
 7 | 
 8 | 
 9 | #fasta_file  = "Potra01-genome.fa"
10 | #output_file = "Potra01"
11 | #size_limit0 = 10000000
12 | 
13 | 
14 | size_limit = int(size_limit0)
15 | 
16 | num = 0
17 | len_test = 0
18 | # read fasta
19 | fasta = open(fasta_file, "rU")
20 | for line in "".join(fasta.readlines()).split(">")[1:]:
21 |     line=line.split("\n")
22 |     seqid = line[0].split()[0].replace("|","")
23 |     seqstr = "".join(line[1:]).upper()
24 |     seqlen = len(seqstr)
25 |     len_test += seqlen
26 | 
27 | 
28 |     if len_test > size_limit or num == 0 : # initial
29 |         if num > 0:
30 |             handleOUT.close()
31 | 
32 |         len_test = 0
33 |         num += 1
34 |         print ("Open file "+str(num)+"\n")
35 |         file_out = output_file + "_p_" + str(num) + ".fasta"
36 |         handleOUT = open(file_out, "w")
37 | 
38 |     # write out all the time
39 |     handleOUT.write(">" + seqid + "\n")
40 |     handleOUT.write(seqstr + "\n")
41 | 
42 | handleOUT.close()
43 | 


--------------------------------------------------------------------------------
/MUMmer_Helper/scaffold_to_contig.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from sys import argv
 5 | import re
 6 | 
 7 | code, fasta_file, output_file, lim_in  = argv
 8 | 
 9 | 
10 | #fasta_file  = "Potra01-genome.fa"
11 | #output_file = "Potra01"
12 | #size_limit0 = 10000000
13 | 
14 | file_out = open(output_file, 'w')
15 | 
16 | lim =  int(lim_in)
17 | 
18 | 
19 | pattern_in = 'N{'+ str(lim)+',}'
20 | pattern = re.compile(pattern_in)
21 | total_num = 0
22 | fasta = open(fasta_file, "rU")
23 | for line in "".join(fasta.readlines()).split(">")[1:]:
24 |     line = line.split("\n")
25 |     seqid = line[0].split()[0].replace("|", "")
26 |     seqstr = "".join(line[1:]).upper()
27 |     out_split = pattern.split(seqstr)
28 |     num_part = 0
29 | 
30 |     if len(out_split)>1 :
31 |         for seq_part in out_split:
32 |             num_part += 1
33 |             seqid_new = seqid + '_'+ str(num_part)
34 |             file_out.write(">" + seqid_new + "\n")
35 |             file_out.write(seq_part + "\n")
36 |     else :
37 |         file_out.write(">" + seqid + "\n")
38 |         file_out.write(seqstr + "\n")
39 | 
40 | file_out.close()
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/MUMmer_Helper/submit_MUM_folder.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from sys import argv
 5 | import glob
 6 | import os
 7 | 
 8 | script, working_dir , data_dir, reference = argv
 9 | 
10 | #master hash
11 | master_file = "sub_MUM_Py.sh"
12 | 
13 | file_out = open(master_file, 'w')
14 | file_out.write("#!/bin/sh\n")
15 | file_out.write("cd "+working_dir + "\n")
16 | 
17 | fq_num  = 0
18 | #data_dir = "/escratch4/lxue/lxue_Aug_23/SNP_717v2/12Comparative/02MUMeach"
19 | for filename in sorted(glob.glob(os.path.join(data_dir, '*.fasta'))):
20 |     dir,file=os.path.split(filename)
21 |     fq_num += 1
22 |     sh_file = 'run' + str(fq_num) + '_' +file[0:-6] +'.sh'
23 |     print(sh_file)
24 | 
25 |     file_out.write('qsub -q rcc-m128-30d ' + sh_file + " \n")
26 |     with open(sh_file, 'w') as file_shell:
27 |         file_shell.write("#!/bin/sh\n")
28 |         file_shell.write("cd " + working_dir + "\n")
29 |         file_shell.write("/usr/local/mummer/latest/nucmer    -prefix="+file[0:-6]+" \\\n")
30 |         file_shell.write( reference + " \\\n")
31 |         file_shell.write(filename+"\n")
32 | 
33 | 
34 | 
35 | file_out.close()
36 | ## Author : lxue@uga.edu
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/MUMmer_Helper/submit_MUM_merge_delta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from sys import argv
 5 | import glob
 6 | import os
 7 | 
 8 | script,  data_dir, reference, query, method, output_file= argv
 9 | 
10 | #master hash
11 | 
12 | 
13 | file_out = open(output_file, 'w')
14 | file_out.write(reference+" "+ query+ "\n")
15 | method_in = method.upper()
16 | file_out.write(method_in+"\n")
17 | 
18 | fq_num  = 0
19 | #data_dir = "/escratch4/lxue/lxue_Aug_23/SNP_717v2/12Comparative/02MUMeach"
20 | for filename in sorted(glob.glob(os.path.join(data_dir, '*.delta'))):
21 |     #dir,file=os.path.split(filename)
22 |     print(filename)
23 |     fq_num += 1
24 |     with open(filename, "rU") as f:
25 |         lines = f.readlines()[2:]
26 |         for line in lines:
27 |             line_n = line.rstrip()
28 |             file_out.write(line_n+"\n")
29 | 
30 | print ("total file"+str(fq_num))
31 | file_out.close()
32 | ## Author : lxue@uga.edu
33 | 
34 | #/usr/local/mummer/latest/delta-filter -1  Potra01_p_9.delta  >Potra01_p_9.delta.1filter
35 | #/usr/local/mummer/latest/show-coords -rclT Potra01_p_9.delta.1filter  > Potra01_p_9.1delta.coords
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PythonNGSTools
 2 | Scripts for NGS data processing
 3 | 
 4 | ## HapCUT2VCF.py   
 5 | This code convert a output file from HapCUT2(https://github.com/pjedge/hapcut2) into VCF format.
 6 | 
 7 | Usage(python3):  
 8 | ```python
 9 | python HapCUT2VCF.py  hapcut2_table  output_VCF   original_VCF(optional)  
10 | ```
11 | Because hapcut2_table only keep phased variants. Unphased variants can be recovered if the original 
12 | input VCF file for HapCUT2 is provided. The variants will be combined into the same output file.
13 | Each block in HapCUT2 table is stored as haplotype block in VCF file, and marked using PS tag.
14 | The numbering of PS starts from 1 for each run. The output tables can be concatenated before converting
15 | to generate unique PS IDs in the whole genome.
16 | 
17 | 
18 | ## VCFsplit4HapCUT2.py  
19 | This code split a VCF file into files of smaller size.  It can be used to run HapCUT2 in parallel.
20 | 
21 | Usage(python3):
22 | ```python
23 | python VCFsplit4HapCUT2.py  vcf_input  line_number_per_file
24 | ```
25 | 
26 | 
27 | ## MUMmerSNPs2VCF.py  
28 | This code converts output file from show-snps of MUMmer into VCF format.
29 | "-x 1" option should be turned on so that reference fasta is not needed.  
30 | 
31 | Usage:  
32 | ```
33 | show-snps -Clr -x 1  -T mum.delta.filter  >mum.delta.filterX.snps  
34 | python3.4 MUMmerSNPs2VCF.py mum.delta.filterX.snps  mum_filterX.snps.vcf  
35 | ```
36 | 
37 | Notes for the code:  
38 | To get the correct converted VCF files from MUMmer/snps:  
39 | 1) You need to check the reference sequence to rebuild insertion and deletion.   
40 | Instead of reading original reference fasta file, I used "show-snps -x 1", so that the surrounding nucleotides are also reported.   
41 | 2) For the insertions, if the query sequences are reversely mapped to the references, the orders of nucleotides in query sequence are reversely reported.   
42 | So, they needed to be concatenated in reverse order.   
43 | 3) The coordinates of insertion and deletions.   
44 | For insertions, the coordinates in MUMmer/snps are the coordinates of nucleotides before insertions. They need to be kept as the same in VCF files.   
45 | For deletions, the coordinates in MUMmer/snps are of the nucleotides that are deleted. The coordinates in VCF should be : first_position_of_deletion_block - 1.  
46 | 
47 | 
48 | 
49 | ## Download_FTP_ENA.py
50 | 
51 | 
52 | Download fastq files from ENA.
53 | 
54 | Usage:  
55 | 1. Search the project in ENA to get the information page of one study.  
56 | 2. Click "Read Files" tag next to "Navigation".  
57 | 3. Download the TEXT file (save as ENA_description.txt).  
58 | 4. Write the shell scripts to download fastq files (The master shell submit downloading jobs to one queue on clusters).  
59 | ```
60 | python2.7 Download_FTP_ENA.py  ./  ENA_description.txt  
61 | ```
62 | 
63 | ./ : is the current directory and can be changed into the directory to store the fastq files.  
64 | 
65 | ## MUMmer helper
66 | Some tools to split large query file, submit jobs to clusters and merge the delta output from MUMmer
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/VCFsplit4HapCUT2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | This code split a VCF file into files of smaller size.  It can be used to run HapCUT2 in parallel.
 6 | Usage:
 7 | python VCFsplit4HapCUT2.py  vcf_input  line_number_per_file
 8 | Author : lxue@uga.edu
 9 | """
10 | 
11 | import re
12 | from sys import argv
13 | import os.path
14 | 
15 | 
16 | #################
17 | ### FUNCTIONS ###
18 | #################
19 | 
20 | class batch_vcf:
21 |     def __init__(self,file_in, batch_tracking):
22 |         self.head = []
23 |         self.content = []
24 |         self.file_out = file_in[0:-4]+'_P'+str(batch_tracking)+'.vcf'
25 | 
26 |     def write(self):
27 |         VCFOUT = open(self.file_out, "w")
28 |         VCFOUT.write("".join(self.head))  # write head lines
29 |         VCFOUT.write("".join(self.content))  # write head lines
30 |         VCFOUT.close()
31 | 
32 | 
33 | class VariantTable:
34 |     """Hold the VCF records in memory"""
35 | 
36 |     def __init__(self, file_in):
37 |         self.file_full = file_in
38 |         dir, self.file_short = os.path.split(file_in)
39 |         self.head = []
40 | 
41 |     def split(self, batch_line_num):
42 |         check_begin = re.compile("^#")
43 |         batch_num = 1
44 |         b_vcf = batch_vcf(self.file_short,batch_num)
45 |         line_num = 0
46 |         with open(self.file_full, "r") as INPUT:
47 |             for line in INPUT:
48 |                 if check_begin.match(line):
49 |                     self.head.append(line)
50 |                 else:
51 |                     line_num += 1
52 |                     b_vcf.content.append(line)
53 |                     if(line_num >= batch_line_num):
54 |                         # write and initiate
55 |                         b_vcf.head =  self.head
56 |                         b_vcf.write()
57 |                         # init
58 |                         batch_num += 1
59 |                         line_num = 0
60 |                         b_vcf = batch_vcf(self.file_short,batch_num)
61 |                     # just append as it is
62 |             # end of for
63 |             b_vcf.head = self.head
64 |             b_vcf.write()  # last batch in memory
65 | 
66 | 
67 | #################
68 | ###   MAIN    ###
69 | #################
70 | 
71 | if __name__ == "__main__":
72 |     script, input_file, batch_line_num = argv
73 |     batch_line_num = int(batch_line_num)
74 |     VarTab = VariantTable(input_file)
75 |     VarTab.split(batch_line_num)
76 | 


--------------------------------------------------------------------------------