├── .gitignore
├── etc
    └── SimpleJBrowser.conf
├── bin
    ├── calc_gap_cnt.py
    ├── get_genes_region_from_gff.py
    ├── split_fasta_by_id.py
    ├── extract_gene_from_gff.py
    ├── get_seq_from_range.py
    ├── split_fasta_by_chr.py
    ├── average_fpkm.py
    ├── filter_cds.py
    ├── convert_collinearity_from_MCScanX_to_Circos.py
    ├── get_gff_with_list.py
    ├── convert_gbff_to_fasta.py
    ├── split_cmd_with_parts.py
    ├── get_chr_len.py
    ├── quick_extract_fastx.py
    ├── split_fasta_by_count.py
    ├── find_gff_ovlp_regions.py
    ├── modify_geno_with_snp_mummer.py
    ├── StatAgp.py
    ├── split_ctg_with_agp.py
    ├── convert_simple_for_circos.py
    ├── subVCF.py
    ├── extract_vcf.py
    ├── bam_cov.py
    ├── extract_fasta_with_bed.py
    ├── convert_QTL_info.py
    ├── convert_chr_to_ctg_with_agp.py
    ├── StatAgpDetail.py
    ├── extract_all_sv_from_nucmer_delta.py
    ├── dup_dotplot.pl
    ├── merge_bed_regions.py
    ├── get_seq_with_bed.py
    ├── get_genes_from_range.py
    ├── eval_filled_gaps.py
    ├── nucmer_extract_all_sv.py
    ├── calc_gene_ovlp_te.py
    ├── group_SNP_exon_and_intron.py
    ├── group_exon_and_intron.py
    ├── nucmer_statistics.py
    ├── eval_synteny.py
    ├── rename_ID.py
    ├── simple_ANGSD_without_errorCorrect.py
    ├── check_cds.py
    ├── nucmer_statistics_all_sv.py
    ├── transfer_gff3_with_agp.py
    ├── sort_gff3.py
    ├── quick_mask_genome.py
    ├── SeqStat.py
    ├── approximate_cnv.py
    ├── simple_ANGSD.py
    ├── remove_region_by_blast_result.py
    ├── convert_anchorwave.py
    ├── blast2heatmap.py
    ├── SimContigs.py
    ├── SimCollapse.py
    ├── simple_JBrowser.py
    ├── SentieonSNP_filter.py
    ├── SimSID.py
    └── easyGoKegg.R
├── LICENSE
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .vscode
3 | .idea
4 | *.pyc
5 | __pycache__


--------------------------------------------------------------------------------
/etc/SimpleJBrowser.conf:
--------------------------------------------------------------------------------
1 | [path]
2 | samtools=
3 | bam2wig=
4 | wig2bw=
5 | JBrowser=
6 | 


--------------------------------------------------------------------------------
/bin/calc_gap_cnt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def calc_gap_cnt(in_fa):
 6 | 	total_cnt = 0
 7 | 	with open(in_fa, 'r') as fin:
 8 | 		id = ''
 9 | 		gap_cnt = 0
10 | 		for line in fin:
11 | 			if line[0] == '>':
12 | 				if id != '':
13 | 					print("%s\t%d"%(id, gap_cnt))
14 | 					total_cnt += gap_cnt
15 | 				id = line.strip()[1:]
16 | 				gap_cnt = 0
17 | 				last_base =''
18 | 			else:
19 | 				for i in range(len(line.strip())):
20 | 					if line[i].lower() == 'n' and last_base != 'n':
21 | 						gap_cnt += 1
22 | 					last_base = line[i].lower()
23 | 		print("%s\t%d"%(id, gap_cnt))
24 | 		total_cnt += gap_cnt
25 | 	print("Total\t%d"%total_cnt)
26 | 
27 | 
28 | if __name__ == "__main__":
29 | 	if len(sys.argv) < 2:
30 | 		print("Usage: python "+sys.argv[0]+" <in_fa>")
31 | 	else:
32 | 		in_fa = sys.argv[1]
33 | 		calc_gap_cnt(in_fa)
34 | 


--------------------------------------------------------------------------------
/bin/get_genes_region_from_gff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def get_genes_region_from_gff(gene_list, in_gff, out_bed):
 6 | 	genes = []
 7 | 	with open(gene_list, 'r') as f_in:
 8 | 		for line in f_in:
 9 | 			genes.append(line.strip())
10 | 	
11 | 	with open(in_gff, 'r') as f_in:
12 | 		with open(out_bed, 'w') as f_out:
13 | 			for line in f_in:
14 | 				if line[0] == '#' or line.strip() == '':
15 | 					continue
16 | 				data = line.strip().split()
17 | 				if data[2] != 'gene':
18 | 					continue
19 | 				id = data[8].split(';')[1].split('=')[1]
20 | 				if id in genes:
21 | 					f_out.write(data[0]+'\t'+data[3]+'\t'+data[4]+'\t'+id+'\n')
22 | 
23 | 
24 | if __name__ == "__main__":
25 | 	if len(sys.argv) < 4:
26 | 		print("Usage: python "+sys.argv[0]+" <gene_list> <in_gff> <out_bed>")
27 | 	else:
28 | 		proc, gene_list, in_gff, out_bed = sys.argv
29 | 		get_genes_region_from_gff(gene_list, in_gff, out_bed)
30 | 
31 | 


--------------------------------------------------------------------------------
/bin/split_fasta_by_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys, os
 3 | 
 4 | 
 5 | def split_fasta_by_chr(fasta_file, out_folder):
 6 | 	if os.path.exists(out_folder) == False:
 7 | 		os.mkdir(out_folder)
 8 | 	seq_db = {}
 9 | 	with open(fasta_file, 'r') as f_fasta:
10 | 		seq = ''
11 | 		seq_id = ''
12 | 		for line in f_fasta:
13 | 			if line[0] == ">":
14 | 				if seq != '':
15 | 					seq_db[seq_id] = seq
16 | 				seq_id = line.strip().split()[0]
17 | 				seq = ''
18 | 			else:
19 | 				seq += line.strip()
20 | 		seq_db[seq_id] = seq
21 | 	
22 | 	for seq_id in seq_db:
23 | 		f_out = open(out_folder+"/"+seq_id[1:]+".fasta", 'w')
24 | 		f_out.write(seq_id+"\n"+seq_db[seq_id]+"\n")
25 | 		f_out.close()				
26 | 
27 | 
28 | if __name__ == "__main__":
29 | 	if len(sys.argv) < 3:
30 | 		print("Notice: script for spliting fasta into serval files contain single chromosome")
31 | 		print("Usage: python " + sys.argv[0] + " <in_fasta> <out_dir>")
32 | 	else:
33 | 		in_fasta = sys.argv[1]
34 | 		out_dir = sys.argv[2]
35 | 		split_fasta_by_chr(in_fasta, out_dir)
36 | 


--------------------------------------------------------------------------------
/bin/extract_gene_from_gff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def extract_gene_from_gff(in_list, in_gff, out_bed):
 6 | 	id_list = []
 7 | 	with open(in_list, 'r') as f_in:
 8 | 		for line in f_in:
 9 | 			id_list.append(line.strip())
10 | 	
11 | 	with open(in_gff, 'r') as f_in:
12 | 		with open(out_bed, 'w') as f_out:
13 | 			for line in f_in:
14 | 				if line[0] == '#' or line.strip() == '':
15 | 					continue
16 | 				data = line.strip().split()
17 | 				if data[2] != 'gene':
18 | 					continue
19 | 				id = data[8].split(';')[0].split("=")[1]
20 | 				if id in id_list:
21 | 					f_out.write(data[0]+'\t'+data[3]+'\t'+data[4]+'\t'+id+'\n')
22 | 
23 | 
24 | if __name__ == "__main__":
25 | 	if len(sys.argv) < 4:
26 | 		print("Notice: this script is used to extract region of gene in list file from gff file")
27 | 		print("Usage: python "+sys.argv[0]+" <in_list> <in_gff> <out_bed>")
28 | 	else:
29 | 		in_list = sys.argv[1]
30 | 		in_gff = sys.argv[2]
31 | 		out_bed = sys.argv[3]
32 | 		extract_gene_from_gff(in_list, in_gff, out_bed)
33 | 
34 | 


--------------------------------------------------------------------------------
/bin/get_seq_from_range.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def get_seq(in_fasta, in_bed, out_fasta):
 6 | 	seq_db = {}
 7 | 	with open(in_fasta, 'r') as f_in:
 8 | 		id = ''
 9 | 		seq = ''
10 | 		for line in f_in:
11 | 			if line[0] == '>':
12 | 				if seq != '':
13 | 					seq_db[id] = seq
14 | 				id = line.strip()[1:]
15 | 				seq = ''
16 | 			else:
17 | 				seq += line.strip()
18 | 		seq_db[id] = seq
19 | 	
20 | 	with open(in_bed, 'r') as f_in:
21 | 		with open(out_fasta, 'w') as f_out:
22 | 			for line in f_in:
23 | 				data = line.strip().split()
24 | 				chrn = data[0]
25 | 				s = int(data[1])
26 | 				e = int(data[2])
27 | 				f_out.write(">"+chrn+"["+str(s)+":"+str(e)+"]\n"+seq_db[chrn][s:e+1]+"\n")
28 | 
29 | 
30 | if __name__ == "__main__":
31 | 	if len(sys.argv) < 4:
32 | 		print("Notice: extract sequences with bed file")
33 | 		print("Usage: python "+sys.argv[0]+" <in_fasta> <in_bed> <out_fasta>")
34 | 	else:
35 | 		in_fasta = sys.argv[1]
36 | 		in_bed = sys.argv[2]
37 | 		out_fasta = sys.argv[3]
38 | 		get_seq(in_fasta, in_bed, out_fasta)
39 | 
40 | 


--------------------------------------------------------------------------------
/bin/split_fasta_by_chr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys, os
 3 | 
 4 | 
 5 | def split_fasta_by_chr(fasta_file, out_folder):
 6 | 	if os.path.exists(out_folder) == False:
 7 | 		os.mkdir(out_folder)
 8 | 	seq_db = {}
 9 | 	with open(fasta_file, 'r') as f_fasta:
10 | 		seq = ''
11 | 		seq_id = ''
12 | 		for line in f_fasta:
13 | 			if line[0] == ">":
14 | 				if seq != '':
15 | 					seq_db[seq_id] = seq
16 | 				seq_id = line.strip()
17 | 				seq = ''
18 | 			else:
19 | 				seq += line.strip()
20 | 		seq_db[seq_id] = seq
21 | 	
22 | 	for seq_id in seq_db:
23 | 		if seq_id[:4].lower() != '>chr':
24 | 			continue
25 | 		f_out = open(out_folder+"/"+seq_id[1:]+".fasta", 'w')
26 | 		f_out.write(seq_id+"\n"+seq_db[seq_id]+"\n")
27 | 		f_out.close()				
28 | 
29 | 
30 | if __name__ == "__main__":
31 | 	if len(sys.argv) < 3:
32 | 		print("Notice: script for spliting fasta into serval files contain single chromosome")
33 | 		print("Usage: python " + sys.argv[0] + " <in_fasta> <out_dir>")
34 | 	else:
35 | 		in_fasta = sys.argv[1]
36 | 		out_dir = sys.argv[2]
37 | 		split_fasta_by_chr(in_fasta, out_dir)
38 | 


--------------------------------------------------------------------------------
/bin/average_fpkm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import numpy as np
 4 | 
 5 | 
 6 | def average_fpkm(in_fpkm, out_avg):
 7 | 	group_list = []
 8 | 	last_smp = ''
 9 | 	with open(in_fpkm, 'r') as fin:
10 | 		with open(out_avg, 'w') as fout:
11 | 			for line in fin:
12 | 				data = line.strip().split()
13 | 				if data[0] == 'gene_id':
14 | 					fout.write("gene_id")
15 | 					for i in range(1, len(data)):
16 | 						smp = data[i][:-1]
17 | 						if smp != last_smp:
18 | 							last_smp = smp
19 | 							group_list.append([])
20 | 							fout.write("\t%s"%smp)
21 | 						group_list[-1].append(i)
22 | 					fout.write("\n")
23 | 				else:
24 | 					fout.write("%s"%data[0])
25 | 					for idxs in group_list:
26 | 						vals = []
27 | 						for idx in idxs:
28 | 							vals.append(float(data[idx]))
29 | 						fout.write("\t%.2f"%np.average(vals))
30 | 					fout.write("\n")
31 | 
32 | 
33 | if __name__ == "__main__":
34 | 	if len(sys.argv) < 3:
35 | 		print("Usage: python %s <in_fpkm> <out_avg>"%sys.argv[0])
36 | 	else:
37 | 		in_fpkm, out_avg = sys.argv[1:]
38 | 		average_fpkm(in_fpkm, out_avg)
39 | 


--------------------------------------------------------------------------------
/bin/filter_cds.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def filter_cds(in_cds, out_cds):
 6 | 	print("Loading cds")
 7 | 	cds_db = {}
 8 | 	with open(in_cds, 'r') as fin:
 9 | 		for line in fin:
10 | 			if line[0] == '>':
11 | 				id = line.strip().split()[0][1:]
12 | 				cds_db[id] = []
13 | 			else:
14 | 				cds_db[id].append(line.strip().upper())
15 | 	
16 | 	for id in cds_db:
17 | 		cds_db[id] = ''.join(cds_db[id])
18 | 	print("Filtering cds")
19 | 	start_codon = set(["ATG"])
20 | 	stop_codon = set(["TAG", "TAA", "TGA"])
21 | 
22 | 	with open(out_cds, 'w') as fout:
23 | 		for id in sorted(cds_db):
24 | 			cds_len = len(cds_db[id])
25 | 			cds_start = cds_db[id][:3]
26 | 			cds_stop = cds_db[id][-3:]
27 | 			if (cds_len%3 != 0) or (cds_start not in start_codon) or (cds_stop not in stop_codon):
28 | 				fout.write(">%s\n%s\n"%(id, cds_db[id]))
29 | 	
30 | 	print("Finished")
31 | 		
32 | 
33 | if __name__ == "__main__":
34 | 	if len(sys.argv) < 3:
35 | 		print("Usage: python %s <in_cds> <out_cds>"%sys.argv[0])
36 | 	else:
37 | 		in_cds, out_cds = sys.argv[1:]
38 | 		filter_cds(in_cds, out_cds)
39 | 
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Shengcheng Zhang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bin/convert_collinearity_from_MCScanX_to_Circos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def get_col(in_col, in_gff, out_txt):
 6 | 	id_db = {}
 7 | 	with open(in_gff, 'r') as f_gff:
 8 | 		for line in f_gff:
 9 | 			data = line.strip().split()
10 | 			id_db[data[1]] = data[0][:2]+"Chr"+data[0][2:]+"\t"+data[2]+"\t"+data[3]+"\n"
11 | 	i = 0
12 | 	with open(in_col, 'r') as f_col:
13 | 		with open(out_txt, 'w') as f_out:
14 | 			for line in f_col:
15 | 				if line[0] == "#":
16 | 					continue
17 | 				data = line.strip().split('\t')
18 | 				id_1 = data[1]
19 | 				id_2 = data[2]
20 | 				if id_1 not in id_db or id_2 not in id_db:
21 | 					continue
22 | 				f_out.write("link"+str(i)+"\t"+id_db[id_1])
23 | 				f_out.write("link"+str(i)+"\t"+id_db[id_2])
24 | 				i += 1
25 | 			
26 | 
27 | if __name__ == "__main__":
28 | 	if len(sys.argv) < 4:
29 | 		print("Notice: script for converting collinearity file from MCScanX result to link file for Circos")
30 | 		print("Usage: python " + sys.argv[0] + " <collinearity_file> <gff_file> <out_file>")
31 | 	else:
32 | 		in_col = sys.argv[1]
33 | 		in_gff = sys.argv[2]
34 | 		out_txt = sys.argv[3]
35 | 		get_col(in_col, in_gff, out_txt)
36 | 
37 | 


--------------------------------------------------------------------------------
/bin/get_gff_with_list.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import re
 4 | 
 5 | 
 6 | def get_gff_with_list(in_gff, in_list, out_gff):
 7 | 	print("Loading list")
 8 | 	use_id = {}
 9 | 	with open(in_list, 'r') as fin:
10 | 		for line in fin:
11 | 			use_id[line.strip()] = 1
12 | 	
13 | 	print("Filter gff3")
14 | 	with open(in_gff, 'r') as fin:
15 | 		with open(out_gff, 'w') as fout:
16 | 			fout.write("#gff-version 3\n")
17 | 			is_write = False
18 | 			for line in fin:
19 | 				if line.strip() == '' or line[0] == '#':
20 | 					continue
21 | 				data = line.strip().split()
22 | 				if data[2] == 'gene':
23 | 					if "Name" in data[8]:
24 | 						regexp = r'Name=(.*)'
25 | 					else:
26 | 						regexp = r'ID=(.*)'
27 | 					id = re.findall(regexp, data[8])[0].split(';')[0]
28 | 					if id in use_id:
29 | 						is_write = True
30 | 					else:
31 | 						is_write = False
32 | 				if is_write:
33 | 					fout.write(line)
34 | 	
35 | 	print("Finished")
36 | 
37 | 
38 | if __name__ == "__main__":
39 | 	if len(sys.argv) < 4:
40 | 		print("Usage: python %s <in_gff> <in_list> <out_gff>"%sys.argv[0])
41 | 	else:
42 | 		in_gff, in_list, out_gff = sys.argv[1:]
43 | 		get_gff_with_list(in_gff, in_list, out_gff)
44 | 


--------------------------------------------------------------------------------
/bin/convert_gbff_to_fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def convert_gbff_to_fasta(in_gbff, out_fa):
 6 | 	print("Converting")
 7 | 	with open(in_gbff, 'r') as fin:
 8 | 		with open(out_fa, 'w') as fout:
 9 | 			cnt = 0
10 | 			err_cnt = 0
11 | 			for line in fin:
12 | 				data = line.strip().split()
13 | 				if data[0] == 'LOCUS':
14 | 					cnt += 1
15 | 					gn = data[1]
16 | 					gn_len = int(data[2])
17 | 					seq_len = 0
18 | 					fout.write(">%s\n"%gn)
19 | 					is_write = False
20 | 				elif data[0] == 'ORIGIN':
21 | 					is_write = True
22 | 				elif data[0] == '//':
23 | 					if gn_len != seq_len:
24 | 						err_cnt += 1
25 | 						print("\tERROR: %s Comment length: %sbp, current length: %dbp"%(gn, gn_len, seq_len))
26 | 					is_write = False
27 | 				else:
28 | 					if is_write:
29 | 						seq = ''.join(data[1:])
30 | 						seq_len += len(seq)
31 | 						fout.write(seq+'\n')
32 | 	print("Total convert %d, error count %d"%(cnt, err_cnt))
33 | 	print("Finished")
34 | 	
35 | 
36 | if __name__ == "__main__":
37 | 	if len(sys.argv) < 3:
38 | 		print("Usage: python %s <in_gbff> <out_fasta>"%sys.argv[0])
39 | 	else:
40 | 		in_gbff, out_fa = sys.argv[1:]
41 | 		convert_gbff_to_fasta(in_gbff, out_fa)
42 | 


--------------------------------------------------------------------------------
/bin/split_cmd_with_parts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import multiprocessing
 4 | 
 5 | 
 6 | def write_cmd(fn, cmd_list):
 7 | 	print("\tWriting %s"%fn)
 8 | 	with open(fn, 'w') as fout:
 9 | 		fout.write("".join(cmd_list))
10 | 
11 | 
12 | def split_cmd(in_cmd, np, out_str, ts):
13 | 	print("Loading cmds")
14 | 	cmd_list = []
15 | 	with open(in_cmd, 'r') as fin:
16 | 		for line in fin:
17 | 			cmd_list.append(line)
18 | 	
19 | 	print("Splitting commands")
20 | 	pool = multiprocessing.Pool(processes=ts)
21 | 	cmd_per_file = int(round(len(cmd_list)/np, 0))
22 | 	for i in range(0, np):
23 | 		fn = out_str%(i+1)
24 | 		if i < np-1:
25 | 			pool.apply_async(write_cmd, (fn, cmd_list[i*cmd_per_file: (i+1)*cmd_per_file],))
26 | 		else:
27 | 			pool.apply_async(write_cmd, (fn, cmd_list[i*cmd_per_file:],))
28 | 	pool.close()
29 | 	pool.join()
30 | 	print("Finished")
31 | 
32 | 
33 | if __name__ == "__main__":
34 | 	if len(sys.argv) < 4:
35 | 		print("Usage: python "+sys.argv[0]+" <in_cmd_file> <num_parts> <out_str> <threads>")
36 | 		print("\t<out_str> is a string contain %d as file index, like run_%d.sh")
37 | 	else:
38 | 		in_cmd, np, out_str, ts = sys.argv[1:]
39 | 		split_cmd(in_cmd, int(np), out_str, int(ts))
40 | 
41 | 


--------------------------------------------------------------------------------
/bin/get_chr_len.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import gzip
 4 | 
 5 | 
 6 | def get_chr_len(in_fasta, out_file, is_chr_only):
 7 | 	if is_chr_only != "T" and is_chr_only != "F":
 8 | 		print("Error argument")
 9 | 		exit(0)
10 | 	dict_len = {}
11 | 	if in_fasta[-3:].lower() == '.gz':
12 | 		f_in = gzip.open(in_fasta, 'rt')
13 | 	else:
14 | 		f_in = open(in_fasta, 'r')
15 | 	chrn = ''
16 | 	seq = ''
17 | 	for line in f_in:
18 | 		if line[0] == '>':
19 | 			if seq != '':
20 | 				dict_len[chrn] = len(seq)
21 | 			chrn = line.strip().split()[0][1:]
22 | 			seq = ''
23 | 		else:
24 | 			seq += line.strip()
25 | 	dict_len[chrn] = len(seq)
26 | 	f_in.close()
27 | 	
28 | 	with open(out_file, 'w') as f_out:
29 | 		chr_list = sorted(dict_len.keys())
30 | 		for chrn in chr_list:
31 | 			if is_chr_only == "T" and chrn[:3].lower() != 'chr':
32 | 				continue
33 | 			else:
34 | 				f_out.write(chrn+"\t"+str(dict_len[chrn])+"\n")
35 | 
36 | 
37 | if __name__ == "__main__":
38 | 	if len(sys.argv) < 4:
39 | 		print("Notice: script for calculating length of chromosomes in fasta file")
40 | 		print("Usage: python "+sys.argv[0]+" <fasta_file> <output_file> <T/F chr only>")
41 | 	else:
42 | 		f_fasta = sys.argv[1]
43 | 		f_out = sys.argv[2]
44 | 		is_chr_only = sys.argv[3]
45 | 		get_chr_len(f_fasta, f_out, is_chr_only)
46 | 


--------------------------------------------------------------------------------
/bin/quick_extract_fastx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import gzip
 4 | import time
 5 | 
 6 | 
 7 | def quick_extract_reads(in_fx, in_li, out_fx):
 8 | 	print("\033[32m%s\033[0m Starting"%(time.strftime('[%H:%M:%S]',time.localtime(time.time()))))
 9 | 	read_db = {}
10 | 	with open(in_li, 'r') as fin:
11 | 		for line in fin:
12 | 			data = line.strip().split()
13 | 			read_db[data[0]] = ''
14 | 	
15 | 	fn = in_fx.split('.')
16 | 	if fn[-1].lower() == 'gz':
17 | 		fin = gzip.open(in_fx, 'rt')
18 | 	else:
19 | 		fin = open(in_fx, 'r')
20 | 	
21 | 	fn = out_fx.split('.')
22 | 	if fn[-1].lower() == 'gz':
23 | 		fout = gzip.open(out_fx, 'wt')
24 | 	else:
25 | 		fout = open(out_fx, 'w')
26 | 		
27 | 	is_write = False
28 | 	cnt = 0
29 | 	for line in fin:
30 | 		if cnt%2==0 and (line[0] == '>' or line[0] == '@'):
31 | 			id = line.strip().split()[0][1:]
32 | 			if id in read_db:
33 | 				is_write = True
34 | 				fout.write(line)
35 | 			else:
36 | 				is_write = False
37 | 		else:
38 | 			if is_write:
39 | 				fout.write(line)
40 | 		cnt += 1
41 | 	fin.close()
42 | 	fout.close()
43 | 	print("\033[32m%s\033[0m Finished"%(time.strftime('[%H:%M:%S]',time.localtime(time.time()))))
44 | 
45 | 
46 | if __name__ == "__main__":
47 | 	if len(sys.argv) < 4:
48 | 		print("Usage: python "+sys.argv[0]+" <in_fastx|gz> <in_list> <out_fastx|gz>")
49 | 	else:
50 | 		in_fx, in_li, out_fx = sys.argv[1:]
51 | 		quick_extract_reads(in_fx, in_li, out_fx)
52 | 
53 | 


--------------------------------------------------------------------------------
/bin/split_fasta_by_count.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys, os
 3 | 
 4 | 
 5 | def split_fasta_by_count(in_fa, is_seq, cnt, out_dir):
 6 | 	if not os.path.isdir(out_dir):
 7 | 		os.mkdir(out_dir)
 8 | 	
 9 | 	seq_db = {}
10 | 	id = ''
11 | 	seq = ''
12 | 	with open(in_fa, 'r') as f_in:
13 | 		for line in f_in:
14 | 			if line[0] == '>':
15 | 				if seq != '':
16 | 					seq_db[id] = seq
17 | 				id = line.strip()[1:]
18 | 				seq = ''
19 | 			else:
20 | 				seq += line
21 | 		seq_db[id] = seq
22 | 	
23 | 	total_seq_cnt = len(seq_db)
24 | 	tmp_cnt = int(round(total_seq_cnt*1.0/cnt+0.5))
25 | 	if is_seq:
26 | 		file_cnt = tmp_cnt
27 | 		seq_cnt = cnt
28 | 	else:
29 | 		file_cnt = cnt
30 | 		seq_cnt = tmp_cnt
31 | 	fn = in_fa.replace('.fasta', '').replace('.fa', '')
32 | 	id_list = seq_db.keys()
33 | 	for i in range(0, file_cnt):
34 | 		with open(os.path.join(out_dir, fn+"_"+str(i)+".fa"), 'w') as f_out:
35 | 			for j in range(0, seq_cnt):
36 | 				index = i*seq_cnt+j
37 | 				if index < len(id_list):
38 | 					f_out.write(">%s\n%s"%(id_list[index], seq_db[id_list[index]]))
39 | 
40 | 
41 | if __name__ == "__main__":
42 | 	if len(sys.argv) < 5:
43 | 		print("Usage: python "+sys.argv[0]+" <in_fasta> <S/F> <count> <out_dir>")
44 | 	else:
45 | 		in_fa = sys.argv[1]
46 | 		if sys.argv[2].lower() == 's':
47 | 			is_seq = True
48 | 		else:
49 | 			is_seq = False
50 | 		cnt = int(sys.argv[3])
51 | 		out_dir = sys.argv[4]
52 | 		split_fasta_by_count(in_fa, is_seq, cnt, out_dir)
53 | 
54 | 


--------------------------------------------------------------------------------
/bin/find_gff_ovlp_regions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def find_ovlp(in_gff3, out_bed):
 6 | 	gff3_db = {}
 7 | 	with open(in_gff3, 'r') as fin:
 8 | 		for line in fin:
 9 | 			if line[0] == '#' or line.strip() == '':
10 | 				continue
11 | 			data = line.strip().split()
12 | 			if data[2] != 'gene':
13 | 				continue
14 | 			chrn = data[0]
15 | 			sp = int(data[3])
16 | 			ep = int(data[4])
17 | 			dir = data[6]
18 | 			gn = data[8].split(";")[1].split("=")[1]
19 | 			if chrn not in gff3_db:
20 | 				gff3_db[chrn] = {}
21 | 			if dir not in gff3_db[chrn]:
22 | 				gff3_db[chrn][dir] = []
23 | 			gff3_db[chrn][dir].append([sp, ep, gn])
24 | 	
25 | 	with open(out_bed, 'w') as fout:
26 | 		for chrn in sorted(gff3_db):
27 | 			for dir in sorted(gff3_db[chrn]):
28 | 				tmp_list = []
29 | 				pos_list = sorted(gff3_db[chrn][dir])
30 | 				for i in range(0, len(pos_list)):
31 | 					if len(tmp_list) == 0:
32 | 						tmp_list.append(pos_list[i])
33 | 					else:
34 | 						s, e, gn = pos_list[i]
35 | 						if s <= last_e:
36 | 							tmp_list.append(pos_list[i])
37 | 						else:
38 | 							if len(tmp_list) > 1:
39 | 								for s, e, gn in tmp_list:
40 | 									fout.write("%s\t%d\t%d\t%s\t%s\n"%(chrn, s, e, dir, gn))
41 | 								fout.write("###\n")
42 | 							tmp_list = []
43 | 							tmp_list.append(pos_list[i])
44 | 					last_s, last_e, gn = pos_list[i]
45 | 
46 | 
47 | if __name__ == "__main__":
48 | 	if len(sys.argv) < 3:
49 | 		print("Usage: python "+sys.argv[0]+" <in_gff3> <out_bed>")
50 | 	else:
51 | 		in_gff3, out_bed = sys.argv[1:]
52 | 		find_ovlp(in_gff3, out_bed)
53 | 


--------------------------------------------------------------------------------
/bin/modify_geno_with_snp_mummer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def modify_geno(in_geno, in_snp, col, out_geno):
 6 | 	snp_db = {}
 7 | 	with open(in_snp, 'r') as f_in:
 8 | 		for line in f_in:
 9 | 			data = line.strip().split()
10 | 			if data == []:
11 | 				continue
12 | 			if data[0].isdigit() == False:
13 | 				continue
14 | 			chrn = data[-1]
15 | 			if chrn not in snp_db:
16 | 				snp_db[chrn] = {}
17 | 			pos = int(data[3])
18 | 			snp_db[chrn][pos] = data[1].replace('.', 'N') + '/'+data[1].replace('.', 'N')
19 | 	
20 | 	cnt_ovlp_snp = 0
21 | 	with open(in_geno, 'r') as f_in:
22 | 		with open(out_geno, 'w') as f_out:
23 | 			for line in f_in:
24 | 				if line[0] == "#":
25 | 					f_out.write(line)
26 | 					continue
27 | 				data = line.strip().split()
28 | 				for i in range(0, len(data)):
29 | 					if i == col:
30 | 						if int(data[1]) in snp_db[data[0]]:
31 | 							cnt_ovlp_snp += 1
32 | 							f_out.write(snp_db[data[0]][int(data[1])])
33 | 						else:
34 | 							f_out.write(data[i])
35 | 					else:
36 | 						f_out.write(data[i])
37 | 					if i < len(data)-1:
38 | 						f_out.write('\t')
39 | 				f_out.write('\n')
40 | 	print(cnt_ovlp_snp)
41 | 
42 | 
43 | if __name__ == "__main__":
44 | 	if len(sys.argv) < 4:
45 | 		print("Notice: modify column in geno file with snp result generated by show-snps of mummer")
46 | 		print("Usage: python "+sys.argv[0]+" <in_geno> <in_snp> <col> <out_geno>")
47 | 	else:
48 | 		in_geno = sys.argv[1]
49 | 		in_snp = sys.argv[2]
50 | 		col = int(sys.argv[3])
51 | 		out_geno = sys.argv[4]
52 | 		modify_geno(in_geno, in_snp, col, out_geno)
53 | 
54 | 


--------------------------------------------------------------------------------
/bin/StatAgp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | 
 5 | def stat_agp(in_agp):
 6 | 	asm_db = {}
 7 | 	total_tig = 0
 8 | 	unchor_tig = 0
 9 | 	unchor_tig_size = 0
10 | 	asm_size = 0
11 | 	with open(in_agp, 'r') as fin:
12 | 		for line in fin:
13 | 			data = line.strip().split()
14 | 			if data[4] == 'U':
15 | 				continue
16 | 			chrn = data[0]
17 | 			total_tig += 1
18 | 			ep = int(data[2])
19 | 			if data[0] != data[5] and chrn[:3] != 'tig' and chrn[:3] != 'utg' and chrn[:3] != 'ctg':
20 | 				allele = chrn[-1]
21 | 				chrn = chrn[:-1]
22 | 				if chrn not in asm_db:
23 | 					asm_db[chrn] = {}
24 | 				if allele not in asm_db[chrn]:
25 | 					asm_db[chrn][allele] = ep
26 | 				asm_db[chrn][allele] = ep
27 | 			else:
28 | 				unchor_tig += 1
29 | 				unchor_tig_size += ep
30 | 		for chrn in asm_db:
31 | 			for allele in asm_db[chrn]:
32 | 				asm_size += asm_db[chrn][allele]
33 | 	print("\t%s"%('\t'.join(sorted(asm_db[chrn]))))
34 | 
35 | 	for chrn in sorted(asm_db):
36 | 		print("%s"%chrn, end='')
37 | 		for allele in sorted(asm_db[chrn]):
38 | 			print("\t%s"%("{:,}".format(asm_db[chrn][allele])), end='')
39 | 		print("")
40 | 	print("No. of unanchored contigs\t%s"%("{:,}".format(unchor_tig)))
41 | 	print("Unanchored sequences (Mb)\t%s"%("{:,}".format(unchor_tig_size*1.0/1e6)))
42 | 	print("Total no. of contigs\t%s"%("{:,}".format(total_tig)))
43 | 	print("Total assembled size (Mb)\t%s"%("{:,}".format(asm_size*1.0/1e6)))
44 | 
45 | 
46 | if __name__ == "__main__":
47 | 	if len(sys.argv) < 2:
48 | 		print("Usage: python %s <in_agp>"%sys.argv[0])
49 | 	else:
50 | 		in_agp = sys.argv[1]
51 | 		stat_agp(in_agp)
52 | 


--------------------------------------------------------------------------------
/bin/split_ctg_with_agp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import os
 4 | 
 5 | 
 6 | def split_fa(in_fa, in_agp, out_dir):
 7 |     if not os.path.exists(out_dir):
 8 |         os.makedirs(out_dir)
 9 | 
10 |     fa_db = {}
11 |     with open(in_fa, 'r') as fin:
12 |         for line in fin:
13 |             if line[0] == '>':
14 |                 id = line.strip().split()[0][1:]
15 |                 fa_db[id] = []
16 |             else:
17 |                 fa_db[id].append(line.strip())
18 | 
19 |     for id in fa_db:
20 |         fa_db[id] = ''.join(fa_db[id])
21 | 
22 |     chr_ctgs = {}
23 |     with open(in_agp, 'r') as fin:
24 |         for line in fin:
25 |             if line.strip() == "" or line[0] == '#':
26 |                 continue
27 |             data = line.strip().split()
28 |             if data[4] != 'W':
29 |                 continue
30 |             chrn = data[0]
31 |             ctg = data[5]
32 |             if chrn==ctg:
33 |                 chrn = 'Unanchored'
34 |             if chrn not in chr_ctgs:
35 |                 chr_ctgs[chrn] = []
36 |             chr_ctgs[chrn].append(ctg)
37 | 
38 |     for chrn in chr_ctgs:
39 |         out_fn = os.path.join(out_dir, "%s.fasta"%chrn)
40 |         with open(out_fn, 'w') as fout:
41 |             for id in chr_ctgs[chrn]:
42 |                 fout.write(">%s\n%s\n"%(id, fa_db[id]))
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     if len(sys.argv) < 4:
47 |         print("Usage: python %s <in_fa> <in_agp> <out_dir>"%sys.argv[0])
48 |     else:
49 |         in_fa, in_agp, out_dir = sys.argv[1:]
50 |         split_fa(in_fa, in_agp, out_dir)
51 |         
52 | 


--------------------------------------------------------------------------------
/bin/convert_simple_for_circos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import re
 4 | 
 5 | 
 6 | def convert_simple_for_circos(in_simple, in_gff3_files, out_link):
 7 | 	print("Loading gff3")
 8 | 	id_db = {}
 9 | 	for in_gff3 in in_gff3_files.split(','):
10 | 		with open(in_gff3, 'r') as fin:
11 | 			for line in fin:
12 | 				if line.strip() == '' or line[0] == '#':
13 | 					continue
14 | 				data = line.strip().split()
15 | 				if data[2]!='gene':
16 | 					continue
17 | 				chrn = data[0]
18 | 				sp = int(data[3])
19 | 				ep = int(data[4])
20 | 				if 'Name' in data[8]:
21 | 					id = re.findall(r'Name=(.*)', data[8])[0].split(';')[0]
22 | 				else:
23 | 					id = re.findall(r'ID=(.*)', data[8])[0].split(';')[0]
24 | 				id_db[id] = [chrn, sp, ep]
25 | 	
26 | 	print("Loading and writing link")
27 | 	with open(in_simple, 'r') as fin:
28 | 		with open(out_link, 'w') as fout:
29 | 			for line in fin:
30 | 				data = line.strip().split()
31 | 				achrn = id_db[data[0]][0]
32 | 				asp = min(id_db[data[0]][1], id_db[data[1]][1])
33 | 				aep = max(id_db[data[0]][2], id_db[data[1]][2])
34 | 				bchrn = id_db[data[2]][0]
35 | 				bsp = min(id_db[data[2]][1], id_db[data[3]][1])
36 | 				bep = max(id_db[data[2]][2], id_db[data[3]][2])
37 | 				fout.write("%s\t%d\t%d\t%s\t%d\t%d\n"%(achrn, asp, aep, bchrn, bsp, bep))
38 | 	
39 | 	print("Finished")				
40 | 
41 | 
42 | if __name__ == "__main__":
43 | 	if len(sys.argv) < 4:
44 | 		print("Usage: python %s <in_simple> <in_gff3_files> <out_link>"%sys.argv[0])
45 | 	else:
46 | 		in_simple, in_gff3_files, out_link = sys.argv[1:]
47 | 		convert_simple_for_circos(in_simple, in_gff3_files, out_link)
48 | 


--------------------------------------------------------------------------------
/bin/subVCF.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import gzip
 4 | 
 5 | 
 6 | def subVCF(in_vcf, in_list, out_vcf, missing_rate):
 7 | 	sp_list = []
 8 | 	
 9 | 	with open(in_list, 'r') as f_in:
10 | 		for line in f_in:
11 | 			if line.strip() != '':
12 | 				sp_list.append(line.strip())
13 | 	
14 | 	if in_vcf.split('.')[-1] == 'gz':
15 | 		f_in = gzip.open(in_vcf, 'rt')
16 | 	else:
17 | 		f_in = open(in_vcf, 'r')
18 | 	
19 | 
20 | 	if out_vcf.split('.')[-1] == 'gz':
21 | 		f_out = gzip.open(out_vcf, 'wt')
22 | 	else:
23 | 		f_out = open(out_vcf, 'w')
24 | 	
25 | 	col_db = []
26 | 	for line in f_in:
27 | 		data = line.strip().split()
28 | 		pub_info = data[0:9]
29 | 		if data[0][0] == "#":
30 | 			for i in range(9, len(data)):
31 | 				if data[i] in sp_list:
32 | 					col_db.append(i)
33 | 		
34 | 		cnt_mis = 0
35 | 		out_str = ''
36 | 		for i in col_db:
37 | 			out_str += "\t"+data[i]
38 | 			if data[i] == './.' or data[i] == '.|.':
39 | 				cnt_mis += 1
40 | 		if len(col_db) > 0 and cnt_mis*1.0/len(col_db) > missing_rate:
41 | 			continue
42 | 		f_out.write("\t".join(pub_info))
43 | 		f_out.write(out_str+"\n")
44 | 
45 | 	f_in.close()
46 | 	f_out.close()
47 | 
48 | 
49 | if __name__ == "__main__":
50 | 	if len(sys.argv) < 4:
51 | 		print("Notice: script to extract vcf file with list file, default missing rate 0.4")
52 | 		print("Usage: python "+sys.argv[0]+" <in_vcf> <in_list> <out_vcf> [<missing_rate>]")
53 | 	else:
54 | 		in_vcf = sys.argv[1]
55 | 		in_list = sys.argv[2]
56 | 		out_vcf = sys.argv[3]
57 | 		if len(sys.argv) == 5:
58 | 			missing_rate = float(sys.argv[4])
59 | 		else:
60 | 			missing_rate = 0.4
61 | 		subVCF(in_vcf, in_list, out_vcf, missing_rate)
62 | 
63 | 


--------------------------------------------------------------------------------
/bin/extract_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def bin_search(bed_list, pos):
 6 | 	s = 0
 7 | 	e = len(bed_list)-1
 8 | 	while s <= e:
 9 | 		mid = (s+e)//2
10 | 		if bed_list[mid][0] < pos:
11 | 			s = mid+1
12 | 		elif bed_list[mid][0] > pos:
13 | 			e = mid-1
14 | 		else:
15 | 			return True
16 | 	if bed_list[e][0] <= pos and bed_list[e][1] >= pos:
17 | 		return True
18 | 	else:
19 | 		return False
20 | 
21 | 
22 | def extract_vcf(in_vcf, in_bed, out_vcf):
23 | 	print("Loading BED file")
24 | 	bed_db = {}
25 | 	with open(in_bed, 'r') as f_in:
26 | 		for line in f_in:
27 | 			if line.strip() == "":
28 | 				continue
29 | 			data = line.strip().split()
30 | 			chrn = data[0]
31 | 			sr = int(data[1])
32 | 			er = int(data[2])
33 | 			if chrn not in bed_db:
34 | 				bed_db[chrn] = []
35 | 			bed_db[chrn].append([sr, er])
36 | 	
37 | 	print("Extracting VCF")
38 | 	last_chrn = ""
39 | 	with open(in_vcf, 'r') as f_in:
40 | 		with open(out_vcf, 'w') as f_out:
41 | 			for line in f_in:
42 | 				if line.strip() == "":
43 | 					continue
44 | 				if line[0] == '#':
45 | 					f_out.write(line)
46 | 				else:
47 | 					data = line.strip().split()
48 | 					chrn = data[0]
49 | 					if chrn != last_chrn:
50 | 						print("\tExtracting %s"%chrn)
51 | 						last_chrn = chrn
52 | 					if chrn not in bed_db:
53 | 						continue
54 | 					pos = int(data[1])
55 | 					if bin_search(bed_db[chrn], pos):
56 | 						f_out.write(line)
57 | 	
58 | 	print("Finished")
59 | 
60 | 
61 | if __name__ == "__main__":
62 | 	if len(sys.argv) < 4:
63 | 		print("Usage: python "+sys.argv[0]+" <in_vcf> <in_bed> <out_vcf>")
64 | 	else:
65 | 		proc, in_vcf, in_bed, out_vcf = sys.argv
66 | 		extract_vcf(in_vcf, in_bed, out_vcf)
67 | 
68 | 


--------------------------------------------------------------------------------
/bin/bam_cov.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pysam
 3 | import multiprocessing
 4 | import argparse
 5 | 
 6 | 
 7 | def get_opts():
 8 |     group = argparse.ArgumentParser()
 9 |     group.add_argument("-b", "--bam", help="Input bam file, must be indexed", required=True)
10 |     group.add_argument("-o", "--output", help="Output statistic", required=True)
11 |     group.add_argument("-t", "--threads", help="Threads, default=10", type=int, default=10)
12 |     return group.parse_args()
13 | 
14 | 
15 | def sub_cov(in_bam, chrn, chrl):
16 |     bins = [0 for _ in range(chrl)]
17 |     with pysam.AlignmentFile(in_bam, 'rb') as fin:
18 |         for read in fin.fetch(chrn):
19 |             for pos in read.get_reference_positions():
20 |                 bins[pos] = 1
21 |     return chrn, chrl, sum(bins)
22 | 
23 | 
24 | def main():
25 |     opts = get_opts()
26 |     in_bam = opts.bam
27 |     out_stat = opts.output
28 |     threads = opts.threads
29 |     chr_name = []
30 |     chr_len = []
31 |     with pysam.AlignmentFile(in_bam, 'rb') as fin:
32 |         chr_name = fin.references
33 |         chr_len = fin.lengths
34 |     
35 |     res = []
36 |     pool = multiprocessing.Pool(processes=threads)
37 |     for _ in range(len(chr_name)):
38 |         r = pool.apply_async(sub_cov, (in_bam, chr_name[_], chr_len[_], ))
39 |         res.append(r)
40 |     pool.close()
41 |     pool.join()
42 | 
43 |     with open(out_stat, 'w') as fout:
44 |         total_covl = 0
45 |         total_chrl = 0
46 |         for r in res:
47 |             chrn, chrl, covl = r.get()
48 |             fout.write("%s\t%d\t%d\t%f\n"%(chrn, chrl, covl, covl*1./chrl))
49 |             total_covl += covl
50 |             total_chrl += chrl
51 |         fout.write("Total\t%s\t%d\t%f\n"%(total_chrl, total_covl, total_covl*1./total_chrl))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/bin/extract_fasta_with_bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def rev_seq(seq):
 6 |     rseq = ""
 7 |     base_db = {"A": "T", "T": "A", "C": "G", "G": "C"}
 8 |     for base in seq[::-1]:
 9 |         if base in base_db:
10 |             rseq += base_db[base]
11 |         else:
12 |             rseq += base
13 |     return rseq
14 | 
15 | 
16 | def extract_fa_with_bed(in_fa, in_bed, out_fa):
17 |     print("Loading fasta")
18 |     fa_db = {}
19 |     with open(in_fa, 'r') as fin:
20 |         seq = ""
21 |         id = ""
22 |         for line in fin:
23 |             if line[0] == '>':
24 |                 if seq:
25 |                     fa_db[id] = seq
26 |                 id = line.strip().split()[0][1:]
27 |                 seq = ""
28 |             else:
29 |                 seq += line.strip().upper()
30 |         if seq:
31 |             fa_db[id] = seq
32 |     
33 |     print("Loading bed and writing fasta")
34 |     with open(in_bed, 'r') as fin:
35 |         with open(out_fa, 'w') as fout:
36 |             for line in fin:
37 |                 data = line.strip().split()
38 |                 chrn = data[0]
39 |                 sp = int(data[1])-1
40 |                 ep = int(data[2])
41 |                 direct = data[3]
42 |                 id = data[4]
43 |                 seq = fa_db[chrn][sp: ep]
44 |                 if direct == '-':
45 |                     seq = rev_seq(seq)
46 |                 fout.write(">%s\n%s\n"%(id, seq))
47 |     print("Finsihed")
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     if len(sys.argv) < 4:
52 |         print("Usage: python %s <in_fa> <in_bed> <out_fa>"%sys.argv[0])
53 |         print("Notice: bed should be 5 columns: \"ID, start, end, direction, id\", positions should be 1-based")
54 |     else:
55 |         in_fa, in_bed, out_fa = sys.argv[1:]
56 |         extract_fa_with_bed(in_fa, in_bed, out_fa)
57 | 


--------------------------------------------------------------------------------
/bin/convert_QTL_info.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def get_tig_pos_of_chr(in_file):
 6 | 	pos_db = {}
 7 | 	with open(in_file, 'r') as f_in:
 8 | 		for line in f_in:
 9 | 			if line.strip() == '':
10 | 				continue
11 | 			data = line.strip().split()
12 | 			if data[4] == 'U':
13 | 				continue
14 | 			chrn = data[0]
15 | 			spos = int(data[1])
16 | 			epos = int(data[2])
17 | 			tig = data[5]
18 | 			direct = data[-1]
19 | 			pos_db[tig] = [chrn, spos, epos, direct]
20 | 	return pos_db
21 | 
22 | 
23 | def convert_QTL_info(in_QTL, in_agp, out_QTL):
24 | 	tig_on_chr = get_tig_pos_of_chr(in_agp)
25 | 	with open(in_QTL, 'r') as fin:
26 | 		with open(out_QTL, 'w') as fout:
27 | 			for line in fin:
28 | 				data = line.strip().split()
29 | 				if data[0] == 'Pop':
30 | 					data.extend(['ActChr', 'Left_Pos', 'Right_Pos', 'Direct'])
31 | 				else:
32 | 					ltig, lpos = data[4].split('_')
33 | 					rtig, rpos = data[5].split('_')
34 | 					lpos = int(lpos)
35 | 					rpos = int(rpos)
36 | 					lchr, lsp, lep, ld = tig_on_chr[ltig]
37 | 					rchr, rsp, rep, rd = tig_on_chr[rtig]
38 | 					if lchr != rchr:
39 | 						print(data[1]+"\t"+lchr+"\t"+rchr)
40 | 						continue
41 | 					if ld == '-':
42 | 						lpos = lep-lpos+1
43 | 					else:
44 | 						lpos = lsp+lpos-1
45 | 					if rd == '-':
46 | 						rpos = rep-rpos+1
47 | 					else:
48 | 						rpos = rsp+rpos-1
49 | 					if lpos > rpos:
50 | 						tmp = lpos
51 | 						lpos = rpos
52 | 						rpos = tmp
53 | 						direct = "-"
54 | 					else:
55 | 						direct = "+"
56 | 					data.extend([lchr, str(lpos), str(rpos), direct])
57 | 				fout.write("%s\n"%'\t'.join(data))
58 | 
59 | 
60 | if __name__ == "__main__":
61 | 	if len(sys.argv) < 4:
62 | 		print("Usage: python "+sys.argv[0]+" <in_QTL> <in_agp> <out_QTL>")
63 | 	else:
64 | 		in_QTL, in_agp, out_QTL = sys.argv[1:]
65 | 		convert_QTL_info(in_QTL, in_agp, out_QTL)
66 | 


--------------------------------------------------------------------------------
/bin/convert_chr_to_ctg_with_agp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def reverse_seq(seq):
 6 |     base_db = {"A": "T", "T": "A", "G": "C", "C": "G"}
 7 |     rev_seq = ''.join([base_db[_] if _ in base_db else _ for _ in seq[::-1]])
 8 |     return rev_seq
 9 | 
10 | 
11 | def convert_chr_to_ctg(in_fa, in_agp, out_fa):
12 |     print("Loading genome file")
13 |     fa_db = {}
14 |     with open(in_fa, 'r') as fin:
15 |         for line in fin:
16 |             if line[0] == '>':
17 |                 id = line.strip().split()[0][1:]
18 |                 fa_db[id] = []
19 |             else:
20 |                 fa_db[id].append(line.strip().upper())
21 | 
22 |     for id in fa_db:
23 |         fa_db[id] = ''.join(fa_db[id])
24 | 
25 |     print("Loading AGP file")
26 |     ctg_db = {}
27 |     with open(in_agp, 'r') as fin:
28 |         for line in fin:
29 |             if line.strip() == "" or line[0] == '#':
30 |                 continue
31 |             data = line.strip().split()
32 |             if data[4] != 'W':
33 |                 continue
34 |             chrn = data[0]
35 |             sp = int(data[1]) - 1
36 |             ep = int(data[2])
37 |             ctg = data[5]
38 |             direct = data[-1]
39 |             ctg_db[ctg] = [chrn, sp, ep, direct]
40 | 
41 |     print("Writing contig file")
42 |     with open(out_fa, 'w') as fout:
43 |         for ctg in sorted(ctg_db):
44 |             chrn, sp, ep, direct = ctg_db[ctg]
45 |             seq = fa_db[chrn][sp: ep]
46 |             if direct == '-':
47 |                 seq = reverse_seq(seq)
48 |             fout.write(">%s\n%s\n" % (ctg, seq))
49 | 
50 |     print("Finished")
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     if len(sys.argv) < 4:
55 |         print("Usage: python %s <in_fa> <in_agp> <out_fa>" % sys.argv[0])
56 |     else:
57 |         in_fa, in_agp, out_fa = sys.argv[1:]
58 |         convert_chr_to_ctg(in_fa, in_agp, out_fa)
59 | 


--------------------------------------------------------------------------------
/bin/StatAgpDetail.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | 
 4 | 
 5 | def stat_agp(in_agp, out_csv):
 6 | 	asm_db = {}
 7 | 	unanc_cnt = 0
 8 | 	unanc_len = 0
 9 | 	anc_cnt = 0
10 | 	anc_len = 0
11 | 	gap_cnt = 0
12 | 	gap_len = 0
13 | 	with open(in_agp, 'r') as fin:
14 | 		for line in fin:
15 | 			data = line.strip().split()
16 | 			if data[4] == 'U':
17 | 				gap_cnt += 1
18 | 				gap_len += 100
19 | 			else:
20 | 				chrn = data[0]
21 | 				if chrn[:3] != 'tig' and chrn[:3] != 'utg' and chrn[:3] != 'ctg' and data[0] != data[5]:
22 | 					allele = chrn[-1]
23 | 					chrn = chrn[:-1]
24 | 					if chrn not in asm_db:
25 | 						asm_db[chrn] = {}
26 | 					if allele not in asm_db[chrn]:
27 | 						asm_db[chrn][allele] = {'cnt': 0, 'len': 0}
28 | 					asm_db[chrn][allele]['cnt'] += 1
29 | 					asm_db[chrn][allele]['len'] = int(data[2])
30 | 					anc_cnt += 1
31 | 					anc_len += int(data[7])
32 | 				else:
33 | 					unanc_cnt += 1
34 | 					unanc_len += int(data[2])
35 | 	
36 | 	for chrn in asm_db:
37 | 		break
38 | 	with open(out_csv, 'w') as fout:
39 | 		fout.write(",%s\n"%(',,'.join(sorted(asm_db[chrn]))))
40 | 		for chrn in sorted(asm_db):
41 | 			info = [chrn]
42 | 			for allele in sorted(asm_db[chrn]):
43 | 				info.append("\"%s\""%("{:,}".format(asm_db[chrn][allele]['cnt'])))
44 | 				info.append("\"%s\""%("{:,}".format(asm_db[chrn][allele]['len'])))
45 | 			fout.write("%s\n"%(','.join(info)))
46 | 		fout.write("Anchored contigs,\"%s\",\"%s\"\n"%("{:,}".format(anc_cnt), "{:,}".format(anc_len/1e6)))
47 | 		fout.write("Unanchored contigs,\"%s\",\"%s\"\n"%("{:,}".format(unanc_cnt), "{:,}".format(unanc_len/1e6)))
48 | 		fout.write("Gaps,\"%s\",\"%s\"\n"%("{:,}".format(gap_cnt), "{:,}".format(gap_len/1e6)))
49 | 		fout.write("Total,\"%s\",\"%s\"\n"%("{:,}".format(anc_cnt+unanc_cnt), "{:,}".format((anc_len+unanc_len)/1e6)))
50 | 	
51 | 
52 | if __name__ == "__main__":
53 | 	if len(sys.argv) < 2:
54 | 		print("Usage: python %s <in_agp> <out_csv>"%sys.argv[0])
55 | 	else:
56 | 		in_agp, out_csv = sys.argv[1:]
57 | 		stat_agp(in_agp, out_csv)
58 | 


--------------------------------------------------------------------------------
/bin/extract_all_sv_from_nucmer_delta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys, os
 3 | 
 4 | 
 5 | def nucmer_extract(in_delta, out_pre):
 6 | 	print("Running delta-filter")
 7 | 	cmd = "delta-filter -gqr "+in_delta+" > "+in_delta+".filtered"
 8 | 	print("Running command: "+cmd)
 9 | 	os.system(cmd)
10 | 	print("Extracting")
11 | 	data_db = {}
12 | 	last_INDEL_pos = {}
13 | 	r_chr_len_db = {}
14 | 	q_chr_len_db = {}
15 | 	#sv_list = ['SNP', 'INDEL', 'JMP', 'INV', 'DUP', 'BRK']
16 | 	with os.popen("show-diff -H "+in_delta+".filtered", 'r') as f_in:
17 | 		for line in f_in:
18 | 			data = line.strip().split()
19 | 			if len(data) < 5:
20 | 				continue
21 | 			chrn = data[0]
22 | 			sv = data[1]
23 | 			if sv not in data_db:
24 | 				data_db[sv] = {}
25 | 			if chrn not in data_db[sv]:
26 | 				data_db[sv][chrn] = []
27 | 			sp = data[2]
28 | 			ep = data[3]
29 | 			data_db[sv][chrn].append([sp, ep])			
30 | 	
31 | 	data_db['SNP'] = {}
32 | 	data_db['INDEL'] = {}
33 | 	with os.popen("show-snps -ClrT "+in_delta+".filtered", 'r') as f_in:
34 | 		for line in f_in:
35 | 			data = line.strip().split()
36 | 			if len(data) == 0 or data[0].isdigit() == False:
37 | 				continue
38 | 			
39 | 			pos = int(data[0])
40 | 			r_chrn = data[-2]
41 | 			if r_chrn not in data_db['SNP']:
42 | 				data_db['SNP'][r_chrn] = []
43 | 			if r_chrn not in data_db['INDEL']:
44 | 				data_db['INDEL'][r_chrn] = []
45 | 			if data[1] != '.' and data[2] != '.':
46 | 				data_db['SNP'][r_chrn].append([data[0], data[1], data[2]])
47 | 			else:
48 | 				data_db['INDEL'][r_chrn].append([data[0], data[1], data[2]])
49 | 	
50 | 	print("Writing data")
51 | 	for sv in data_db:
52 | 		with open(out_pre+"."+sv+".txt", 'w') as fout:
53 | 			for chrn in sorted(data_db[sv]):
54 | 				for data in data_db[sv][chrn]:
55 | 					fout.write("%s\t%s\n"%(chrn, '\t'.join(data)))
56 | 	print("Success")
57 | 
58 | 
59 | if __name__ == "__main__":
60 | 	if len(sys.argv) < 3:
61 | 		print("Usage: python "+sys.argv[0]+" <in_delta> <out_pre>")
62 | 	else:
63 | 		in_delta, out_pre = sys.argv[1:]
64 | 		nucmer_extract(in_delta, out_pre)
65 | 


--------------------------------------------------------------------------------
/bin/dup_dotplot.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | use Getopt::Std;
 3 | getopts "g:r:q:n:t:";
 4 | 
 5 | if ((!defined $opt_g)or(!defined $opt_r)or(!defined $opt_q)or(!defined $opt_n)){
 6 | 	die"******************************************************************************************
 7 |     Usage: perl $0 -g reference_genome -r ref_id -q query_id -n number_of_dup -t threads
 8 |         ref_id: reference cds and bed name, like: Sb, Sb.cds and Sb.bed must exist
 9 |         query_id: query cds and bed name, like: Os
10 |         number_of_dup: number of duplications
11 |         threads: default 1
12 | ******************************************************************************************\n";
13 | }
14 | 
15 | my $genome = $opt_g;
16 | my $ref_name = $opt_r;
17 | my $qry_name = $opt_q;
18 | my $dup_n = $opt_n;
19 | if (!defined $opt_t){
20 | 	$threads = "1";
21 | }
22 | else{
23 | 	$threads = $opt_t;
24 | }
25 | 
26 | my %sbcdsdb;
27 | open(IN, $ref_name.".cds") or die"";
28 | while(<IN>){
29 | 	chomp;
30 | 	if(/>/){
31 | 		$gene = $_;
32 | 		$gene =~ s/\s.*//g;
33 | 		$gene =~ s/>//g;
34 | 	}else{
35 | 		$sbcdsdb{$gene} .= $_;
36 | 		}
37 | 	}
38 | close IN;
39 | 
40 | my $size=(-s $genome);
41 | my $gmap = "gmap";
42 | if($size>2**32){
43 | 	$gmap = "gmapl";
44 | }
45 | system("gmap_build -D . -d DB ".$genome);
46 | system($gmap." -D . -d DB -t ".$threads." -f 2 -n ".$dup_n." ".$ref_name.".cds"." > ".$qry_name.".gff3");
47 | 
48 | open(CDS, "> ".$qry_name.".cds") or die"";
49 | open(BED, "> ".$qry_name.".bed") or die"";
50 | my $count = 0;
51 | open(IN, "grep 'gene' ".$qry_name.".gff3|") or die"";
52 | while(<IN>){
53 | 	chomp;
54 | 	$count++;
55 | 	my @data = split(/\s+/,$_);
56 | 	my $gid  = $data[0];
57 | 	my $a    = $data[3];
58 | 	my $b    = $data[4];
59 | 	my $gene = $1 if(/Name=(\S+)/);
60 | 	   $gene =~ s/;.*//g;
61 | 	my $cds  = $sbcdsdb{$gene};
62 | 	   $gene = $gene."_".$count;
63 | 	print CDS ">$gene\n$cds\n";
64 | 	print BED "$gid	$a	$b	$gene	0	$data[6]\n";
65 | 	}
66 | close IN;
67 | close CDS;
68 | close BED;
69 | 
70 | system(" python -m jcvi.compara.catalog ortholog ".$qry_name." ".$ref_name);
71 | 
72 | 


--------------------------------------------------------------------------------
/bin/merge_bed_regions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | def merge_reions(region_db, md):
 5 | 	new_regions = {}
 6 | 	for chrn in region_db:
 7 | 		new_regions[chrn] = []
 8 | 		tmp_list = []
 9 | 		tmp_gns = [[]]
10 | 		for region in sorted(region_db[chrn]):
11 | 			s = region[0]
12 | 			e = region[1]
13 | 			gn = region[2]
14 | 			if len(tmp_list) == 0:
15 | 				tmp_list.append(s)
16 | 				laste = e
17 | 			else:
18 | 				if s > laste+md:
19 | 					tmp_list.append(laste)
20 | 					tmp_list.append(s)
21 | 					tmp_gns.append([])
22 | 					laste = e
23 | 				else:
24 | 					if e > laste:
25 | 						laste = e
26 | 			tmp_gns[-1].extend(gn)
27 | 		tmp_list.append(laste)
28 | 		for i in range(0, len(tmp_list), 2):
29 | 			s = tmp_list[i]
30 | 			e = tmp_list[i+1]
31 | 			gns = ''.join(tmp_gns[int(i/2)])
32 | 			gns = list(set(gns.split(',')))
33 | 			new_gns = []
34 | 			for gn in gns:
35 | 				if gn != '':
36 | 					new_gns.append(gn)
37 | 			gns = ','.join(new_gns)
38 | 			new_regions[chrn].append([s, e, gns])
39 | 	return new_regions
40 | 
41 | 
42 | def read_bed(in_bed):
43 | 	bed_db = {}
44 | 	with open(in_bed, 'r') as f_in:
45 | 		for line in f_in:
46 | 			if line.strip() == '':
47 | 				continue
48 | 			data = line.strip().split()
49 | 			chrn = data[0]
50 | 			sp = int(data[1])
51 | 			ep = int(data[2])
52 | 			if len(data) > 3:
53 | 				gn = data[-1]
54 | 			else:
55 | 				gn = ''
56 | 			if sp > ep:
57 | 				temp = sp
58 | 				sp = ep
59 | 				ep = temp
60 | 			if chrn not in bed_db:
61 | 				bed_db[chrn] = []
62 | 			bed_db[chrn].append([sp, ep, gn])
63 | 	return bed_db
64 | 
65 | 
66 | def merge_regions_in_bed(in_bed, out_bed, md):
67 | 	ori_regions = read_bed(in_bed)
68 | 	new_regions = merge_reions(ori_regions, md)
69 | 	with open(out_bed, 'w') as f_out:
70 | 		for chrn in sorted(new_regions):
71 | 			for region in new_regions[chrn]:
72 | 				f_out.write("%s\t%d\t%d\t%s\n"%(chrn, region[0], region[1], region[2]))
73 | 
74 | 
75 | if __name__ == "__main__":
76 | 	if len(sys.argv) < 4:
77 | 		print("Usage: python "+sys.argv[0]+" <in_bed> <out_bed> <max_distance>")
78 | 	else:
79 | 		in_bed, out_bed, md = sys.argv[1:]
80 | 		md = int(md)
81 | 		merge_regions_in_bed(in_bed, out_bed, md)
82 | 


--------------------------------------------------------------------------------
/bin/get_seq_with_bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import gzip
 4 | 
 5 | 
 6 | def reverse_seq(seq):
 7 |     base_db = {"A": "T", "T": "A", "C": "G", "G": "C"}
 8 |     rev_seq = "".join([base_db[_] if _ in base_db else _ for _ in seq.upper()[::-1]])
 9 |     return rev_seq
10 | 
11 | 
12 | def get_seq_with_bed(in_fa, in_bed, out_fa):
13 |     print("Loading bed")
14 |     bed_db = {}
15 |     with open(in_bed, "r") as fin:
16 |         for line in fin:
17 |             data = line.strip().split()
18 |             chrn = data[0]
19 |             sp = int(data[1]) - 1
20 |             ep = int(data[2])
21 |             if len(data) > 4:
22 |                 direct = data[3]
23 |             else:
24 |                 direct = "+"
25 |             gid = data[-1]
26 |             if chrn not in bed_db:
27 |                 bed_db[chrn] = []
28 |             bed_db[chrn].append([sp, ep, direct, gid])
29 | 
30 |     print("Extracting")
31 |     if in_fa.endswith(".gz"):
32 |         fin = gzip.open(in_fa, "rt")
33 |     else:
34 |         fin = open(in_fa, "r")
35 | 
36 |     fa_db = {}
37 |     for line in fin:
38 |         if line[0] == ">":
39 |             cid = line.strip()[1:]
40 |             fa_db[cid] = []
41 |         else:
42 |             fa_db[cid].append(line.strip())
43 | 
44 |     fin.close()
45 |     for _ in fa_db:
46 |         fa_db[_] = "".join(fa_db[_])
47 | 
48 |     with open(out_fa, "w") as fout:
49 |         for cid in bed_db:
50 |             for sp, ep, direct, gid in bed_db[cid]:
51 |                 fout.write(
52 |                     ">%s\n%s\n"
53 |                     % (
54 |                         gid,
55 |                         (
56 |                             fa_db[cid][sp:ep]
57 |                             if direct == "+"
58 |                             else reverse_seq(fa_db[cid][sp:ep])
59 |                         ),
60 |                     )
61 |                 )
62 | 
63 |     print("Finished")
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     if len(sys.argv) < 3:
68 |         print("Usage: python %s <in_fa> <in_bed> <out_fa>" % sys.argv[0])
69 |     else:
70 |         in_fa, in_bed, out_fa = sys.argv[1:]
71 |         get_seq_with_bed(in_fa, in_bed, out_fa)
72 | 


--------------------------------------------------------------------------------
/bin/get_genes_from_range.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def cmplist(a,b):
 6 | 	dataa = a.strip().split('\t')
 7 | 	datab = b.strip().split('\t')
 8 | 	if long(dataa[0]) > long(datab[0]):
 9 | 		return 1
10 | 	elif long(dataa[0]) < long(datab[0]):
11 | 		return -1
12 | 	else:
13 | 		return 0
14 | 
15 | def search(listGene, x, y, td):
16 | 	res = ""
17 | 	for line in listGene:
18 | 		data = line.split('\t')
19 | 		if (x >= long(data[0]) and x <= long(data[1])) or (y >= long(data[0]) and y <= long(data[1])) or (x <= long(data[0]) and y>= long(data[1])):
20 | 			if x >= long(data[0]):
21 | 				start_pos = x
22 | 			else:
23 | 				start_pos = long(data[0])
24 | 			if y <= long(data[1]):
25 | 				end_pos = y
26 | 			else:
27 | 				end_pos = long(data[1])
28 | 			if (end_pos-start_pos+1)*1.0/(y-x+1) >= td:
29 | 				res = res+data[2]+'\n'
30 | 	return res
31 | 
32 | 
33 | def get_genes_from_range(f_gff, f_bed, f_out, td):
34 | 	gff = open(f_gff,'r')
35 | 	dictGene = {}
36 | 	for line in gff:
37 | 		data = line.strip().split('\t')
38 | 		if(len(data) > 3):
39 | 			if(data[2] == "gene"):
40 | 				if data[0] not in dictGene:
41 | 					dictGene[data[0]] = []
42 | 				dictGene[data[0]].append(data[3]+'\t'+data[4]+'\t'+data[8].split(';')[0][3:])
43 | 	gff.close()
44 | 	
45 | 	for key in dictGene:
46 | 		dictGene[key].sort(cmplist)
47 | 	bed = open(f_bed,'r')
48 | 	out = open(f_out,'w')
49 | 	for line in bed:
50 | 		data = line.strip().split('\t')
51 | 		if len(data) < 3 or line[0] == '#':
52 | 			continue
53 | 		ss = long(data[1])
54 | 		se = long(data[2])
55 | 		if data[0] in dictGene:
56 | 			res = search(dictGene[data[0]], ss, se, td)
57 | 			if res != "":
58 | 				rs = res.split('\n')
59 | 				out.write(line.strip())
60 | 				for r in rs:
61 | 					if r != "":
62 | 						out.write('\t'+r)
63 | 				out.write('\n')
64 | 	out.close()
65 | 	bed.close()
66 | 
67 | 
68 | if __name__ == "__main__":
69 | 	if len(sys.argv) < 5:
70 | 		print("Notice: script for get genes in range from bed file")
71 | 		print("Usage: python "+sys.argv[0]+" <gff3_file> <bed_file> <output_file> <threshold>")
72 | 	else:
73 | 		f_gff = sys.argv[1]
74 | 		f_bed = sys.argv[2]
75 | 		f_out = sys.argv[3]
76 | 		td = float(sys.argv[4])/100.0
77 | 		get_genes_from_range(f_gff, f_bed, f_out, td)
78 | 


--------------------------------------------------------------------------------
/bin/eval_filled_gaps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def search_gaps(seq):
 6 | 	gaps_db = []
 7 | 	cnt_n = 0
 8 | 	for i in range(0, len(seq)):
 9 | 		if seq[i].lower() == 'n':
10 | 			if cnt_n == 0:
11 | 				s = i
12 | 			cnt_n += 1
13 | 		else:
14 | 			if cnt_n != 0:
15 | 				e = i
16 | 				gaps_db.append([s, e-1])
17 | 				cnt_n = 0
18 | 	cnt_n = 0
19 | 	for region in gaps_db:
20 | 		cnt_n += region[1]-region[0]+1
21 | 	return gaps_db, cnt_n
22 | 
23 | 
24 | def calc_gaps(seq):
25 | 	cnt_n = 0
26 | 	for i in range(0, len(seq)):
27 | 		if seq[i].lower() == 'n':
28 | 			cnt_n += 1
29 | 	return cnt_n
30 | 
31 | 
32 | def make_seq_db(in_fasta):
33 | 	seq_db = {}
34 | 	with open(in_fasta, 'r') as f_in:
35 | 		id = ''
36 | 		seq = ''
37 | 		for line in f_in:
38 | 			if line[0] == ">":
39 | 				if seq != '':
40 | 					seq_db[id] = seq
41 | 				id = line.strip()
42 | 				seq = ''
43 | 			else:
44 | 				seq += line.strip()
45 | 		seq_db[id] = seq
46 | 	return seq_db
47 | 
48 | 
49 | def eval_filled_gaps(ref_fasta, query_fasta, result_file):
50 | 	print("Reading reference fasta")
51 | 	ref_seq_db = make_seq_db(ref_fasta)
52 | 
53 | 	print("Reading query fasta")
54 | 	query_seq_db = make_seq_db(query_fasta)
55 | 
56 | 	print("Evaluating")
57 | 	with open(result_file, 'w') as f_out:
58 | 		for id in ref_seq_db:
59 | 			ref_gaps_db, ref_gaps_cnt = search_gaps(ref_seq_db[id])
60 | 			query_gaps_cnt = calc_gaps(query_seq_db[id])
61 | 			f_out.write(id[1:]+"\n")
62 | 			if ref_gaps_cnt != 0:
63 | 				f_out.write("Filled %0.2f%%\n"%((ref_gaps_cnt-query_gaps_cnt)*1.0/ref_gaps_cnt*100.0))
64 | 			else:
65 | 				f_out.write("No gaps\n")
66 | 			if len(ref_gaps_db) != 0:
67 | 				for region in ref_gaps_db:
68 | 					s = region[0]
69 | 					e = region[1]
70 | 					f_out.write("Region %d-%d:\n%s\n"%(s, e, query_seq_db[id][s:e+1]))
71 | 			f_out.write("\n")
72 | 	print("Success")
73 | 
74 | 
75 | if __name__ == "__main__":
76 | 	if len(sys.argv) < 4:
77 | 		print("Notice: this script is used to evaulate status that gaps been filled")
78 | 		print("Usage: python "+sys.argv[0]+" <ref_fasta> <query_fasta> <result_file>")
79 | 	else:
80 | 		ref_fasta = sys.argv[1]
81 | 		query_fasta = sys.argv[2]
82 | 		result_file = sys.argv[3]
83 | 		eval_filled_gaps(ref_fasta, query_fasta, result_file)
84 | 
85 | 


--------------------------------------------------------------------------------
/bin/nucmer_extract_all_sv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys, os
 3 | 
 4 | 
 5 | def nucmer_extract(r_fa, q_fa, out_pre, ts):
 6 | 	print("Running nucmer")
 7 | 	cmd = "nucmer -p "+out_pre+" "+r_fa+" "+q_fa+" -t "+ts
 8 | 	print("Running command: "+cmd)
 9 | 	os.system(cmd)
10 | 
11 | 	print("Running delta-filter")
12 | 	cmd = "delta-filter -gqr "+out_pre+".delta > "+out_pre+".filtered"
13 | 	print("Running command: "+cmd)
14 | 	os.system(cmd)
15 | 
16 | 	print("Extracting")
17 | 	data_db = {}
18 | 	last_INDEL_pos = {}
19 | 	r_chr_len_db = {}
20 | 	q_chr_len_db = {}
21 | 	#sv_list = ['SNP', 'INDEL', 'JMP', 'INV', 'DUP', 'BRK']
22 | 	with os.popen("show-diff -H "+out_pre+".filtered", 'r') as f_in:
23 | 		for line in f_in:
24 | 			data = line.strip().split()
25 | 			if len(data) < 5:
26 | 				continue
27 | 			chrn = data[0]
28 | 			sv = data[1]
29 | 			if sv not in data_db:
30 | 				data_db[sv] = {}
31 | 			if chrn not in data_db[sv]:
32 | 				data_db[sv][chrn] = []
33 | 			sp = data[2]
34 | 			ep = data[3]
35 | 			data_db[sv][chrn].append([sp, ep])			
36 | 	
37 | 	data_db['SNP'] = {}
38 | 	data_db['INDEL'] = {}
39 | 	with os.popen("show-snps -ClrT "+out_pre+".filtered", 'r') as f_in:
40 | 		for line in f_in:
41 | 			data = line.strip().split()
42 | 			if len(data) == 0 or data[0].isdigit() == False:
43 | 				continue
44 | 			
45 | 			pos = int(data[0])
46 | 			r_chrn = data[-2]
47 | 			if r_chrn not in data_db['SNP']:
48 | 				data_db['SNP'][r_chrn] = []
49 | 			if r_chrn not in data_db['INDEL']:
50 | 				data_db['INDEL'][r_chrn] = []
51 | 			if data[1] != '.' and data[2] != '.':
52 | 				data_db['SNP'][r_chrn].append([data[0], data[1], data[2]])
53 | 			else:
54 | 				data_db['INDEL'][r_chrn].append([data[0], data[1], data[2]])
55 | 	
56 | 	print("Writing data")
57 | 	with open(out_pre+".sv.txt", 'w') as fall:
58 | 		for sv in data_db:
59 | 			with open(out_pre+"."+sv+".txt", 'w') as fout:
60 | 				for chrn in sorted(data_db[sv]):
61 | 					for data in data_db[sv][chrn]:
62 | 						if sv != "SNP" and sv != "INDEL":
63 | 							fall.write("%s\t%s\t%s\n"%(chrn, '\t'.join(data), sv))
64 | 						fout.write("%s\t%s\n"%(chrn, '\t'.join(data)))
65 | 	print("Success")
66 | 
67 | 
68 | if __name__ == "__main__":
69 | 	if len(sys.argv) < 5:
70 | 		print("Usage: python "+sys.argv[0]+" <ref_fasta> <query_fasta> <out_pre> <threads>")
71 | 	else:
72 | 		r_fa, q_fa, out_pre, ts = sys.argv[1:]
73 | 		nucmer_extract(r_fa, q_fa, out_pre, ts)
74 | 


--------------------------------------------------------------------------------
/bin/calc_gene_ovlp_te.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def merge_regions(regions):
 6 | 	tmp_regions = []
 7 | 	last_ep = 0
 8 | 	for sp, ep in sorted(regions):
 9 | 		if tmp_regions == []:
10 | 			tmp_regions.append(sp)
11 | 			last_ep = ep
12 | 		else:
13 | 			if sp > last_ep:
14 | 				tmp_regions.append(last_ep)
15 | 				tmp_regions.append(sp)
16 | 				last_ep = ep
17 | 			else:
18 | 				if ep > last_ep:
19 | 					last_ep = ep
20 | 	tmp_regions.append(last_ep)
21 | 	new_regions = []
22 | 	for i in range(0, len(tmp_regions)-1, 2):
23 | 		new_regions.append([tmp_regions[i], tmp_regions[i+1]])
24 | 	return new_regions
25 | 
26 | 
27 | def calc_ovlp_ratio(regions, sp, ep):
28 | 	tmp_tes = []
29 | 	for rsp, rep in regions:
30 | 		ovlp = min(ep, rep)-max(sp, rsp)+1
31 | 		if ovlp <= 0:
32 | 			continue
33 | 		tmp_tes.append([max(sp, rsp), min(ep, rep)])
34 | 	
35 | 	ovlp_len = 0
36 | 	if len(tmp_tes) != 0:
37 | 		for msp, mep in merge_regions(tmp_tes):
38 | 			ovlp_len += mep-msp+1
39 | 	
40 | 	return ovlp_len*1.0/(ep-sp+1)
41 | 
42 | 
43 | def calc_gene_ovlp_te(gene_gff3, TE_gffs, ovlp_stat):
44 | 	print("Loading TEs")
45 | 	TE_db = {}
46 | 	for te in TE_gffs.split(','):
47 | 		print("\tLoading: %s"%te)
48 | 		with open(te, 'r') as fin:
49 | 			for line in fin:
50 | 				if line[0] == '#':
51 | 					continue
52 | 				data = line.strip().split()
53 | 				tig = data[0]
54 | 				sp = int(data[3])
55 | 				ep = int(data[4])
56 | 				if sp > ep:
57 | 					sp, ep = ep, sp
58 | 				if tig not in TE_db:
59 | 					TE_db[tig] = []
60 | 				TE_db[tig].append([sp, ep])
61 | 
62 | 	for tig in TE_db:
63 | 		TE_db[tig] = sorted(TE_db[tig])
64 | 	
65 | 	print("Reading gene gff3 and calculating overlaps")
66 | 	with open(gene_gff3, 'r') as fin:
67 | 		with open(ovlp_stat, 'w') as fout:
68 | 			for line in fin:
69 | 				if line[0] == '#':
70 | 					continue
71 | 				data = line.strip().split()
72 | 				if data[2] != 'gene':
73 | 					continue
74 | 				tig = data[0]
75 | 				gid = data[8].split(';')[0].split('=')[1]
76 | 				sp = int(data[3])
77 | 				ep = int(data[4])
78 | 				if sp > ep:
79 | 					sp, ep = ep, sp
80 | 				if tig not in TE_db:
81 | 					continue
82 | 				fout.write("%s\t%f\n"%(gid, 100*calc_ovlp_ratio(TE_db[tig], sp, ep)))
83 | 	print("Finished")
84 | 
85 | 
86 | if __name__ == "__main__":
87 | 	if len(sys.argv) < 4:
88 | 		print("Usage: python "+sys.argv[0]+" <gene_gff3> <TE_gffs> <ovlp_stat>")
89 | 	else:
90 | 		gene_gff3, TE_gffs, ovlp_stat = sys.argv[1:]
91 | 		calc_gene_ovlp_te(gene_gff3, TE_gffs, ovlp_stat)
92 | 


--------------------------------------------------------------------------------
/bin/group_SNP_exon_and_intron.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import time
 4 | import gzip
 5 | 
 6 | 
 7 | def search_pos(in_gff, in_snp, out_file):
 8 | 	pos_db = {}
 9 | 	if in_gff.split('.')[-1] == 'gz':
10 | 		f_gff = gzip.open(in_gff, 'rt')
11 | 	else:
12 | 		f_gff = open(in_gff, 'r')
13 | 
14 | 	for line in f_gff:
15 | 		if line[0] == '#' or line.strip() == '':
16 | 			continue
17 | 		data = line.strip().split()
18 | 		chrn = data[0]
19 | 		s_pos = int(data[3])
20 | 		e_pos = int(data[4])
21 | 		name = data[8].split(';')[0].split('=')[1]
22 | 		if "G" in name:
23 | 			name = '.'.join(name.split('.')[:2])
24 | 		else:
25 | 			name = name.split('.')[0]
26 | 		type = data[2]
27 | 		if chrn not in pos_db:
28 | 			pos_db[chrn] = {}
29 | 		if name not in pos_db[chrn]:
30 | 			pos_db[chrn][name] = {}
31 | 			pos_db[chrn][name]['gene'] = ()
32 | 			pos_db[chrn][name]['exon'] = []
33 | 		if type == 'gene':
34 | 			pos_db[chrn][name]['gene'] = (s_pos, e_pos)
35 | 		elif type == 'exon':
36 | 			pos_db[chrn][name]['exon'].append((s_pos, e_pos))
37 | 		else:
38 | 			continue
39 | 	f_gff.close()
40 | 	
41 | 	if in_snp.split('.') == 'gz':
42 | 		f_snp = gzip.open(in_snp, 'rt')
43 | 	else:
44 | 		f_snp = open(in_snp, 'r')
45 | 	
46 | 	if out_file.split('.') == 'gz':
47 | 		f_out = gzip.open(out_file, 'wt')
48 | 	else:
49 | 		f_out = open(out_file, 'w')
50 | 
51 | 	for line in f_snp:
52 | 		if line[0] == '#' or line.strip() == '':
53 | 			continue
54 | 		data = line.strip().split()
55 | 		chrn = data[0]
56 | 		pos = int(data[1])
57 | 		if chrn not in pos_db:
58 | 			continue
59 | 		is_found = False
60 | 		for name in pos_db[chrn]:
61 | 			s_pos, e_pos = pos_db[chrn][name]['gene']
62 | 			if s_pos <= pos and pos <= e_pos:
63 | 				is_found = True
64 | 				break
65 | 		if is_found:
66 | 			is_exon = False
67 | 			for (s_pos, e_pos) in pos_db[chrn][name]['exon']:
68 | 				if s_pos <= pos and pos <= e_pos:
69 | 					is_exon = True
70 | 					break
71 | 			if is_exon:
72 | 				f_out.write(line.strip()+"\t"+name+"\t"+"\t"+"exon\n")
73 | 			else:
74 | 				f_out.write(line.strip()+"\t"+name+"\t"+"\t"+"intron\n")
75 | 	f_snp.close()
76 | 	f_out.close()
77 | 
78 | 
79 | if __name__ == "__main__":
80 | 	if len(sys.argv)<4:
81 | 		print("Notice: script use for determining pos to exon or intro in snp file base on gff file")
82 | 		print("Usage python "+sys.argv[0]+" <input_gff> <input_snp> <output_file>")
83 | 	else:
84 | 		s_time = time.time()
85 | 		in_gff = sys.argv[1]
86 | 		in_snp = sys.argv[2]
87 | 		out_file = sys.argv[3]
88 | 		search_pos(in_gff, in_snp, out_file)
89 | 		e_time = time.time()
90 | 		print("cost time " + str(e_time-s_time))
91 | 
92 | 


--------------------------------------------------------------------------------
/bin/group_exon_and_intron.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import time
 4 | import gzip
 5 | 
 6 | 
 7 | def search_pos(in_gff, in_vcf, out_file):
 8 | 	pos_db = {}
 9 | 	if in_gff.split('.')[-1] == 'gz':
10 | 		f_gff = gzip.open(in_gff, 'rt')
11 | 	else:
12 | 		f_gff = open(in_gff, 'r')
13 | 
14 | 	for line in f_gff:
15 | 		if line[0] == '#' or line.strip() == '':
16 | 			continue
17 | 		data = line.strip().split()
18 | 		chrn = data[0]
19 | 		s_pos = int(data[3])
20 | 		e_pos = int(data[4])
21 | 		name = data[8].split(';')[0].split('=')[1]
22 | 		if "G" in name:
23 | 			name = '.'.join(name.split('.')[:2])
24 | 		else:
25 | 			name = name.split('.')[0]
26 | 		type = data[2]
27 | 		if chrn not in pos_db:
28 | 			pos_db[chrn] = {}
29 | 		if name not in pos_db[chrn]:
30 | 			pos_db[chrn][name] = {}
31 | 			pos_db[chrn][name]['gene'] = ()
32 | 			pos_db[chrn][name]['exon'] = []
33 | 		if type == 'gene':
34 | 			pos_db[chrn][name]['gene'] = (s_pos, e_pos)
35 | 		elif type == 'exon':
36 | 			pos_db[chrn][name]['exon'].append((s_pos, e_pos))
37 | 		else:
38 | 			continue
39 | 	f_gff.close()
40 | 	
41 | 	if in_vcf.split('.') == 'gz':
42 | 		f_vcf = gzip.open(in_vcf, 'rt')
43 | 	else:
44 | 		f_vcf = open(in_vcf, 'r')
45 | 	
46 | 	if out_file.split('.') == 'gz':
47 | 		f_out = gzip.open(out_file, 'wt')
48 | 	else:
49 | 		f_out = open(out_file, 'w')
50 | 
51 | 	for line in f_vcf:
52 | 		if line[0] == '#' or line.strip() == '':
53 | 			continue
54 | 		data = line.strip().split()
55 | 		chrn = data[0]
56 | 		pos = int(data[1])
57 | 		if chrn not in pos_db:
58 | 			continue
59 | 		is_found = False
60 | 		for name in pos_db[chrn]:
61 | 			s_pos, e_pos = pos_db[chrn][name]['gene']
62 | 			if s_pos <= pos and pos <= e_pos:
63 | 				is_found = True
64 | 				break
65 | 		if is_found:
66 | 			is_exon = False
67 | 			for (s_pos, e_pos) in pos_db[chrn][name]['exon']:
68 | 				if s_pos <= pos and pos <= e_pos:
69 | 					is_exon = True
70 | 					break
71 | 			if is_exon:
72 | 				f_out.write(chrn+"\t"+name+"\t"+str(pos)+"\t"+"exon\n")
73 | 			else:
74 | 				f_out.write(chrn+"\t"+name+"\t"+str(pos)+"\t"+"intron\n")
75 | 	f_vcf.close()
76 | 	f_out.close()
77 | 
78 | 
79 | if __name__ == "__main__":
80 | 	if len(sys.argv)<4:
81 | 		print("Notice: script use for determining pos to exon or intro in vcf file base on gff file")
82 | 		print("Usage python "+sys.argv[0]+" <input_gff> <input_vcf> <output_file>")
83 | 	else:
84 | 		s_time = time.time()
85 | 		in_gff = sys.argv[1]
86 | 		in_vcf = sys.argv[2]
87 | 		out_file = sys.argv[3]
88 | 		search_pos(in_gff, in_vcf, out_file)
89 | 		e_time = time.time()
90 | 		print("cost time " + str(e_time-s_time))
91 | 
92 | 


--------------------------------------------------------------------------------
/bin/nucmer_statistics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys, os
 3 | 
 4 | 
 5 | def nucmer_statistics(r_fa, q_fa, out_pre, ts):
 6 | 	print("Running nucmer")
 7 | 	cmd = "nucmer -p "+out_pre+" "+r_fa+" "+q_fa+" -t "+ts
 8 | 	print("Running command: "+cmd)
 9 | 	os.system(cmd)
10 | 	
11 | 	print("Running delta-filter")
12 | 	cmd = "delta-filter -gqr "+out_pre+".delta > "+out_pre+".filtered"
13 | 	print("Running command: "+cmd)
14 | 	os.system(cmd)
15 | 	print("Statisitcs")
16 | 	data_db = {}
17 | 	last_INDEL_pos = {}
18 | 	r_chr_len_db = {}
19 | 	q_chr_len_db = {}
20 | 	sv_list = ['SNP', 'INDEL']
21 | 	with os.popen("show-snps -ClrT "+out_pre+".filtered", 'r') as f_in:
22 | 		for line in f_in:
23 | 			data = line.strip().split()
24 | 			if len(data) == 0 or data[0].isdigit() == False:
25 | 				continue
26 | 			
27 | 			pos = int(data[0])
28 | 			r_chrn = data[-2]
29 | 			q_chrn = data[-1]
30 | 			
31 | 			if r_chrn not in data_db:
32 | 				data_db[r_chrn] = {'SNP': {'count': 0, 'size': 0}, 'INDEL': {'count': 0, 'size': 0}}
33 | 				last_INDEL_pos[r_chrn] = 0
34 | 				r_chr_len_db[r_chrn] = int(data[6])
35 | 			if q_chrn not in q_chr_len_db:
36 | 				q_chr_len_db[q_chrn] = int(data[7])
37 | 			
38 | 			if data[1] != '.' and data[2] != '.':
39 | 				data_db[r_chrn]['SNP']['count'] += 1
40 | 				data_db[r_chrn]['SNP']['size'] += 1
41 | 				last_INDEL_pos[r_chrn] = 0
42 | 			else:
43 | 				if pos - last_INDEL_pos[r_chrn] > 1:
44 | 					data_db[r_chrn]['INDEL']['count'] += 1
45 | 				data_db[r_chrn]['INDEL']['size'] += 1
46 | 				last_INDEL_pos[r_chrn] = pos
47 | 	
48 | 	total_size = {}
49 | 	total_count = {}
50 | 	for chrn in data_db:
51 | 		total_size[chrn] = 0
52 | 		total_count[chrn] = 0
53 | 		for type in data_db[chrn]:
54 | 			total_size[chrn] += data_db[chrn][type]['size']
55 | 			total_count[chrn] += data_db[chrn][type]['count']
56 | 	
57 | 	with open(out_pre+".statistics", 'w') as f_out:
58 | 		f_out.write("Type\tNumber\tSize\tSize/TotalVar\tSize/ChrSize\n")
59 | 		for chrn in data_db:
60 | 			for type in sv_list:
61 | 				if type not in data_db[chrn]:
62 | 					continue
63 | 				f_out.write("%s\t%d\t%d\t%.4f\t%.4f\n"%(type, data_db[chrn][type]['count'], data_db[chrn][type]['size'], data_db[chrn][type]['size']*1.0/total_size[chrn], data_db[chrn][type]['size']*2.0/(r_chr_len_db[chrn]+q_chr_len_db[chrn])))
64 | 			f_out.write("%s\t%d\t%d\t%.4f\t%.4f\n"%("Total", total_count[chrn], total_size[chrn], total_size[chrn]*1.0/total_size[chrn], total_size[chrn]*2.0/(r_chr_len_db[chrn]+q_chr_len_db[chrn])))
65 | 	print("Success")
66 | 
67 | 
68 | if __name__ == "__main__":
69 | 	if len(sys.argv) < 5:
70 | 		print("Usage: python "+sys.argv[0]+" <ref_fasta> <query_fasta> <out_pre> <threads>")
71 | 	else:
72 | 		r_fa, q_fa, out_pre, ts = sys.argv[1:]
73 | 		nucmer_statistics(r_fa, q_fa, out_pre, ts)
74 | 


--------------------------------------------------------------------------------
/bin/eval_synteny.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import re
 4 | import numpy as np
 5 | import bisect
 6 | 
 7 | 
 8 | def get_opts():
 9 |     group = argparse.ArgumentParser()
10 |     group.add_argument('-r', '--ref', help='ref.bed', required=True)
11 |     group.add_argument('-q', '--qry' ,help='qry.bed', required=True)
12 |     
13 |     return group.parse_args()
14 | 
15 | # Longest increase sub sequence
16 | def LIS(arr):
17 |     min_num = [-1]
18 |     for n in arr:
19 |         k = bisect.bisect_left(min_num, n)
20 |         if len(min_num) == k:
21 |             min_num.append(n)
22 |         else:
23 |             min_num[k] = n
24 |     return len(min_num)-1
25 | 
26 | 
27 | def eval_synteny(ref_file, qry_file):
28 |     print("Loading bed files")
29 |     ref_db = {}
30 |     qry_db = {}
31 |     ref_list = []
32 |     qry_list = []
33 |     with open(ref_file, 'r') as fin:
34 |         for line in fin:
35 |             data = line.strip().split()
36 |             chrn = data[0]
37 |             gid = data[3]
38 |             if chrn not in ref_db:
39 |                 ref_db[chrn] = []
40 |                 ref_list.append(chrn)
41 |             ref_db[chrn].append(gid)
42 |     
43 |     with open(qry_file, 'r') as fin:
44 |         for line in fin:
45 |             data = line.strip().split()
46 |             chrn = data[0]
47 |             gid = data[3]
48 |             if chrn not in qry_db:
49 |                 qry_db[chrn] = []
50 |                 qry_list.append(chrn)
51 |             qry_db[chrn].append(gid)
52 |     
53 |     print("Comparing")
54 |     total_lis_values = 0
55 |     total_gene_cnt = 0
56 | 
57 |     print("Result")
58 |     for qchr in qry_list:
59 |         max_score = 0
60 |         max_lis_val = 0
61 |         max_gene_cnt = 0
62 |         max_chr = ""
63 |         for rchr in ref_list:
64 |             ref_idx_db = {ref_db[rchr][idx]: idx for idx in range(len(ref_db[rchr]))}
65 |             tmp_order_list = [ref_idx_db[qry_db[qchr][idx]] 
66 |                                  if qry_db[qchr][idx] in ref_idx_db 
67 |                                  else -1 
68 |                                  for idx in range(len(qry_db[qchr]))]
69 |             region_order_list = []
70 |             for _ in tmp_order_list:
71 |                 if _ != -1:
72 |                     region_order_list.append(_)
73 |             chr_gene_cnt = (len(ref_db[rchr])+len(qry_db[qchr]))/2.
74 |             lis_value = max(LIS(region_order_list), LIS(region_order_list[::-1]))
75 |             score = lis_value*100./chr_gene_cnt
76 |             if score > max_score:
77 |                 max_lis_val = lis_value
78 |                 max_gene_cnt = chr_gene_cnt
79 |                 max_score = score
80 |                 max_chr = rchr
81 |         print("%s match %s: %.4f%%"%(qchr, max_chr, max_score))
82 |         total_lis_values += max_lis_val
83 |         total_gene_cnt += max_gene_cnt
84 |     print("Total: %.4f%%"%(total_lis_values*100./total_gene_cnt))
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     opts = get_opts()
89 |     ref_file = opts.ref
90 |     qry_file = opts.qry
91 |     eval_synteny(ref_file, qry_file)
92 | 


--------------------------------------------------------------------------------
/bin/rename_ID.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | 
 5 | def generate_new_id_by_gff(chr_pre, in_gff):
 6 | 	id_db = {}
 7 | 	gff_db = {}
 8 | 	is_first_id = True
 9 | 	with open(in_gff, 'r') as f_in:
10 | 		for line in f_in:
11 | 			data = line.strip().split()
12 | 			if line[0] == '#' or len(data) < 9:
13 | 				continue
14 | 			if data[2] == 'gene':
15 | 				chrn = data[0]
16 | 			#if chrn[:3].lower() != 'chr':
17 | 			#	continue
18 | 				s_p = int(data[3])
19 | 				id = data[8].split(";")[0].split("=")[1]
20 | 				if is_first_id:
21 | 					print("Check ID: %s"%id)
22 | 					is_first_id = False
23 | 				if chrn not in id_db:
24 | 					id_db[chrn] = []
25 | 				id_db[chrn].append([s_p, id])
26 | 				gff_db[id] = []
27 | 			gff_db[id].append(line)
28 | 				
29 | 	new_id_db = {}
30 | 	chr_base = {}
31 | 	ordered_id = []
32 | 	with open("rename_list.txt", 'w') as f_out:
33 | 		tig_base = 10
34 | 		for chrn in sorted(id_db):
35 | 			base = 10
36 | 			for info in sorted(id_db[chrn]):
37 | 				if chrn[:3].lower() == 'chr':
38 | 					idx = int(chrn[3:])
39 | 					new_id = chr_pre+".%02dG%07d"%(idx, base)
40 | 					base += 10
41 | 				else:
42 | 					new_id = chr_pre+".%08d"%(tig_base)
43 | 					tig_base += 10
44 | 				#	new_id = info[1].replace('.', '').replace('G', '')
45 | 				f_out.write("%s\t%d\t%s\t%s\n"%(chrn, info[0], info[1], new_id))
46 | 				ordered_id.append(info[1])
47 | 				new_id_db[info[1]] = new_id
48 | 	return new_id_db, gff_db, ordered_id
49 | 			
50 | 
51 | def rename_id(chr_pre, in_gff, out_gff, in_fastas, out_fastas):
52 | 	print("Generating rename list")
53 | 	rename_id_db, gff_db, ordered_id = generate_new_id_by_gff(chr_pre, in_gff)
54 | 
55 | 	print("Dealing gff")
56 | 	with open(out_gff, 'w') as fout:
57 | 		fout.write("###gff version 3\n")
58 | 		for id in ordered_id:
59 | 			for line in gff_db[id]:
60 | 				fout.write(line.replace(id, rename_id_db[id]))
61 | 			fout.write("\n")				
62 | 
63 | 	print("Dealing fasta")
64 | 	in_fasta_list = in_fastas.split(',')
65 | 	out_fasta_list = out_fastas.split(',')
66 | 	for i in range(0, len(in_fasta_list)):
67 | 		print("\tDealing %s"%in_fasta_list[i])
68 | 		with open(in_fasta_list[i], 'r') as f_in:
69 | 			with open(out_fasta_list[i], 'w') as f_out:
70 | 				suf = in_fasta_list[i].split('.')[-1].lower()
71 | 				if suf == "fasta" or suf == "fa":
72 | 					for line in f_in:
73 | 						if line[0] == '>':
74 | 							id = line.strip()[1:]
75 | 							if id in rename_id_db:
76 | 								line = line.replace(id, rename_id_db[id])
77 | 						f_out.write(line)
78 | 				else:
79 | 					for line in f_in:
80 | 						id = line.strip().split()[0]
81 | 						if id in rename_id_db:
82 | 							f_out.write(line.replace(id, rename_id_db[id]))
83 | 						else:
84 | 							f_out.write(line)
85 | 	print("Finished")
86 | 
87 | 
88 | if __name__ == "__main__":
89 | 	if len(sys.argv) < 5:
90 | 		print("Usage: python "+sys.argv[0]+" <chr_prefix> <in_gff> <out_gff> <in_fasta> <out_fasta>")
91 | 		print("Notice: sort and rename id with in_gff, and rename them in fasta files")
92 | 		print("Example: python "+sys.argv[0]+" CB5 in.gff out.gff 1.fasta,2.fasta 1.new.fasta,2.new.fasta")
93 | 	else:
94 | 		chr_pre, in_gff, out_gff, in_fastas, out_fastas = sys.argv[1:]
95 | 		rename_id(chr_pre, in_gff, out_gff, in_fastas, out_fastas)
96 | 


--------------------------------------------------------------------------------
/bin/simple_ANGSD_without_errorCorrect.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys, os
  3 | 
  4 | 
  5 | def help_message():
  6 | 	print("Usage: python "+sys.argv[0]+" -l <species.list> -r <region> [-out <out_group_name> -p <bam_path>]")
  7 | 
  8 | 
  9 | def parse_options(ARGV):
 10 | 	opt_dict = {}
 11 | 	for i in range(0, len(ARGV), 2):
 12 | 		opt = ARGV[i]
 13 | 		if opt == '-l':
 14 | 			opt_dict['list'] = ARGV[i+1]
 15 | 		elif opt == '-p':
 16 | 			opt_dict['path'] = ARGV[i+1]
 17 | 		elif opt == '-r':
 18 | 			opt_dict['regions'] = ARGV[i+1]
 19 | 		elif opt == '-out':
 20 | 			opt_dict['out_name'] = ARGV[i+1]
 21 | 		elif opt == '-h':
 22 | 			help_message()
 23 | 			exit(0)
 24 | 	return opt_dict
 25 | 
 26 | 
 27 | def run_abbababa(opts):
 28 | 	if 'path' not in opts:
 29 | 		bam_path = './'
 30 | 	else:
 31 | 		bam_path = opts['path']
 32 | 	bam_files = []
 33 | 	for fn in os.listdir(bam_path):
 34 | 		if fn[-4:] == '.bam':
 35 | 			bam_files.append(os.path.join(bam_path, fn))
 36 | 	
 37 | 	print("Indexing bams")
 38 | 	for fn in bam_files:
 39 | 		fn = fn.split('/')[-1]
 40 | 		if os.path.isfile(os.path.join('./', fn+'.bai')) == False:
 41 | 			cmd = 'samtools index '+fn
 42 | 			print("Running command: "+cmd)
 43 | 			os.system(cmd)
 44 | 	
 45 | 	print("Done\nReading list")
 46 | 	list_db = {}
 47 | 	with open(opts['list'], 'r') as f_in:
 48 | 		for line in f_in:
 49 | 			data = line.strip().split()
 50 | 			if data[1] not in list_db:
 51 | 				list_db[data[1]] = {}
 52 | 				list_db[data[1]]['name'] = []
 53 | 				list_db[data[1]]['path'] = []
 54 | 			for fn in bam_files:
 55 | 				if data[0] in fn:
 56 | 					list_db[data[1]]['path'].append(fn)
 57 | 					list_db[data[1]]['name'].append(data[0])
 58 | 	
 59 | 	if 'out_name' not in opts:
 60 | 		out_name = "Outgroup"
 61 | 	else:
 62 | 		out_name = opts['out_name']
 63 | 	
 64 | 	print("Done\nGenerate bam.filelist sizeFile.size popNames.name bamWithErrors.filelist errorList.error")
 65 | 	with open("bam.filelist", "w") as f_list:
 66 | 		with open("sizeFile.size", "w") as f_size:
 67 | 			with open("popNames.name", "w") as f_pop:
 68 | 				for subgroup in list_db:
 69 | 					if subgroup != out_name:
 70 | 						f_pop.write(subgroup+'\n')
 71 | 						f_list.write('\n'.join(list_db[subgroup]['path'])+'\n')
 72 | 						group_size = len(list_db[subgroup]['name'])
 73 | 						f_size.write(str(group_size)+'\n')
 74 | 				f_list.write(list_db[out_name]['path'][0]+'\n')
 75 | 				f_pop.write(out_name+'\n')
 76 | 				f_size.write('1\n')
 77 | 	
 78 | 	print("Done\nDo abbababa")
 79 | 	if "regions" not in opts:
 80 | 		regions_file = "regions.txt"
 81 | 	else:
 82 | 		regions_file = opts["regions"]
 83 | 
 84 | 	cmd = "ANGSD -doAbbababa2 1 -bam bam.filelist -sizeFile sizeFile.size -doCounts 1 -out bam.Angsd -rf "+regions_file+" -useLast 1 -minQ 20 -minMapQ 30"
 85 | 	print("Running command: "+cmd)
 86 | 	os.system(cmd)
 87 | 	
 88 | 	print("Done\n")
 89 | 	
 90 | 	cmd = "Rscript DSTAT angsdFile=\"bam.Angsd\" out=\"result\" sizeFile=sizeFile.size nameFile=popNames.name"
 91 | 	print("Running command: "+cmd)
 92 | 	os.system(cmd)
 93 | 	print("Done\nSuccess")
 94 | 
 95 | 
 96 | if __name__ == "__main__":
 97 | 	if len(sys.argv) == 1:
 98 | 		help_message()
 99 | 	else:
100 | 		opts = parse_options(sys.argv[1:])
101 | 		run_abbababa(opts)
102 | 


--------------------------------------------------------------------------------
/bin/check_cds.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | from enum import Enum
 4 | 
 5 | 
 6 | class CDS_Type(Enum):
 7 |     VALID = 1
 8 |     LENGTH_ERROR = 2
 9 |     MISSING_START_CODON = 3
10 |     MISSING_STOP_CODON = 4
11 |     EARLY_STOP_CODON = 5
12 | 
13 | 
14 | def get_opts():
15 |     group = argparse.ArgumentParser()
16 |     group.add_argument("-i", "--input", help="Input CDS file", required=True)
17 |     group.add_argument(
18 |         "--detail", help="If set, output detail information", action="store_true"
19 |     )
20 |     group.add_argument(
21 |         "-o",
22 |         "--output",
23 |         help="Output summary file, if not set, output to stdout",
24 |         default="",
25 |     )
26 |     return group.parse_args()
27 | 
28 | 
29 | def check_cds(in_cds, is_detail, out_summary):
30 |     start_codon = set(["ATG"])
31 |     stop_codon = set(["TAG", "TAA", "TGA"])
32 |     cds_db = {}
33 |     with open(in_cds, "r") as fin:
34 |         for line in fin:
35 |             if line.strip() == "":
36 |                 continue
37 |             if line[0] == ">":
38 |                 gid = line.strip().split()[0][1:]
39 |                 cds_db[gid] = []
40 |             else:
41 |                 cds_db[gid].append(line.strip().upper())
42 | 
43 |     for gid in cds_db:
44 |         cds_db[gid] = "".join(cds_db[gid])
45 | 
46 |     detail_db = {}
47 |     for gid in cds_db:
48 |         detail_db[gid] = CDS_Type.VALID
49 |         if len(cds_db[gid]) % 3 != 0:
50 |             detail_db[gid] = CDS_Type.LENGTH_ERROR
51 |         else:
52 |             if cds_db[gid][:3] not in start_codon:
53 |                 detail_db[gid] = CDS_Type.MISSING_START_CODON
54 |             elif cds_db[gid][-3:] not in stop_codon:
55 |                 detail_db[gid] = CDS_Type.MISSING_STOP_CODON
56 |             else:
57 |                 for _ in range(3, len(cds_db[gid]) - 3, 3):
58 |                     if cds_db[gid][_ : _ + 3] in stop_codon:
59 |                         detail_db[gid] = CDS_Type.EARLY_STOP_CODON
60 | 
61 |     # Valid, length error, missing start codon, missing stop codon, early stop codon
62 |     summary_info = [0, 0, 0, 0, 0]
63 |     for gid in detail_db:
64 |         summary_info[detail_db[gid].value - 1] += 1
65 | 
66 |     out_info = []
67 |     out_info.append("# Summary")
68 |     out_info.append("Valid:               %d" % summary_info[0])
69 |     out_info.append("Length error:        %d" % summary_info[1])
70 |     out_info.append("Missing start codon: %d" % summary_info[2])
71 |     out_info.append("Missing stop codon:  %d" % summary_info[3])
72 |     out_info.append("Early stop codon:    %d" % summary_info[4])
73 | 
74 |     if is_detail:
75 |         out_info.append("")
76 |         out_info.append("# Error detail")
77 |         for gid in sorted(detail_db):
78 |             if detail_db[gid] != CDS_Type.VALID:
79 |                 out_info.append("%s: %s" % (gid, detail_db[gid].name))
80 | 
81 |     if out_summary:
82 |         with open(out_summary, "w") as fout:
83 |             fout.write("%s\n" % ("\n".join(out_info)))
84 |     else:
85 |         print("%s" % ("\n".join(out_info)))
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     opts = get_opts()
90 |     in_cds = opts.input
91 |     is_detail = True if opts.detail else False
92 |     out_summary = opts.output
93 |     check_cds(in_cds, is_detail, out_summary)
94 | 


--------------------------------------------------------------------------------
/bin/nucmer_statistics_all_sv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys, os
 3 | 
 4 | 
 5 | def nucmer_statistics(r_fa, q_fa, out_pre, ts):
 6 | 	print("Running nucmer")
 7 | 	cmd = "nucmer -p "+out_pre+" "+r_fa+" "+q_fa+" -t "+ts
 8 | 	print("Running command: "+cmd)
 9 | 	os.system(cmd)
10 | 	
11 | 	print("Running delta-filter")
12 | 	cmd = "delta-filter -gqr "+out_pre+".delta > "+out_pre+".filtered"
13 | 	print("Running command: "+cmd)
14 | 	os.system(cmd)
15 | 	print("Statisitcs")
16 | 	data_db = {}
17 | 	last_INDEL_pos = {}
18 | 	r_chr_len_db = {}
19 | 	q_chr_len_db = {}
20 | 	sv_list = ['SNP', 'INDEL', 'JMP', 'INV', 'DUP', 'BRK']
21 | 	with os.popen("show-diff -H "+out_pre+".filtered", 'r') as f_in:
22 | 		for line in f_in:
23 | 			data = line.strip().split()
24 | 			if len(data) < 5:
25 | 				continue
26 | 			chrn = data[0]
27 | 			sv = data[1]
28 | 			if chrn not in data_db:
29 | 				data_db[chrn] = {}
30 | 			if sv not in sv_list:
31 | 				continue
32 | 			if sv not in data_db[chrn]:
33 | 				data_db[chrn][sv] = {'count': 0, 'size': 0}
34 | 			sv_size = abs(int(data[-1]))
35 | 			data_db[chrn][sv]['count'] += 1
36 | 			data_db[chrn][sv]['size'] += sv_size
37 | 			
38 | 				
39 | 	with os.popen("show-snps -ClrT "+out_pre+".filtered", 'r') as f_in:
40 | 		for line in f_in:
41 | 			data = line.strip().split()
42 | 			if len(data) == 0 or data[0].isdigit() == False:
43 | 				continue
44 | 			
45 | 			pos = int(data[0])
46 | 			r_chrn = data[-2]
47 | 			q_chrn = data[-1]
48 | 			if 'SNP' not in data_db[r_chrn]:
49 | 				data_db[r_chrn]['SNP'] = {'count': 0, 'size':0}
50 | 			if 'INDEL' not in data_db[r_chrn]:
51 | 				data_db[r_chrn]['INDEL'] = {'count': 0, 'size':0}
52 | 			if r_chrn not in r_chr_len_db:
53 | 				last_INDEL_pos[r_chrn] = 0
54 | 				r_chr_len_db[r_chrn] = int(data[6])
55 | 			if q_chrn not in q_chr_len_db:
56 | 				q_chr_len_db[q_chrn] = int(data[7])
57 | 			
58 | 			if data[1] != '.' and data[2] != '.':
59 | 				data_db[r_chrn]['SNP']['count'] += 1
60 | 				data_db[r_chrn]['SNP']['size'] += 1
61 | 				last_INDEL_pos[r_chrn] = 0
62 | 			else:
63 | 				if pos - last_INDEL_pos[r_chrn] > 1:
64 | 					data_db[r_chrn]['INDEL']['count'] += 1
65 | 				data_db[r_chrn]['INDEL']['size'] += 1
66 | 				last_INDEL_pos[r_chrn] = pos
67 | 	
68 | 	total_size = {}
69 | 	total_count = {}
70 | 	for chrn in data_db:
71 | 		total_size[chrn] = 0
72 | 		total_count[chrn] = 0
73 | 		for type in data_db[chrn]:
74 | 			total_size[chrn] += data_db[chrn][type]['size']
75 | 			total_count[chrn] += data_db[chrn][type]['count']
76 | 	
77 | 	with open(out_pre+".statistics", 'w') as f_out:
78 | 		f_out.write("Type\tNumber\tSize\tSize/TotalVar\tSize/ChrSize\n")
79 | 		for chrn in data_db:
80 | 			for type in sv_list:
81 | 				if type not in data_db[chrn]:
82 | 					continue
83 | 				f_out.write("%s\t%d\t%d\t%.4f\t%.4f\n"%(type, data_db[chrn][type]['count'], data_db[chrn][type]['size'], data_db[chrn][type]['size']*1.0/total_size[chrn], data_db[chrn][type]['size']*2.0/(r_chr_len_db[chrn]+q_chr_len_db[chrn])))
84 | 			f_out.write("%s\t%d\t%d\t%.4f\t%.4f\n"%("Total", total_count[chrn], total_size[chrn], total_size[chrn]*1.0/total_size[chrn], total_size[chrn]*2.0/(r_chr_len_db[chrn]+q_chr_len_db[chrn])))
85 | 	print("Success")
86 | 
87 | 
88 | if __name__ == "__main__":
89 | 	if len(sys.argv) < 5:
90 | 		print("Usage: python "+sys.argv[0]+" <ref_fasta> <query_fasta> <out_pre> <threads>")
91 | 	else:
92 | 		r_fa, q_fa, out_pre, ts = sys.argv[1:]
93 | 		nucmer_statistics(r_fa, q_fa, out_pre, ts)
94 | 


--------------------------------------------------------------------------------
/bin/transfer_gff3_with_agp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | 
  4 | 
  5 | def get_gene_ctg(chr_list, sp, ep):
  6 | 	infos = []
  7 | 	for info in chr_list:
  8 | 		chsp = info[0]
  9 | 		chep = info[1]
 10 | 		ovlp = min(chep, ep)-max(chsp, sp)
 11 | 		if ovlp>=0:
 12 | 			infos.append([ovlp, info])
 13 | 	if infos != []:
 14 | 		return sorted(infos, reverse=True)[0][1]
 15 | 	else:
 16 | 		return []
 17 | 
 18 | 
 19 | def trans_anno(in_gff3, in_old_agp, in_new_agp, out_gff3):
 20 | 	print("Reading gff3")
 21 | 	gff3_db = {}
 22 | 	with open(in_gff3, 'r') as fin:
 23 | 		for line in fin:
 24 | 			if line[0] == '#' or line.strip() == '':
 25 | 				continue
 26 | 			data = line.strip().split()
 27 | 			if data[2] == 'gene':
 28 | 				ID = data[8].split(';')[0].split('=')[-1]
 29 | 				if ID not in gff3_db:
 30 | 					gff3_db[ID] = []
 31 | 			gff3_db[ID].append(line)
 32 | 
 33 | 	print("Reading old agp")
 34 | 	old_agp_db = {}
 35 | 	with open(in_old_agp, 'r') as fin:
 36 | 		for line in fin:
 37 | 			data = line.strip().split()
 38 | 			if data[4] == 'U':
 39 | 				continue
 40 | 			chrn = data[0]
 41 | 			sp = int(data[1])
 42 | 			ep = int(data[2])
 43 | 			direct = data[-1]
 44 | 			tig = data[5]
 45 | 			if chrn not in old_agp_db:
 46 | 				old_agp_db[chrn] = []
 47 | 			old_agp_db[chrn].append([sp, ep, tig, direct])
 48 | 	
 49 | 	for chrn in old_agp_db:
 50 | 		old_agp_db[chrn] = sorted(old_agp_db[chrn])
 51 | 	
 52 | 	print("Reading new agp")
 53 | 	new_agp_db = {}
 54 | 	with open(in_new_agp, 'r') as fin:
 55 | 		for line in fin:
 56 | 			data = line.strip().split()
 57 | 			if data[4] == 'U':
 58 | 				continue
 59 | 			tig = data[5]
 60 | 			chrn = data[0]
 61 | 			sp = int(data[1])
 62 | 			ep = int(data[2])
 63 | 			direct = data[-1]
 64 | 			new_agp_db[tig] = [chrn, sp, ep, direct]
 65 | 	
 66 | 	print("Writing new gff3")
 67 | 	with open(out_gff3, 'w') as fout:
 68 | 		fout.write("###gff version 3\n")
 69 | 		for id in sorted(gff3_db):
 70 | 			for i in range(0, len(gff3_db[id])):
 71 | 				data = gff3_db[id][i].split()
 72 | 				if data[2] == 'gene':
 73 | 					break
 74 | 			chrn = data[0]
 75 | 			#if chrn not in old_agp_db:
 76 | 			#	continue
 77 | 			sp = int(data[3])
 78 | 			ep = int(data[4])
 79 | 			match_ctg = get_gene_ctg(old_agp_db[chrn], sp, ep)
 80 | 			if match_ctg == []:
 81 | 				print(id, data)
 82 | 			else:
 83 | 				csp, cep, tig, tdir = match_ctg
 84 | 				nchrn, nsp, nep, ndir = new_agp_db[tig]
 85 | 				for line in gff3_db[id]:
 86 | 					data = line.strip().split()
 87 | 					gsp = int(data[3])
 88 | 					gep = int(data[4])
 89 | 					gdir = data[6]
 90 | 					if tdir == '+':
 91 | 						gts = gsp-csp+1
 92 | 						gte = gep-csp+1
 93 | 					else:
 94 | 						gts = cep-gep+1
 95 | 						gte = cep-gsp+1
 96 | 					if gdir == tdir:
 97 | 						gtd = '+'
 98 | 					else:
 99 | 						gtd = '-'
100 | 					if ndir == '+':
101 | 						gns = nsp+gts-1
102 | 						gne = nsp+gte-1
103 | 					else:
104 | 						gns = nep-gte+1
105 | 						gne = nep-gts+1
106 | 					if gtd == ndir:
107 | 						gnd = '+'
108 | 					else:
109 | 						gnd = '-'
110 | 					if gns <= 0 or gne <= 0:
111 | 						continue
112 | 					data[0] = nchrn
113 | 					data[3] = str(gns)
114 | 					data[4] = str(gne)
115 | 					data[6] = gnd
116 | 					fout.write('\t'.join(data)+'\n')
117 | 			fout.write('\n')
118 | 	print("Finished")
119 | 
120 | 
121 | if __name__ == "__main__":
122 | 	if len(sys.argv) < 5:
123 | 		print("Usage: python "+sys.argv[0]+" <in_gff3> <in_old_agp> <in_new_agp> <out_gff3>")
124 | 	else:
125 | 		in_gff3, in_old_agp, in_new_agp, out_gff3 = sys.argv[1:]
126 | 		trans_anno(in_gff3, in_old_agp, in_new_agp, out_gff3)
127 | 


--------------------------------------------------------------------------------
/bin/sort_gff3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import re
 4 | 
 5 | 
 6 | def generate_new_id_by_gff(chr_pre, in_gff):
 7 |     id_db = {}
 8 |     gff_db = {}
 9 |     is_first_id = True
10 |     with open(in_gff, 'r') as f_in:
11 |         for line in f_in:
12 |             data = line.strip().split()
13 |             if line[0] == '#' or len(data) < 9:
14 |                 continue
15 |             if data[2] == 'gene':
16 |                 chrn = data[0]
17 |                 s_p = int(data[3])
18 |                 id = data[8].split(";")[0].split("=")[1]
19 |                 if is_first_id:
20 |                     print("Check ID: %s" % id)
21 |                     is_first_id = False
22 |                 if chrn not in id_db:
23 |                     id_db[chrn] = []
24 |                 id_db[chrn].append([s_p, id])
25 |                 gff_db[id] = []
26 |             gff_db[id].append(line)
27 | 
28 |     new_id_db = {}
29 |     ordered_id = []
30 |     tig_base = 10
31 |     for chrn in sorted(id_db, key=lambda x: int(re.findall(r'\d+', x)[0]) if len(re.findall(r'\d+', x)) > 0 else 1000):
32 |         base = 10
33 |         for info in sorted(id_db[chrn]):
34 |             if chrn[:3].lower() == 'chr':
35 |                 idx, hap = re.findall(r"(\d+)([A-Z]*)", chrn)[0]
36 |                 idx = int(idx)
37 |                 if not hap:
38 |                     hap = "G"
39 |                 new_id = chr_pre + ".%02d%s%07d" % (idx, hap, base)
40 |                 base += 10
41 |             else:
42 |                 new_id = chr_pre + ".%08d" % tig_base
43 |                 tig_base += 10
44 |             ordered_id.append(info[1])
45 |             new_id_db[info[1]] = new_id
46 |     return new_id_db, gff_db, ordered_id
47 | 
48 | 
49 | def rename_id(chr_pre, in_gff, out_gff):
50 |     print("Generating rename list")
51 |     rename_id_db, gff_db, ordered_id = generate_new_id_by_gff(chr_pre, in_gff)
52 | 
53 |     print("Dealing gff3")
54 |     with open(out_gff, 'w') as fout:
55 |         fout.write("###gff version 3\n")
56 |         for ori_id in ordered_id:
57 |             mrna_idx = 1
58 |             for line in gff_db[ori_id]:
59 |                 data = line.strip().split()
60 |                 if data[2] == 'gene':
61 |                     gid = rename_id_db[ori_id]
62 |                     data[8] = "ID=%s;Name=%s" % (gid, gid)
63 |                 elif data[2] == 'mRNA':
64 |                     mrid = "%s.t%d" % (gid, mrna_idx)
65 |                     mrna_idx += 1
66 |                     other_idx_db = {}
67 |                     data[8] = "ID=%s;Name=%s;Parent=%s" % (mrid, mrid, gid)
68 |                 else:
69 |                     feature = data[2]
70 |                     if feature not in other_idx_db:
71 |                         other_idx_db[feature] = 1
72 |                     other_id = "%s.%s%d" % (mrid, feature, other_idx_db[feature])
73 |                     other_idx_db[feature] += 1
74 |                     data[8] = "ID=%s;Name=%s;Parent=%s" % (other_id, other_id, mrid)
75 |                 fout.write("%s\n" % ("\t".join(data)))
76 |             fout.write("###\n")
77 |     print("Finished")
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     if len(sys.argv) < 3:
82 |         print("Usage: python " + sys.argv[0] + " <chr_prefix> <in_gff3> <out_gff3>")
83 |         print("Notice: sort and rename id with in_gff by coordinate, the chromosome ID should be like: Chr01 for mono "
84 |               "assembly, Chr01A for phased assembly.")
85 |         print("Example: python " + sys.argv[0] + " CB5 in.gff3 out.gff3")
86 |     else:
87 |         chr_pre, in_gff3, out_gff3 = sys.argv[1:]
88 |         rename_id(chr_pre, in_gff3, out_gff3)
89 | 


--------------------------------------------------------------------------------
/bin/quick_mask_genome.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys, os
  3 | import multiprocessing
  4 | 
  5 | 
  6 | def read_bed(in_bed, th):
  7 | 	win_regions = {}
  8 | 	with open(in_bed, 'r') as f_in:
  9 | 		for line in f_in:
 10 | 			if line.strip() == '':
 11 | 				continue
 12 | 			data = line.strip().split()
 13 | 			if int(data[3]) > 150:
 14 | 				chrn = data[0]
 15 | 				sp = int(data[1])-1
 16 | 				ep = int(data[2])-1
 17 | 				if chrn not in win_regions:
 18 | 					win_regions[chrn] = []
 19 | 				win_regions[chrn].append([sp, ep])
 20 | 	return win_regions
 21 | 
 22 | 
 23 | def merge_regions(ori_regions):
 24 | 	new_regions = {}
 25 | 	for chrn in ori_regions:
 26 | 		new_regions[chrn] = []
 27 | 		tmp_regions = []
 28 | 		last_e = 0
 29 | 		for region in sorted(ori_regions[chrn]):
 30 | 			sr = region[0]
 31 | 			er = region[1]
 32 | 			if last_e == 0:
 33 | 				tmp_regions.append(sr)
 34 | 				last_e = er
 35 | 			if sr > last_e:
 36 | 				tmp_regions.append(last_e)
 37 | 				tmp_regions.append(sr)
 38 | 				last_e = er
 39 | 			else:
 40 | 				if er > last_e:
 41 | 					last_e = er
 42 | 		tmp_regions.append(last_e)
 43 | 		for i in range(0, len(tmp_regions), 2):
 44 | 			new_regions[chrn].append([tmp_regions[i], tmp_regions[i+1]])
 45 | 	return new_regions
 46 | 
 47 | 
 48 | def read_fasta(in_fa):
 49 | 	seq_db = {}
 50 | 	seq_id_list = []
 51 | 	with open(in_fa, 'r') as f_in:
 52 | 		id = ''
 53 | 		seq = ''
 54 | 		for line in f_in:
 55 | 			if line[0] == '>':
 56 | 				if seq != '':
 57 | 					seq_db[id] = seq
 58 | 				id = line.strip()[1:]
 59 | 				seq_id_list.append(id)
 60 | 				seq = ''
 61 | 			else:
 62 | 				seq += line.strip()
 63 | 		seq_db[id] = seq
 64 | 	return seq_db, seq_id_list
 65 | 
 66 | 
 67 | def mask_fasta(id_list, seq_db, win_regions):
 68 | 	for id in id_list:
 69 | 		new_seq = ''
 70 | 		sp = 0
 71 | 		if id in win_regions:
 72 | 			for region in win_regions[id]:
 73 | 				if region[0] > sp:
 74 | 					new_seq += seq_db[id][sp: region[0]]
 75 | 				new_seq += 'N'*(region[1]-region[0]+1)
 76 | 				sp = region[1]+1
 77 | 			if sp < len(seq_db[id]):
 78 | 				new_seq += seq_db[id][sp: len(seq_db[id])]
 79 | 		else:
 80 | 			new_seq = seq_db[id]
 81 | 
 82 | 		with open(id+'.tmp', 'w') as f_out:
 83 | 			f_out.write(">%s\n%s\n"%(id, new_seq))
 84 | 
 85 | 
 86 | def quick_mask_genome(in_fa, in_bed, out_fa, th, ts):
 87 | 	print("Reading fasta")
 88 | 	seq_db, seq_id_list = read_fasta(in_fa)
 89 | 
 90 | 	print("Reading bed")
 91 | 	win_regions = merge_regions(read_bed(in_bed, th))
 92 | 
 93 | 	task_per_thread = int(len(seq_id_list)/ts)
 94 | 	
 95 | 	task_list = []
 96 | 	
 97 | 	print("Masking genome")
 98 | 	for i in range(0, ts):
 99 | 		if i < ts-1:
100 | 			t = multiprocessing.Process(target=mask_fasta, args=(seq_id_list[i*task_per_thread: (i+1)*task_per_thread], seq_db, win_regions))
101 | 		else:
102 | 			t = multiprocessing.Process(target=mask_fasta, args=(seq_id_list[i*task_per_thread:], seq_db, win_regions))
103 | 		task_list.append(t)
104 | 	
105 | 	for t in task_list:
106 | 		t.start()
107 | 	
108 | 	for t in task_list:
109 | 		t.join()
110 | 	
111 | 	print("Merging")
112 | 	if os.path.exists(out_fa):
113 | 		os.remove(out_fa)
114 | 	for id in seq_id_list:
115 | 		os.system("cat "+id+".tmp >> "+out_fa)
116 | 		os.remove(id+".tmp")
117 | 	print("Success")
118 | 
119 | 
120 | if __name__ == "__main__":
121 | 	if len(sys.argv) < 6:
122 | 		print("Usage: python "+sys.argv[0]+" <in_fasta> <in_bed> <out_fasta> <threshold> <threads>")
123 | 	else:
124 | 		in_fa = sys.argv[1]
125 | 		in_bed = sys.argv[2]
126 | 		out_fa = sys.argv[3]
127 | 		th = int(sys.argv[4])
128 | 		ts = int(sys.argv[5])
129 | 		quick_mask_genome(in_fa, in_bed, out_fa, th, ts)
130 | 


--------------------------------------------------------------------------------
/bin/SeqStat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import os
  4 | import pysam
  5 | import gzip
  6 | 
  7 | 
  8 | def read_fasta(in_file):
  9 | 	if in_file.split('.')[-1] == 'gz':
 10 | 		fin = gzip.open(in_file, 'rt')
 11 | 	else:
 12 | 		fin = open(in_file, 'r')
 13 | 	seq_len = []
 14 | 	for line in fin:
 15 | 		if line[0] == '>':
 16 | 			seq_len.append(0)
 17 | 		else:
 18 | 			seq_len[-1] += len(line.strip())
 19 | 	fin.close()
 20 | 	return sorted(seq_len, reverse=True)
 21 | 
 22 | 
 23 | def read_fastq(in_file):
 24 | 	if in_file.split('.')[-1] == 'gz':
 25 | 		fin = gzip.open(in_file, 'rt')
 26 | 	else:
 27 | 		fin = open(in_file)
 28 | 	seq_len = []
 29 | 	cnt = 0
 30 | 	for line in fin:
 31 | 		if cnt%4 == 1:
 32 | 			seq_len.append(len(line.strip()))
 33 | 		cnt += 1
 34 | 	fin.close()
 35 | 	return sorted(seq_len, reverse=True)
 36 | 
 37 | 
 38 | def read_bam(in_file):
 39 | 	seq_len = []
 40 | 	'''
 41 | 	with os.popen("samtools view %s"%in_file, 'r') as fin:
 42 | 		for line in fin:
 43 | 			seq_len.append(len(line.strip().split()[9]))
 44 | 	'''
 45 | 	with pysam.AlignmentFile(in_file, 'rb', check_sq=False) as fin:
 46 | 		for line in fin:
 47 | 			seq_len.append(line.query_length)
 48 | 	return sorted(seq_len, reverse=True)
 49 | 
 50 | 
 51 | def check_file_type(in_file):
 52 | 	data = in_file.split('.')
 53 | 	if data[-1] == 'gz':
 54 | 		with gzip.open(in_file, 'rt') as fin:
 55 | 			for line in fin:
 56 | 				break
 57 | 			if line[0] == '>':
 58 | 				return "fa"
 59 | 			elif line[0] == '@':
 60 | 				return "fq"
 61 | 			else:
 62 | 				return ""
 63 | 	elif data[-1] == 'bam':
 64 | 		return "bam"
 65 | 	else:
 66 | 		with open(in_file, 'r') as fin:
 67 | 			for line in fin:
 68 | 				break
 69 | 			if line[0] == '>':
 70 | 				return "fa"
 71 | 			elif line[0] == '@':
 72 | 				return "fq"
 73 | 			else:
 74 | 				return ""
 75 | 
 76 | 
 77 | def seq_stat(in_file, out_stat):
 78 | 	file_type = check_file_type(in_file)
 79 | 	if file_type == 'fa':
 80 | 		seq_len = read_fasta(in_file)
 81 | 	elif file_type == 'fq':
 82 | 		seq_len = read_fastq(in_file)
 83 | 	elif file_type == 'bam':
 84 | 		seq_len = read_bam(in_file)
 85 | 	else:
 86 | 		print("Unsupport file type")
 87 | 		sys.exit()
 88 | 	seq_cnt = len(seq_len)
 89 | 	min_len = seq_len[-1]
 90 | 	max_len = seq_len[0]
 91 | 	total_size = sum(seq_len)
 92 | 	ave_len = total_size*1.0/seq_cnt
 93 | 	n_threshold = []
 94 | 	n_values = []
 95 | 	n_labels = []
 96 | 	for i in range(90, 40, -10):
 97 | 		n_threshold.append(i/100.0*total_size)
 98 | 		n_values.append(0)
 99 | 		n_labels.append("N%d:\t"%i)
100 | 	cur_size = 0
101 | 	cnt_500 = 0
102 | 	cnt_2k = 0
103 | 	for i in range(0, seq_cnt):
104 | 		cur_size += seq_len[i]
105 | 		for j in range(0, len(n_values)):
106 | 			if n_values[j] == 0 and cur_size >= n_threshold[j]:
107 | 				n_values[j] = seq_len[i]
108 | 		if seq_len[i] > 500:
109 | 			cnt_500 += 1
110 | 		if seq_len[i] > 2000:
111 | 			cnt_2k += 1
112 | 	n_info = []
113 | 	for i in range(0, len(n_values)):
114 | 		n_info.append("%s%d\n"%(n_labels[i], n_values[i]))
115 | 	info = "number of seq:\t%d\nmin length:\t%d\nmax length:\t%d\ntotal size:\t%d\n%sAverage length:\t%d\nTotal number (>500bp):\t%d\nTotal number (>2000bp):\t%d"%(seq_cnt, min_len, max_len, total_size, ''.join(n_info), ave_len, cnt_500, cnt_2k)
116 | 	if out_stat == "":
117 | 		print(info)
118 | 	else:
119 | 		with open(out_stat, 'w') as fout:
120 | 			fout.write("%s\n"%info)
121 | 
122 | 
123 | if __name__ == "__main__":
124 | 	if len(sys.argv) < 2:
125 | 		print("Usage: python %s <in_file> [out_stat]"%sys.argv[0])
126 | 	else:
127 | 		if len(sys.argv) == 2:
128 | 			in_file = sys.argv[1]
129 | 			out_stat = ""
130 | 		else:
131 | 			in_file, out_stat = sys.argv[1:]
132 | 		seq_stat(in_file, out_stat)
133 | 
134 | 


--------------------------------------------------------------------------------
/bin/approximate_cnv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys, os
  3 | import multiprocessing
  4 | 
  5 | 
  6 | def help_message():
  7 | 	print("Usage: python "+sys.argv[0]+" -bam <bam_list_file> -g <genome_size> -l <read_length> -bed <bed_file> -o <out_file> [-t <thread_nums>]")
  8 | 
  9 | 
 10 | def get_opts(ARGV):
 11 | 	opts = {}
 12 | 	if len(ARGV) < 3:
 13 | 		help_message()
 14 | 		sys.exit(0)
 15 | 
 16 | 	for i in range(1, len(ARGV), 2):
 17 | 		key = ARGV[i][1:]
 18 | 		value = ARGV[i+1]
 19 | 		if key not in opts:
 20 | 			opts[key] = value
 21 | 	return opts
 22 | 
 23 | 
 24 | def calc_mapped_reads_count(in_bam):
 25 | 	fn = in_bam + '.read_counts.txt'
 26 | 	counts = 0
 27 | 	res = os.popen("samtools view "+in_bam)
 28 | 	for line in res:
 29 | 		data = line.strip().split()
 30 | 		if data[2] != '*':
 31 | 			counts += 1
 32 | 	with open(fn, 'w') as f_out:
 33 | 		f_out.write(str(counts))
 34 | 
 35 | 
 36 | def calc_read_depth(in_bam, in_bed):
 37 | 	fn = in_bam + '.read_depth.txt'
 38 | 	res = os.popen("bedtools coverage -a "+in_bed+" -b "+in_bam+" 2>/dev/null")
 39 | 	with open(fn, 'w') as f_out:
 40 | 		for line in res:
 41 | 			data = line.strip().split('\t')
 42 | 			if len(data) < 7:
 43 | 				continue
 44 | 			f_out.write(line)
 45 | 
 46 | 
 47 | def calc_pipeline(in_bams, in_bed):
 48 | 	calc_mapped_reads_count(in_bams)
 49 | 	calc_read_depth(in_bams, in_bed)
 50 | 
 51 | 
 52 | def quick_CNV(opts):
 53 | 	bam_list = []
 54 | 	name_list = []
 55 | 	bed_rows = 0
 56 | 	print("Calculating read depth and counts")
 57 | 	with open(opts['bam'], 'r') as f_in:
 58 | 		for line in f_in:
 59 | 			print("\tDealing %s"%line.strip())
 60 | 			bam_list.append(line.strip())
 61 | 			name_list.append(line.strip().split('/')[-1].split('\\')[-1].split('.')[0])
 62 | 	if 't' in opts:
 63 | 		t_n = int(opts['t'])
 64 | 	else:
 65 | 		t_n = 1
 66 | 
 67 | 	print("Creating processes pool")
 68 | 	bed_file = opts['bed']
 69 | 
 70 | 	pool = multiprocessing.Pool(processes=t_n)
 71 | 	for bam_file in bam_list:
 72 | 		res = pool.apply_async(calc_pipeline, (bam_file, bed_file,))
 73 | 	pool.close()
 74 | 	pool.join()
 75 | 
 76 | 	genome_size = int(opts['g'])
 77 | 	read_length = int(opts['l'])
 78 | 	gene_length_db = {}
 79 | 
 80 | 	print("Reading bed")
 81 | 	with open(bed_file, 'r') as f_in:
 82 | 		for line in f_in:
 83 | 			data = line.strip().split()
 84 | 			gene_name = data[-1]
 85 | 			s_p = int(data[1])
 86 | 			e_p = int(data[2])
 87 | 			length = e_p - s_p
 88 | 			if length < 0:
 89 | 				length = -length
 90 | 			gene_length_db[gene_name] = length
 91 | 	
 92 | 	print("Approximating CNV")
 93 | 	bed_rows = len(data)
 94 | 	mapped_rc = {}
 95 | 	coverage_db = {}
 96 | 	for i in range(0, len(bam_list)):
 97 | 		fn = bam_list[i] + '.read_counts.txt'
 98 | 		with open(fn, 'r') as f_in:
 99 | 			for line in f_in:
100 | 				if line.strip() == '':
101 | 					continue
102 | 				mapped_rc[name_list[i]] = int(line.strip())
103 | 		#os.remove(fn)
104 | 
105 | 		fn = bam_list[i] + '.read_depth.txt'
106 | 		if name_list[i] not in coverage_db:
107 | 			coverage_db[name_list[i]] = {}
108 | 		with open(fn, 'r') as f_in:
109 | 			for line in f_in:
110 | 				if line.strip() == '':
111 | 					continue
112 | 				data = line.strip().split('\t')
113 | 				gene_name = data[bed_rows-1]
114 | 				if gene_name not in coverage_db[name_list[i]]:
115 | 					coverage_db[name_list[i]][gene_name] = int(data[bed_rows])
116 | 		#os.remove(fn)
117 | 	
118 | 	print("Writing result")
119 | 	out_file = opts['o']
120 | 	with open(out_file, 'w') as f_out:
121 | 		f_out.write("#Sample\\CopyNumber\t")
122 | 		f_out.write('\t'.join(list(sorted(gene_length_db.keys())))+'\n')
123 | 		for name in sorted(mapped_rc.keys()):
124 | 			f_out.write(name)
125 | 			seq_depth = mapped_rc[name]*1.0*read_length/genome_size
126 | 			for gene in sorted(gene_length_db.keys()):
127 | 				copy_number = 1.0*coverage_db[name][gene]*read_length/gene_length_db[gene]/seq_depth
128 | 				f_out.write('\t'+str(copy_number))
129 | 			f_out.write('\n')
130 | 	
131 | 	print("Finished")
132 | 
133 | 
134 | if __name__ == "__main__":
135 | 	opts = get_opts(sys.argv)
136 | 	necessary_paras = ['bam', 'g', 'l', 'bed', 'o']
137 | 	for key in necessary_paras:
138 | 		if key not in opts:
139 | 			help_message()
140 | 			sys.exit(0)
141 | 	quick_CNV(opts)
142 | 


--------------------------------------------------------------------------------
/bin/simple_ANGSD.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys, os
  3 | 
  4 | 
  5 | def help_message():
  6 | 	print("Usage: python "+sys.argv[0]+" -l <species.list> -anc <outgroup.fasta> -r <region> [-out <out_group_name> -p <bam_path> -ref <ref.fasta>]")
  7 | 
  8 | 
  9 | def parse_options(ARGV):
 10 | 	opt_dict = {}
 11 | 	for i in range(0, len(ARGV), 2):
 12 | 		opt = ARGV[i]
 13 | 		if opt == '-l':
 14 | 			opt_dict['list'] = ARGV[i+1]
 15 | 		elif opt == '-p':
 16 | 			opt_dict['path'] = ARGV[i+1]
 17 | 		elif opt == '-anc':
 18 | 			opt_dict['anc'] = ARGV[i+1]
 19 | 		elif opt == '-ref':
 20 | 			opt_dict['ref'] = ARGV[i+1]
 21 | 		elif opt == '-r':
 22 | 			opt_dict['regions'] = ARGV[i+1]
 23 | 		elif opt == '-out':
 24 | 			opt_dict['out_name'] = ARGV[i+1]
 25 | 		elif opt == '-h':
 26 | 			help_message()
 27 | 			exit(0)
 28 | 	return opt_dict
 29 | 
 30 | 
 31 | def run_abbababa(opts):
 32 | 	if 'path' not in opts:
 33 | 		bam_path = './'
 34 | 	else:
 35 | 		bam_path = opts['path']
 36 | 	bam_files = []
 37 | 	for fn in os.listdir(bam_path):
 38 | 		if fn[-4:] == '.bam':
 39 | 			bam_files.append(os.path.join(bam_path, fn))
 40 | 	
 41 | 	print("Indexing bams")
 42 | 	for fn in bam_files:
 43 | 		fn = fn.split('/')[-1]
 44 | 		if os.path.isfile(os.path.join('./', fn+'.bai')) == False:
 45 | 			cmd = 'samtools index '+fn
 46 | 			print("Running command: "+cmd)
 47 | 			os.system(cmd)
 48 | 	
 49 | 	print("Done\nReading list")
 50 | 	list_db = {}
 51 | 	with open(opts['list'], 'r') as f_in:
 52 | 		for line in f_in:
 53 | 			data = line.strip().split()
 54 | 			if data[1] not in list_db:
 55 | 				list_db[data[1]] = {}
 56 | 				list_db[data[1]]['name'] = []
 57 | 				list_db[data[1]]['path'] = []
 58 | 			for fn in bam_files:
 59 | 				if data[0] in fn:
 60 | 					list_db[data[1]]['path'].append(fn)
 61 | 					list_db[data[1]]['name'].append(data[0])
 62 | 	
 63 | 	if 'out_name' not in opts:
 64 | 		out_name = "Outgroup"
 65 | 	else:
 66 | 		out_name = opts['out_name']
 67 | 	
 68 | 	print("Done\nGenerate bam.filelist sizeFile.size popNames.name bamWithErrors.filelist errorList.error")
 69 | 	with open("bam.filelist", "w") as f_list:
 70 | 		with open("sizeFile.size", "w") as f_size:
 71 | 			with open("popNames.name", "w") as f_pop:
 72 | 				with open("bamWithErrors.filelist", "w") as f_bwe:
 73 | 					with open("errorList.error", "w") as f_err_list:
 74 | 						i = 0
 75 | 						for subgroup in list_db:
 76 | 							if subgroup != out_name:
 77 | 								f_pop.write(subgroup+'\n')
 78 | 								if i < 2:
 79 | 									f_bwe.write('\n'.join(list_db[subgroup]['path'])+"\n")
 80 | 									i += 1
 81 | 									f_err_list.write("./errorFile.ancError\n")
 82 | 								else:
 83 | 									f_err_list.write("NA\n")
 84 | 								f_list.write('\n'.join(list_db[subgroup]['path'])+'\n')
 85 | 								group_size = len(list_db[subgroup]['name'])
 86 | 								f_size.write(str(group_size)+'\n')
 87 | 						f_list.write(list_db[out_name]['path'][0]+'\n')
 88 | 						f_pop.write(out_name+'\n')
 89 | 						f_size.write('1\n')
 90 | 	
 91 | 	print("Done\nDo abbababa")
 92 | 	anc_file = opts['anc']
 93 | 	if "regions" not in opts:
 94 | 		regions_file = "regions.txt"
 95 | 	else:
 96 | 		regions_file = opts["regions"]
 97 | 
 98 | 	cmd = "ANGSD -doAbbababa2 1 -bam bam.filelist -sizeFile sizeFile.size -doCounts 1 -out bam.Angsd -rf "+regions_file+" -useLast 1 -minQ 20 -minMapQ 30"
 99 | 	print("Running command: "+cmd)
100 | 	os.system(cmd)
101 | 	
102 | 	print("Done\nIndex reference fasta")
103 | 	if 'ref' not in opts:
104 | 		os.system("ANGSD -i "+bam_files[-1]+" -doFasta 1 -doCounts 1 -out perfectSampleCEU")
105 | 		os.system("gunzip perfectSampleCEU.fa.gz")
106 | 		os.system("samtools faidx perfectSampleCEU.fa")
107 | 		ref_fasta = "perfectSampleCEU.fa"
108 | 	else:
109 | 		ref_fasta = opts['ref']
110 | 		os.system("samtools faidx "+ref_fasta)
111 | 	
112 | 	print("Done\nDo Anc Error and Rscript")
113 | 	cmd = "ANGSD -doAncError 1 -anc "+anc_file+" -ref "+ref_fasta+" -out errorFile -bam bamWithErrors.filelist"
114 | 	print("Running command: "+cmd)
115 | 	os.system(cmd)
116 | 
117 | 	cmd = "Rscript DSTAT angsdFile=\"bam.Angsd\" out=\"result\" sizeFile=sizeFile.size errFile=errorList.error nameFile=popNames.name"
118 | 	print("Running command: "+cmd)
119 | 	os.system(cmd)
120 | 	print("Done\nSuccess")
121 | 
122 | 
123 | if __name__ == "__main__":
124 | 	if len(sys.argv) == 1:
125 | 		help_message()
126 | 	else:
127 | 		opts = parse_options(sys.argv[1:])
128 | 		run_abbababa(opts)
129 | 


--------------------------------------------------------------------------------
/bin/remove_region_by_blast_result.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys, operator
  3 | 
  4 | 
  5 | def reverse_region(ori_region, chr_len_db):
  6 | 	new_region = {}
  7 | 	for chrn in ori_region:
  8 | 		if chrn not in new_region:
  9 | 			new_region[chrn] = [] 
 10 | 		temp_region = []
 11 | 		temp_region.append(0)
 12 | 		for region in ori_region[chrn]:
 13 | 			temp_region.append(region[0]-1)
 14 | 			temp_region.append(region[1]+1)
 15 | 		temp_region.append(chr_len_db[chrn]-1)
 16 | 		i = 0
 17 | 		while i < len(temp_region):
 18 | 			new_region[chrn].append([temp_region[i], temp_region[i+1]])
 19 | 			i += 2
 20 | 	return new_region
 21 | 
 22 | 
 23 | def update_region(ori_region, region):
 24 | 	temp_region = []
 25 | 	for chr_region in ori_region:
 26 | 		if chr_region[0] <= region[0] <= chr_region[1]:
 27 | 			if chr_region not in temp_region:
 28 | 				temp_region.append(chr_region)
 29 | 		if chr_region[0] <= region[1] <= chr_region[1]:
 30 | 			if chr_region not in temp_region:
 31 | 				temp_region.append(chr_region)
 32 | 	for chr_region in temp_region:
 33 | 		if chr_region in ori_region:
 34 | 			ori_region.remove(chr_region)
 35 | 		if chr_region[0] < region[0] <= chr_region[1]:
 36 | 			ori_region.append([chr_region[0], region[0]-1])
 37 | 		if chr_region[0] <= region[1] < chr_region[1]:
 38 | 			ori_region.append([region[1]+1, chr_region[1]])
 39 | 	return ori_region
 40 | 
 41 | 
 42 | def remove_region(blast_results, chr_len_file, out_bed):
 43 | 	chr_len_db = {}
 44 | 	with open(chr_len_file, 'r') as f_in:
 45 | 		for line in f_in:
 46 | 			if line.strip() != '':
 47 | 				data = line.strip().split()
 48 | 				chr_len_db[data[0]] = int(data[1])
 49 | 	out_region = {}
 50 | 	is_first = True
 51 | 	for blast_file in blast_results:
 52 | 		if is_first:
 53 | 			with open(blast_file, 'r') as f_in:
 54 | 				blast_region = {}
 55 | 				for line in f_in:
 56 | 					data = line.strip().split()
 57 | 					chrn = data[1]
 58 | 					s = int(data[8])
 59 | 					e = int(data[9])
 60 | 					if s > e:
 61 | 						temp = s
 62 | 						s = e
 63 | 						e = temp
 64 | 					if chrn not in out_region:
 65 | 						blast_region[chrn] = []
 66 | 						blast_region[chrn].append([s, e])
 67 | 					temp_list = []
 68 | 					for i in range(0, len(blast_region[chrn])):
 69 | 						if s <= blast_region[chrn][i][0] and e >= blast_region[chrn][i][0]:
 70 | 							blast_region[chrn][i][0] = s
 71 | 						if e >= blast_region[chrn][i][1] and s <= blast_region[chrn][i][1]:
 72 | 							blast_region[chrn][i][1] = e
 73 | 						if s > blast_region[chrn][i][1] or e < blast_region[chrn][i][0]:
 74 | 							temp_list.append([s, e])
 75 | 					for region in temp_list:
 76 | 						blast_region[chrn].append(region)
 77 | 					blast_region[chrn] = sorted(blast_region[chrn])
 78 | 				for chrn in blast_region:
 79 | 					temp_region = []
 80 | 					last_e = 0
 81 | 					for i in range(0, len(blast_region[chrn])):
 82 | 						s = blast_region[chrn][i][0]
 83 | 						e = blast_region[chrn][i][1]
 84 | 						if i == 0:
 85 | 							temp_region.append(s)
 86 | 							last_e = e
 87 | 						else:
 88 | 							if last_e < s:
 89 | 								temp_region.append(e)
 90 | 								temp_region.append(s)
 91 | 								last_e = e
 92 | 							else:
 93 | 								if last_e < e:
 94 | 									last_e = e
 95 | 					temp_region.append(e)
 96 | 					i = 0
 97 | 					blast_region[chrn] = []
 98 | 					while i < len(temp_region):
 99 | 						blast_region[chrn].append([temp_region[i], temp_region[i+1]])
100 | 						i += 2
101 | 					out_region = reverse_region(blast_region, chr_len_db)
102 | 			is_first = False
103 | 		else:
104 | 			with open(blast_file, 'r') as f_in:
105 | 				for line in f_in:
106 | 					data = line.strip().split()
107 | 					chrn = data[1]
108 | 					s = int(data[8])
109 | 					e = int(data[9])
110 | 					if s > e:
111 | 						temp = s
112 | 						s = e
113 | 						e = temp
114 | 					out_region[chrn] = update_region(out_region[chrn], [s, e])
115 | 	
116 | 	with open(out_bed, 'w') as f_out:
117 | 		for chrn in out_region:
118 | 			out_region[chrn] = sorted(out_region[chrn])
119 | 			for region in out_region[chrn]:
120 | 				f_out.write(chrn+'\t'+str(region[0])+'\t'+str(region[1])+'\n')
121 | 
122 | 
123 | if __name__ == "__main__":
124 | 	if len(sys.argv) < 4:
125 | 		print("Notice: this script is using for remove regions from choromosome with blast results")
126 | 		print("Usage: python "+sys.argv[0]+" <blast_results> <chr_len> <out_bed>")
127 | 		print("\t<blast_results> is a blast file list segmented with comma")
128 | 	else:
129 | 		blast_results = sys.argv[1].split(",")
130 | 		chr_len_file = sys.argv[2]
131 | 		out_bed = sys.argv[3]
132 | 		remove_region(blast_results, chr_len_file, out_bed)
133 | 
134 | 


--------------------------------------------------------------------------------
/bin/convert_anchorwave.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | 
 4 | 
 5 | def get_opt():
 6 |     group = argparse.ArgumentParser()
 7 |     group.add_argument('-i', '--input', help='Input maf file', required=True)
 8 |     group.add_argument('-o', '--output', help='Out put file', required=True)
 9 |     return group.parse_args()
10 | 
11 | 
12 | def convert_anchorwave(in_file, out_file):
13 |     print("Converting")
14 |     with open(in_file, 'r') as fin:
15 |         with open(out_file, 'w') as fout:
16 |             tmp = []
17 |             chrs = []
18 |             fout.write("#Ref\tStart\tEnd\tQuery\tStart\tEnd\tType\n")
19 |             for line in fin:
20 |                 if line.strip() == '' or line[0] == '#' or line[0] == 'a':
21 |                     continue
22 |                 data = line.strip().split()
23 |                 chrs.append(data[1])
24 |                 tmp.append(data[6])
25 |                 if len(tmp) == 2:
26 |                     seq_len = len(tmp[0])
27 |                     print("\tConverting pair: %s, length: %d"%(','.join(chrs), seq_len))
28 |                     ref_pos = 0
29 |                     qry_pos = 0
30 |                     var_info = []
31 |                     per_cnt = int(seq_len / 10)
32 |                     print("\t", end="")
33 |                     for _ in range(seq_len):
34 |                         if (_+1)%per_cnt == 0:
35 |                             print("%d%%"%(int((_+1)/per_cnt)*10), end='\t', flush=True)
36 |                         ref_base = tmp[0][_]
37 |                         qry_base = tmp[1][_]
38 |                         if ref_base == '-':
39 |                             var_type = 'INS'
40 |                             var_info.append([ref_pos, qry_pos, var_type])
41 |                             qry_pos += 1
42 |                         elif qry_base == '-':
43 |                             var_type = 'DEL'
44 |                             var_info.append([ref_pos, qry_pos, var_type])
45 |                             ref_pos += 1
46 |                         elif ref_base != qry_base:
47 |                             var_type = 'SNP'
48 |                             var_info.append([ref_pos, qry_pos, var_type])
49 |                             ref_pos += 1
50 |                             qry_pos += 1
51 |                         else:
52 |                             ref_pos += 1
53 |                             qry_pos += 1
54 |                     print()
55 |                     if len(var_info) == 0:
56 |                         tmp = []
57 |                         chrs = []
58 |                         continue
59 |                     print("\tMerging pair: %s, length: %d"%(','.join(chrs), seq_len))
60 |                     merge_info = [[var_info[0][0], var_info[0][0], 
61 |                                 var_info[0][1], var_info[0][1], 
62 |                                 var_info[0][2]]]
63 |                     for _ in range(1, len(var_info)):
64 |                         cur_info = var_info[_]
65 |                         if cur_info[-1] == merge_info[-1][-1]:
66 |                             is_continue = False
67 |                             if cur_info[0] == merge_info[-1][1] + 1:
68 |                                 merge_info[-1][1] = cur_info[0]
69 |                                 is_continue = True
70 |                             if cur_info[1] == merge_info[-1][3] + 1:
71 |                                 merge_info[-1][3] = cur_info[1]
72 |                                 is_continue = True
73 |                             if not is_continue:
74 |                                 merge_info.append([cur_info[0], cur_info[0],
75 |                                                 cur_info[1], cur_info[1],
76 |                                                 cur_info[2]])
77 |                         else:
78 |                             merge_info.append([cur_info[0], cur_info[0],
79 |                                             cur_info[1], cur_info[1],
80 |                                             cur_info[2]])
81 |                     print("\tWriting pair: %s, length: %d"%(','.join(chrs), seq_len))
82 |                     for rsp, rep, qsp, qep, var_type in merge_info:
83 |                         fout.write("%s\t%d\t%d\t%s\t%d\t%d\t%s\n"%(chrs[0], rsp+1, rep+1, 
84 |                                                                    chrs[1], qsp+1, qep+1,
85 |                                                                    var_type))
86 |                     chrs = []
87 |                     tmp = []
88 |     print("Finished")
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     opts = get_opt()
93 |     in_file = opts.input
94 |     out_file = opts.output
95 |     convert_anchorwave(in_file, out_file)
96 | 
97 | 


--------------------------------------------------------------------------------
/bin/blast2heatmap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys, os
  3 | 
  4 | 
  5 | def filter_blast(blast_file, out_file, t_i, t_m):
  6 | 	blast_db = {}
  7 | 	with open(blast_file, 'r') as f_in:
  8 | 		for line in f_in:
  9 | 			data = line.strip().split()
 10 | 			chrn = data[1]
 11 | 			iden = float(data[2])
 12 | 			s_pos_1 = int(data[6])
 13 | 			e_pos_1 = int(data[7])
 14 | 			s_pos_2 = int(data[8])
 15 | 			e_pos_2 = int(data[9])
 16 | 			if iden < t_i:
 17 | 				continue
 18 | 			if s_pos_1 == s_pos_2 and e_pos_1 == e_pos_2:
 19 | 				continue
 20 | 			if s_pos_2 > e_pos_2:
 21 | 				tmp = s_pos_2
 22 | 				s_pos_2 = e_pos_2
 23 | 				e_pos_2 = tmp
 24 | 			if e_pos_2 - s_pos_2 < t_m:
 25 | 				continue
 26 | 			if chrn not in blast_db:
 27 | 				blast_db[chrn] = []
 28 | 			if [s_pos_2, e_pos_2] not in blast_db[chrn]:
 29 | 				blast_db[chrn].append([s_pos_2, e_pos_2])
 30 | 	with open(out_file, 'w') as f_out:
 31 | 		for chrn in sorted(blast_db.keys()):
 32 | 			for region in sorted(blast_db[chrn]):
 33 | 				f_out.write(chrn+'\t'+str(region[0])+'\t'+str(region[1])+'\n')
 34 | 
 35 | 
 36 | def reshape(in_data, out_data):
 37 | 	data_db = {}
 38 | 	max_length = 0
 39 | 	max_chr = ''
 40 | 	with open(in_data, 'r') as f_in:
 41 | 		for line in f_in:
 42 | 			data = line.strip().split()
 43 | 			chrn = data[0]
 44 | 			pos = int(data[1])
 45 | 			value = data[-1]
 46 | 			if chrn[:3].lower() != 'chr':
 47 | 				continue
 48 | 			if len(chrn) == 4:
 49 | 				chrn = chrn[:3]+'0'+chrn[-1]
 50 | 			if chrn not in data_db:
 51 | 				data_db[chrn] = []
 52 | 			data_db[chrn].append([pos, value])
 53 | 
 54 | 	for chrn in data_db:
 55 | 		if len(data_db[chrn]) > max_length:
 56 | 			max_length = len(data_db[chrn])
 57 | 			max_chr = chrn
 58 | 	
 59 | 	for chrn in data_db:
 60 | 		curr_len = len(data_db[chrn])
 61 | 		if curr_len < max_length:
 62 | 			for i in range(curr_len, max_length):
 63 | 				data_db[chrn].append([data_db[max_chr][i][0], 'nan'])
 64 | 	
 65 | 	new_data = {}
 66 | 	for chrn in data_db:
 67 | 		for value in data_db[chrn]:
 68 | 			if value[0] not in new_data:
 69 | 				new_data[value[0]] = {}
 70 | 			new_data[value[0]][chrn] = value[1]
 71 | 	
 72 | 	head = []
 73 | 	for pos in new_data:
 74 | 		for chrn in sorted(new_data[pos].keys()):
 75 | 			head.append(chrn)
 76 | 		break
 77 | 
 78 | 	with open(out_data, 'w') as f_out:
 79 | 		f_out.write('\t'+'\t'.join(head)+'\n')
 80 | 		for pos in sorted(new_data.keys()):
 81 | 			f_out.write(str(pos))
 82 | 			for chrn in sorted(new_data[pos].keys()):
 83 | 				f_out.write('\t'+new_data[pos][chrn])
 84 | 			f_out.write('\n')
 85 | 
 86 | 
 87 | def draw_heatmap_R(in_data, out_name, rs):
 88 | 	current_path = os.getcwd()
 89 | 	script = os.path.join(current_path, out_name+'_draw.R')
 90 | 	with open(script, 'w') as f_out:
 91 | 		f_out.write("setwd(\""+current_path+"\")\n")
 92 | 		f_out.write("data<-read.table(\""+in_data+"\", header = TRUE)\n")
 93 | 		f_out.write("cr<-c(0:length(colnames(data))-1)\n")
 94 | 		f_out.write("library(\"pheatmap\")\n")
 95 | 		f_out.write("pheatmap(data, cluster_cols = FALSE, cluster_rows = FALSE, show_rownames = FALSE, gaps_col = cr, filename = \""+out_name+".pdf\")\n")
 96 | 	os.system("Rscript "+out_name+"_draw.R")
 97 | 
 98 | 	script = os.path.join(current_path, out_name+'_draw_with_label.R')
 99 | 	with open(script, 'w') as f_out:
100 | 		f_out.write("setwd(\""+current_path+"\")\n")
101 | 		f_out.write("data<-read.table(\""+in_data+"\", header = TRUE)\n")
102 | 		f_out.write("cr<-c(0:length(colnames(data))-1)\n")
103 | 		f_out.write("row_name<-array()\n")
104 | 		f_out.write("for(i in 1:length(rownames(data))){if(i%%==1){row_name[i]<-rownames(data)[i]}else{row_name[i]<-\"\"}}\n")
105 | 		f_out.write("library(\"pheatmap\")\n")
106 | 		f_out.write("pheatmap(data, cluster_cols = FALSE, cluster_rows = FALSE, labels_row = row_name, gaps_col = cr, filename = \""+out_name+"_label.pdf\")\n")
107 | 	os.system("Rscript "+out_name+"_draw_with_label.R")
108 | 
109 | 
110 | def blast2heatmap(ref_fasta, blast_file, ws, out_name, t_i, t_m):
111 | 	print("Filter blast result")
112 | 	filter_blast(blast_file, "01_"+out_name+'_filter.bed', t_i, t_m)
113 | 	print("Generate windows")
114 | 	os.system("bedtools makewindows -g "+ref_fasta+" -w "+ws+" > 02_"+out_name+"_win.bed")
115 | 	print("Coverage")
116 | 	os.system("bedtools coverage -a "+"02_"+out_name+"_win.bed -b "+"01_"+out_name+"_filter.bed > "+"03_"+out_name+"_cover.txt")
117 | 	print("Reshape data")
118 | 	reshape("03_"+out_name+"_cover.txt", "04_"+out_name+"_result.txt")
119 | 	print("Draw heatmap")
120 | 	draw_heatmap_R("04_"+out_name+"_result.txt", "05_"+out_name)
121 | 
122 | 
123 | if __name__ == "__main__":
124 | 	if len(sys.argv) < 7:
125 | 		print("Usage: python "+sys.argv[0]+" <ref_fasta> <blast_file> <window_size> <out_name> <threshold_identify> <threshold_match>")
126 | 		print("Notice: bedtools and R are required")
127 | 	else:
128 | 		proc, ref_fasta, blast_file, ws, out_name, t_i, t_m = sys.argv
129 | 		blast2heatmap(ref_fasta, blast_file, ws, out_name, float(t_i), float(t_m))
130 | 


--------------------------------------------------------------------------------
/bin/SimContigs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import argparse
  4 | import random
  5 | import copy
  6 | 
  7 | 
  8 | def GetOpts():
  9 | 	group = argparse.ArgumentParser()
 10 | 	group.add_argument('--min', help='minimum length of contig, default: 15k, you can use both number or string end with k,m', default="15k")
 11 | 	group.add_argument('--max', help='minimum length of contig, default: 5m, you can use both number or string end with k,m', default="5m")
 12 | 	group.add_argument('-n', '--n50', help='size of N50, default: 500k, you can use both number or string end with k,m', default="500k")
 13 | 	group.add_argument('-i', '--input', help='origin fasta file of genome', required=True)
 14 | 	group.add_argument('-o', '--output', help='filename of simulated data', required=True)
 15 | 
 16 | 	return group.parse_args()
 17 | 
 18 | 
 19 | def ReadFasta(inFasta):
 20 | 	fastaDB = {}
 21 | 	with open(inFasta, 'r') as fIn:
 22 | 		id = ''
 23 | 		seq = ''
 24 | 		for line in fIn:
 25 | 			if line[0] == '>':
 26 | 				if seq != '':
 27 | 					fastaDB[id] = seq
 28 | 				id = line.strip()[1:]
 29 | 				seq = ''
 30 | 			else:
 31 | 				seq += line.strip()
 32 | 		fastaDB[id] = seq
 33 | 	return fastaDB
 34 | 
 35 | 
 36 | def GenCtgLen(fastaLenDB, cntLowerDB, cntHigherDB, n50, minLen, maxLen):
 37 | 	ctgLenDB = {}
 38 | 	for chrn in cntLowerDB:
 39 | 		ctgLenDB[chrn] = []
 40 | 		totalLower = 0
 41 | 		totalHigher = 0
 42 | 		totalLen = 0
 43 | 		for i in range(0, cntLowerDB[chrn]):
 44 | 			tmpLen = random.randint(minLen, n50)
 45 | 			totalLower += tmpLen
 46 | 			if totalLower > fastaLenDB[chrn]/2:
 47 | 				break
 48 | 			ctgLenDB[chrn].append(tmpLen)
 49 | 			totalLen += tmpLen
 50 | 		for i in range(0, cntHigherDB[chrn]):
 51 | 			tmpLen = random.randint(n50, maxLen)
 52 | 			totalHigher += tmpLen
 53 | 			if totalHigher > fastaLenDB[chrn]/2:
 54 | 				break
 55 | 			ctgLenDB[chrn].append(tmpLen)
 56 | 			totalLen += tmpLen
 57 | 		cntN50 = int((fastaLenDB[chrn]-totalLen)/n50)
 58 | 		for i in range(0, cntN50):
 59 | 			tmpLen = random.randint(int(n50-n50*0.1), int(n50+n50*0.1))
 60 | 			totalLen += tmpLen
 61 | 			if totalLen > fastaLenDB[chrn]:
 62 | 				totalLen -= tmpLen
 63 | 				break
 64 | 			ctgLenDB[chrn].append(tmpLen)
 65 | 		ctgLenDB[chrn].append(fastaLenDB[chrn]-totalLen)
 66 | 	return ctgLenDB
 67 | 
 68 | 
 69 | def GenCtgRegions(fastaLenDB, ctgLenDB):
 70 | 	ctgRegionsDB = {}
 71 | 	for chrn in fastaLenDB:
 72 | 		ctgRegionsDB[chrn] = []
 73 | 		totalCtgLen = 0
 74 | 		for ctgLen in ctgLenDB[chrn]:
 75 | 			totalCtgLen += ctgLen
 76 | 		cntCtg = len(ctgLenDB[chrn])
 77 | 		lastPos = 0
 78 | 		ctgLenList = copy.deepcopy(ctgLenDB[chrn])
 79 | 		for i in range(0, cntCtg):
 80 | 			index = random.randint(0, cntCtg-1)
 81 | 			ctgRegionsDB[chrn].append([lastPos, lastPos+ctgLenList[index]])
 82 | 			lastPos += ctgLenList[index]
 83 | 			del ctgLenList[index]
 84 | 			cntCtg -= 1
 85 | 	return ctgRegionsDB
 86 | 
 87 | 
 88 | def SimGenomeCtg(inFasta, outFasta, n50, minLen, maxLen):
 89 | 	random.seed()
 90 | 	print("Reading fasta")
 91 | 	fastaDB = ReadFasta(inFasta)
 92 | 	fastaLenDB = {}
 93 | 	cntLowerDB = {}
 94 | 	cntHigherDB = {}
 95 | 	for chrn in fastaDB:
 96 | 		fastaLenDB[chrn] = len(fastaDB[chrn])
 97 | 		cntLowerDB[chrn] = int(fastaLenDB[chrn]/(minLen+n50))
 98 | 		cntHigherDB[chrn] = int(fastaLenDB[chrn]/(maxLen+n50))
 99 | 	
100 | 	print("\nGenerating contigs")
101 | 	ctgLenDB = GenCtgLen(fastaLenDB, cntLowerDB, cntHigherDB, n50, minLen, maxLen)
102 | 	ctgRegionsDB = GenCtgRegions(fastaLenDB, ctgLenDB)
103 | 
104 | 	print("\nStatistics")
105 | 	for chrn in sorted(fastaDB):
106 | 		print("\tChromosome:\t%s"%(chrn))
107 | 		print("\tChromosome size:\t%d"%(fastaLenDB[chrn]))
108 | 		print("\tContig counts:\t%d"%(len(ctgLenDB[chrn])))
109 | 		tmpLen = 0
110 | 		n50Len = 0
111 | 		for ctgLen in sorted(ctgLenDB[chrn], reverse=True):
112 | 			tmpLen += ctgLen
113 | 			if tmpLen >= fastaLenDB[chrn]/2 and n50Len == 0:
114 | 				n50Len = ctgLen
115 | 		print("\tContig total size:\t%d"%(tmpLen))
116 | 		print("\tN50 size:\t%d\n"%(n50Len))
117 | 
118 | 	print("\nWriting contigs")
119 | 	with open(outFasta, 'w') as fOut:
120 | 		base = 100
121 | 		for chrn in sorted(fastaDB):
122 | 			for region in ctgRegionsDB[chrn]:
123 | 				s = region[0]
124 | 				e = region[1]
125 | 				ctgName = "tig%07d"%(base)
126 | 				fOut.write(">%s %s %d:%d length=%d\n%s\n"%(ctgName, chrn, s+1, e+1, e-s+1, fastaDB[chrn][s: e]))	
127 | 				base += 100
128 | 	print("\nFinished")
129 | 
130 | 
131 | if __name__ == "__main__":
132 | 	opts = GetOpts()
133 | 	inFasta = opts.input
134 | 	outFasta = opts.output
135 | 	n50 = opts.n50
136 | 	n50 = n50.lower()
137 | 	n50 = n50.replace('m', '000000')
138 | 	n50 = n50.replace('k', '000')
139 | 	n50 = int(n50)
140 | 	minLen = opts.min
141 | 	minLen = minLen.lower()
142 | 	minLen = minLen.replace('m', '000000')
143 | 	minLen = minLen.replace('k', '000')
144 | 	minLen = int(minLen)
145 | 	maxLen = opts.max
146 | 	maxLen = maxLen.lower()
147 | 	maxLen = maxLen.replace('m', '000000')
148 | 	maxLen = maxLen.replace('k', '000')
149 | 	maxLen = int(maxLen)
150 | 	SimGenomeCtg(inFasta, outFasta, n50, minLen, maxLen)
151 | 


--------------------------------------------------------------------------------
/bin/SimCollapse.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import argparse
  4 | import random
  5 | 
  6 | 
  7 | def GetOpts():
  8 | 	group = argparse.ArgumentParser()
  9 | 	group.add_argument('-a', '--a_contigs', help='first fasta file contain contigs generated by SimContigs.py', required=True)
 10 | 	group.add_argument('-b', '--b_contigs', help='second fasta file contain contigs generated by SimContigs.py', required=True)
 11 | 	group.add_argument('-p', '--prefix', help='prefix of contig file a and contig file b, divided by comma, like: HA, HB', required=True)
 12 | 	group.add_argument('-o', '--output', help='filename of simulated data', required=True)
 13 | 	group.add_argument('-s', '--blast', help='blast file with format 6, must use first file of input as query and second file as database', required=True)
 14 | 	group.add_argument('-c', '--collapse', type=float, help='persentage of collapse region size, like 5 means 5%%, default: 10', default=10)
 15 | 
 16 | 	return group.parse_args()
 17 | 
 18 | 
 19 | def ReadFasta(inFasta):
 20 | 	fastaDB = {}
 21 | 	with open(inFasta, 'r') as fIn:
 22 | 		id = ''
 23 | 		seq = ''
 24 | 		totalLen = 0
 25 | 		for line in fIn:
 26 | 			if line[0] == '>':
 27 | 				if seq != '':
 28 | 					fastaDB[id] = seq
 29 | 				data = line.strip()[1:].split()
 30 | 				id = data[0]
 31 | 				seq = ''
 32 | 			else:
 33 | 				seq += line.strip()
 34 | 				totalLen += len(line.strip())
 35 | 		fastaDB[id] = seq
 36 | 	return fastaDB, totalLen
 37 | 
 38 | 
 39 | def ReadBlast(inBlast, prefixList):
 40 | 	blastDB = {}
 41 | 	with open(inBlast, 'r') as fBlast:
 42 | 		for line in fBlast:
 43 | 			data = line.strip().split()
 44 | 			queryID = data[0]
 45 | 			targetID = data[1]
 46 | 			identity = float(data[2])
 47 | 			queryRegion = list(map(int, [data[6], data[7]]))
 48 | 			targetRegion = list(map(int, [data[8], data[9]]))
 49 | 			if queryID not in blastDB:
 50 | 				blastDB[queryID] = [targetID, identity, queryRegion, targetRegion]
 51 | 			else:
 52 | 				if identity > blastDB[queryID][1]:
 53 | 					blastDB[queryID] = [targetID, identity, queryRegion, targetRegion]
 54 | 	
 55 | 	mapping = {}
 56 | 	allContigList = []
 57 | 	for queryID in blastDB:
 58 | 		targetID = blastDB[queryID][0]
 59 | 		mapping[prefixList[0]+'-'+queryID] = prefixList[1]+"-"+targetID
 60 | 		mapping[prefixList[1]+"-"+targetID] = prefixList[0]+'-'+queryID
 61 | 		allContigList.append(prefixList[0]+'-'+queryID)
 62 | 		allContigList.append(prefixList[1]+"-"+targetID)
 63 | 	
 64 | 	return mapping, allContigList
 65 | 
 66 | 
 67 | def SimCollapse(aFasta, bFasta, outFa, blastFile, prefixList, collapse):
 68 | 	print("Reading first contig file")
 69 | 	fastaDBA, lenA = ReadFasta(aFasta)
 70 | 
 71 | 	print("Reading second contig file")
 72 | 	fastaDBB, lenB = ReadFasta(bFasta)
 73 | 	
 74 | 	lenDB = {}
 75 | 	for id in fastaDBA:
 76 | 		lenDB[prefixList[0]+"-"+id] = len(fastaDBA[id])
 77 | 	for id in fastaDBB:
 78 | 		lenDB[prefixList[1]+"-"+id] = len(fastaDBB[id])
 79 | 
 80 | 	print("Reading blast file")
 81 | 	mapping, allContigList = ReadBlast(blastFile, prefixList)
 82 | 	collapseLen = int((lenA+lenB)*collapse)
 83 | 	
 84 | 	print("Total collapse size expected: %d"%(collapseLen))
 85 | 	removeList = {}
 86 | 	print("Removing collapse regions")
 87 | 	removeLen = 0
 88 | 	while collapseLen > 0:
 89 | 		index = random.randint(0, len(allContigList)-1)
 90 | 		name = allContigList[index]
 91 | 
 92 | 		while mapping[name] not in allContigList:
 93 | 			index = random.randint(0, len(allContigList)-1)
 94 | 			name = allContigList[index]
 95 | 
 96 | 		repeatCnt = 0
 97 | 		isLast = False
 98 | 		while mapping[name] not in allContigList or lenDB[name] > collapseLen:
 99 | 			if repeatCnt > 50:
100 | 				isLast = True
101 | 				break
102 | 			if lenDB[name] > collapseLen:
103 | 				repeatCnt += 1
104 | 			index = random.randint(0, len(allContigList)-1)
105 | 			name = allContigList[index]
106 | 		
107 | 		if isLast:
108 | 			break
109 | 		pre, ctg = name.split('-')
110 | 		collapseLen -= lenDB[name]
111 | 		removeLen += lenDB[name]
112 | 		
113 | 		if pre not in removeList:
114 | 			removeList[pre] = []
115 | 		removeList[pre].append(ctg)
116 | 		
117 | 		allContigList.remove(name)
118 | 		allContigList.remove(mapping[name])
119 | 	print("Total collapse size removed: %d"%(removeLen))
120 | 
121 | 	print("Writing result")
122 | 	with open(outFa, 'w') as fOut:
123 | 		for pre in removeList:
124 | 			if pre == prefixList[0]:
125 | 				for id in fastaDBA:
126 | 					if id not in removeList[pre]:
127 | 						fOut.write(">%s_%s\n%s\n"%(pre, id, fastaDBA[id]))
128 | 			else:
129 | 				for id in fastaDBB:
130 | 					if id not in removeList[pre]:
131 | 						fOut.write(">%s_%s\n%s\n"%(pre, id, fastaDBB[id]))
132 | 	
133 | 	print("Success")
134 | 
135 | 
136 | if __name__ == "__main__":
137 | 	opts = GetOpts()
138 | 	aFasta = opts.a_contigs
139 | 	bFasta = opts.b_contigs
140 | 	outFa = opts.output
141 | 	collapse = opts.collapse/100.0
142 | 	blastFile = opts.blast
143 | 	prefixList = opts.prefix.split(',')
144 | 	print("Arguments")
145 | 	print("\tInput files: %s, %s"%(aFasta, bFasta))
146 | 	print("\tOutput file: %s"%(outFa))
147 | 	print("\tBlast file: %s"%(blastFile))
148 | 	print("\tCollapse ratio: %.2f%%"%(collapse*100))
149 | 	SimCollapse(aFasta, bFasta, outFa, blastFile, prefixList, collapse)
150 | 


--------------------------------------------------------------------------------
/bin/simple_JBrowser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import sys
  4 | 
  5 | 
  6 | def get_config(f_conf, section, key):
  7 | 	if sys.version[0] == '2':
  8 | 		import ConfigParser
  9 | 		config = ConfigParser.ConfigParser()
 10 | 	else:
 11 | 		import configparser
 12 | 		config = configparser.ConfigParser()
 13 | 	config.read(f_conf)
 14 | 	return config.get(section, key)
 15 | 
 16 | 
 17 | def get_options(Argvs):
 18 | 	i = 0
 19 | 	opts = {}
 20 | 	fasta_file = ''
 21 | 	gff_files = []
 22 | 	bed_files = []
 23 | 	bam_files = []
 24 | 	bw_files = []
 25 | 	vcf_files = []
 26 | 	conf_file = ''
 27 | 	chrs_file = ''
 28 | 	tn = '1'
 29 | 	while i<len(Argvs):
 30 | 		if Argvs[i] == '-f':
 31 | 			fasta_file = Argvs[i+1]
 32 | 		elif Argvs[i] == '--gff':
 33 | 			gff_files.append(Argvs[i+1])
 34 | 		elif Argvs[i] == '--bed':
 35 | 			bed_files.append(Argvs[i+1])
 36 | 		elif Argvs[i] == '--bam':
 37 | 			bam_files.append(Argvs[i+1])
 38 | 		elif Argvs[i] == '--bw':
 39 | 			bw_files.append(Argvs[i+1])
 40 | 		elif Argvs[i] == '--conf':
 41 | 			conf_file = Argvs[i+1]
 42 | 		elif Argvs[i] == '--bam2bw':
 43 | 			chrs_file = Argvs[i+1]
 44 | 		elif Argvs[i] == '-t':
 45 | 			tn = Argvs[i+1]
 46 | 		elif Argvs[i] == '--vcf':
 47 | 			vcf_files.append(Argvs[i+1])
 48 | 		elif Argvs[i] == '-h' or Argvs[i] == '--help':
 49 | 			print("Usage: python "+sys.argv[0]+" -f <fasta_file> [--gff <gff_file> --bed <bed_file> --bam <bam_file> --bw <bigwig_file> --conf <config_file>]")
 50 | 			exit(0)
 51 | 		i += 2
 52 | 	opts['fasta_file'] = fasta_file
 53 | 	opts['gff_files'] = gff_files
 54 | 	opts['bed_files'] = bed_files
 55 | 	opts['bam_files'] = bam_files
 56 | 	opts['vcf_files'] = vcf_files
 57 | 	opts['bw_files'] = bw_files
 58 | 	opts['bam2bw'] = chrs_file
 59 | 	opts['conf_file'] = conf_file
 60 | 	opts['tn'] = tn
 61 | 	return opts
 62 | 
 63 | 
 64 | def simple_jbrowser(opts):
 65 | 	samtools = ''
 66 | 	jbrowser = ''
 67 | 	bam2wig = ''
 68 | 	if opts['conf_file'] != '':
 69 | 		conf_file = opts['conf_file']
 70 | 		if os.path.isfile(conf_file):
 71 | 			samtools = get_config(conf_file, "path", "samtools")
 72 | 			bam2wig = get_config(conf_file, "path", "bam2wig")
 73 | 			jbrowser = get_config(conf_file, "path", "JBrowser")
 74 | 			wig2bw = get_config(conf_file, "path", "wig2bw")
 75 | 	else:
 76 | 		print("No configure file")
 77 | 		exit(0)
 78 | 	
 79 | 	if samtools != '' and samtools[-1] != '/':
 80 | 		samtools += '/'
 81 | 	if jbrowser != '' and jbrowser[-1] != '/':
 82 | 		jbrowser += '/'
 83 | 	if bam2wig != '' and bam2wig[-1] != '/':
 84 | 		bam2wig += '/'
 85 | 	if wig2bw != '':
 86 | 		os.system("export PATH="+wig2bw+":$PATH")
 87 | 
 88 | 	print("Preparing reference sequences")
 89 | 	if opts['fasta_file'] == '':
 90 | 		print("No reference sequences")
 91 | 		exit(0)
 92 | 	os.system(jbrowser+"prepare-refseqs.pl --fasta "+opts['fasta_file'])
 93 | 
 94 | 	print("Preparing gffs")
 95 | 	for gff in opts['gff_files']:
 96 | 		if gff != '':
 97 | 			os.system(jbrowser+"flatfile-to-json.pl --gff "+gff+" --trackType CanvasFeatures --trackLabel "+gff.split('.')[0])
 98 | 	
 99 | 	print("Preparing beds")
100 | 	for bed in opts['bed_files']:
101 | 		if bed != '':
102 | 			os.system(jbrowser+"flatfile-to-json.pl --bed "+bed+" --trackType CanvasFeatures --trackLabel "+bed)
103 | 	
104 | 	print("Preparing vcf")
105 | 	for vcf in opts['vcf_files']:
106 | 		if vcf[-3:].lower() != '.gz':
107 | 			os.system("bgzip "+vcf)
108 | 			vcf = vcf+".gz"
109 | 		if os.path.exists(vcf+".tbi") == False:
110 | 			os.system("tabix -p vcf "+vcf)
111 | 		with open("data/tracks.conf", "a") as f_track:
112 | 			f_track.write("[tracks."+vcf.replace('.', '_')+"]\nstoreClass = JBrowse/Store/SeqFeature/VCFTabix\nurlTemplate = ../"+vcf+"\ncategory = VCF\ntype = JBrowse/View/Track/CanvasVariants\nkey  = "+vcf.replace('.', '_')+"\n")
113 | 
114 | 	print("Preparing bam")
115 | 	for bam in opts['bam_files']:
116 | 		if not bam.endswith('sorted.bam'):
117 | 			sorted_bam = bam+".sorted.bam"
118 | 			indexed_bam = sorted_bam+".bai"
119 | 			if not os.path.exists(sorted_bam):
120 | 				os.system(samtools+"samtools sort -@ "+opts['tn']+" -o "+sorted_bam+" "+bam)
121 | 			if not os.path.exists(indexed_bam):
122 | 				os.system(samtools+"samtools index "+sorted_bam)
123 | 		else:
124 | 			sorted_bam = bam
125 | 			indexed_bam  = sorted_bam+".bai"
126 | 			if not os.path.exists(indexed_bam):
127 | 				os.system(samtools+"samtools index "+sorted_bam)
128 | 		with open("data/tracks.conf", "a") as f_track:
129 | 			f_track.write("[tracks."+bam.replace('.', '_')+"]\nstoreClass = JBrowse/Store/SeqFeature/BAM\nurlTemplate = ../"+sorted_bam+"\nbaiUrlTemplate = ../"+indexed_bam+"\ncategory = NGS\ntype = JBrowse/View/Track/Alignments2\nkey = "+bam.replace('.', '_')+"\n")
130 | 		if opts['bam2bw'] != '':
131 | 			os.system(bam2wig+"bam2wig.py -i "+sorted_bam+" -s "+opts['bam2bw']+" -o "+bam)
132 | 			opts['bw_files'].append(bam+".bw")
133 | 			os.remove(bam+".wig")
134 | 	
135 | 	print("Preparing bigwig")
136 | 	for bw in opts['bw_files']:
137 | 		with open("data/tracks.conf", "a") as f_track:
138 | 			f_track.write("[tracks."+bw.replace('.', '_')+"]\nstoreClass = JBrowse/Store/SeqFeature/BigWig\nurlTemplate = ../"+bw+"\ncategory = Quantitative\ntype = JBrowse/View/Track/Wiggle/XYPlot\nkey = "+bw.replace('.', '_')+"\n")
139 | 	print("Finished")
140 | 
141 | 
142 | if __name__ == "__main__":
143 | 	if len(sys.argv) == 1:
144 | 		print("Usage: python "+sys.argv[0]+" -f <ref_fasta> --conf <config_file> [--gff <gff_file> --bed <bed_file> --vcf <vcf_file> --bam <bam_file> --bw <bigwig_file> --bam2bw <chrs_length_file> -t <threads>]")
145 | 	else:
146 | 		opts = get_options(sys.argv[1:])
147 | 		simple_jbrowser(opts)
148 | 


--------------------------------------------------------------------------------
/bin/SentieonSNP_filter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import matplotlib as mpl
  3 | mpl.use('Agg')
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import gzip
  7 | import argparse
  8 | import re
  9 | import time
 10 | 
 11 | 
 12 | def get_opts():
 13 | 	group = argparse.ArgumentParser()
 14 | 	group.add_argument("-b", "--base", help="Input vcf file as base, .gz supported", required=True)
 15 | 	group.add_argument("-v", "--validation", help="Input vcf file as validation, .gz supported", required=True)
 16 | 	group.add_argument("-r", "--repeat", help="Repeat regions file, gff format", default="")
 17 | 	group.add_argument("-o", "--output", help="Output vcf file based on base vcf file, compressed with gzip", required=True)
 18 | 	group.add_argument("-m", "--missing_rate", type=float, help="Missing rate threshold, percentage, default: 40", default=40)
 19 | 	group.add_argument("-d", "--min_distance", type=int, help="Minimum distance between two snp sites, default: 0", default=0)
 20 | 	return group.parse_args()
 21 | 
 22 | 
 23 | def read_vcf(in_vcf, method):
 24 | 	if in_vcf[-3:].lower() == '.gz':
 25 | 		f_in = gzip.open(in_vcf, 'rt')
 26 | 	else:
 27 | 		f_in = open(in_vcf, 'r')
 28 | 	
 29 | 	header = []
 30 | 	vcf_infos = {}
 31 | 	dp_db = {}
 32 | 	for line in f_in:
 33 | 		if line[0] == '#':
 34 | 			header.append(line)
 35 | 		else:
 36 | 			data = line.strip().split()
 37 | 			chrn = data[0]
 38 | 			pos = int(data[1])
 39 | 			ref = data[3]
 40 | 			alt = data[4]
 41 | 			filter = data[6]
 42 | 			
 43 | 			# filter indels
 44 | 			if len(ref) > 1 or len(alt) > 1:
 45 | 				continue
 46 | 			if method.lower() == 'full':
 47 | 				# filter LowQual marker
 48 | 				if filter.lower() == 'lowqual':
 49 | 					continue
 50 | 			if chrn not in vcf_infos:
 51 | 				vcf_infos[chrn] = {}
 52 | 			if method.lower() == 'full':
 53 | 				# calc missing rate
 54 | 				m_c = 0
 55 | 				for i in range(9, len(data)):
 56 | 					if data[i].split(':')[0] == './.':
 57 | 						m_c += 1
 58 | 				m_r = m_c*1.0/(len(data)-8)
 59 | 				vcf_infos[chrn][pos] = {'alt': alt, 'line': line, 'mr': m_r}
 60 | 				dp = int(data[7].split('DP=')[1].split(';')[0])
 61 | 				if dp not in dp_db:
 62 | 					dp_db[dp] = 0
 63 | 				dp_db[dp] += 1
 64 | 			else:
 65 | 				vcf_infos[chrn][pos] = {'alt': alt}
 66 | 	f_in.close()
 67 | 	if method.lower() == 'full':
 68 | 		return header, vcf_infos, dp_db
 69 | 	else:
 70 | 		return vcf_infos
 71 | 
 72 | 
 73 | def read_gff(in_gff):
 74 | 	regions_db = {}
 75 | 	with open(in_gff, 'r') as fin:
 76 | 		for line in fin:
 77 | 			if line[0] == '#':
 78 | 				continue
 79 | 			data = line.strip().split()
 80 | 			chrn = data[0]
 81 | 			sr = int(data[3])
 82 | 			er = int(data[4])
 83 | 			if sr > er:
 84 | 				tmp = er
 85 | 				er = sr
 86 | 				sr = tmp
 87 | 			if chrn not in regions_db:
 88 | 				regions_db[chrn] = []
 89 | 			regions_db[chrn].append([sr, er])
 90 | 	
 91 | 	for chrn in regions_db:
 92 | 		regions_db[chrn] = sorted(regions_db[chrn])
 93 | 
 94 | 	return regions_db
 95 | 
 96 | 
 97 | def is_repeat(repeats, pos):
 98 | 	s = 0
 99 | 	e = len(repeats)-1
100 | 	while s<=e:
101 | 		mid = int((s+e)/2)
102 | 		if repeats[mid][0] > pos:
103 | 			e = mid-1
104 | 		elif repeats[mid][0] < pos:
105 | 			s = mid+1
106 | 		else:
107 | 			return True
108 | 	if repeats[e][1] >= pos:
109 | 		return True
110 | 	else:
111 | 		return False
112 | 
113 | 
114 | 
115 | def snp_filter(in_base, in_valid, in_rep, mr, md, out_file):
116 | 	print("\033[32m%s\033[0m Reading valid file"%(time.strftime('[%H:%M:%S]',time.localtime(time.time()))))
117 | 	valid_snps = read_vcf(in_valid, 'simple')
118 | 
119 | 	print("\033[32m%s\033[0m Reading base file"%(time.strftime('[%H:%M:%S]',time.localtime(time.time()))))
120 | 	base_header, base_snps, base_dps = read_vcf(in_base, 'full')
121 | 
122 | 	print("\033[32m%s\033[0m Reading repeat file"%(time.strftime('[%H:%M:%S]',time.localtime(time.time()))))
123 | 	if in_rep != '':
124 | 		repeat_db = read_gff(in_rep)
125 | 	else:
126 | 		repeat_db = {}
127 | 	'''
128 | 	dp_cnt = []
129 | 	for dp in sorted(base_dps):
130 | 		dp_cnt.append(base_dps[dp])
131 | 	dp_cnt_th = int(max(dp_cnt)*0.05)
132 | 	
133 | 	dp_min = -1
134 | 	for dp in sorted(base_dps):
135 | 		if base_dps[dp] > dp_cnt_th:
136 | 			if dp_min == -1:
137 | 				dp_min = dp
138 | 			dp_max = dp
139 | 	'''
140 | 	# Plot dist
141 | 	plt.figure(figsize=(10, 8), dpi=100)
142 | 	dp_x = []
143 | 	dp_y = []
144 | 	dp_total = []
145 | 	for dp in sorted(base_dps):
146 | 		plt.bar(x=dp, height=base_dps[dp], width=1, edgecolor='white', facecolor='blue', align='center', linewidth=0.01)
147 | 		dp_x.append(dp)
148 | 		dp_y.append(base_dps[dp])
149 | 		for i in range(0, base_dps[dp]):
150 | 			dp_total.append(dp_x)
151 | 	plt.plot(dp_x, dp_y, linewidth=0.05, linestyle='-', markersize=0, marker=',')
152 | 
153 | 	dp_min = 5 #mean-std*1.96
154 | 	sum = np.sum(dp_y)
155 | 	top_sum = sum*0.95
156 | 	cnt = 0
157 | 	
158 | 	for dp in sorted(base_dps):
159 | 		if dp < dp_min:
160 | 			continue
161 | 		cnt += base_dps[dp]
162 | 		if cnt >= top_sum:
163 | 			dp_max = dp
164 | 			break
165 | 	plt.xlim(dp_x[0]-1, dp_max+1)
166 | 	plt.savefig('dist.pdf', filetype='pdf', bbox_inches='tight')	
167 | 	print("\033[32m%s\033[0m range=[%f, %f]"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())), dp_min, dp_max))	
168 | 
169 | 	print("\033[32m%s\033[0m Filtering and writing results"%(time.strftime('[%H:%M:%S]',time.localtime(time.time()))))
170 | 	if out_file[-3:].lower() != '.gz':
171 | 		out_file += '.gz'
172 | 	with gzip.open(out_file, 'wt') as fout:
173 | 		fout.write(''.join(base_header))
174 | 		for chrn in sorted(base_snps):
175 | 			if chrn not in valid_snps:
176 | 				continue
177 | 			last_pos = -1
178 | 			for pos in sorted(base_snps[chrn]):
179 | 				# filter repeat regions
180 | 				if chrn in repeat_db and is_repeat(repeat_db[chrn], pos):
181 | 					continue
182 | 				# filter base snps with valid snps
183 | 				if pos not in valid_snps[chrn]:
184 | 					continue
185 | 				if base_snps[chrn][pos]['alt'] != valid_snps[chrn][pos]['alt']:
186 | 					continue
187 | 				# filter missing rate
188 | 				if base_snps[chrn][pos]['mr'] > mr:
189 | 					continue
190 | 				
191 | 				# filter dp
192 | 				data = base_snps[chrn][pos]['line'].strip().split()
193 | 				dp = int(data[7].split('DP=')[1].split(';')[0])
194 | 				if dp < dp_min or dp > dp_max:
195 | 					continue
196 | 				if last_pos != -1 and pos - last_pos <= md:
197 | 					last_pos = pos
198 | 					continue
199 | 				else:
200 | 					last_pos = pos	
201 | 				fout.write(base_snps[chrn][pos]['line'])
202 | 	
203 | 	print("\033[32m%s\033[0m Finished"%(time.strftime('[%H:%M:%S]',time.localtime(time.time()))))
204 | 				
205 | 
206 | if __name__ == "__main__":
207 | 	opts = get_opts()
208 | 	in_base = opts.base
209 | 	in_valid = opts.validation
210 | 	in_rep = opts.repeat
211 | 	mr = opts.missing_rate/100.0
212 | 	md = opts.min_distance
213 | 	out_file = opts.output
214 | 	snp_filter(in_base, in_valid, in_rep, mr, md, out_file)
215 | 


--------------------------------------------------------------------------------
/bin/SimSID.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import sys
  4 | import argparse
  5 | import random
  6 | 
  7 | 
  8 | def ArgParser():
  9 | 	group = argparse.ArgumentParser()
 10 | 	group.add_argument('-s', '--snp', type=float, help='snp ratio of whole genome, percentage, default: 0.01', default=0.01)
 11 | 	group.add_argument('-i', '--insertion', type=float, help='insertion ratio of whole genome, percentage, default: 0.01', default=0.01)
 12 | 	group.add_argument('--insert_length', type=int, help='max length of insertion, default: 10', default=10)
 13 | 	group.add_argument('-d', '--deletion', type=float, help='delection ratio of whole genome, percentage, default: 0.01', default=0.01)
 14 | 	group.add_argument('--delete_length', type=int, help='max length of deletion, default: 10', default=10)
 15 | 	group.add_argument('--random_length', action="store_true", help='use this argument for generate random length of indels', default=False)
 16 | 	group.add_argument('-v', '--verbose', action="store_true", help='print detail information', default=False)
 17 | 	group.add_argument('-r', '--ref', help='origin fasta file of genome', required=True)
 18 | 	group.add_argument('-o', '--out', help='prefix of simulated data', required=True)
 19 | 
 20 | 	return group.parse_args()
 21 | 
 22 | 
 23 | def ReadFASTA(inputFASTA):
 24 | 	fastaDB = {}
 25 | 	posDB = {}
 26 | 	with open(inputFASTA, 'r') as fIN:
 27 | 		id = ''
 28 | 		seq = ''
 29 | 		for line in fIN:
 30 | 			if line[0] == '>':
 31 | 				if seq != '':
 32 | 					fastaDB[id] = seq
 33 | 				id = line.strip()[1:]
 34 | 				seq = ''
 35 | 			else:
 36 | 				seq += line.strip()
 37 | 		fastaDB[id] = seq
 38 | 	for chrn in fastaDB:
 39 | 		posDB[chrn] = [1]*len(fastaDB[chrn])
 40 | 	return fastaDB, posDB
 41 | 
 42 | 
 43 | def IsInRegions(regionDB, queryPos):
 44 | 	s = 0
 45 | 	e = len(regionDB)-1
 46 | 	refRegions = sorted(regionDB)
 47 | 	if len(refRegions) == 0:
 48 | 		return False
 49 | 	while s<=e:
 50 | 		mid = int((s+e)/2)
 51 | 		if refRegions[mid][0] < queryPos:
 52 | 			s = mid+1
 53 | 		elif refRegions[mid][0] > queryPos:
 54 | 			e = mid-1
 55 | 		else:
 56 | 			return True
 57 | 	if refRegions[e][1] >= queryPos:
 58 | 		return True
 59 | 	else:
 60 | 		return False
 61 | 
 62 | 
 63 | def GenDelRegions(fastaDB, posDB, delRatio, delLength, isRandom, isVerbose):
 64 | 	delRegions = {}
 65 | 	for chrn in fastaDB:
 66 | 		chrLen = len(fastaDB[chrn])
 67 | 		if isRandom:
 68 | 			avgDelLen = int(delLength/2)
 69 | 		else:
 70 | 			avgDelLen = delLength
 71 | 		cntDel = int(chrLen*delRatio/avgDelLen)
 72 | 		print("%s\tdelections count: %d"%(chrn, cntDel))
 73 | 		delRegions[chrn] = []
 74 | 		for i in range(0, cntDel):
 75 | 			if isVerbose:
 76 | 				print("Generating: %d"%(i+1))
 77 | 			if isRandom:
 78 | 				curDelLen = random.randint(1, delLength)
 79 | 			else:
 80 | 				curDelLen = delLength
 81 | 			sp = random.randint(0, chrLen-curDelLen)
 82 | 			ep = sp+curDelLen
 83 | 			while posDB[chrn][sp] == 0 or posDB[chrn][ep-1] == 0:
 84 | 				sp = random.randint(0, chrLen-curDelLen)
 85 | 				ep = sp+curDelLen
 86 | 			delRegions[chrn].append([sp, ep])
 87 | 			for i in range(sp, ep):
 88 | 				posDB[chrn][i] = 0
 89 | 	for chrn in delRegions:
 90 | 		delRegions[chrn] = sorted(delRegions[chrn])
 91 | 	
 92 | 	return delRegions
 93 | 
 94 | 
 95 | def GenSeq(seqLen):
 96 | 	nucType = ['A', 'T', 'G', 'C']
 97 | 	seq = ''
 98 | 	for i in range(0, seqLen):
 99 | 		seq += nucType[random.randint(0, 3)]
100 | 	return seq
101 | 
102 | 
103 | def GenInsPosSeqs(delRegions, fastaDB, posDB, insRatio, insLength, isRandom, isVerbose):
104 | 	insList = {}
105 | 	insSeqs = {}
106 | 	for chrn in fastaDB:
107 | 		chrLen = len(fastaDB[chrn])
108 | 		if isRandom:
109 | 			avgInsLen = int(insLength/2)
110 | 		else:
111 | 			avgInsLen = insLength
112 | 		cntIns = int(chrLen*insRatio/avgInsLen)
113 | 		print("%s\tinsertions count: %d"%(chrn, cntIns))
114 | 		insList[chrn] = []
115 | 		insSeqs[chrn] = []
116 | 		for i in range(0, cntIns):
117 | 			if isVerbose:
118 | 				print("Generating: %d"%(i+1))
119 | 			if isRandom:
120 | 				curInsLen = random.randint(1, insLength)
121 | 			else:
122 | 				curInsLen = insLength
123 | 			pos = random.randint(0, chrLen-1)
124 | 			while posDB[chrn][pos] == 0:
125 | 				pos = random.randint(0, chrLen-1)
126 | 			insList[chrn].append(pos)
127 | 			posDB[chrn][pos] = 0
128 | 			insSeqs[chrn].append(GenSeq(curInsLen))
129 | 	return insList, insSeqs
130 | 
131 | 
132 | def GenSNPPos(delRegions, fastaDB, posDB, snpRatio, insPos, isVerbose):
133 | 	snpSeq = {}
134 | 	snpPos = {}
135 | 	nucType = ['A', 'T', 'G', 'C']
136 | 	for chrn in fastaDB:
137 | 		snpSeq[chrn] = []
138 | 		snpPos[chrn] = []
139 | 		chrLen = len(fastaDB[chrn])
140 | 		cntSNP = int(chrLen*snpRatio)
141 | 		print("%s\tSNPs count: %d"%(chrn, cntSNP))
142 | 		for i in range(cntSNP):
143 | 			if isVerbose:
144 | 				print("Generating: %d"%(i+1))
145 | 			pos = random.randint(0, chrLen-1)
146 | 			while posDB[chrn][pos] == 0:
147 | 				pos = random.randint(0, chrLen-1)
148 | 			SNP = nucType[random.randint(0, 3)]
149 | 			while SNP == fastaDB[chrn][pos]:
150 | 				SNP = nucType[random.randint(0, 3)]
151 | 			snpSeq[chrn].append(SNP)
152 | 			snpPos[chrn].append(pos)
153 | 			posDB[chrn][pos] = 0
154 | 	return snpPos, snpSeq
155 | 
156 | 
157 | def SimSID(snpRatio, insRatio, delRatio, insLength, delLength, isRandom, isVerbose, inputFASTA, outPrefix):
158 | 	print("SNP Ratio = %.2f%%\nINS Ratio = %.2f%%\nDEL Ratio = %.2f%%\nINS Length = %d\nDEL Length = %d\nRandom: %s\nVerbose: %s\nInput file: %s\nOut prefix: %s"%(snpRatio*100, insRatio*100, delRatio*100, insLength, delLength, isRandom, isVerbose, inputFASTA, outPrefix))
159 | 	random.seed()
160 | 	print("Reading fasta")
161 | 	fastaDB, posDB = ReadFASTA(inputFASTA)
162 | 	print("Generating deletions")
163 | 	delRegions = GenDelRegions(fastaDB, posDB, delRatio, delLength, isRandom, isVerbose)
164 | 
165 | 	print("Generating insertions")
166 | 	insPos, insSeq = GenInsPosSeqs(delRegions, fastaDB, posDB, insRatio, insLength, isRandom, isVerbose)
167 | 
168 | 	print("Generating SNPs")
169 | 	snpPos, snpSeq = GenSNPPos(delRegions, fastaDB, posDB, snpRatio, insPos, isVerbose)
170 | 	
171 | 	print("Writing infomations")
172 | 	with open(outPrefix+"_snps.txt", 'w') as fSNP:
173 | 		fSNP.write("Chromosome\tPosition\tOrigin\tNew\n")
174 | 		writeStrings = []
175 | 		for chrn in sorted(fastaDB):
176 | 			for i in range(0, len(snpPos[chrn])):
177 | 				pos = snpPos[chrn][i]
178 | 				writeStrings.append([chrn, pos+1, fastaDB[chrn][pos], snpSeq[chrn][i]])
179 | 		for wString in sorted(writeStrings):
180 | 			fSNP.write('\t'.join(list(map(str, wString)))+'\n')
181 | 	
182 | 	with open(outPrefix+"_indel.txt", 'w') as fIndel:
183 | 		fIndel.write("Chromosome\tPosition\tOrigin\tNew\n")
184 | 		writeStrings = []
185 | 		for chrn in sorted(fastaDB):
186 | 			for i in range(0, len(delRegions[chrn])):
187 | 				sp = delRegions[chrn][i][0]
188 | 				ep = delRegions[chrn][i][1]
189 | 				writeStrings.append([chrn, sp+1, fastaDB[chrn][sp: ep], '[]'])
190 | 		for chrn in sorted(fastaDB):
191 | 			for i in range(0, len(insPos[chrn])):
192 | 				pos = insPos[chrn][i]
193 | 				writeStrings.append([chrn, pos+1, '[]', insSeq[chrn][i]])
194 | 		for wString in sorted(writeStrings):
195 | 			fIndel.write('\t'.join(list(map(str, wString)))+'\n')
196 | 	
197 | 	print("Writing fasta")
198 | 	with open(outPrefix+"_sim.fasta", 'w') as fSim:
199 | 		for chrn in sorted(fastaDB):
200 | 			newSeq = list(fastaDB[chrn])
201 | 			for i in range(0, len(snpPos[chrn])):
202 | 				newSeq[snpPos[chrn][i]] = snpSeq[chrn][i]
203 | 			for i in range(0, len(insPos[chrn])):
204 | 				newSeq[insPos[chrn][i]] = insSeq[chrn][i] + newSeq[insPos[chrn][i]]
205 | 			for i in range(0, len(delRegions[chrn])):
206 | 				for j in range(delRegions[chrn][i][0], delRegions[chrn][i][1]):
207 | 					newSeq[j] = ''
208 | 			fSim.write(">%s\n%s\n"%(chrn, ''.join(newSeq)))
209 | 	print("Success")
210 | 
211 | 
212 | if __name__ == "__main__":
213 | 	opts = ArgParser()
214 | 	snpRatio = opts.snp/100.0
215 | 	insRatio = opts.insertion/100.0
216 | 	delRatio = opts.deletion/100.0
217 | 	inputFASTA = opts.ref
218 | 	outPrefix = opts.out
219 | 	insLength = opts.insert_length
220 | 	delLength = opts.delete_length
221 | 	isRandom = opts.random_length
222 | 	isVerbose = opts.verbose
223 | 	SimSID(snpRatio, insRatio, delRatio, insLength, delLength, isRandom, isVerbose, inputFASTA, outPrefix)
224 | 


--------------------------------------------------------------------------------
/bin/easyGoKegg.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | library(optparse)
  3 | library(clusterProfiler)
  4 | library(KEGGREST)
  5 | library(dplyr)
  6 | library(stringr)
  7 | library(AnnotationForge)
  8 | library(jsonlite)
  9 | library(purrr)
 10 | library(RCurl)
 11 | library(ggplot2)
 12 | 
 13 | download_plant_path <- function(db_file) {
 14 |   ### Download plant pathway
 15 |   org <- data.frame(keggList("organism"))
 16 |   plants <- org[grep("Plants", org$phylogeny), ]
 17 |   plant_pathways_total <- vector()
 18 | 
 19 |   for (i in seq_along(length(plants$organism))) {
 20 |     try({
 21 |       pathways <- keggLink("pathway", plants[i, 2])
 22 |       pathways <- sub(paste(".*", plants[i, 2], sep = ""), "", pathways)
 23 |       pathways <- unique(pathways)
 24 |       plant_pathways_total <- append(plant_pathways_total, pathways)
 25 |       plant_pathways_total <- unique(plant_pathways_total)
 26 |     })
 27 |   }
 28 | 
 29 |   plant_pathways_total <- paste0("ko", plant_pathways_total)
 30 |   write.table(plant_pathways_total, file = db_file, quote = FALSE, row.names = FALSE, col.names = FALSE)
 31 | }
 32 | 
 33 | load_plant_path <- function(db_file) {
 34 |   ### Load plant pathway
 35 |   plant_pathways_total <- read.table(db_file, sep = "", header = FALSE)
 36 |   colnames(plant_pathways_total) <- "Pathway"
 37 |   plant_pathways_total
 38 | }
 39 | 
 40 | generate_go_db <- function(annotation_file, db_path, genus, species, tax_id) {
 41 |   cat("Reading annotation file\n")
 42 |   egg <- read.table(annotation_file, header = TRUE, sep = "\t", quote = "")
 43 |   egg[egg == ""] <- NA
 44 | 
 45 |   gterms <- egg %>%
 46 |     dplyr::select("query", "GOs") %>%
 47 |     na.omit()
 48 | 
 49 |   gene2go <- data.frame(
 50 |     GID = character(),
 51 |     GO = character(),
 52 |     EVIDENCE = character()
 53 |   )
 54 | 
 55 |   gene_ids <- egg$query
 56 |   eggnog_lines_with_go <- egg$GOs != "-" & egg$GOs != ""
 57 |   eggnog_annoations_go <- strsplit(egg$GOs[eggnog_lines_with_go], ",")
 58 |   gene2go <- data.frame(
 59 |     GID = rep(gene_ids[eggnog_lines_with_go],
 60 |       times = sapply(eggnog_annoations_go, length)
 61 |     ),
 62 |     GO = unlist(eggnog_annoations_go),
 63 |     EVIDENCE = "IEA"
 64 |   )
 65 | 
 66 |   gene_info <- egg %>%
 67 |     dplyr::select(GID = "query", GENENAME = "Preferred_name") %>%
 68 |     na.omit()
 69 | 
 70 |   gene2ko <- egg %>%
 71 |     dplyr::select(GID = "query", Ko = "KEGG_ko") %>%
 72 |     na.omit()
 73 | 
 74 |   gene2ko$Ko <- gsub("ko:", "", gene2ko$Ko)
 75 | 
 76 |   cat("Saving GO database\n")
 77 |   genus <- "Custom genus"
 78 |   species <- "CUSTOM"
 79 |   tax_id <- "0000"
 80 |   makeOrgPackage(
 81 |     gene_info = gene_info,
 82 |     go = gene2go,
 83 |     ko = gene2ko,
 84 |     version = "0.1",
 85 |     maintainer = "maintainer <tmp@tmp>",
 86 |     author = "author <tmp@tmp>",
 87 |     outputDir = db_path,
 88 |     tax_id = tax_id,
 89 |     genus = genus,
 90 |     species = species,
 91 |     goTable = "go"
 92 |   )
 93 | 
 94 |   anno_db <- list(go = gene2go, ko = gene2ko)
 95 |   return(anno_db)
 96 | }
 97 | 
 98 | update_kegg_db <- function(db_path, kegg_json, kegg_db_file) {
 99 |   url <- "https://www.kegg.jp/kegg-bin/download_htext?htext=ko00001&format=json&filedir="
100 |   if (!file.exists(kegg_json)) {
101 |     json <- paste(db_path, "ko00001.json", sep = "/")
102 |     download.file(url, json)
103 |   } else {
104 |     json <- kegg_json
105 |   }
106 |   pathway2name <- tibble(Pathway = character(), Name = character())
107 |   ko2pathway <- tibble(Ko = character(), Pathway = character())
108 |   kegg <- fromJSON(json)
109 |   for (a in seq_along(kegg[["children"]][["children"]])) {
110 |     A <- kegg[["children"]][["name"]][[a]]
111 |     for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
112 |       B <- kegg[["children"]][["children"]][[a]][["name"]][[b]]
113 |       for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
114 |         pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
115 |         pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
116 |         pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
117 |         pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
118 |         kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
119 |         kos <- str_match(kos_info, "K[0-9]*")[, 1]
120 |         ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))
121 |       }
122 |     }
123 |   }
124 |   save(pathway2name, ko2pathway, file = kegg_db_file)
125 | }
126 | 
127 | load_kegg_db <- function(kegg_db_file, gene2ko) {
128 |   kegg_db <- new.env()
129 |   load(file = kegg_db_file, envir = kegg_db)
130 |   gene2pathway <- gene2ko %>%
131 |     left_join(kegg_db$ko2pathway, by = "Ko", relationship = "many-to-many") %>%
132 |     dplyr::select("GID", "Pathway") %>%
133 |     na.omit()
134 |   kegg_db <- list(pathway2name = kegg_db$pathway2name, gene2pathway = gene2pathway)
135 |   return(kegg_db)
136 | }
137 | 
138 | keep_plant_path_only <- function(pathway2name, gene2pathway, plant_pathways_total) {
139 |   pathway2name <- pathway2name %>% filter(pathway2name$Pathway %in% plant_pathways_total$Pathway)
140 |   gene2pathway <- gene2pathway %>% filter(gene2pathway$Pathway %in% pathway2name$Pathway)
141 |   filtered_kegg_db <- list(pathway2name = pathway2name, gene2pathway = gene2pathway)
142 |   return(filtered_kegg_db)
143 | }
144 | 
145 | run_go_kegg <- function(db_path, gene2pathway, pathway2name, gene_file,
146 |                         pvalue_cutoff, qvalue_cutoff, padjust_method, ontology) {
147 |   gene <- read.table(gene_file, header = FALSE)
148 |   gene_list <- gene[, 1]
149 |   GO <- enrichGO(
150 |     gene = gene_list,
151 |     OrgDb = "org.CCUSTOM.eg.db",
152 |     keyType = "GID",
153 |     ont = "ALL",
154 |     pAdjustMethod = padjust_method,
155 |     pvalueCutoff = pvalue_cutoff,
156 |     qvalueCutoff = qvalue_cutoff
157 |   )
158 | 
159 |   cat("Saving GO result\n")
160 |   GO_df <- as.data.frame(GO)
161 |   write.table(GO_df, file = "GO.results.tsv", sep = "\t", quote = FALSE)
162 | 
163 |   cat("Saving GO barplot\n")
164 |   pdf(file = "GO_barplot.pdf", width = 15, height = 20)
165 |   print(barplot(GO, drop = TRUE, showCategory = 10, split = "ONTOLOGY") + facet_grid(ONTOLOGY ~ ., scale = "free"))
166 |   dev.off()
167 | 
168 |   cat("Saving GO bubble\n")
169 |   pdf(file = "GO_bubble.pdf", width = 15, height = 20)
170 |   print(dotplot(GO, showCategory = 10, split = "ONTOLOGY") + facet_grid(ONTOLOGY ~ ., scale = "free"))
171 |   dev.off()
172 | 
173 |   KEGG <- enricher(
174 |     gene = gene_list,
175 |     TERM2GENE = gene2pathway[c("Pathway", "GID")],
176 |     TERM2NAME = pathway2name[c("Pathway", "Name")],
177 |     pAdjustMethod = padjust_method,
178 |     pvalueCutoff = pvalue_cutoff,
179 |     qvalueCutoff = qvalue_cutoff
180 |   )
181 | 
182 | 
183 |   cat("Saving KEGG result\n")
184 |   KEGG_df <- as.data.frame(KEGG)
185 |   write.table(KEGG_df, file = "KEGG.results.tsv", sep = "\t", quote = FALSE)
186 | 
187 |   cat("Saving KEGG barplot\n")
188 |   pdf(file = "KEGG_barplot.pdf", width = 15, height = 20)
189 |   print(barplot(KEGG, drop = TRUE, showCategory = 10))
190 |   dev.off()
191 | 
192 |   cat("Saving KEGG bubble\n")
193 |   pdf(file = "KEGG_bubble.pdf", width = 15, height = 20)
194 |   print(dotplot(KEGG))
195 |   dev.off()
196 | }
197 | 
198 | main_pipe <- function() {
199 |   opt <- parse_args(opt_parser)
200 |   opt_names <- names(opt)
201 |   if ("input" %in% opt_names && "anno" %in% opt_names && "db" %in% opt_names) {
202 |     anno_file <- opt$anno
203 |     db_path <- opt$db
204 |     if (!dir.exists(db_path)) {
205 |       dir.create(db_path)
206 |     }
207 |     go_db_name <- paste("org.", substr(opt$genus, 1, 1), opt$species, ".eg.db", sep = "")
208 |     db_pack <- paste(db_path, go_db_name, sep = "/")
209 | 
210 |     pvalue_cutoff <- opt$pvalue
211 |     qvalue_cutoff <- opt$qvalue
212 |     padjust_method <- opt$padjust
213 |     ontology <- opt$ontology
214 |     cat("Generating GO database\n")
215 |     if (!file.exists(anno_file)) {
216 |       cat("Fatal: annotation file not exists\n")
217 |       return()
218 |     }
219 | 
220 |     if (file.exists(db_pack)) {
221 |       unlink(db_pack, recursive = TRUE)
222 |     }
223 |     anno_db <- generate_go_db(anno_file, db_path, opt$genus, opt$species, opt$tax_id)
224 | 
225 |     cat("Loading GO database\n")
226 |     install.packages(db_pack, repos = NULL, type = "sources")
227 |     do.call(library, list(go_db_name))
228 |     kegg_json <- ""
229 |     if ("kegg_json" %in% opt_names) {
230 |       kegg_json <- opt$kegg_json
231 |     }
232 |     kegg_db_file <- paste(db_path, "KEGG_db.RData", sep = "/")
233 |     if (!file.exists(kegg_db_file) || "update" %in% opt_names) {
234 |       cat("Generating KEGG database\n")
235 |       update_kegg_db(db_path, kegg_json, kegg_db_file)
236 |     }
237 | 
238 |     cat("Loading KEGG database\n")
239 |     kegg_db <- load_kegg_db(kegg_db_file, anno_db$ko)
240 | 
241 |     if ("plant" %in% opt_names) {
242 |       cat("Keeping plants pathway\n")
243 |       plants_kegg_file <- opt$plant_kegg
244 |       if (!file.exists(plants_kegg_file)) {
245 |         plants_kegg <- "plants.kegg.txt"
246 |         plants_kegg_file <- paste(db_path, plants_kegg, sep = "/")
247 |         download_plant_path(plants_kegg_file)
248 |       }
249 |       plant_pathways_total <- load_plant_path(plants_kegg_file)
250 |       kegg_db <- keep_plant_path_only(kegg_db$pathway2name, kegg_db$gene2pathway, plant_pathways_total)
251 |     }
252 | 
253 |     gene_file <- opt$input
254 |     cat("Running GO and KEGG\n")
255 |     run_go_kegg(
256 |       db_path, kegg_db$gene2pathway, kegg_db$pathway2name, gene_file,
257 |       pvalue_cutoff, qvalue_cutoff, padjust_method, ontology
258 |     )
259 | 
260 |     cat("Finished\n")
261 |   } else {
262 |     print_help(opt_parser)
263 |   }
264 | }
265 | 
266 | opt_list <- list(
267 |   make_option(c("-i", "--input"), type = "character", help = "Input gene list file"),
268 |   make_option(c("-a", "--anno"), type = "character", help = "Functional annotation file"),
269 |   make_option(c("-d", "--db"), type = "character", help = "Database path"),
270 |   make_option(c("--kegg_json"), type = "character", help = "Pre-downloaded kegg json file"),
271 |   make_option(c("--genus"),
272 |     type = "character", help = "Genus name for creating GO database, default=\"Custom genus\"",
273 |     default = "Custom genus"
274 |   ),
275 |   make_option(c("--pvalue"), type = "numeric", help = "P value cutoff for GO and KEGG, default=0.05", default = 0.05),
276 |   make_option(c("--qvalue"), type = "numeric", help = "Q value cutoff for GO and KEGG, default=0.05", default = 0.05),
277 |   make_option(c("--padjust"),
278 |     type = "character", help = "P adjust method for GO and KEGG, default=\"BH\"",
279 |     default = "BH"
280 |   ),
281 |   make_option(c("--ontology"),
282 |     type = "character", help = "Ontology for GO, default=\"ALL\"",
283 |     default = "ALL"
284 |   ),
285 |   make_option(c("--species"),
286 |     type = "character", help = "Species name for creating GO database, default=\"CUSTOM\"",
287 |     default = "CUSTOM"
288 |   ),
289 |   make_option(c("--tax_id"),
290 |     type = "character", help = "Tax id for creating GO database, default=\"0000\"",
291 |     default = "0000"
292 |   ),
293 |   make_option(c("--update"), action = "store_true", help = "Update databases"),
294 |   make_option(c("--plant"), action = "store_true", help = "enrich with plant pathway only"),
295 |   make_option(c("--plant_kegg"), type = "character", help = "Pre-generated plant kegg db file")
296 | )
297 | 
298 | opt_parser <- OptionParser(option_list = opt_list, usage = "This Script is used for running GO and KEGG")
299 | main_pipe()
300 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Introduction
  2 | 
  3 | This repository contains several scripts for bioinformatics.
  4 | 
  5 | ## Installation
  6 | 
  7 | ```shell
  8 | git clone https://github.com/sc-zhang/bioscripts.git
  9 | cd bin
 10 | chmod +x *
 11 | # Optional, add following line to your ~/.bash_profile
 12 | export PATH=/path/to/bioscripts/bin:$PATH
 13 | ```
 14 | 
 15 | ## Usage
 16 | 
 17 | 1. approximate_cnv.py is a script for approximating CNV (Copy Number Variation) with read depth.
 18 | 
 19 | ```shell
 20 | approximate_cnv.py -bam <bam_list_file> -g <genome_size> -l <read_length> -bed <bed_file> -o <out_file> [-t <thread_nums>]
 21 | 
 22 | Usage:
 23 |   -bam: a list file, each line is the full path of a bam file
 24 |   -g: the size of genome, integer
 25 |   -l: the length of read, integer
 26 |   -bed: bed file contain 4 columns: chromosome, start position, end position, gene name, seperate with tab
 27 |   -o: result file
 28 |   -t: threads, integer
 29 | ```
 30 | 
 31 | 2. average_fpkm.py is a script for calculating average of fpkm values.
 32 | 
 33 | ```shell
 34 | # Dependencies
 35 | # Python modules: numpy
 36 | average_fpkm.py <in_fpkm> <out_avg>
 37 | ```
 38 | 
 39 | 3. blast2heatmap.py is a script for drawing heatmap with blast file of format 6.
 40 | 
 41 | ```shell
 42 | # Dependencies
 43 | # Software: R, bedtools
 44 | # R modules: pheatmap
 45 | blast2heatmap.py <ref_fasta> <blast_file> <window_size> <out_name> <threshold_identify> <threshold_match>
 46 | ```
 47 | 
 48 | 4. calc_gap_cnt.py is a script for calculating gap count of all sequences.
 49 | 
 50 | ```shell
 51 | calc_gap_cnt.py <in_fa>
 52 | ```
 53 | 
 54 | 5. calc_gene_ovlp_te.py is a script for calculating overlap ratio of genes with TE regions.
 55 | 
 56 | ```shell
 57 | calc_gene_ovlp_te.py <gene_gff3> <TE_gffs> <ovlp_stat>
 58 | Usage:
 59 |   ovlp_stat: is the output file.
 60 | ```
 61 | 
 62 | 6. convert_collinearity_from_MCScanX_to_Circos.py is a script for converting collinearity file from MCScanX result to
 63 |    link file for Circos
 64 | 
 65 | ```shell
 66 | convert_collinearity_from_MCScanX_to_Circos.py <collinearity_file> <gff_file> <out_file>
 67 | ```
 68 | 
 69 | 7. convert_gbff_to_fasta.py is a script for converting NCBI GBFF file to fasta file.
 70 | 
 71 | ```shell
 72 | convert_gbff_to_fasta.py <in_gbff> <out_fasta>
 73 | ```
 74 | 
 75 | 8. convert_QTL_info.py is a script for converting QTL information of contig-level to chromosome-level with agp file.
 76 | 
 77 | ```shell
 78 | convert_QTL_info.py <in_QTL> <in_agp> <out_QTL>
 79 | ```
 80 | 
 81 | 9. convert_simple_for_circos.py is a script for converting JCVI simple file to link file for circos.
 82 | 
 83 | ```shell
 84 | convert_simple_for_circos.py <in_simple> <in_gff3_files> <out_link>
 85 | ```
 86 | 
 87 | 10. dup_dotplot.pl is a script for plotting dotplot with monoploid and polyploid.
 88 | 
 89 | ```shell
 90 | dup_dotplot.pl -g reference_genome -r ref_id -q query_id -n number_of_dup -t threads
 91 | Usage:
 92 |     ref_id: reference cds and bed name, like: Sb, Sb.cds and Sb.bed must exist
 93 |     query_id: query cds and bed name, like: Os
 94 |     number_of_dup: number of duplications
 95 |     threads: default 1
 96 | ```
 97 | 
 98 | 11. eval_filled_gaps.py is a script for evaluating status that gaps been filled
 99 | 
100 | ```shell
101 | eval_filled_gaps.py <ref_fasta> <query_fasta> <result_file>
102 | ```
103 | 
104 | 12. extract_all_sv_from_nucmer_delta.py is a script for extracting SV from delta file generated by nucmer.
105 | 
106 | ```shell
107 | extract_all_sv_from_nucmer_delta.py <in_delta> <out_pre>
108 | ```
109 | 
110 | 13. extract_gene_from_gff.py is a script for extracting genes from gff3 file with gene id list and generating a bed
111 |     file.
112 | 
113 | ```shell
114 | extract_gene_from_gff.py <in_list> <in_gff> <out_bed>
115 | ```
116 | 
117 | 14. extract_vcf.py is a script for extracting vcf with bed file
118 | 
119 | ```shell
120 | extract_vcf.py <in_vcf> <in_bed> <out_vcf>
121 | ```
122 | 
123 | 15. filter_cds.py is a script for removing invalided CDS sequences.
124 | 
125 | ```shell
126 | filter_cds.py <in_cds> <out_cds>
127 | ```
128 | 
129 | 16. find_gff_ovlp_regions.py is a script for getting overlap regions from gff3 file.
130 | 
131 | ```shell
132 | find_gff_ovlp_regions.py <in_gff3> <out_bed>
133 | ```
134 | 
135 | 17. get_chr_len.py is a script for calculating length of chromosomes in fasta file
136 | 
137 | ```shell
138 | get_chr_len.py <fasta_file> <output_file> <T/F chr only>
139 | ```
140 | 
141 | 18. get_genes_from_range.py is a script for getting genes with bed file.
142 | 
143 | ```shell
144 | get_genes_from_range.py <gff3_file> <bed_file> <output_file> <threshold>
145 | ```
146 | 
147 | 19. get_genes_region_from_gff.py is a script for getting gene regions from gff3 file.
148 | 
149 | ```shell
150 | get_genes_region_from_gff.py <gene_list> <in_gff> <out_bed>
151 | ```
152 | 
153 | 20. get_gff_with_list.py is a script for extracting gff3 file with gene IDs.
154 | 
155 | ```shell
156 | get_gff_with_list.py <in_gff> <in_list> <out_gff>
157 | ```
158 | 
159 | 21. get_seq_from_range.py is a script for extracting sequence fragments with bed file.
160 | 
161 | ```shell
162 | get_seq_from_range.py <in_fasta> <in_bed> <out_fasta>
163 | ```
164 | 
165 | 22. group_exon_and_intron.py is a script for classifying vcf positions to exon and intron.
166 | 
167 | ```shell
168 | group_exon_and_intron.py <input_gff> <input_vcf> <output_file>
169 | ```
170 | 
171 | 23. group_SNP_exon_and_intron.py is a script for classifying SNP positions to exon and intron.
172 | 
173 | ```shell
174 | group_SNP_exon_and_intron.py <input_gff> <input_snp> <output_file>
175 | ```
176 | 
177 | 24. merge_bed_regions.py is a script for merging bed files based on distance
178 | 
179 | ```shell
180 | merge_bed_regions.py <in_bed> <out_bed> <max_distance>
181 | ```
182 | 
183 | 25. modify_geno_with_snp_mummer.py is a script for modifying columns in geno file with snp result generated by show-snps
184 |     of mummer
185 | 
186 | ```shell
187 | modify_geno_with_snp_mummer.py <in_geno> <in_snp> <col> <out_geno>
188 | ```
189 | 
190 | 26. nucmer_extract_all_sv.py is a script for running nucmer and extracting all SV.
191 | 
192 | ```
193 | # Dependencies
194 | # Software: nucmer
195 | nucmer_extract_all_sv.py <ref_fasta> <query_fasta> <out_pre> <threads>
196 | ```
197 | 
198 | 27. nucmer_statistics.py & nucmer_statistics_all_sv.py are scripts for running nucmer and generating statistics.
199 | 
200 | ```shell
201 | nucmer_statistics.py <ref_fasta> <query_fasta> <out_pre> <threads>
202 | nucmer_statistics_all_sv.py <ref_fasta> <query_fasta> <out_pre> <threads>
203 | ```
204 | 
205 | 28. quick_extract_fastx.py is a script for extracting fasta or fastq file with list.
206 | 
207 | ```shell
208 | quick_extract_fastx.py <in_fastx|gz> <in_list> <out_fastx|gz>
209 | ```
210 | 
211 | 29. quick_mask_genome.py is a script for masking genome with bed file.
212 | 
213 | ```shell
214 | quick_mask_genome.py <in_fasta> <in_bed> <out_fasta> <threshold> <threads>
215 | ```
216 | 
217 | 30. remove_region_by_blast_result.py is a script for removing regions in chromosomes with blast results.
218 | 
219 | ```shell
220 | remove_region_by_blast_result.py <blast_results> <chr_len> <out_bed>
221 | Usage:
222 |   <blast_results> is a list of blast files seperated with comma
223 | ```
224 | 
225 | 31. rename_ID.py is a script for sorting and renaming id with in_gff file, and renaming id in fasta files.
226 | 
227 | ```shell
228 | rename_ID.py <chr_prefix> <in_gff> <out_gff> <in_fasta> <out_fasta>
229 | ```
230 | 
231 | 32. SentieonSNP_filter.py is a script for filtering vcf result generated by Sentieon.
232 | 
233 | ```shell
234 | usage: SentieonSNP_filter.py [-h] -b BASE -v VALIDATION [-r REPEAT] -o OUTPUT [-m MISSING_RATE] [-d MIN_DISTANCE]
235 | 
236 | options:
237 |   -h, --help            show this help message and exit
238 |   -b BASE, --base BASE  Input vcf file as base
239 |   -v VALIDATION, --validation VALIDATION
240 |                         Input vcf file as validation
241 |   -r REPEAT, --repeat REPEAT
242 |                         Repeat regions file, gff format
243 |   -o OUTPUT, --output OUTPUT
244 |                         Output vcf file based on base vcf file, compressed with gzip
245 |   -m MISSING_RATE, --missing_rate MISSING_RATE
246 |                         Missing rate threshold, percentage, default: 40
247 |   -d MIN_DISTANCE, --min_distance MIN_DISTANCE
248 |                         Minimum distance between two snp sites, default: 0
249 | ```
250 | 
251 | 33. SeqStat.py is a script for generating statistics with fasta|fastq|bam file.
252 | 
253 | ```shell
254 | SeqStat.py <in_file> [out_stat]
255 | ```
256 | 
257 | 34. SimContigs.py & SimCollapse.py are scripts for simulating collapsed contigs.
258 | 
259 | ```shell
260 | usage: SimContigs.py [-h] [--min MIN] [--max MAX] [-n N50] -i INPUT -o OUTPUT
261 | 
262 | options:
263 |   -h, --help            show this help message and exit
264 |   --min MIN             minimum length of contig, default: 15k, you can use both number or string end with k,m
265 |   --max MAX             minimum length of contig, default: 5m, you can use both number or string end with k,m
266 |   -n N50, --n50 N50     size of N50, default: 500k, you can use both number or string end with k,m
267 |   -i INPUT, --input INPUT
268 |                         origin fasta file of genome
269 |   -o OUTPUT, --output OUTPUT
270 |                         filename of simulated data
271 | 
272 | ```
273 | 
274 | 35. SimCollapse.py
275 | 
276 | ```shell
277 | usage: SimCollapse.py [-h] -a A_CONTIGS -b B_CONTIGS -p PREFIX -o OUTPUT -s BLAST [-c COLLAPSE]
278 | 
279 | options:
280 |   -h, --help            show this help message and exit
281 |   -a A_CONTIGS, --a_contigs A_CONTIGS
282 |                         first fasta file contain contigs generated by SimContigs.py
283 |   -b B_CONTIGS, --b_contigs B_CONTIGS
284 |                         second fasta file contain contigs generated by SimContigs.py
285 |   -p PREFIX, --prefix PREFIX
286 |                         prefix of contig file a and contig file b, divided by comma, like: HA, HB
287 |   -o OUTPUT, --output OUTPUT
288 |                         filename of simulated data
289 |   -s BLAST, --blast BLAST
290 |                         blast file with format 6, must use first file of input as query and second file as database
291 |   -c COLLAPSE, --collapse COLLAPSE
292 |                         persentage of collapse region size, like 5 means 5%, default: 10
293 | ```
294 | 
295 | 36. simple_ANGSD.py & simple_ANGSD_without_errorCorrect.py are script for running ANGSD.
296 | 
297 | ```shell
298 | simple_ANGSD.py -l <species.list> -anc <outgroup.fasta> -r <region> [-out <out_group_name> -p <bam_path> -ref <ref.fasta>]
299 | simple_ANGSD_without_errorCorrect.py -l <species.list> -r <region> [-out <out_group_name> -p <bam_path>]
300 | Notice:
301 |   -p: path of bam files, default is current path
302 |   -out: name of outgroup, default is "Outgroup"
303 | ```
304 | 
305 | 37. simple_JBrowser.py is a script for generating file for JBrowser
306 | 
307 | ```shell
308 | # etc/SimpleJBrowser.conf is a template config file for simple_JBrowser.py
309 | simple_JBrowser.py -f <fasta_file> [--gff <gff_file> --bed <bed_file> --bam <bam_file> --bw <bigwig_file> --conf <config_file>]
310 | ```
311 | 
312 | 38. SimSID.py is a script for simulating SNP, Insertions and Deletions.
313 | 
314 | ```shell
315 | usage: SimSID.py [-h] [-s SNP] [-i INSERTION] [--insert_length INSERT_LENGTH] [-d DELETION] [--delete_length DELETE_LENGTH] [--random_length] [-v] -r REF -o OUT
316 | 
317 | options:
318 |   -h, --help            show this help message and exit
319 |   -s SNP, --snp SNP     snp ratio of whole genome, percentage, default: 0.01
320 |   -i INSERTION, --insertion INSERTION
321 |                         insertion ratio of whole genome, percentage, default: 0.01
322 |   --insert_length INSERT_LENGTH
323 |                         max length of insertion, default: 10
324 |   -d DELETION, --deletion DELETION
325 |                         delection ratio of whole genome, percentage, default: 0.01
326 |   --delete_length DELETE_LENGTH
327 |                         max length of deletion, default: 10
328 |   --random_length       use this argument for generate random length of indels
329 |   -v, --verbose         print detail information
330 |   -r REF, --ref REF     origin fasta file of genome
331 |   -o OUT, --out OUT     prefix of simulated data
332 | ```
333 | 
334 | 39. split_cmd_with_parts.py is a script for splitting cmd file.
335 | 
336 | ```shell
337 | split_cmd_with_parts.py <in_cmd_file> <num_parts> <out_str> <threads>
338 | ```
339 | 
340 | 40. split_ctg_with_agp.py is a script for splitting contig fasta file into chromosome groups with agp file.
341 | 
342 | ```shell
343 | split_ctg_with_agp.py <in_fa> <in_agp> <out_dir>
344 | ```
345 | 
346 | 41. split_fasta_by_chr.py is a script for splitting fasta into several files contain single chromosome.
347 | 
348 | ```shell
349 | split_fasta_by_chr.py <in_fasta> <out_dir>
350 | ```
351 | 
352 | 42. split_fasta_by_count.py is a script for splitting fasta to several files with file size or sequence counts.
353 | 
354 | ```shell
355 | split_fasta_by_count.py <in_fasta> <S/F> <count> <out_dir>
356 | ```
357 | 
358 | 43. split_fasta_by_id.py is a script for splitting fasta with id.
359 | 
360 | ```shell
361 | split_fasta_by_id.py <in_fasta> <out_dir>
362 | ```
363 | 
364 | 44. StatAgp.py & StatAgpDetail.py are scripts for generating statistic with agp file.
365 | 
366 | ```shell
367 | StatAgp.py <in_agp>
368 | StatAgpDetail.py <in_agp> <out_csv>
369 | ```
370 | 
371 | 45. subVCF.py is a script for extracting vcf file with list file, default missing rate 0.4.
372 | 
373 | ```shell
374 | subVCF.py <in_vcf> <in_list> <out_vcf> [<missing_rate>]
375 | ```
376 | 
377 | 46. transfer_gff3_with_agp.py is a script for transferring positions with old agp and new agp file.
378 | 
379 | ```shell
380 | transfer_gff3_with_agp.py <in_gff3> <in_old_agp> <in_new_agp> <out_gff3>
381 | ```
382 | 
383 | 47. eval_synteny.py is a script for evaluating the assembly consistency between query genome and reference genome by
384 |     mapping cds of reference genome to query genome and reference genome with gmap and extract bed files with jcvi, be
385 |     sure that the query bed file only contain the chromosomes and/or contigs which you want evalute.
386 | 
387 | ```shell
388 | usage: eval_synteny.py [-h] -r REF -q QRY -p PAIR
389 | 
390 | options:
391 |   -h, --help            show this help message and exit
392 |   -r REF, --ref REF     ref.bed
393 |   -q QRY, --qry QRY     qry.bed
394 | ```
395 | 
396 | 48. get_seq_with_bed.py is a script for extracting sequences from fasta file with bed file, the bed file can contain 4
397 |     or
398 |     5 fields: [seq_id, start_pos, end_pos, out_id] or [seq_id, start_pos, end_pos, direct, out_id]
399 | 
400 | ```shell
401 | Usage: python get_seq_with_bed.py <in_fa> <in_bed> <out_fa>
402 | ```
403 | 
404 | 49. convert_anchorwave.py is a script for convert anchorwave maf file to a table file, which contains 7 columns: "Ref
405 |     id,
406 |     start position, end position, query id, start position, end position, variant type"
407 | 
408 | ```shell
409 | usage: convert_anchorwave.py [-h] -i INPUT -o OUTPUT
410 | 
411 | options:
412 |   -h, --help            show this help message and exit
413 |   -i INPUT, --input INPUT
414 |                         Input maf file
415 |   -o OUTPUT, --output OUTPUT
416 |                         Out put file
417 | ```
418 | 
419 | 50. extract_fasta_with_bed.py is a script for extracting seq with bed file contain 5 colunms: "ID, start, end,
420 |     direction,
421 |     id", positions should be 1-based
422 | 
423 | ```shell
424 | Usage: python bin/extract_fasta_with_bed.py <in_fa> <in_bed> <out_fa>
425 | Notice: bed should be 5 columns: "ID, start, end, direction, id", positions should be 1-based
426 | ```
427 | 
428 | 51. convert_chr_to_ctg_with_agp.py is a script for converting chromosomes to contigs with AGP file.
429 | 
430 | ```shell
431 | Usage: python ./bin/convert_chr_to_ctg_with_agp.py <in_fa> <in_agp> <out_fa>
432 | ```
433 | 
434 | 52. bam_cov.py is a script for calculating genome coverage ratio from bam file.
435 | 
436 | ```shell
437 | usage: bam_cov.py [-h] -b BAM -o OUTPUT [-t THREADS]
438 | 
439 | options:
440 |   -h, --help            show this help message and exit
441 |   -b BAM, --bam BAM     Input bam file, must be indexed
442 |   -o OUTPUT, --output OUTPUT
443 |                         Output statistic
444 |   -t THREADS, --threads THREADS
445 |                         Threads, default=10
446 | ```
447 | 
448 | 53. sort_gff3.py is a script for sorting genes with chromosomes and positions, and generating new IDs.
449 | 
450 | ```shell
451 | Usage: python ./bin/sort_gff3.py <chr_prefix> <in_gff3> <out_gff3>
452 | Notice: sort and rename id with in_gff by coordinate, the chromosome ID should be like: Chr01 for mono assembly, Chr01A for phased assembly.
453 | Example: python ./bin/sort_gff3.py CB5 in.gff3 out.gff3
454 | ```
455 | 
456 | 54. easyGoKegg.R is a simple script for running GO and KEGG with custom emapper annotation
457 | 
458 | > ### Dependencies
459 | >**Software**
460 | > - R
461 | >
462 | >**R modules**
463 | > - optparse
464 | > - KEGGREST
465 | > - clusterProfiler
466 | > - dplyr
467 | > - stringr
468 | > - AnnotationForge
469 | > - jsonlite
470 | > - purrr
471 | > - RCurl
472 | > - ggplot2
473 | >
474 | >**Install R packages**
475 | > - Install with R
476 | > ```bash
477 | > install.packages("BiocManager")
478 | > BiocManager::install(c("optparse","dplyr","stringr", "jsonlite","purrr","ggplot2", "RCurl", "KEGGREST", "clusterProfiler", "AnnotationForge"))
479 | > ```
480 | > - Install with conda/mamba
481 | > ```bash
482 | > conda create -n GoKegg -c conda-forge -c bioconda bioconductor-AnnotationForge bioconductor-clusterProfiler bioconductor-KEGGREST r-dplyr r-ggplot2 r-jsonlite r-optparse r-purrr r-RCurl r-stringr
483 | > # or
484 | > mamba create -n GoKegg -c conda-forge -c bioconda bioconductor-AnnotationForge bioconductor-clusterProfiler bioconductor-KEGGREST r-dplyr r-ggplot2 r-jsonlite r-optparse r-purrr r-RCurl r-stringr
485 | > ```
486 | >### Data preparation
487 | >- Prepare eggnog result  
488 |    > Drop lines start with "##", remove "#" at the beginning of line "#query ...", make sure the first line of
489 |    annotation file is like below:
490 | >```text
491 | >query	seed_ortholog	evalue	score	eggNOG_OGs	max_annot_lvl	COG_category	Description	Preferred_name	GOs	EC	KEGG_ko	KEGG_Pathway	KEGG_Module	KEGG_Reaction	KEGG_rclass	BRITE	KEGG_TC	CAZy	BiGG_Reaction	PFAMs
492 | >```
493 | >
494 | >- Prepare target gene list  
495 |    > A text file each line contain one gene id, the id of genes should match which in annotation file.
496 | >
497 | >### Usage
498 | >- Run command like below
499 | >```bash
500 | >Rscript easyGoKegg.R -i gene.txt -a emapper.annotations -d database
501 | >```
502 | >
503 | >- Detail of parameters
504 | >```bash
505 | >Usage: This Script is used for running GO and KEGG
506 | >Options:
507 | >	-i INPUT, --input=INPUT
508 | >		Input gene list file
509 | >	-a ANNO, --anno=ANNO
510 | >		Functional annotation file
511 | >	-d DB, --db=DB
512 | >		Database path
513 | >	--kegg_json=KEGG_JSON
514 | >		Pre-downloaded kegg json file
515 | >	--genus=GENUS
516 | >		Genus name for creating GO database, default="Custom genus"
517 | >	--pvalue=PVALUE
518 | >		P value cutoff for GO and KEGG, default=0.05
519 | >	--qvalue=QVALUE
520 | >		Q value cutoff for GO and KEGG, default=0.05
521 | >	--padjust=PADJUST
522 | >		P adjust method for GO and KEGG, default="BH"
523 | >	--ontology=ONTOLOGY
524 | >		Ontology for GO, default="ALL"
525 | >	--species=SPECIES
526 | >		Species name for creating GO database, default="CUSTOM"
527 | >	--tax_id=TAX_ID
528 | >		Tax id for creating GO database, default="0000"
529 | >	--update
530 | >		Update databases
531 | >	--plant
532 | >		enrich with plant pathway only
533 | >	--plant_kegg=PLANT_KEGG
534 | >		Pre-generated plant kegg db file
535 | >	-h, --help
536 | >		Show this help message and exit
537 | >```
538 | >> **Notice** there should no space in species
539 | >
540 | >## Result
541 | >1. text file of GO and KEGG results
542 | >2. bubble plot of GO and KEGG results
543 | >3. bar plot of GO and KEGG results
544 | 
545 | 55. check_cds.py is a script for checking CDS valid or not.
546 | 
547 | ```shell
548 | usage: check_cds.py [-h] -i INPUT [--detail] [-o OUTPUT]
549 | 
550 | options:
551 |   -h, --help            show this help message and exit
552 |   -i INPUT, --input INPUT
553 |                         Input CDS file
554 |   --detail              If set, output detail information
555 |   -o OUTPUT, --output OUTPUT
556 |                         Output summary file, if not set, output to stdout
557 | ```
558 | 


--------------------------------------------------------------------------------