├── .gitignore ├── etc └── SimpleJBrowser.conf ├── bin ├── calc_gap_cnt.py ├── get_genes_region_from_gff.py ├── split_fasta_by_id.py ├── extract_gene_from_gff.py ├── get_seq_from_range.py ├── split_fasta_by_chr.py ├── average_fpkm.py ├── filter_cds.py ├── convert_collinearity_from_MCScanX_to_Circos.py ├── get_gff_with_list.py ├── convert_gbff_to_fasta.py ├── split_cmd_with_parts.py ├── get_chr_len.py ├── quick_extract_fastx.py ├── split_fasta_by_count.py ├── find_gff_ovlp_regions.py ├── modify_geno_with_snp_mummer.py ├── StatAgp.py ├── split_ctg_with_agp.py ├── convert_simple_for_circos.py ├── subVCF.py ├── extract_vcf.py ├── bam_cov.py ├── extract_fasta_with_bed.py ├── convert_QTL_info.py ├── convert_chr_to_ctg_with_agp.py ├── StatAgpDetail.py ├── extract_all_sv_from_nucmer_delta.py ├── dup_dotplot.pl ├── merge_bed_regions.py ├── get_seq_with_bed.py ├── get_genes_from_range.py ├── eval_filled_gaps.py ├── nucmer_extract_all_sv.py ├── calc_gene_ovlp_te.py ├── group_SNP_exon_and_intron.py ├── group_exon_and_intron.py ├── nucmer_statistics.py ├── eval_synteny.py ├── rename_ID.py ├── simple_ANGSD_without_errorCorrect.py ├── check_cds.py ├── nucmer_statistics_all_sv.py ├── transfer_gff3_with_agp.py ├── sort_gff3.py ├── quick_mask_genome.py ├── SeqStat.py ├── approximate_cnv.py ├── simple_ANGSD.py ├── remove_region_by_blast_result.py ├── convert_anchorwave.py ├── blast2heatmap.py ├── SimContigs.py ├── SimCollapse.py ├── simple_JBrowser.py ├── SentieonSNP_filter.py ├── SimSID.py └── easyGoKegg.R ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .vscode 3 | .idea 4 | *.pyc 5 | __pycache__ -------------------------------------------------------------------------------- /etc/SimpleJBrowser.conf: -------------------------------------------------------------------------------- 1 | [path] 2 | samtools= 3 | bam2wig= 4 | wig2bw= 5 | JBrowser= 6 | -------------------------------------------------------------------------------- /bin/calc_gap_cnt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def calc_gap_cnt(in_fa): 6 | total_cnt = 0 7 | with open(in_fa, 'r') as fin: 8 | id = '' 9 | gap_cnt = 0 10 | for line in fin: 11 | if line[0] == '>': 12 | if id != '': 13 | print("%s\t%d"%(id, gap_cnt)) 14 | total_cnt += gap_cnt 15 | id = line.strip()[1:] 16 | gap_cnt = 0 17 | last_base ='' 18 | else: 19 | for i in range(len(line.strip())): 20 | if line[i].lower() == 'n' and last_base != 'n': 21 | gap_cnt += 1 22 | last_base = line[i].lower() 23 | print("%s\t%d"%(id, gap_cnt)) 24 | total_cnt += gap_cnt 25 | print("Total\t%d"%total_cnt) 26 | 27 | 28 | if __name__ == "__main__": 29 | if len(sys.argv) < 2: 30 | print("Usage: python "+sys.argv[0]+" ") 31 | else: 32 | in_fa = sys.argv[1] 33 | calc_gap_cnt(in_fa) 34 | -------------------------------------------------------------------------------- /bin/get_genes_region_from_gff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def get_genes_region_from_gff(gene_list, in_gff, out_bed): 6 | genes = [] 7 | with open(gene_list, 'r') as f_in: 8 | for line in f_in: 9 | genes.append(line.strip()) 10 | 11 | with open(in_gff, 'r') as f_in: 12 | with open(out_bed, 'w') as f_out: 13 | for line in f_in: 14 | if line[0] == '#' or line.strip() == '': 15 | continue 16 | data = line.strip().split() 17 | if data[2] != 'gene': 18 | continue 19 | id = data[8].split(';')[1].split('=')[1] 20 | if id in genes: 21 | f_out.write(data[0]+'\t'+data[3]+'\t'+data[4]+'\t'+id+'\n') 22 | 23 | 24 | if __name__ == "__main__": 25 | if len(sys.argv) < 4: 26 | print("Usage: python "+sys.argv[0]+" ") 27 | else: 28 | proc, gene_list, in_gff, out_bed = sys.argv 29 | get_genes_region_from_gff(gene_list, in_gff, out_bed) 30 | 31 | -------------------------------------------------------------------------------- /bin/split_fasta_by_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | 4 | 5 | def split_fasta_by_chr(fasta_file, out_folder): 6 | if os.path.exists(out_folder) == False: 7 | os.mkdir(out_folder) 8 | seq_db = {} 9 | with open(fasta_file, 'r') as f_fasta: 10 | seq = '' 11 | seq_id = '' 12 | for line in f_fasta: 13 | if line[0] == ">": 14 | if seq != '': 15 | seq_db[seq_id] = seq 16 | seq_id = line.strip().split()[0] 17 | seq = '' 18 | else: 19 | seq += line.strip() 20 | seq_db[seq_id] = seq 21 | 22 | for seq_id in seq_db: 23 | f_out = open(out_folder+"/"+seq_id[1:]+".fasta", 'w') 24 | f_out.write(seq_id+"\n"+seq_db[seq_id]+"\n") 25 | f_out.close() 26 | 27 | 28 | if __name__ == "__main__": 29 | if len(sys.argv) < 3: 30 | print("Notice: script for spliting fasta into serval files contain single chromosome") 31 | print("Usage: python " + sys.argv[0] + " ") 32 | else: 33 | in_fasta = sys.argv[1] 34 | out_dir = sys.argv[2] 35 | split_fasta_by_chr(in_fasta, out_dir) 36 | -------------------------------------------------------------------------------- /bin/extract_gene_from_gff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def extract_gene_from_gff(in_list, in_gff, out_bed): 6 | id_list = [] 7 | with open(in_list, 'r') as f_in: 8 | for line in f_in: 9 | id_list.append(line.strip()) 10 | 11 | with open(in_gff, 'r') as f_in: 12 | with open(out_bed, 'w') as f_out: 13 | for line in f_in: 14 | if line[0] == '#' or line.strip() == '': 15 | continue 16 | data = line.strip().split() 17 | if data[2] != 'gene': 18 | continue 19 | id = data[8].split(';')[0].split("=")[1] 20 | if id in id_list: 21 | f_out.write(data[0]+'\t'+data[3]+'\t'+data[4]+'\t'+id+'\n') 22 | 23 | 24 | if __name__ == "__main__": 25 | if len(sys.argv) < 4: 26 | print("Notice: this script is used to extract region of gene in list file from gff file") 27 | print("Usage: python "+sys.argv[0]+" ") 28 | else: 29 | in_list = sys.argv[1] 30 | in_gff = sys.argv[2] 31 | out_bed = sys.argv[3] 32 | extract_gene_from_gff(in_list, in_gff, out_bed) 33 | 34 | -------------------------------------------------------------------------------- /bin/get_seq_from_range.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def get_seq(in_fasta, in_bed, out_fasta): 6 | seq_db = {} 7 | with open(in_fasta, 'r') as f_in: 8 | id = '' 9 | seq = '' 10 | for line in f_in: 11 | if line[0] == '>': 12 | if seq != '': 13 | seq_db[id] = seq 14 | id = line.strip()[1:] 15 | seq = '' 16 | else: 17 | seq += line.strip() 18 | seq_db[id] = seq 19 | 20 | with open(in_bed, 'r') as f_in: 21 | with open(out_fasta, 'w') as f_out: 22 | for line in f_in: 23 | data = line.strip().split() 24 | chrn = data[0] 25 | s = int(data[1]) 26 | e = int(data[2]) 27 | f_out.write(">"+chrn+"["+str(s)+":"+str(e)+"]\n"+seq_db[chrn][s:e+1]+"\n") 28 | 29 | 30 | if __name__ == "__main__": 31 | if len(sys.argv) < 4: 32 | print("Notice: extract sequences with bed file") 33 | print("Usage: python "+sys.argv[0]+" ") 34 | else: 35 | in_fasta = sys.argv[1] 36 | in_bed = sys.argv[2] 37 | out_fasta = sys.argv[3] 38 | get_seq(in_fasta, in_bed, out_fasta) 39 | 40 | -------------------------------------------------------------------------------- /bin/split_fasta_by_chr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | 4 | 5 | def split_fasta_by_chr(fasta_file, out_folder): 6 | if os.path.exists(out_folder) == False: 7 | os.mkdir(out_folder) 8 | seq_db = {} 9 | with open(fasta_file, 'r') as f_fasta: 10 | seq = '' 11 | seq_id = '' 12 | for line in f_fasta: 13 | if line[0] == ">": 14 | if seq != '': 15 | seq_db[seq_id] = seq 16 | seq_id = line.strip() 17 | seq = '' 18 | else: 19 | seq += line.strip() 20 | seq_db[seq_id] = seq 21 | 22 | for seq_id in seq_db: 23 | if seq_id[:4].lower() != '>chr': 24 | continue 25 | f_out = open(out_folder+"/"+seq_id[1:]+".fasta", 'w') 26 | f_out.write(seq_id+"\n"+seq_db[seq_id]+"\n") 27 | f_out.close() 28 | 29 | 30 | if __name__ == "__main__": 31 | if len(sys.argv) < 3: 32 | print("Notice: script for spliting fasta into serval files contain single chromosome") 33 | print("Usage: python " + sys.argv[0] + " ") 34 | else: 35 | in_fasta = sys.argv[1] 36 | out_dir = sys.argv[2] 37 | split_fasta_by_chr(in_fasta, out_dir) 38 | -------------------------------------------------------------------------------- /bin/average_fpkm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import numpy as np 4 | 5 | 6 | def average_fpkm(in_fpkm, out_avg): 7 | group_list = [] 8 | last_smp = '' 9 | with open(in_fpkm, 'r') as fin: 10 | with open(out_avg, 'w') as fout: 11 | for line in fin: 12 | data = line.strip().split() 13 | if data[0] == 'gene_id': 14 | fout.write("gene_id") 15 | for i in range(1, len(data)): 16 | smp = data[i][:-1] 17 | if smp != last_smp: 18 | last_smp = smp 19 | group_list.append([]) 20 | fout.write("\t%s"%smp) 21 | group_list[-1].append(i) 22 | fout.write("\n") 23 | else: 24 | fout.write("%s"%data[0]) 25 | for idxs in group_list: 26 | vals = [] 27 | for idx in idxs: 28 | vals.append(float(data[idx])) 29 | fout.write("\t%.2f"%np.average(vals)) 30 | fout.write("\n") 31 | 32 | 33 | if __name__ == "__main__": 34 | if len(sys.argv) < 3: 35 | print("Usage: python %s "%sys.argv[0]) 36 | else: 37 | in_fpkm, out_avg = sys.argv[1:] 38 | average_fpkm(in_fpkm, out_avg) 39 | -------------------------------------------------------------------------------- /bin/filter_cds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def filter_cds(in_cds, out_cds): 6 | print("Loading cds") 7 | cds_db = {} 8 | with open(in_cds, 'r') as fin: 9 | for line in fin: 10 | if line[0] == '>': 11 | id = line.strip().split()[0][1:] 12 | cds_db[id] = [] 13 | else: 14 | cds_db[id].append(line.strip().upper()) 15 | 16 | for id in cds_db: 17 | cds_db[id] = ''.join(cds_db[id]) 18 | print("Filtering cds") 19 | start_codon = set(["ATG"]) 20 | stop_codon = set(["TAG", "TAA", "TGA"]) 21 | 22 | with open(out_cds, 'w') as fout: 23 | for id in sorted(cds_db): 24 | cds_len = len(cds_db[id]) 25 | cds_start = cds_db[id][:3] 26 | cds_stop = cds_db[id][-3:] 27 | if (cds_len%3 != 0) or (cds_start not in start_codon) or (cds_stop not in stop_codon): 28 | fout.write(">%s\n%s\n"%(id, cds_db[id])) 29 | 30 | print("Finished") 31 | 32 | 33 | if __name__ == "__main__": 34 | if len(sys.argv) < 3: 35 | print("Usage: python %s "%sys.argv[0]) 36 | else: 37 | in_cds, out_cds = sys.argv[1:] 38 | filter_cds(in_cds, out_cds) 39 | 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Shengcheng Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bin/convert_collinearity_from_MCScanX_to_Circos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def get_col(in_col, in_gff, out_txt): 6 | id_db = {} 7 | with open(in_gff, 'r') as f_gff: 8 | for line in f_gff: 9 | data = line.strip().split() 10 | id_db[data[1]] = data[0][:2]+"Chr"+data[0][2:]+"\t"+data[2]+"\t"+data[3]+"\n" 11 | i = 0 12 | with open(in_col, 'r') as f_col: 13 | with open(out_txt, 'w') as f_out: 14 | for line in f_col: 15 | if line[0] == "#": 16 | continue 17 | data = line.strip().split('\t') 18 | id_1 = data[1] 19 | id_2 = data[2] 20 | if id_1 not in id_db or id_2 not in id_db: 21 | continue 22 | f_out.write("link"+str(i)+"\t"+id_db[id_1]) 23 | f_out.write("link"+str(i)+"\t"+id_db[id_2]) 24 | i += 1 25 | 26 | 27 | if __name__ == "__main__": 28 | if len(sys.argv) < 4: 29 | print("Notice: script for converting collinearity file from MCScanX result to link file for Circos") 30 | print("Usage: python " + sys.argv[0] + " ") 31 | else: 32 | in_col = sys.argv[1] 33 | in_gff = sys.argv[2] 34 | out_txt = sys.argv[3] 35 | get_col(in_col, in_gff, out_txt) 36 | 37 | -------------------------------------------------------------------------------- /bin/get_gff_with_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | 5 | 6 | def get_gff_with_list(in_gff, in_list, out_gff): 7 | print("Loading list") 8 | use_id = {} 9 | with open(in_list, 'r') as fin: 10 | for line in fin: 11 | use_id[line.strip()] = 1 12 | 13 | print("Filter gff3") 14 | with open(in_gff, 'r') as fin: 15 | with open(out_gff, 'w') as fout: 16 | fout.write("#gff-version 3\n") 17 | is_write = False 18 | for line in fin: 19 | if line.strip() == '' or line[0] == '#': 20 | continue 21 | data = line.strip().split() 22 | if data[2] == 'gene': 23 | if "Name" in data[8]: 24 | regexp = r'Name=(.*)' 25 | else: 26 | regexp = r'ID=(.*)' 27 | id = re.findall(regexp, data[8])[0].split(';')[0] 28 | if id in use_id: 29 | is_write = True 30 | else: 31 | is_write = False 32 | if is_write: 33 | fout.write(line) 34 | 35 | print("Finished") 36 | 37 | 38 | if __name__ == "__main__": 39 | if len(sys.argv) < 4: 40 | print("Usage: python %s "%sys.argv[0]) 41 | else: 42 | in_gff, in_list, out_gff = sys.argv[1:] 43 | get_gff_with_list(in_gff, in_list, out_gff) 44 | -------------------------------------------------------------------------------- /bin/convert_gbff_to_fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def convert_gbff_to_fasta(in_gbff, out_fa): 6 | print("Converting") 7 | with open(in_gbff, 'r') as fin: 8 | with open(out_fa, 'w') as fout: 9 | cnt = 0 10 | err_cnt = 0 11 | for line in fin: 12 | data = line.strip().split() 13 | if data[0] == 'LOCUS': 14 | cnt += 1 15 | gn = data[1] 16 | gn_len = int(data[2]) 17 | seq_len = 0 18 | fout.write(">%s\n"%gn) 19 | is_write = False 20 | elif data[0] == 'ORIGIN': 21 | is_write = True 22 | elif data[0] == '//': 23 | if gn_len != seq_len: 24 | err_cnt += 1 25 | print("\tERROR: %s Comment length: %sbp, current length: %dbp"%(gn, gn_len, seq_len)) 26 | is_write = False 27 | else: 28 | if is_write: 29 | seq = ''.join(data[1:]) 30 | seq_len += len(seq) 31 | fout.write(seq+'\n') 32 | print("Total convert %d, error count %d"%(cnt, err_cnt)) 33 | print("Finished") 34 | 35 | 36 | if __name__ == "__main__": 37 | if len(sys.argv) < 3: 38 | print("Usage: python %s "%sys.argv[0]) 39 | else: 40 | in_gbff, out_fa = sys.argv[1:] 41 | convert_gbff_to_fasta(in_gbff, out_fa) 42 | -------------------------------------------------------------------------------- /bin/split_cmd_with_parts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import multiprocessing 4 | 5 | 6 | def write_cmd(fn, cmd_list): 7 | print("\tWriting %s"%fn) 8 | with open(fn, 'w') as fout: 9 | fout.write("".join(cmd_list)) 10 | 11 | 12 | def split_cmd(in_cmd, np, out_str, ts): 13 | print("Loading cmds") 14 | cmd_list = [] 15 | with open(in_cmd, 'r') as fin: 16 | for line in fin: 17 | cmd_list.append(line) 18 | 19 | print("Splitting commands") 20 | pool = multiprocessing.Pool(processes=ts) 21 | cmd_per_file = int(round(len(cmd_list)/np, 0)) 22 | for i in range(0, np): 23 | fn = out_str%(i+1) 24 | if i < np-1: 25 | pool.apply_async(write_cmd, (fn, cmd_list[i*cmd_per_file: (i+1)*cmd_per_file],)) 26 | else: 27 | pool.apply_async(write_cmd, (fn, cmd_list[i*cmd_per_file:],)) 28 | pool.close() 29 | pool.join() 30 | print("Finished") 31 | 32 | 33 | if __name__ == "__main__": 34 | if len(sys.argv) < 4: 35 | print("Usage: python "+sys.argv[0]+" ") 36 | print("\t is a string contain %d as file index, like run_%d.sh") 37 | else: 38 | in_cmd, np, out_str, ts = sys.argv[1:] 39 | split_cmd(in_cmd, int(np), out_str, int(ts)) 40 | 41 | -------------------------------------------------------------------------------- /bin/get_chr_len.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import gzip 4 | 5 | 6 | def get_chr_len(in_fasta, out_file, is_chr_only): 7 | if is_chr_only != "T" and is_chr_only != "F": 8 | print("Error argument") 9 | exit(0) 10 | dict_len = {} 11 | if in_fasta[-3:].lower() == '.gz': 12 | f_in = gzip.open(in_fasta, 'rt') 13 | else: 14 | f_in = open(in_fasta, 'r') 15 | chrn = '' 16 | seq = '' 17 | for line in f_in: 18 | if line[0] == '>': 19 | if seq != '': 20 | dict_len[chrn] = len(seq) 21 | chrn = line.strip().split()[0][1:] 22 | seq = '' 23 | else: 24 | seq += line.strip() 25 | dict_len[chrn] = len(seq) 26 | f_in.close() 27 | 28 | with open(out_file, 'w') as f_out: 29 | chr_list = sorted(dict_len.keys()) 30 | for chrn in chr_list: 31 | if is_chr_only == "T" and chrn[:3].lower() != 'chr': 32 | continue 33 | else: 34 | f_out.write(chrn+"\t"+str(dict_len[chrn])+"\n") 35 | 36 | 37 | if __name__ == "__main__": 38 | if len(sys.argv) < 4: 39 | print("Notice: script for calculating length of chromosomes in fasta file") 40 | print("Usage: python "+sys.argv[0]+" ") 41 | else: 42 | f_fasta = sys.argv[1] 43 | f_out = sys.argv[2] 44 | is_chr_only = sys.argv[3] 45 | get_chr_len(f_fasta, f_out, is_chr_only) 46 | -------------------------------------------------------------------------------- /bin/quick_extract_fastx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import gzip 4 | import time 5 | 6 | 7 | def quick_extract_reads(in_fx, in_li, out_fx): 8 | print("\033[32m%s\033[0m Starting"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())))) 9 | read_db = {} 10 | with open(in_li, 'r') as fin: 11 | for line in fin: 12 | data = line.strip().split() 13 | read_db[data[0]] = '' 14 | 15 | fn = in_fx.split('.') 16 | if fn[-1].lower() == 'gz': 17 | fin = gzip.open(in_fx, 'rt') 18 | else: 19 | fin = open(in_fx, 'r') 20 | 21 | fn = out_fx.split('.') 22 | if fn[-1].lower() == 'gz': 23 | fout = gzip.open(out_fx, 'wt') 24 | else: 25 | fout = open(out_fx, 'w') 26 | 27 | is_write = False 28 | cnt = 0 29 | for line in fin: 30 | if cnt%2==0 and (line[0] == '>' or line[0] == '@'): 31 | id = line.strip().split()[0][1:] 32 | if id in read_db: 33 | is_write = True 34 | fout.write(line) 35 | else: 36 | is_write = False 37 | else: 38 | if is_write: 39 | fout.write(line) 40 | cnt += 1 41 | fin.close() 42 | fout.close() 43 | print("\033[32m%s\033[0m Finished"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())))) 44 | 45 | 46 | if __name__ == "__main__": 47 | if len(sys.argv) < 4: 48 | print("Usage: python "+sys.argv[0]+" ") 49 | else: 50 | in_fx, in_li, out_fx = sys.argv[1:] 51 | quick_extract_reads(in_fx, in_li, out_fx) 52 | 53 | -------------------------------------------------------------------------------- /bin/split_fasta_by_count.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | 4 | 5 | def split_fasta_by_count(in_fa, is_seq, cnt, out_dir): 6 | if not os.path.isdir(out_dir): 7 | os.mkdir(out_dir) 8 | 9 | seq_db = {} 10 | id = '' 11 | seq = '' 12 | with open(in_fa, 'r') as f_in: 13 | for line in f_in: 14 | if line[0] == '>': 15 | if seq != '': 16 | seq_db[id] = seq 17 | id = line.strip()[1:] 18 | seq = '' 19 | else: 20 | seq += line 21 | seq_db[id] = seq 22 | 23 | total_seq_cnt = len(seq_db) 24 | tmp_cnt = int(round(total_seq_cnt*1.0/cnt+0.5)) 25 | if is_seq: 26 | file_cnt = tmp_cnt 27 | seq_cnt = cnt 28 | else: 29 | file_cnt = cnt 30 | seq_cnt = tmp_cnt 31 | fn = in_fa.replace('.fasta', '').replace('.fa', '') 32 | id_list = seq_db.keys() 33 | for i in range(0, file_cnt): 34 | with open(os.path.join(out_dir, fn+"_"+str(i)+".fa"), 'w') as f_out: 35 | for j in range(0, seq_cnt): 36 | index = i*seq_cnt+j 37 | if index < len(id_list): 38 | f_out.write(">%s\n%s"%(id_list[index], seq_db[id_list[index]])) 39 | 40 | 41 | if __name__ == "__main__": 42 | if len(sys.argv) < 5: 43 | print("Usage: python "+sys.argv[0]+" ") 44 | else: 45 | in_fa = sys.argv[1] 46 | if sys.argv[2].lower() == 's': 47 | is_seq = True 48 | else: 49 | is_seq = False 50 | cnt = int(sys.argv[3]) 51 | out_dir = sys.argv[4] 52 | split_fasta_by_count(in_fa, is_seq, cnt, out_dir) 53 | 54 | -------------------------------------------------------------------------------- /bin/find_gff_ovlp_regions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def find_ovlp(in_gff3, out_bed): 6 | gff3_db = {} 7 | with open(in_gff3, 'r') as fin: 8 | for line in fin: 9 | if line[0] == '#' or line.strip() == '': 10 | continue 11 | data = line.strip().split() 12 | if data[2] != 'gene': 13 | continue 14 | chrn = data[0] 15 | sp = int(data[3]) 16 | ep = int(data[4]) 17 | dir = data[6] 18 | gn = data[8].split(";")[1].split("=")[1] 19 | if chrn not in gff3_db: 20 | gff3_db[chrn] = {} 21 | if dir not in gff3_db[chrn]: 22 | gff3_db[chrn][dir] = [] 23 | gff3_db[chrn][dir].append([sp, ep, gn]) 24 | 25 | with open(out_bed, 'w') as fout: 26 | for chrn in sorted(gff3_db): 27 | for dir in sorted(gff3_db[chrn]): 28 | tmp_list = [] 29 | pos_list = sorted(gff3_db[chrn][dir]) 30 | for i in range(0, len(pos_list)): 31 | if len(tmp_list) == 0: 32 | tmp_list.append(pos_list[i]) 33 | else: 34 | s, e, gn = pos_list[i] 35 | if s <= last_e: 36 | tmp_list.append(pos_list[i]) 37 | else: 38 | if len(tmp_list) > 1: 39 | for s, e, gn in tmp_list: 40 | fout.write("%s\t%d\t%d\t%s\t%s\n"%(chrn, s, e, dir, gn)) 41 | fout.write("###\n") 42 | tmp_list = [] 43 | tmp_list.append(pos_list[i]) 44 | last_s, last_e, gn = pos_list[i] 45 | 46 | 47 | if __name__ == "__main__": 48 | if len(sys.argv) < 3: 49 | print("Usage: python "+sys.argv[0]+" ") 50 | else: 51 | in_gff3, out_bed = sys.argv[1:] 52 | find_ovlp(in_gff3, out_bed) 53 | -------------------------------------------------------------------------------- /bin/modify_geno_with_snp_mummer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def modify_geno(in_geno, in_snp, col, out_geno): 6 | snp_db = {} 7 | with open(in_snp, 'r') as f_in: 8 | for line in f_in: 9 | data = line.strip().split() 10 | if data == []: 11 | continue 12 | if data[0].isdigit() == False: 13 | continue 14 | chrn = data[-1] 15 | if chrn not in snp_db: 16 | snp_db[chrn] = {} 17 | pos = int(data[3]) 18 | snp_db[chrn][pos] = data[1].replace('.', 'N') + '/'+data[1].replace('.', 'N') 19 | 20 | cnt_ovlp_snp = 0 21 | with open(in_geno, 'r') as f_in: 22 | with open(out_geno, 'w') as f_out: 23 | for line in f_in: 24 | if line[0] == "#": 25 | f_out.write(line) 26 | continue 27 | data = line.strip().split() 28 | for i in range(0, len(data)): 29 | if i == col: 30 | if int(data[1]) in snp_db[data[0]]: 31 | cnt_ovlp_snp += 1 32 | f_out.write(snp_db[data[0]][int(data[1])]) 33 | else: 34 | f_out.write(data[i]) 35 | else: 36 | f_out.write(data[i]) 37 | if i < len(data)-1: 38 | f_out.write('\t') 39 | f_out.write('\n') 40 | print(cnt_ovlp_snp) 41 | 42 | 43 | if __name__ == "__main__": 44 | if len(sys.argv) < 4: 45 | print("Notice: modify column in geno file with snp result generated by show-snps of mummer") 46 | print("Usage: python "+sys.argv[0]+" ") 47 | else: 48 | in_geno = sys.argv[1] 49 | in_snp = sys.argv[2] 50 | col = int(sys.argv[3]) 51 | out_geno = sys.argv[4] 52 | modify_geno(in_geno, in_snp, col, out_geno) 53 | 54 | -------------------------------------------------------------------------------- /bin/StatAgp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | 5 | def stat_agp(in_agp): 6 | asm_db = {} 7 | total_tig = 0 8 | unchor_tig = 0 9 | unchor_tig_size = 0 10 | asm_size = 0 11 | with open(in_agp, 'r') as fin: 12 | for line in fin: 13 | data = line.strip().split() 14 | if data[4] == 'U': 15 | continue 16 | chrn = data[0] 17 | total_tig += 1 18 | ep = int(data[2]) 19 | if data[0] != data[5] and chrn[:3] != 'tig' and chrn[:3] != 'utg' and chrn[:3] != 'ctg': 20 | allele = chrn[-1] 21 | chrn = chrn[:-1] 22 | if chrn not in asm_db: 23 | asm_db[chrn] = {} 24 | if allele not in asm_db[chrn]: 25 | asm_db[chrn][allele] = ep 26 | asm_db[chrn][allele] = ep 27 | else: 28 | unchor_tig += 1 29 | unchor_tig_size += ep 30 | for chrn in asm_db: 31 | for allele in asm_db[chrn]: 32 | asm_size += asm_db[chrn][allele] 33 | print("\t%s"%('\t'.join(sorted(asm_db[chrn])))) 34 | 35 | for chrn in sorted(asm_db): 36 | print("%s"%chrn, end='') 37 | for allele in sorted(asm_db[chrn]): 38 | print("\t%s"%("{:,}".format(asm_db[chrn][allele])), end='') 39 | print("") 40 | print("No. of unanchored contigs\t%s"%("{:,}".format(unchor_tig))) 41 | print("Unanchored sequences (Mb)\t%s"%("{:,}".format(unchor_tig_size*1.0/1e6))) 42 | print("Total no. of contigs\t%s"%("{:,}".format(total_tig))) 43 | print("Total assembled size (Mb)\t%s"%("{:,}".format(asm_size*1.0/1e6))) 44 | 45 | 46 | if __name__ == "__main__": 47 | if len(sys.argv) < 2: 48 | print("Usage: python %s "%sys.argv[0]) 49 | else: 50 | in_agp = sys.argv[1] 51 | stat_agp(in_agp) 52 | -------------------------------------------------------------------------------- /bin/split_ctg_with_agp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | 5 | 6 | def split_fa(in_fa, in_agp, out_dir): 7 | if not os.path.exists(out_dir): 8 | os.makedirs(out_dir) 9 | 10 | fa_db = {} 11 | with open(in_fa, 'r') as fin: 12 | for line in fin: 13 | if line[0] == '>': 14 | id = line.strip().split()[0][1:] 15 | fa_db[id] = [] 16 | else: 17 | fa_db[id].append(line.strip()) 18 | 19 | for id in fa_db: 20 | fa_db[id] = ''.join(fa_db[id]) 21 | 22 | chr_ctgs = {} 23 | with open(in_agp, 'r') as fin: 24 | for line in fin: 25 | if line.strip() == "" or line[0] == '#': 26 | continue 27 | data = line.strip().split() 28 | if data[4] != 'W': 29 | continue 30 | chrn = data[0] 31 | ctg = data[5] 32 | if chrn==ctg: 33 | chrn = 'Unanchored' 34 | if chrn not in chr_ctgs: 35 | chr_ctgs[chrn] = [] 36 | chr_ctgs[chrn].append(ctg) 37 | 38 | for chrn in chr_ctgs: 39 | out_fn = os.path.join(out_dir, "%s.fasta"%chrn) 40 | with open(out_fn, 'w') as fout: 41 | for id in chr_ctgs[chrn]: 42 | fout.write(">%s\n%s\n"%(id, fa_db[id])) 43 | 44 | 45 | if __name__ == "__main__": 46 | if len(sys.argv) < 4: 47 | print("Usage: python %s "%sys.argv[0]) 48 | else: 49 | in_fa, in_agp, out_dir = sys.argv[1:] 50 | split_fa(in_fa, in_agp, out_dir) 51 | 52 | -------------------------------------------------------------------------------- /bin/convert_simple_for_circos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | 5 | 6 | def convert_simple_for_circos(in_simple, in_gff3_files, out_link): 7 | print("Loading gff3") 8 | id_db = {} 9 | for in_gff3 in in_gff3_files.split(','): 10 | with open(in_gff3, 'r') as fin: 11 | for line in fin: 12 | if line.strip() == '' or line[0] == '#': 13 | continue 14 | data = line.strip().split() 15 | if data[2]!='gene': 16 | continue 17 | chrn = data[0] 18 | sp = int(data[3]) 19 | ep = int(data[4]) 20 | if 'Name' in data[8]: 21 | id = re.findall(r'Name=(.*)', data[8])[0].split(';')[0] 22 | else: 23 | id = re.findall(r'ID=(.*)', data[8])[0].split(';')[0] 24 | id_db[id] = [chrn, sp, ep] 25 | 26 | print("Loading and writing link") 27 | with open(in_simple, 'r') as fin: 28 | with open(out_link, 'w') as fout: 29 | for line in fin: 30 | data = line.strip().split() 31 | achrn = id_db[data[0]][0] 32 | asp = min(id_db[data[0]][1], id_db[data[1]][1]) 33 | aep = max(id_db[data[0]][2], id_db[data[1]][2]) 34 | bchrn = id_db[data[2]][0] 35 | bsp = min(id_db[data[2]][1], id_db[data[3]][1]) 36 | bep = max(id_db[data[2]][2], id_db[data[3]][2]) 37 | fout.write("%s\t%d\t%d\t%s\t%d\t%d\n"%(achrn, asp, aep, bchrn, bsp, bep)) 38 | 39 | print("Finished") 40 | 41 | 42 | if __name__ == "__main__": 43 | if len(sys.argv) < 4: 44 | print("Usage: python %s "%sys.argv[0]) 45 | else: 46 | in_simple, in_gff3_files, out_link = sys.argv[1:] 47 | convert_simple_for_circos(in_simple, in_gff3_files, out_link) 48 | -------------------------------------------------------------------------------- /bin/subVCF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import gzip 4 | 5 | 6 | def subVCF(in_vcf, in_list, out_vcf, missing_rate): 7 | sp_list = [] 8 | 9 | with open(in_list, 'r') as f_in: 10 | for line in f_in: 11 | if line.strip() != '': 12 | sp_list.append(line.strip()) 13 | 14 | if in_vcf.split('.')[-1] == 'gz': 15 | f_in = gzip.open(in_vcf, 'rt') 16 | else: 17 | f_in = open(in_vcf, 'r') 18 | 19 | 20 | if out_vcf.split('.')[-1] == 'gz': 21 | f_out = gzip.open(out_vcf, 'wt') 22 | else: 23 | f_out = open(out_vcf, 'w') 24 | 25 | col_db = [] 26 | for line in f_in: 27 | data = line.strip().split() 28 | pub_info = data[0:9] 29 | if data[0][0] == "#": 30 | for i in range(9, len(data)): 31 | if data[i] in sp_list: 32 | col_db.append(i) 33 | 34 | cnt_mis = 0 35 | out_str = '' 36 | for i in col_db: 37 | out_str += "\t"+data[i] 38 | if data[i] == './.' or data[i] == '.|.': 39 | cnt_mis += 1 40 | if len(col_db) > 0 and cnt_mis*1.0/len(col_db) > missing_rate: 41 | continue 42 | f_out.write("\t".join(pub_info)) 43 | f_out.write(out_str+"\n") 44 | 45 | f_in.close() 46 | f_out.close() 47 | 48 | 49 | if __name__ == "__main__": 50 | if len(sys.argv) < 4: 51 | print("Notice: script to extract vcf file with list file, default missing rate 0.4") 52 | print("Usage: python "+sys.argv[0]+" []") 53 | else: 54 | in_vcf = sys.argv[1] 55 | in_list = sys.argv[2] 56 | out_vcf = sys.argv[3] 57 | if len(sys.argv) == 5: 58 | missing_rate = float(sys.argv[4]) 59 | else: 60 | missing_rate = 0.4 61 | subVCF(in_vcf, in_list, out_vcf, missing_rate) 62 | 63 | -------------------------------------------------------------------------------- /bin/extract_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def bin_search(bed_list, pos): 6 | s = 0 7 | e = len(bed_list)-1 8 | while s <= e: 9 | mid = (s+e)//2 10 | if bed_list[mid][0] < pos: 11 | s = mid+1 12 | elif bed_list[mid][0] > pos: 13 | e = mid-1 14 | else: 15 | return True 16 | if bed_list[e][0] <= pos and bed_list[e][1] >= pos: 17 | return True 18 | else: 19 | return False 20 | 21 | 22 | def extract_vcf(in_vcf, in_bed, out_vcf): 23 | print("Loading BED file") 24 | bed_db = {} 25 | with open(in_bed, 'r') as f_in: 26 | for line in f_in: 27 | if line.strip() == "": 28 | continue 29 | data = line.strip().split() 30 | chrn = data[0] 31 | sr = int(data[1]) 32 | er = int(data[2]) 33 | if chrn not in bed_db: 34 | bed_db[chrn] = [] 35 | bed_db[chrn].append([sr, er]) 36 | 37 | print("Extracting VCF") 38 | last_chrn = "" 39 | with open(in_vcf, 'r') as f_in: 40 | with open(out_vcf, 'w') as f_out: 41 | for line in f_in: 42 | if line.strip() == "": 43 | continue 44 | if line[0] == '#': 45 | f_out.write(line) 46 | else: 47 | data = line.strip().split() 48 | chrn = data[0] 49 | if chrn != last_chrn: 50 | print("\tExtracting %s"%chrn) 51 | last_chrn = chrn 52 | if chrn not in bed_db: 53 | continue 54 | pos = int(data[1]) 55 | if bin_search(bed_db[chrn], pos): 56 | f_out.write(line) 57 | 58 | print("Finished") 59 | 60 | 61 | if __name__ == "__main__": 62 | if len(sys.argv) < 4: 63 | print("Usage: python "+sys.argv[0]+" ") 64 | else: 65 | proc, in_vcf, in_bed, out_vcf = sys.argv 66 | extract_vcf(in_vcf, in_bed, out_vcf) 67 | 68 | -------------------------------------------------------------------------------- /bin/bam_cov.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pysam 3 | import multiprocessing 4 | import argparse 5 | 6 | 7 | def get_opts(): 8 | group = argparse.ArgumentParser() 9 | group.add_argument("-b", "--bam", help="Input bam file, must be indexed", required=True) 10 | group.add_argument("-o", "--output", help="Output statistic", required=True) 11 | group.add_argument("-t", "--threads", help="Threads, default=10", type=int, default=10) 12 | return group.parse_args() 13 | 14 | 15 | def sub_cov(in_bam, chrn, chrl): 16 | bins = [0 for _ in range(chrl)] 17 | with pysam.AlignmentFile(in_bam, 'rb') as fin: 18 | for read in fin.fetch(chrn): 19 | for pos in read.get_reference_positions(): 20 | bins[pos] = 1 21 | return chrn, chrl, sum(bins) 22 | 23 | 24 | def main(): 25 | opts = get_opts() 26 | in_bam = opts.bam 27 | out_stat = opts.output 28 | threads = opts.threads 29 | chr_name = [] 30 | chr_len = [] 31 | with pysam.AlignmentFile(in_bam, 'rb') as fin: 32 | chr_name = fin.references 33 | chr_len = fin.lengths 34 | 35 | res = [] 36 | pool = multiprocessing.Pool(processes=threads) 37 | for _ in range(len(chr_name)): 38 | r = pool.apply_async(sub_cov, (in_bam, chr_name[_], chr_len[_], )) 39 | res.append(r) 40 | pool.close() 41 | pool.join() 42 | 43 | with open(out_stat, 'w') as fout: 44 | total_covl = 0 45 | total_chrl = 0 46 | for r in res: 47 | chrn, chrl, covl = r.get() 48 | fout.write("%s\t%d\t%d\t%f\n"%(chrn, chrl, covl, covl*1./chrl)) 49 | total_covl += covl 50 | total_chrl += chrl 51 | fout.write("Total\t%s\t%d\t%f\n"%(total_chrl, total_covl, total_covl*1./total_chrl)) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /bin/extract_fasta_with_bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def rev_seq(seq): 6 | rseq = "" 7 | base_db = {"A": "T", "T": "A", "C": "G", "G": "C"} 8 | for base in seq[::-1]: 9 | if base in base_db: 10 | rseq += base_db[base] 11 | else: 12 | rseq += base 13 | return rseq 14 | 15 | 16 | def extract_fa_with_bed(in_fa, in_bed, out_fa): 17 | print("Loading fasta") 18 | fa_db = {} 19 | with open(in_fa, 'r') as fin: 20 | seq = "" 21 | id = "" 22 | for line in fin: 23 | if line[0] == '>': 24 | if seq: 25 | fa_db[id] = seq 26 | id = line.strip().split()[0][1:] 27 | seq = "" 28 | else: 29 | seq += line.strip().upper() 30 | if seq: 31 | fa_db[id] = seq 32 | 33 | print("Loading bed and writing fasta") 34 | with open(in_bed, 'r') as fin: 35 | with open(out_fa, 'w') as fout: 36 | for line in fin: 37 | data = line.strip().split() 38 | chrn = data[0] 39 | sp = int(data[1])-1 40 | ep = int(data[2]) 41 | direct = data[3] 42 | id = data[4] 43 | seq = fa_db[chrn][sp: ep] 44 | if direct == '-': 45 | seq = rev_seq(seq) 46 | fout.write(">%s\n%s\n"%(id, seq)) 47 | print("Finsihed") 48 | 49 | 50 | if __name__ == "__main__": 51 | if len(sys.argv) < 4: 52 | print("Usage: python %s "%sys.argv[0]) 53 | print("Notice: bed should be 5 columns: \"ID, start, end, direction, id\", positions should be 1-based") 54 | else: 55 | in_fa, in_bed, out_fa = sys.argv[1:] 56 | extract_fa_with_bed(in_fa, in_bed, out_fa) 57 | -------------------------------------------------------------------------------- /bin/convert_QTL_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def get_tig_pos_of_chr(in_file): 6 | pos_db = {} 7 | with open(in_file, 'r') as f_in: 8 | for line in f_in: 9 | if line.strip() == '': 10 | continue 11 | data = line.strip().split() 12 | if data[4] == 'U': 13 | continue 14 | chrn = data[0] 15 | spos = int(data[1]) 16 | epos = int(data[2]) 17 | tig = data[5] 18 | direct = data[-1] 19 | pos_db[tig] = [chrn, spos, epos, direct] 20 | return pos_db 21 | 22 | 23 | def convert_QTL_info(in_QTL, in_agp, out_QTL): 24 | tig_on_chr = get_tig_pos_of_chr(in_agp) 25 | with open(in_QTL, 'r') as fin: 26 | with open(out_QTL, 'w') as fout: 27 | for line in fin: 28 | data = line.strip().split() 29 | if data[0] == 'Pop': 30 | data.extend(['ActChr', 'Left_Pos', 'Right_Pos', 'Direct']) 31 | else: 32 | ltig, lpos = data[4].split('_') 33 | rtig, rpos = data[5].split('_') 34 | lpos = int(lpos) 35 | rpos = int(rpos) 36 | lchr, lsp, lep, ld = tig_on_chr[ltig] 37 | rchr, rsp, rep, rd = tig_on_chr[rtig] 38 | if lchr != rchr: 39 | print(data[1]+"\t"+lchr+"\t"+rchr) 40 | continue 41 | if ld == '-': 42 | lpos = lep-lpos+1 43 | else: 44 | lpos = lsp+lpos-1 45 | if rd == '-': 46 | rpos = rep-rpos+1 47 | else: 48 | rpos = rsp+rpos-1 49 | if lpos > rpos: 50 | tmp = lpos 51 | lpos = rpos 52 | rpos = tmp 53 | direct = "-" 54 | else: 55 | direct = "+" 56 | data.extend([lchr, str(lpos), str(rpos), direct]) 57 | fout.write("%s\n"%'\t'.join(data)) 58 | 59 | 60 | if __name__ == "__main__": 61 | if len(sys.argv) < 4: 62 | print("Usage: python "+sys.argv[0]+" ") 63 | else: 64 | in_QTL, in_agp, out_QTL = sys.argv[1:] 65 | convert_QTL_info(in_QTL, in_agp, out_QTL) 66 | -------------------------------------------------------------------------------- /bin/convert_chr_to_ctg_with_agp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def reverse_seq(seq): 6 | base_db = {"A": "T", "T": "A", "G": "C", "C": "G"} 7 | rev_seq = ''.join([base_db[_] if _ in base_db else _ for _ in seq[::-1]]) 8 | return rev_seq 9 | 10 | 11 | def convert_chr_to_ctg(in_fa, in_agp, out_fa): 12 | print("Loading genome file") 13 | fa_db = {} 14 | with open(in_fa, 'r') as fin: 15 | for line in fin: 16 | if line[0] == '>': 17 | id = line.strip().split()[0][1:] 18 | fa_db[id] = [] 19 | else: 20 | fa_db[id].append(line.strip().upper()) 21 | 22 | for id in fa_db: 23 | fa_db[id] = ''.join(fa_db[id]) 24 | 25 | print("Loading AGP file") 26 | ctg_db = {} 27 | with open(in_agp, 'r') as fin: 28 | for line in fin: 29 | if line.strip() == "" or line[0] == '#': 30 | continue 31 | data = line.strip().split() 32 | if data[4] != 'W': 33 | continue 34 | chrn = data[0] 35 | sp = int(data[1]) - 1 36 | ep = int(data[2]) 37 | ctg = data[5] 38 | direct = data[-1] 39 | ctg_db[ctg] = [chrn, sp, ep, direct] 40 | 41 | print("Writing contig file") 42 | with open(out_fa, 'w') as fout: 43 | for ctg in sorted(ctg_db): 44 | chrn, sp, ep, direct = ctg_db[ctg] 45 | seq = fa_db[chrn][sp: ep] 46 | if direct == '-': 47 | seq = reverse_seq(seq) 48 | fout.write(">%s\n%s\n" % (ctg, seq)) 49 | 50 | print("Finished") 51 | 52 | 53 | if __name__ == "__main__": 54 | if len(sys.argv) < 4: 55 | print("Usage: python %s " % sys.argv[0]) 56 | else: 57 | in_fa, in_agp, out_fa = sys.argv[1:] 58 | convert_chr_to_ctg(in_fa, in_agp, out_fa) 59 | -------------------------------------------------------------------------------- /bin/StatAgpDetail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | 5 | def stat_agp(in_agp, out_csv): 6 | asm_db = {} 7 | unanc_cnt = 0 8 | unanc_len = 0 9 | anc_cnt = 0 10 | anc_len = 0 11 | gap_cnt = 0 12 | gap_len = 0 13 | with open(in_agp, 'r') as fin: 14 | for line in fin: 15 | data = line.strip().split() 16 | if data[4] == 'U': 17 | gap_cnt += 1 18 | gap_len += 100 19 | else: 20 | chrn = data[0] 21 | if chrn[:3] != 'tig' and chrn[:3] != 'utg' and chrn[:3] != 'ctg' and data[0] != data[5]: 22 | allele = chrn[-1] 23 | chrn = chrn[:-1] 24 | if chrn not in asm_db: 25 | asm_db[chrn] = {} 26 | if allele not in asm_db[chrn]: 27 | asm_db[chrn][allele] = {'cnt': 0, 'len': 0} 28 | asm_db[chrn][allele]['cnt'] += 1 29 | asm_db[chrn][allele]['len'] = int(data[2]) 30 | anc_cnt += 1 31 | anc_len += int(data[7]) 32 | else: 33 | unanc_cnt += 1 34 | unanc_len += int(data[2]) 35 | 36 | for chrn in asm_db: 37 | break 38 | with open(out_csv, 'w') as fout: 39 | fout.write(",%s\n"%(',,'.join(sorted(asm_db[chrn])))) 40 | for chrn in sorted(asm_db): 41 | info = [chrn] 42 | for allele in sorted(asm_db[chrn]): 43 | info.append("\"%s\""%("{:,}".format(asm_db[chrn][allele]['cnt']))) 44 | info.append("\"%s\""%("{:,}".format(asm_db[chrn][allele]['len']))) 45 | fout.write("%s\n"%(','.join(info))) 46 | fout.write("Anchored contigs,\"%s\",\"%s\"\n"%("{:,}".format(anc_cnt), "{:,}".format(anc_len/1e6))) 47 | fout.write("Unanchored contigs,\"%s\",\"%s\"\n"%("{:,}".format(unanc_cnt), "{:,}".format(unanc_len/1e6))) 48 | fout.write("Gaps,\"%s\",\"%s\"\n"%("{:,}".format(gap_cnt), "{:,}".format(gap_len/1e6))) 49 | fout.write("Total,\"%s\",\"%s\"\n"%("{:,}".format(anc_cnt+unanc_cnt), "{:,}".format((anc_len+unanc_len)/1e6))) 50 | 51 | 52 | if __name__ == "__main__": 53 | if len(sys.argv) < 2: 54 | print("Usage: python %s "%sys.argv[0]) 55 | else: 56 | in_agp, out_csv = sys.argv[1:] 57 | stat_agp(in_agp, out_csv) 58 | -------------------------------------------------------------------------------- /bin/extract_all_sv_from_nucmer_delta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | 4 | 5 | def nucmer_extract(in_delta, out_pre): 6 | print("Running delta-filter") 7 | cmd = "delta-filter -gqr "+in_delta+" > "+in_delta+".filtered" 8 | print("Running command: "+cmd) 9 | os.system(cmd) 10 | print("Extracting") 11 | data_db = {} 12 | last_INDEL_pos = {} 13 | r_chr_len_db = {} 14 | q_chr_len_db = {} 15 | #sv_list = ['SNP', 'INDEL', 'JMP', 'INV', 'DUP', 'BRK'] 16 | with os.popen("show-diff -H "+in_delta+".filtered", 'r') as f_in: 17 | for line in f_in: 18 | data = line.strip().split() 19 | if len(data) < 5: 20 | continue 21 | chrn = data[0] 22 | sv = data[1] 23 | if sv not in data_db: 24 | data_db[sv] = {} 25 | if chrn not in data_db[sv]: 26 | data_db[sv][chrn] = [] 27 | sp = data[2] 28 | ep = data[3] 29 | data_db[sv][chrn].append([sp, ep]) 30 | 31 | data_db['SNP'] = {} 32 | data_db['INDEL'] = {} 33 | with os.popen("show-snps -ClrT "+in_delta+".filtered", 'r') as f_in: 34 | for line in f_in: 35 | data = line.strip().split() 36 | if len(data) == 0 or data[0].isdigit() == False: 37 | continue 38 | 39 | pos = int(data[0]) 40 | r_chrn = data[-2] 41 | if r_chrn not in data_db['SNP']: 42 | data_db['SNP'][r_chrn] = [] 43 | if r_chrn not in data_db['INDEL']: 44 | data_db['INDEL'][r_chrn] = [] 45 | if data[1] != '.' and data[2] != '.': 46 | data_db['SNP'][r_chrn].append([data[0], data[1], data[2]]) 47 | else: 48 | data_db['INDEL'][r_chrn].append([data[0], data[1], data[2]]) 49 | 50 | print("Writing data") 51 | for sv in data_db: 52 | with open(out_pre+"."+sv+".txt", 'w') as fout: 53 | for chrn in sorted(data_db[sv]): 54 | for data in data_db[sv][chrn]: 55 | fout.write("%s\t%s\n"%(chrn, '\t'.join(data))) 56 | print("Success") 57 | 58 | 59 | if __name__ == "__main__": 60 | if len(sys.argv) < 3: 61 | print("Usage: python "+sys.argv[0]+" ") 62 | else: 63 | in_delta, out_pre = sys.argv[1:] 64 | nucmer_extract(in_delta, out_pre) 65 | -------------------------------------------------------------------------------- /bin/dup_dotplot.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use Getopt::Std; 3 | getopts "g:r:q:n:t:"; 4 | 5 | if ((!defined $opt_g)or(!defined $opt_r)or(!defined $opt_q)or(!defined $opt_n)){ 6 | die"****************************************************************************************** 7 | Usage: perl $0 -g reference_genome -r ref_id -q query_id -n number_of_dup -t threads 8 | ref_id: reference cds and bed name, like: Sb, Sb.cds and Sb.bed must exist 9 | query_id: query cds and bed name, like: Os 10 | number_of_dup: number of duplications 11 | threads: default 1 12 | ******************************************************************************************\n"; 13 | } 14 | 15 | my $genome = $opt_g; 16 | my $ref_name = $opt_r; 17 | my $qry_name = $opt_q; 18 | my $dup_n = $opt_n; 19 | if (!defined $opt_t){ 20 | $threads = "1"; 21 | } 22 | else{ 23 | $threads = $opt_t; 24 | } 25 | 26 | my %sbcdsdb; 27 | open(IN, $ref_name.".cds") or die""; 28 | while(){ 29 | chomp; 30 | if(/>/){ 31 | $gene = $_; 32 | $gene =~ s/\s.*//g; 33 | $gene =~ s/>//g; 34 | }else{ 35 | $sbcdsdb{$gene} .= $_; 36 | } 37 | } 38 | close IN; 39 | 40 | my $size=(-s $genome); 41 | my $gmap = "gmap"; 42 | if($size>2**32){ 43 | $gmap = "gmapl"; 44 | } 45 | system("gmap_build -D . -d DB ".$genome); 46 | system($gmap." -D . -d DB -t ".$threads." -f 2 -n ".$dup_n." ".$ref_name.".cds"." > ".$qry_name.".gff3"); 47 | 48 | open(CDS, "> ".$qry_name.".cds") or die""; 49 | open(BED, "> ".$qry_name.".bed") or die""; 50 | my $count = 0; 51 | open(IN, "grep 'gene' ".$qry_name.".gff3|") or die""; 52 | while(){ 53 | chomp; 54 | $count++; 55 | my @data = split(/\s+/,$_); 56 | my $gid = $data[0]; 57 | my $a = $data[3]; 58 | my $b = $data[4]; 59 | my $gene = $1 if(/Name=(\S+)/); 60 | $gene =~ s/;.*//g; 61 | my $cds = $sbcdsdb{$gene}; 62 | $gene = $gene."_".$count; 63 | print CDS ">$gene\n$cds\n"; 64 | print BED "$gid $a $b $gene 0 $data[6]\n"; 65 | } 66 | close IN; 67 | close CDS; 68 | close BED; 69 | 70 | system(" python -m jcvi.compara.catalog ortholog ".$qry_name." ".$ref_name); 71 | 72 | -------------------------------------------------------------------------------- /bin/merge_bed_regions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | def merge_reions(region_db, md): 5 | new_regions = {} 6 | for chrn in region_db: 7 | new_regions[chrn] = [] 8 | tmp_list = [] 9 | tmp_gns = [[]] 10 | for region in sorted(region_db[chrn]): 11 | s = region[0] 12 | e = region[1] 13 | gn = region[2] 14 | if len(tmp_list) == 0: 15 | tmp_list.append(s) 16 | laste = e 17 | else: 18 | if s > laste+md: 19 | tmp_list.append(laste) 20 | tmp_list.append(s) 21 | tmp_gns.append([]) 22 | laste = e 23 | else: 24 | if e > laste: 25 | laste = e 26 | tmp_gns[-1].extend(gn) 27 | tmp_list.append(laste) 28 | for i in range(0, len(tmp_list), 2): 29 | s = tmp_list[i] 30 | e = tmp_list[i+1] 31 | gns = ''.join(tmp_gns[int(i/2)]) 32 | gns = list(set(gns.split(','))) 33 | new_gns = [] 34 | for gn in gns: 35 | if gn != '': 36 | new_gns.append(gn) 37 | gns = ','.join(new_gns) 38 | new_regions[chrn].append([s, e, gns]) 39 | return new_regions 40 | 41 | 42 | def read_bed(in_bed): 43 | bed_db = {} 44 | with open(in_bed, 'r') as f_in: 45 | for line in f_in: 46 | if line.strip() == '': 47 | continue 48 | data = line.strip().split() 49 | chrn = data[0] 50 | sp = int(data[1]) 51 | ep = int(data[2]) 52 | if len(data) > 3: 53 | gn = data[-1] 54 | else: 55 | gn = '' 56 | if sp > ep: 57 | temp = sp 58 | sp = ep 59 | ep = temp 60 | if chrn not in bed_db: 61 | bed_db[chrn] = [] 62 | bed_db[chrn].append([sp, ep, gn]) 63 | return bed_db 64 | 65 | 66 | def merge_regions_in_bed(in_bed, out_bed, md): 67 | ori_regions = read_bed(in_bed) 68 | new_regions = merge_reions(ori_regions, md) 69 | with open(out_bed, 'w') as f_out: 70 | for chrn in sorted(new_regions): 71 | for region in new_regions[chrn]: 72 | f_out.write("%s\t%d\t%d\t%s\n"%(chrn, region[0], region[1], region[2])) 73 | 74 | 75 | if __name__ == "__main__": 76 | if len(sys.argv) < 4: 77 | print("Usage: python "+sys.argv[0]+" ") 78 | else: 79 | in_bed, out_bed, md = sys.argv[1:] 80 | md = int(md) 81 | merge_regions_in_bed(in_bed, out_bed, md) 82 | -------------------------------------------------------------------------------- /bin/get_seq_with_bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import gzip 4 | 5 | 6 | def reverse_seq(seq): 7 | base_db = {"A": "T", "T": "A", "C": "G", "G": "C"} 8 | rev_seq = "".join([base_db[_] if _ in base_db else _ for _ in seq.upper()[::-1]]) 9 | return rev_seq 10 | 11 | 12 | def get_seq_with_bed(in_fa, in_bed, out_fa): 13 | print("Loading bed") 14 | bed_db = {} 15 | with open(in_bed, "r") as fin: 16 | for line in fin: 17 | data = line.strip().split() 18 | chrn = data[0] 19 | sp = int(data[1]) - 1 20 | ep = int(data[2]) 21 | if len(data) > 4: 22 | direct = data[3] 23 | else: 24 | direct = "+" 25 | gid = data[-1] 26 | if chrn not in bed_db: 27 | bed_db[chrn] = [] 28 | bed_db[chrn].append([sp, ep, direct, gid]) 29 | 30 | print("Extracting") 31 | if in_fa.endswith(".gz"): 32 | fin = gzip.open(in_fa, "rt") 33 | else: 34 | fin = open(in_fa, "r") 35 | 36 | fa_db = {} 37 | for line in fin: 38 | if line[0] == ">": 39 | cid = line.strip()[1:] 40 | fa_db[cid] = [] 41 | else: 42 | fa_db[cid].append(line.strip()) 43 | 44 | fin.close() 45 | for _ in fa_db: 46 | fa_db[_] = "".join(fa_db[_]) 47 | 48 | with open(out_fa, "w") as fout: 49 | for cid in bed_db: 50 | for sp, ep, direct, gid in bed_db[cid]: 51 | fout.write( 52 | ">%s\n%s\n" 53 | % ( 54 | gid, 55 | ( 56 | fa_db[cid][sp:ep] 57 | if direct == "+" 58 | else reverse_seq(fa_db[cid][sp:ep]) 59 | ), 60 | ) 61 | ) 62 | 63 | print("Finished") 64 | 65 | 66 | if __name__ == "__main__": 67 | if len(sys.argv) < 3: 68 | print("Usage: python %s " % sys.argv[0]) 69 | else: 70 | in_fa, in_bed, out_fa = sys.argv[1:] 71 | get_seq_with_bed(in_fa, in_bed, out_fa) 72 | -------------------------------------------------------------------------------- /bin/get_genes_from_range.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def cmplist(a,b): 6 | dataa = a.strip().split('\t') 7 | datab = b.strip().split('\t') 8 | if long(dataa[0]) > long(datab[0]): 9 | return 1 10 | elif long(dataa[0]) < long(datab[0]): 11 | return -1 12 | else: 13 | return 0 14 | 15 | def search(listGene, x, y, td): 16 | res = "" 17 | for line in listGene: 18 | data = line.split('\t') 19 | if (x >= long(data[0]) and x <= long(data[1])) or (y >= long(data[0]) and y <= long(data[1])) or (x <= long(data[0]) and y>= long(data[1])): 20 | if x >= long(data[0]): 21 | start_pos = x 22 | else: 23 | start_pos = long(data[0]) 24 | if y <= long(data[1]): 25 | end_pos = y 26 | else: 27 | end_pos = long(data[1]) 28 | if (end_pos-start_pos+1)*1.0/(y-x+1) >= td: 29 | res = res+data[2]+'\n' 30 | return res 31 | 32 | 33 | def get_genes_from_range(f_gff, f_bed, f_out, td): 34 | gff = open(f_gff,'r') 35 | dictGene = {} 36 | for line in gff: 37 | data = line.strip().split('\t') 38 | if(len(data) > 3): 39 | if(data[2] == "gene"): 40 | if data[0] not in dictGene: 41 | dictGene[data[0]] = [] 42 | dictGene[data[0]].append(data[3]+'\t'+data[4]+'\t'+data[8].split(';')[0][3:]) 43 | gff.close() 44 | 45 | for key in dictGene: 46 | dictGene[key].sort(cmplist) 47 | bed = open(f_bed,'r') 48 | out = open(f_out,'w') 49 | for line in bed: 50 | data = line.strip().split('\t') 51 | if len(data) < 3 or line[0] == '#': 52 | continue 53 | ss = long(data[1]) 54 | se = long(data[2]) 55 | if data[0] in dictGene: 56 | res = search(dictGene[data[0]], ss, se, td) 57 | if res != "": 58 | rs = res.split('\n') 59 | out.write(line.strip()) 60 | for r in rs: 61 | if r != "": 62 | out.write('\t'+r) 63 | out.write('\n') 64 | out.close() 65 | bed.close() 66 | 67 | 68 | if __name__ == "__main__": 69 | if len(sys.argv) < 5: 70 | print("Notice: script for get genes in range from bed file") 71 | print("Usage: python "+sys.argv[0]+" ") 72 | else: 73 | f_gff = sys.argv[1] 74 | f_bed = sys.argv[2] 75 | f_out = sys.argv[3] 76 | td = float(sys.argv[4])/100.0 77 | get_genes_from_range(f_gff, f_bed, f_out, td) 78 | -------------------------------------------------------------------------------- /bin/eval_filled_gaps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def search_gaps(seq): 6 | gaps_db = [] 7 | cnt_n = 0 8 | for i in range(0, len(seq)): 9 | if seq[i].lower() == 'n': 10 | if cnt_n == 0: 11 | s = i 12 | cnt_n += 1 13 | else: 14 | if cnt_n != 0: 15 | e = i 16 | gaps_db.append([s, e-1]) 17 | cnt_n = 0 18 | cnt_n = 0 19 | for region in gaps_db: 20 | cnt_n += region[1]-region[0]+1 21 | return gaps_db, cnt_n 22 | 23 | 24 | def calc_gaps(seq): 25 | cnt_n = 0 26 | for i in range(0, len(seq)): 27 | if seq[i].lower() == 'n': 28 | cnt_n += 1 29 | return cnt_n 30 | 31 | 32 | def make_seq_db(in_fasta): 33 | seq_db = {} 34 | with open(in_fasta, 'r') as f_in: 35 | id = '' 36 | seq = '' 37 | for line in f_in: 38 | if line[0] == ">": 39 | if seq != '': 40 | seq_db[id] = seq 41 | id = line.strip() 42 | seq = '' 43 | else: 44 | seq += line.strip() 45 | seq_db[id] = seq 46 | return seq_db 47 | 48 | 49 | def eval_filled_gaps(ref_fasta, query_fasta, result_file): 50 | print("Reading reference fasta") 51 | ref_seq_db = make_seq_db(ref_fasta) 52 | 53 | print("Reading query fasta") 54 | query_seq_db = make_seq_db(query_fasta) 55 | 56 | print("Evaluating") 57 | with open(result_file, 'w') as f_out: 58 | for id in ref_seq_db: 59 | ref_gaps_db, ref_gaps_cnt = search_gaps(ref_seq_db[id]) 60 | query_gaps_cnt = calc_gaps(query_seq_db[id]) 61 | f_out.write(id[1:]+"\n") 62 | if ref_gaps_cnt != 0: 63 | f_out.write("Filled %0.2f%%\n"%((ref_gaps_cnt-query_gaps_cnt)*1.0/ref_gaps_cnt*100.0)) 64 | else: 65 | f_out.write("No gaps\n") 66 | if len(ref_gaps_db) != 0: 67 | for region in ref_gaps_db: 68 | s = region[0] 69 | e = region[1] 70 | f_out.write("Region %d-%d:\n%s\n"%(s, e, query_seq_db[id][s:e+1])) 71 | f_out.write("\n") 72 | print("Success") 73 | 74 | 75 | if __name__ == "__main__": 76 | if len(sys.argv) < 4: 77 | print("Notice: this script is used to evaulate status that gaps been filled") 78 | print("Usage: python "+sys.argv[0]+" ") 79 | else: 80 | ref_fasta = sys.argv[1] 81 | query_fasta = sys.argv[2] 82 | result_file = sys.argv[3] 83 | eval_filled_gaps(ref_fasta, query_fasta, result_file) 84 | 85 | -------------------------------------------------------------------------------- /bin/nucmer_extract_all_sv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | 4 | 5 | def nucmer_extract(r_fa, q_fa, out_pre, ts): 6 | print("Running nucmer") 7 | cmd = "nucmer -p "+out_pre+" "+r_fa+" "+q_fa+" -t "+ts 8 | print("Running command: "+cmd) 9 | os.system(cmd) 10 | 11 | print("Running delta-filter") 12 | cmd = "delta-filter -gqr "+out_pre+".delta > "+out_pre+".filtered" 13 | print("Running command: "+cmd) 14 | os.system(cmd) 15 | 16 | print("Extracting") 17 | data_db = {} 18 | last_INDEL_pos = {} 19 | r_chr_len_db = {} 20 | q_chr_len_db = {} 21 | #sv_list = ['SNP', 'INDEL', 'JMP', 'INV', 'DUP', 'BRK'] 22 | with os.popen("show-diff -H "+out_pre+".filtered", 'r') as f_in: 23 | for line in f_in: 24 | data = line.strip().split() 25 | if len(data) < 5: 26 | continue 27 | chrn = data[0] 28 | sv = data[1] 29 | if sv not in data_db: 30 | data_db[sv] = {} 31 | if chrn not in data_db[sv]: 32 | data_db[sv][chrn] = [] 33 | sp = data[2] 34 | ep = data[3] 35 | data_db[sv][chrn].append([sp, ep]) 36 | 37 | data_db['SNP'] = {} 38 | data_db['INDEL'] = {} 39 | with os.popen("show-snps -ClrT "+out_pre+".filtered", 'r') as f_in: 40 | for line in f_in: 41 | data = line.strip().split() 42 | if len(data) == 0 or data[0].isdigit() == False: 43 | continue 44 | 45 | pos = int(data[0]) 46 | r_chrn = data[-2] 47 | if r_chrn not in data_db['SNP']: 48 | data_db['SNP'][r_chrn] = [] 49 | if r_chrn not in data_db['INDEL']: 50 | data_db['INDEL'][r_chrn] = [] 51 | if data[1] != '.' and data[2] != '.': 52 | data_db['SNP'][r_chrn].append([data[0], data[1], data[2]]) 53 | else: 54 | data_db['INDEL'][r_chrn].append([data[0], data[1], data[2]]) 55 | 56 | print("Writing data") 57 | with open(out_pre+".sv.txt", 'w') as fall: 58 | for sv in data_db: 59 | with open(out_pre+"."+sv+".txt", 'w') as fout: 60 | for chrn in sorted(data_db[sv]): 61 | for data in data_db[sv][chrn]: 62 | if sv != "SNP" and sv != "INDEL": 63 | fall.write("%s\t%s\t%s\n"%(chrn, '\t'.join(data), sv)) 64 | fout.write("%s\t%s\n"%(chrn, '\t'.join(data))) 65 | print("Success") 66 | 67 | 68 | if __name__ == "__main__": 69 | if len(sys.argv) < 5: 70 | print("Usage: python "+sys.argv[0]+" ") 71 | else: 72 | r_fa, q_fa, out_pre, ts = sys.argv[1:] 73 | nucmer_extract(r_fa, q_fa, out_pre, ts) 74 | -------------------------------------------------------------------------------- /bin/calc_gene_ovlp_te.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def merge_regions(regions): 6 | tmp_regions = [] 7 | last_ep = 0 8 | for sp, ep in sorted(regions): 9 | if tmp_regions == []: 10 | tmp_regions.append(sp) 11 | last_ep = ep 12 | else: 13 | if sp > last_ep: 14 | tmp_regions.append(last_ep) 15 | tmp_regions.append(sp) 16 | last_ep = ep 17 | else: 18 | if ep > last_ep: 19 | last_ep = ep 20 | tmp_regions.append(last_ep) 21 | new_regions = [] 22 | for i in range(0, len(tmp_regions)-1, 2): 23 | new_regions.append([tmp_regions[i], tmp_regions[i+1]]) 24 | return new_regions 25 | 26 | 27 | def calc_ovlp_ratio(regions, sp, ep): 28 | tmp_tes = [] 29 | for rsp, rep in regions: 30 | ovlp = min(ep, rep)-max(sp, rsp)+1 31 | if ovlp <= 0: 32 | continue 33 | tmp_tes.append([max(sp, rsp), min(ep, rep)]) 34 | 35 | ovlp_len = 0 36 | if len(tmp_tes) != 0: 37 | for msp, mep in merge_regions(tmp_tes): 38 | ovlp_len += mep-msp+1 39 | 40 | return ovlp_len*1.0/(ep-sp+1) 41 | 42 | 43 | def calc_gene_ovlp_te(gene_gff3, TE_gffs, ovlp_stat): 44 | print("Loading TEs") 45 | TE_db = {} 46 | for te in TE_gffs.split(','): 47 | print("\tLoading: %s"%te) 48 | with open(te, 'r') as fin: 49 | for line in fin: 50 | if line[0] == '#': 51 | continue 52 | data = line.strip().split() 53 | tig = data[0] 54 | sp = int(data[3]) 55 | ep = int(data[4]) 56 | if sp > ep: 57 | sp, ep = ep, sp 58 | if tig not in TE_db: 59 | TE_db[tig] = [] 60 | TE_db[tig].append([sp, ep]) 61 | 62 | for tig in TE_db: 63 | TE_db[tig] = sorted(TE_db[tig]) 64 | 65 | print("Reading gene gff3 and calculating overlaps") 66 | with open(gene_gff3, 'r') as fin: 67 | with open(ovlp_stat, 'w') as fout: 68 | for line in fin: 69 | if line[0] == '#': 70 | continue 71 | data = line.strip().split() 72 | if data[2] != 'gene': 73 | continue 74 | tig = data[0] 75 | gid = data[8].split(';')[0].split('=')[1] 76 | sp = int(data[3]) 77 | ep = int(data[4]) 78 | if sp > ep: 79 | sp, ep = ep, sp 80 | if tig not in TE_db: 81 | continue 82 | fout.write("%s\t%f\n"%(gid, 100*calc_ovlp_ratio(TE_db[tig], sp, ep))) 83 | print("Finished") 84 | 85 | 86 | if __name__ == "__main__": 87 | if len(sys.argv) < 4: 88 | print("Usage: python "+sys.argv[0]+" ") 89 | else: 90 | gene_gff3, TE_gffs, ovlp_stat = sys.argv[1:] 91 | calc_gene_ovlp_te(gene_gff3, TE_gffs, ovlp_stat) 92 | -------------------------------------------------------------------------------- /bin/group_SNP_exon_and_intron.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import time 4 | import gzip 5 | 6 | 7 | def search_pos(in_gff, in_snp, out_file): 8 | pos_db = {} 9 | if in_gff.split('.')[-1] == 'gz': 10 | f_gff = gzip.open(in_gff, 'rt') 11 | else: 12 | f_gff = open(in_gff, 'r') 13 | 14 | for line in f_gff: 15 | if line[0] == '#' or line.strip() == '': 16 | continue 17 | data = line.strip().split() 18 | chrn = data[0] 19 | s_pos = int(data[3]) 20 | e_pos = int(data[4]) 21 | name = data[8].split(';')[0].split('=')[1] 22 | if "G" in name: 23 | name = '.'.join(name.split('.')[:2]) 24 | else: 25 | name = name.split('.')[0] 26 | type = data[2] 27 | if chrn not in pos_db: 28 | pos_db[chrn] = {} 29 | if name not in pos_db[chrn]: 30 | pos_db[chrn][name] = {} 31 | pos_db[chrn][name]['gene'] = () 32 | pos_db[chrn][name]['exon'] = [] 33 | if type == 'gene': 34 | pos_db[chrn][name]['gene'] = (s_pos, e_pos) 35 | elif type == 'exon': 36 | pos_db[chrn][name]['exon'].append((s_pos, e_pos)) 37 | else: 38 | continue 39 | f_gff.close() 40 | 41 | if in_snp.split('.') == 'gz': 42 | f_snp = gzip.open(in_snp, 'rt') 43 | else: 44 | f_snp = open(in_snp, 'r') 45 | 46 | if out_file.split('.') == 'gz': 47 | f_out = gzip.open(out_file, 'wt') 48 | else: 49 | f_out = open(out_file, 'w') 50 | 51 | for line in f_snp: 52 | if line[0] == '#' or line.strip() == '': 53 | continue 54 | data = line.strip().split() 55 | chrn = data[0] 56 | pos = int(data[1]) 57 | if chrn not in pos_db: 58 | continue 59 | is_found = False 60 | for name in pos_db[chrn]: 61 | s_pos, e_pos = pos_db[chrn][name]['gene'] 62 | if s_pos <= pos and pos <= e_pos: 63 | is_found = True 64 | break 65 | if is_found: 66 | is_exon = False 67 | for (s_pos, e_pos) in pos_db[chrn][name]['exon']: 68 | if s_pos <= pos and pos <= e_pos: 69 | is_exon = True 70 | break 71 | if is_exon: 72 | f_out.write(line.strip()+"\t"+name+"\t"+"\t"+"exon\n") 73 | else: 74 | f_out.write(line.strip()+"\t"+name+"\t"+"\t"+"intron\n") 75 | f_snp.close() 76 | f_out.close() 77 | 78 | 79 | if __name__ == "__main__": 80 | if len(sys.argv)<4: 81 | print("Notice: script use for determining pos to exon or intro in snp file base on gff file") 82 | print("Usage python "+sys.argv[0]+" ") 83 | else: 84 | s_time = time.time() 85 | in_gff = sys.argv[1] 86 | in_snp = sys.argv[2] 87 | out_file = sys.argv[3] 88 | search_pos(in_gff, in_snp, out_file) 89 | e_time = time.time() 90 | print("cost time " + str(e_time-s_time)) 91 | 92 | -------------------------------------------------------------------------------- /bin/group_exon_and_intron.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import time 4 | import gzip 5 | 6 | 7 | def search_pos(in_gff, in_vcf, out_file): 8 | pos_db = {} 9 | if in_gff.split('.')[-1] == 'gz': 10 | f_gff = gzip.open(in_gff, 'rt') 11 | else: 12 | f_gff = open(in_gff, 'r') 13 | 14 | for line in f_gff: 15 | if line[0] == '#' or line.strip() == '': 16 | continue 17 | data = line.strip().split() 18 | chrn = data[0] 19 | s_pos = int(data[3]) 20 | e_pos = int(data[4]) 21 | name = data[8].split(';')[0].split('=')[1] 22 | if "G" in name: 23 | name = '.'.join(name.split('.')[:2]) 24 | else: 25 | name = name.split('.')[0] 26 | type = data[2] 27 | if chrn not in pos_db: 28 | pos_db[chrn] = {} 29 | if name not in pos_db[chrn]: 30 | pos_db[chrn][name] = {} 31 | pos_db[chrn][name]['gene'] = () 32 | pos_db[chrn][name]['exon'] = [] 33 | if type == 'gene': 34 | pos_db[chrn][name]['gene'] = (s_pos, e_pos) 35 | elif type == 'exon': 36 | pos_db[chrn][name]['exon'].append((s_pos, e_pos)) 37 | else: 38 | continue 39 | f_gff.close() 40 | 41 | if in_vcf.split('.') == 'gz': 42 | f_vcf = gzip.open(in_vcf, 'rt') 43 | else: 44 | f_vcf = open(in_vcf, 'r') 45 | 46 | if out_file.split('.') == 'gz': 47 | f_out = gzip.open(out_file, 'wt') 48 | else: 49 | f_out = open(out_file, 'w') 50 | 51 | for line in f_vcf: 52 | if line[0] == '#' or line.strip() == '': 53 | continue 54 | data = line.strip().split() 55 | chrn = data[0] 56 | pos = int(data[1]) 57 | if chrn not in pos_db: 58 | continue 59 | is_found = False 60 | for name in pos_db[chrn]: 61 | s_pos, e_pos = pos_db[chrn][name]['gene'] 62 | if s_pos <= pos and pos <= e_pos: 63 | is_found = True 64 | break 65 | if is_found: 66 | is_exon = False 67 | for (s_pos, e_pos) in pos_db[chrn][name]['exon']: 68 | if s_pos <= pos and pos <= e_pos: 69 | is_exon = True 70 | break 71 | if is_exon: 72 | f_out.write(chrn+"\t"+name+"\t"+str(pos)+"\t"+"exon\n") 73 | else: 74 | f_out.write(chrn+"\t"+name+"\t"+str(pos)+"\t"+"intron\n") 75 | f_vcf.close() 76 | f_out.close() 77 | 78 | 79 | if __name__ == "__main__": 80 | if len(sys.argv)<4: 81 | print("Notice: script use for determining pos to exon or intro in vcf file base on gff file") 82 | print("Usage python "+sys.argv[0]+" ") 83 | else: 84 | s_time = time.time() 85 | in_gff = sys.argv[1] 86 | in_vcf = sys.argv[2] 87 | out_file = sys.argv[3] 88 | search_pos(in_gff, in_vcf, out_file) 89 | e_time = time.time() 90 | print("cost time " + str(e_time-s_time)) 91 | 92 | -------------------------------------------------------------------------------- /bin/nucmer_statistics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | 4 | 5 | def nucmer_statistics(r_fa, q_fa, out_pre, ts): 6 | print("Running nucmer") 7 | cmd = "nucmer -p "+out_pre+" "+r_fa+" "+q_fa+" -t "+ts 8 | print("Running command: "+cmd) 9 | os.system(cmd) 10 | 11 | print("Running delta-filter") 12 | cmd = "delta-filter -gqr "+out_pre+".delta > "+out_pre+".filtered" 13 | print("Running command: "+cmd) 14 | os.system(cmd) 15 | print("Statisitcs") 16 | data_db = {} 17 | last_INDEL_pos = {} 18 | r_chr_len_db = {} 19 | q_chr_len_db = {} 20 | sv_list = ['SNP', 'INDEL'] 21 | with os.popen("show-snps -ClrT "+out_pre+".filtered", 'r') as f_in: 22 | for line in f_in: 23 | data = line.strip().split() 24 | if len(data) == 0 or data[0].isdigit() == False: 25 | continue 26 | 27 | pos = int(data[0]) 28 | r_chrn = data[-2] 29 | q_chrn = data[-1] 30 | 31 | if r_chrn not in data_db: 32 | data_db[r_chrn] = {'SNP': {'count': 0, 'size': 0}, 'INDEL': {'count': 0, 'size': 0}} 33 | last_INDEL_pos[r_chrn] = 0 34 | r_chr_len_db[r_chrn] = int(data[6]) 35 | if q_chrn not in q_chr_len_db: 36 | q_chr_len_db[q_chrn] = int(data[7]) 37 | 38 | if data[1] != '.' and data[2] != '.': 39 | data_db[r_chrn]['SNP']['count'] += 1 40 | data_db[r_chrn]['SNP']['size'] += 1 41 | last_INDEL_pos[r_chrn] = 0 42 | else: 43 | if pos - last_INDEL_pos[r_chrn] > 1: 44 | data_db[r_chrn]['INDEL']['count'] += 1 45 | data_db[r_chrn]['INDEL']['size'] += 1 46 | last_INDEL_pos[r_chrn] = pos 47 | 48 | total_size = {} 49 | total_count = {} 50 | for chrn in data_db: 51 | total_size[chrn] = 0 52 | total_count[chrn] = 0 53 | for type in data_db[chrn]: 54 | total_size[chrn] += data_db[chrn][type]['size'] 55 | total_count[chrn] += data_db[chrn][type]['count'] 56 | 57 | with open(out_pre+".statistics", 'w') as f_out: 58 | f_out.write("Type\tNumber\tSize\tSize/TotalVar\tSize/ChrSize\n") 59 | for chrn in data_db: 60 | for type in sv_list: 61 | if type not in data_db[chrn]: 62 | continue 63 | f_out.write("%s\t%d\t%d\t%.4f\t%.4f\n"%(type, data_db[chrn][type]['count'], data_db[chrn][type]['size'], data_db[chrn][type]['size']*1.0/total_size[chrn], data_db[chrn][type]['size']*2.0/(r_chr_len_db[chrn]+q_chr_len_db[chrn]))) 64 | f_out.write("%s\t%d\t%d\t%.4f\t%.4f\n"%("Total", total_count[chrn], total_size[chrn], total_size[chrn]*1.0/total_size[chrn], total_size[chrn]*2.0/(r_chr_len_db[chrn]+q_chr_len_db[chrn]))) 65 | print("Success") 66 | 67 | 68 | if __name__ == "__main__": 69 | if len(sys.argv) < 5: 70 | print("Usage: python "+sys.argv[0]+" ") 71 | else: 72 | r_fa, q_fa, out_pre, ts = sys.argv[1:] 73 | nucmer_statistics(r_fa, q_fa, out_pre, ts) 74 | -------------------------------------------------------------------------------- /bin/eval_synteny.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import re 4 | import numpy as np 5 | import bisect 6 | 7 | 8 | def get_opts(): 9 | group = argparse.ArgumentParser() 10 | group.add_argument('-r', '--ref', help='ref.bed', required=True) 11 | group.add_argument('-q', '--qry' ,help='qry.bed', required=True) 12 | 13 | return group.parse_args() 14 | 15 | # Longest increase sub sequence 16 | def LIS(arr): 17 | min_num = [-1] 18 | for n in arr: 19 | k = bisect.bisect_left(min_num, n) 20 | if len(min_num) == k: 21 | min_num.append(n) 22 | else: 23 | min_num[k] = n 24 | return len(min_num)-1 25 | 26 | 27 | def eval_synteny(ref_file, qry_file): 28 | print("Loading bed files") 29 | ref_db = {} 30 | qry_db = {} 31 | ref_list = [] 32 | qry_list = [] 33 | with open(ref_file, 'r') as fin: 34 | for line in fin: 35 | data = line.strip().split() 36 | chrn = data[0] 37 | gid = data[3] 38 | if chrn not in ref_db: 39 | ref_db[chrn] = [] 40 | ref_list.append(chrn) 41 | ref_db[chrn].append(gid) 42 | 43 | with open(qry_file, 'r') as fin: 44 | for line in fin: 45 | data = line.strip().split() 46 | chrn = data[0] 47 | gid = data[3] 48 | if chrn not in qry_db: 49 | qry_db[chrn] = [] 50 | qry_list.append(chrn) 51 | qry_db[chrn].append(gid) 52 | 53 | print("Comparing") 54 | total_lis_values = 0 55 | total_gene_cnt = 0 56 | 57 | print("Result") 58 | for qchr in qry_list: 59 | max_score = 0 60 | max_lis_val = 0 61 | max_gene_cnt = 0 62 | max_chr = "" 63 | for rchr in ref_list: 64 | ref_idx_db = {ref_db[rchr][idx]: idx for idx in range(len(ref_db[rchr]))} 65 | tmp_order_list = [ref_idx_db[qry_db[qchr][idx]] 66 | if qry_db[qchr][idx] in ref_idx_db 67 | else -1 68 | for idx in range(len(qry_db[qchr]))] 69 | region_order_list = [] 70 | for _ in tmp_order_list: 71 | if _ != -1: 72 | region_order_list.append(_) 73 | chr_gene_cnt = (len(ref_db[rchr])+len(qry_db[qchr]))/2. 74 | lis_value = max(LIS(region_order_list), LIS(region_order_list[::-1])) 75 | score = lis_value*100./chr_gene_cnt 76 | if score > max_score: 77 | max_lis_val = lis_value 78 | max_gene_cnt = chr_gene_cnt 79 | max_score = score 80 | max_chr = rchr 81 | print("%s match %s: %.4f%%"%(qchr, max_chr, max_score)) 82 | total_lis_values += max_lis_val 83 | total_gene_cnt += max_gene_cnt 84 | print("Total: %.4f%%"%(total_lis_values*100./total_gene_cnt)) 85 | 86 | 87 | if __name__ == "__main__": 88 | opts = get_opts() 89 | ref_file = opts.ref 90 | qry_file = opts.qry 91 | eval_synteny(ref_file, qry_file) 92 | -------------------------------------------------------------------------------- /bin/rename_ID.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def generate_new_id_by_gff(chr_pre, in_gff): 6 | id_db = {} 7 | gff_db = {} 8 | is_first_id = True 9 | with open(in_gff, 'r') as f_in: 10 | for line in f_in: 11 | data = line.strip().split() 12 | if line[0] == '#' or len(data) < 9: 13 | continue 14 | if data[2] == 'gene': 15 | chrn = data[0] 16 | #if chrn[:3].lower() != 'chr': 17 | # continue 18 | s_p = int(data[3]) 19 | id = data[8].split(";")[0].split("=")[1] 20 | if is_first_id: 21 | print("Check ID: %s"%id) 22 | is_first_id = False 23 | if chrn not in id_db: 24 | id_db[chrn] = [] 25 | id_db[chrn].append([s_p, id]) 26 | gff_db[id] = [] 27 | gff_db[id].append(line) 28 | 29 | new_id_db = {} 30 | chr_base = {} 31 | ordered_id = [] 32 | with open("rename_list.txt", 'w') as f_out: 33 | tig_base = 10 34 | for chrn in sorted(id_db): 35 | base = 10 36 | for info in sorted(id_db[chrn]): 37 | if chrn[:3].lower() == 'chr': 38 | idx = int(chrn[3:]) 39 | new_id = chr_pre+".%02dG%07d"%(idx, base) 40 | base += 10 41 | else: 42 | new_id = chr_pre+".%08d"%(tig_base) 43 | tig_base += 10 44 | # new_id = info[1].replace('.', '').replace('G', '') 45 | f_out.write("%s\t%d\t%s\t%s\n"%(chrn, info[0], info[1], new_id)) 46 | ordered_id.append(info[1]) 47 | new_id_db[info[1]] = new_id 48 | return new_id_db, gff_db, ordered_id 49 | 50 | 51 | def rename_id(chr_pre, in_gff, out_gff, in_fastas, out_fastas): 52 | print("Generating rename list") 53 | rename_id_db, gff_db, ordered_id = generate_new_id_by_gff(chr_pre, in_gff) 54 | 55 | print("Dealing gff") 56 | with open(out_gff, 'w') as fout: 57 | fout.write("###gff version 3\n") 58 | for id in ordered_id: 59 | for line in gff_db[id]: 60 | fout.write(line.replace(id, rename_id_db[id])) 61 | fout.write("\n") 62 | 63 | print("Dealing fasta") 64 | in_fasta_list = in_fastas.split(',') 65 | out_fasta_list = out_fastas.split(',') 66 | for i in range(0, len(in_fasta_list)): 67 | print("\tDealing %s"%in_fasta_list[i]) 68 | with open(in_fasta_list[i], 'r') as f_in: 69 | with open(out_fasta_list[i], 'w') as f_out: 70 | suf = in_fasta_list[i].split('.')[-1].lower() 71 | if suf == "fasta" or suf == "fa": 72 | for line in f_in: 73 | if line[0] == '>': 74 | id = line.strip()[1:] 75 | if id in rename_id_db: 76 | line = line.replace(id, rename_id_db[id]) 77 | f_out.write(line) 78 | else: 79 | for line in f_in: 80 | id = line.strip().split()[0] 81 | if id in rename_id_db: 82 | f_out.write(line.replace(id, rename_id_db[id])) 83 | else: 84 | f_out.write(line) 85 | print("Finished") 86 | 87 | 88 | if __name__ == "__main__": 89 | if len(sys.argv) < 5: 90 | print("Usage: python "+sys.argv[0]+" ") 91 | print("Notice: sort and rename id with in_gff, and rename them in fasta files") 92 | print("Example: python "+sys.argv[0]+" CB5 in.gff out.gff 1.fasta,2.fasta 1.new.fasta,2.new.fasta") 93 | else: 94 | chr_pre, in_gff, out_gff, in_fastas, out_fastas = sys.argv[1:] 95 | rename_id(chr_pre, in_gff, out_gff, in_fastas, out_fastas) 96 | -------------------------------------------------------------------------------- /bin/simple_ANGSD_without_errorCorrect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | 4 | 5 | def help_message(): 6 | print("Usage: python "+sys.argv[0]+" -l -r [-out -p ]") 7 | 8 | 9 | def parse_options(ARGV): 10 | opt_dict = {} 11 | for i in range(0, len(ARGV), 2): 12 | opt = ARGV[i] 13 | if opt == '-l': 14 | opt_dict['list'] = ARGV[i+1] 15 | elif opt == '-p': 16 | opt_dict['path'] = ARGV[i+1] 17 | elif opt == '-r': 18 | opt_dict['regions'] = ARGV[i+1] 19 | elif opt == '-out': 20 | opt_dict['out_name'] = ARGV[i+1] 21 | elif opt == '-h': 22 | help_message() 23 | exit(0) 24 | return opt_dict 25 | 26 | 27 | def run_abbababa(opts): 28 | if 'path' not in opts: 29 | bam_path = './' 30 | else: 31 | bam_path = opts['path'] 32 | bam_files = [] 33 | for fn in os.listdir(bam_path): 34 | if fn[-4:] == '.bam': 35 | bam_files.append(os.path.join(bam_path, fn)) 36 | 37 | print("Indexing bams") 38 | for fn in bam_files: 39 | fn = fn.split('/')[-1] 40 | if os.path.isfile(os.path.join('./', fn+'.bai')) == False: 41 | cmd = 'samtools index '+fn 42 | print("Running command: "+cmd) 43 | os.system(cmd) 44 | 45 | print("Done\nReading list") 46 | list_db = {} 47 | with open(opts['list'], 'r') as f_in: 48 | for line in f_in: 49 | data = line.strip().split() 50 | if data[1] not in list_db: 51 | list_db[data[1]] = {} 52 | list_db[data[1]]['name'] = [] 53 | list_db[data[1]]['path'] = [] 54 | for fn in bam_files: 55 | if data[0] in fn: 56 | list_db[data[1]]['path'].append(fn) 57 | list_db[data[1]]['name'].append(data[0]) 58 | 59 | if 'out_name' not in opts: 60 | out_name = "Outgroup" 61 | else: 62 | out_name = opts['out_name'] 63 | 64 | print("Done\nGenerate bam.filelist sizeFile.size popNames.name bamWithErrors.filelist errorList.error") 65 | with open("bam.filelist", "w") as f_list: 66 | with open("sizeFile.size", "w") as f_size: 67 | with open("popNames.name", "w") as f_pop: 68 | for subgroup in list_db: 69 | if subgroup != out_name: 70 | f_pop.write(subgroup+'\n') 71 | f_list.write('\n'.join(list_db[subgroup]['path'])+'\n') 72 | group_size = len(list_db[subgroup]['name']) 73 | f_size.write(str(group_size)+'\n') 74 | f_list.write(list_db[out_name]['path'][0]+'\n') 75 | f_pop.write(out_name+'\n') 76 | f_size.write('1\n') 77 | 78 | print("Done\nDo abbababa") 79 | if "regions" not in opts: 80 | regions_file = "regions.txt" 81 | else: 82 | regions_file = opts["regions"] 83 | 84 | cmd = "ANGSD -doAbbababa2 1 -bam bam.filelist -sizeFile sizeFile.size -doCounts 1 -out bam.Angsd -rf "+regions_file+" -useLast 1 -minQ 20 -minMapQ 30" 85 | print("Running command: "+cmd) 86 | os.system(cmd) 87 | 88 | print("Done\n") 89 | 90 | cmd = "Rscript DSTAT angsdFile=\"bam.Angsd\" out=\"result\" sizeFile=sizeFile.size nameFile=popNames.name" 91 | print("Running command: "+cmd) 92 | os.system(cmd) 93 | print("Done\nSuccess") 94 | 95 | 96 | if __name__ == "__main__": 97 | if len(sys.argv) == 1: 98 | help_message() 99 | else: 100 | opts = parse_options(sys.argv[1:]) 101 | run_abbababa(opts) 102 | -------------------------------------------------------------------------------- /bin/check_cds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from enum import Enum 4 | 5 | 6 | class CDS_Type(Enum): 7 | VALID = 1 8 | LENGTH_ERROR = 2 9 | MISSING_START_CODON = 3 10 | MISSING_STOP_CODON = 4 11 | EARLY_STOP_CODON = 5 12 | 13 | 14 | def get_opts(): 15 | group = argparse.ArgumentParser() 16 | group.add_argument("-i", "--input", help="Input CDS file", required=True) 17 | group.add_argument( 18 | "--detail", help="If set, output detail information", action="store_true" 19 | ) 20 | group.add_argument( 21 | "-o", 22 | "--output", 23 | help="Output summary file, if not set, output to stdout", 24 | default="", 25 | ) 26 | return group.parse_args() 27 | 28 | 29 | def check_cds(in_cds, is_detail, out_summary): 30 | start_codon = set(["ATG"]) 31 | stop_codon = set(["TAG", "TAA", "TGA"]) 32 | cds_db = {} 33 | with open(in_cds, "r") as fin: 34 | for line in fin: 35 | if line.strip() == "": 36 | continue 37 | if line[0] == ">": 38 | gid = line.strip().split()[0][1:] 39 | cds_db[gid] = [] 40 | else: 41 | cds_db[gid].append(line.strip().upper()) 42 | 43 | for gid in cds_db: 44 | cds_db[gid] = "".join(cds_db[gid]) 45 | 46 | detail_db = {} 47 | for gid in cds_db: 48 | detail_db[gid] = CDS_Type.VALID 49 | if len(cds_db[gid]) % 3 != 0: 50 | detail_db[gid] = CDS_Type.LENGTH_ERROR 51 | else: 52 | if cds_db[gid][:3] not in start_codon: 53 | detail_db[gid] = CDS_Type.MISSING_START_CODON 54 | elif cds_db[gid][-3:] not in stop_codon: 55 | detail_db[gid] = CDS_Type.MISSING_STOP_CODON 56 | else: 57 | for _ in range(3, len(cds_db[gid]) - 3, 3): 58 | if cds_db[gid][_ : _ + 3] in stop_codon: 59 | detail_db[gid] = CDS_Type.EARLY_STOP_CODON 60 | 61 | # Valid, length error, missing start codon, missing stop codon, early stop codon 62 | summary_info = [0, 0, 0, 0, 0] 63 | for gid in detail_db: 64 | summary_info[detail_db[gid].value - 1] += 1 65 | 66 | out_info = [] 67 | out_info.append("# Summary") 68 | out_info.append("Valid: %d" % summary_info[0]) 69 | out_info.append("Length error: %d" % summary_info[1]) 70 | out_info.append("Missing start codon: %d" % summary_info[2]) 71 | out_info.append("Missing stop codon: %d" % summary_info[3]) 72 | out_info.append("Early stop codon: %d" % summary_info[4]) 73 | 74 | if is_detail: 75 | out_info.append("") 76 | out_info.append("# Error detail") 77 | for gid in sorted(detail_db): 78 | if detail_db[gid] != CDS_Type.VALID: 79 | out_info.append("%s: %s" % (gid, detail_db[gid].name)) 80 | 81 | if out_summary: 82 | with open(out_summary, "w") as fout: 83 | fout.write("%s\n" % ("\n".join(out_info))) 84 | else: 85 | print("%s" % ("\n".join(out_info))) 86 | 87 | 88 | if __name__ == "__main__": 89 | opts = get_opts() 90 | in_cds = opts.input 91 | is_detail = True if opts.detail else False 92 | out_summary = opts.output 93 | check_cds(in_cds, is_detail, out_summary) 94 | -------------------------------------------------------------------------------- /bin/nucmer_statistics_all_sv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | 4 | 5 | def nucmer_statistics(r_fa, q_fa, out_pre, ts): 6 | print("Running nucmer") 7 | cmd = "nucmer -p "+out_pre+" "+r_fa+" "+q_fa+" -t "+ts 8 | print("Running command: "+cmd) 9 | os.system(cmd) 10 | 11 | print("Running delta-filter") 12 | cmd = "delta-filter -gqr "+out_pre+".delta > "+out_pre+".filtered" 13 | print("Running command: "+cmd) 14 | os.system(cmd) 15 | print("Statisitcs") 16 | data_db = {} 17 | last_INDEL_pos = {} 18 | r_chr_len_db = {} 19 | q_chr_len_db = {} 20 | sv_list = ['SNP', 'INDEL', 'JMP', 'INV', 'DUP', 'BRK'] 21 | with os.popen("show-diff -H "+out_pre+".filtered", 'r') as f_in: 22 | for line in f_in: 23 | data = line.strip().split() 24 | if len(data) < 5: 25 | continue 26 | chrn = data[0] 27 | sv = data[1] 28 | if chrn not in data_db: 29 | data_db[chrn] = {} 30 | if sv not in sv_list: 31 | continue 32 | if sv not in data_db[chrn]: 33 | data_db[chrn][sv] = {'count': 0, 'size': 0} 34 | sv_size = abs(int(data[-1])) 35 | data_db[chrn][sv]['count'] += 1 36 | data_db[chrn][sv]['size'] += sv_size 37 | 38 | 39 | with os.popen("show-snps -ClrT "+out_pre+".filtered", 'r') as f_in: 40 | for line in f_in: 41 | data = line.strip().split() 42 | if len(data) == 0 or data[0].isdigit() == False: 43 | continue 44 | 45 | pos = int(data[0]) 46 | r_chrn = data[-2] 47 | q_chrn = data[-1] 48 | if 'SNP' not in data_db[r_chrn]: 49 | data_db[r_chrn]['SNP'] = {'count': 0, 'size':0} 50 | if 'INDEL' not in data_db[r_chrn]: 51 | data_db[r_chrn]['INDEL'] = {'count': 0, 'size':0} 52 | if r_chrn not in r_chr_len_db: 53 | last_INDEL_pos[r_chrn] = 0 54 | r_chr_len_db[r_chrn] = int(data[6]) 55 | if q_chrn not in q_chr_len_db: 56 | q_chr_len_db[q_chrn] = int(data[7]) 57 | 58 | if data[1] != '.' and data[2] != '.': 59 | data_db[r_chrn]['SNP']['count'] += 1 60 | data_db[r_chrn]['SNP']['size'] += 1 61 | last_INDEL_pos[r_chrn] = 0 62 | else: 63 | if pos - last_INDEL_pos[r_chrn] > 1: 64 | data_db[r_chrn]['INDEL']['count'] += 1 65 | data_db[r_chrn]['INDEL']['size'] += 1 66 | last_INDEL_pos[r_chrn] = pos 67 | 68 | total_size = {} 69 | total_count = {} 70 | for chrn in data_db: 71 | total_size[chrn] = 0 72 | total_count[chrn] = 0 73 | for type in data_db[chrn]: 74 | total_size[chrn] += data_db[chrn][type]['size'] 75 | total_count[chrn] += data_db[chrn][type]['count'] 76 | 77 | with open(out_pre+".statistics", 'w') as f_out: 78 | f_out.write("Type\tNumber\tSize\tSize/TotalVar\tSize/ChrSize\n") 79 | for chrn in data_db: 80 | for type in sv_list: 81 | if type not in data_db[chrn]: 82 | continue 83 | f_out.write("%s\t%d\t%d\t%.4f\t%.4f\n"%(type, data_db[chrn][type]['count'], data_db[chrn][type]['size'], data_db[chrn][type]['size']*1.0/total_size[chrn], data_db[chrn][type]['size']*2.0/(r_chr_len_db[chrn]+q_chr_len_db[chrn]))) 84 | f_out.write("%s\t%d\t%d\t%.4f\t%.4f\n"%("Total", total_count[chrn], total_size[chrn], total_size[chrn]*1.0/total_size[chrn], total_size[chrn]*2.0/(r_chr_len_db[chrn]+q_chr_len_db[chrn]))) 85 | print("Success") 86 | 87 | 88 | if __name__ == "__main__": 89 | if len(sys.argv) < 5: 90 | print("Usage: python "+sys.argv[0]+" ") 91 | else: 92 | r_fa, q_fa, out_pre, ts = sys.argv[1:] 93 | nucmer_statistics(r_fa, q_fa, out_pre, ts) 94 | -------------------------------------------------------------------------------- /bin/transfer_gff3_with_agp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | 5 | def get_gene_ctg(chr_list, sp, ep): 6 | infos = [] 7 | for info in chr_list: 8 | chsp = info[0] 9 | chep = info[1] 10 | ovlp = min(chep, ep)-max(chsp, sp) 11 | if ovlp>=0: 12 | infos.append([ovlp, info]) 13 | if infos != []: 14 | return sorted(infos, reverse=True)[0][1] 15 | else: 16 | return [] 17 | 18 | 19 | def trans_anno(in_gff3, in_old_agp, in_new_agp, out_gff3): 20 | print("Reading gff3") 21 | gff3_db = {} 22 | with open(in_gff3, 'r') as fin: 23 | for line in fin: 24 | if line[0] == '#' or line.strip() == '': 25 | continue 26 | data = line.strip().split() 27 | if data[2] == 'gene': 28 | ID = data[8].split(';')[0].split('=')[-1] 29 | if ID not in gff3_db: 30 | gff3_db[ID] = [] 31 | gff3_db[ID].append(line) 32 | 33 | print("Reading old agp") 34 | old_agp_db = {} 35 | with open(in_old_agp, 'r') as fin: 36 | for line in fin: 37 | data = line.strip().split() 38 | if data[4] == 'U': 39 | continue 40 | chrn = data[0] 41 | sp = int(data[1]) 42 | ep = int(data[2]) 43 | direct = data[-1] 44 | tig = data[5] 45 | if chrn not in old_agp_db: 46 | old_agp_db[chrn] = [] 47 | old_agp_db[chrn].append([sp, ep, tig, direct]) 48 | 49 | for chrn in old_agp_db: 50 | old_agp_db[chrn] = sorted(old_agp_db[chrn]) 51 | 52 | print("Reading new agp") 53 | new_agp_db = {} 54 | with open(in_new_agp, 'r') as fin: 55 | for line in fin: 56 | data = line.strip().split() 57 | if data[4] == 'U': 58 | continue 59 | tig = data[5] 60 | chrn = data[0] 61 | sp = int(data[1]) 62 | ep = int(data[2]) 63 | direct = data[-1] 64 | new_agp_db[tig] = [chrn, sp, ep, direct] 65 | 66 | print("Writing new gff3") 67 | with open(out_gff3, 'w') as fout: 68 | fout.write("###gff version 3\n") 69 | for id in sorted(gff3_db): 70 | for i in range(0, len(gff3_db[id])): 71 | data = gff3_db[id][i].split() 72 | if data[2] == 'gene': 73 | break 74 | chrn = data[0] 75 | #if chrn not in old_agp_db: 76 | # continue 77 | sp = int(data[3]) 78 | ep = int(data[4]) 79 | match_ctg = get_gene_ctg(old_agp_db[chrn], sp, ep) 80 | if match_ctg == []: 81 | print(id, data) 82 | else: 83 | csp, cep, tig, tdir = match_ctg 84 | nchrn, nsp, nep, ndir = new_agp_db[tig] 85 | for line in gff3_db[id]: 86 | data = line.strip().split() 87 | gsp = int(data[3]) 88 | gep = int(data[4]) 89 | gdir = data[6] 90 | if tdir == '+': 91 | gts = gsp-csp+1 92 | gte = gep-csp+1 93 | else: 94 | gts = cep-gep+1 95 | gte = cep-gsp+1 96 | if gdir == tdir: 97 | gtd = '+' 98 | else: 99 | gtd = '-' 100 | if ndir == '+': 101 | gns = nsp+gts-1 102 | gne = nsp+gte-1 103 | else: 104 | gns = nep-gte+1 105 | gne = nep-gts+1 106 | if gtd == ndir: 107 | gnd = '+' 108 | else: 109 | gnd = '-' 110 | if gns <= 0 or gne <= 0: 111 | continue 112 | data[0] = nchrn 113 | data[3] = str(gns) 114 | data[4] = str(gne) 115 | data[6] = gnd 116 | fout.write('\t'.join(data)+'\n') 117 | fout.write('\n') 118 | print("Finished") 119 | 120 | 121 | if __name__ == "__main__": 122 | if len(sys.argv) < 5: 123 | print("Usage: python "+sys.argv[0]+" ") 124 | else: 125 | in_gff3, in_old_agp, in_new_agp, out_gff3 = sys.argv[1:] 126 | trans_anno(in_gff3, in_old_agp, in_new_agp, out_gff3) 127 | -------------------------------------------------------------------------------- /bin/sort_gff3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import re 4 | 5 | 6 | def generate_new_id_by_gff(chr_pre, in_gff): 7 | id_db = {} 8 | gff_db = {} 9 | is_first_id = True 10 | with open(in_gff, 'r') as f_in: 11 | for line in f_in: 12 | data = line.strip().split() 13 | if line[0] == '#' or len(data) < 9: 14 | continue 15 | if data[2] == 'gene': 16 | chrn = data[0] 17 | s_p = int(data[3]) 18 | id = data[8].split(";")[0].split("=")[1] 19 | if is_first_id: 20 | print("Check ID: %s" % id) 21 | is_first_id = False 22 | if chrn not in id_db: 23 | id_db[chrn] = [] 24 | id_db[chrn].append([s_p, id]) 25 | gff_db[id] = [] 26 | gff_db[id].append(line) 27 | 28 | new_id_db = {} 29 | ordered_id = [] 30 | tig_base = 10 31 | for chrn in sorted(id_db, key=lambda x: int(re.findall(r'\d+', x)[0]) if len(re.findall(r'\d+', x)) > 0 else 1000): 32 | base = 10 33 | for info in sorted(id_db[chrn]): 34 | if chrn[:3].lower() == 'chr': 35 | idx, hap = re.findall(r"(\d+)([A-Z]*)", chrn)[0] 36 | idx = int(idx) 37 | if not hap: 38 | hap = "G" 39 | new_id = chr_pre + ".%02d%s%07d" % (idx, hap, base) 40 | base += 10 41 | else: 42 | new_id = chr_pre + ".%08d" % tig_base 43 | tig_base += 10 44 | ordered_id.append(info[1]) 45 | new_id_db[info[1]] = new_id 46 | return new_id_db, gff_db, ordered_id 47 | 48 | 49 | def rename_id(chr_pre, in_gff, out_gff): 50 | print("Generating rename list") 51 | rename_id_db, gff_db, ordered_id = generate_new_id_by_gff(chr_pre, in_gff) 52 | 53 | print("Dealing gff3") 54 | with open(out_gff, 'w') as fout: 55 | fout.write("###gff version 3\n") 56 | for ori_id in ordered_id: 57 | mrna_idx = 1 58 | for line in gff_db[ori_id]: 59 | data = line.strip().split() 60 | if data[2] == 'gene': 61 | gid = rename_id_db[ori_id] 62 | data[8] = "ID=%s;Name=%s" % (gid, gid) 63 | elif data[2] == 'mRNA': 64 | mrid = "%s.t%d" % (gid, mrna_idx) 65 | mrna_idx += 1 66 | other_idx_db = {} 67 | data[8] = "ID=%s;Name=%s;Parent=%s" % (mrid, mrid, gid) 68 | else: 69 | feature = data[2] 70 | if feature not in other_idx_db: 71 | other_idx_db[feature] = 1 72 | other_id = "%s.%s%d" % (mrid, feature, other_idx_db[feature]) 73 | other_idx_db[feature] += 1 74 | data[8] = "ID=%s;Name=%s;Parent=%s" % (other_id, other_id, mrid) 75 | fout.write("%s\n" % ("\t".join(data))) 76 | fout.write("###\n") 77 | print("Finished") 78 | 79 | 80 | if __name__ == "__main__": 81 | if len(sys.argv) < 3: 82 | print("Usage: python " + sys.argv[0] + " ") 83 | print("Notice: sort and rename id with in_gff by coordinate, the chromosome ID should be like: Chr01 for mono " 84 | "assembly, Chr01A for phased assembly.") 85 | print("Example: python " + sys.argv[0] + " CB5 in.gff3 out.gff3") 86 | else: 87 | chr_pre, in_gff3, out_gff3 = sys.argv[1:] 88 | rename_id(chr_pre, in_gff3, out_gff3) 89 | -------------------------------------------------------------------------------- /bin/quick_mask_genome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | import multiprocessing 4 | 5 | 6 | def read_bed(in_bed, th): 7 | win_regions = {} 8 | with open(in_bed, 'r') as f_in: 9 | for line in f_in: 10 | if line.strip() == '': 11 | continue 12 | data = line.strip().split() 13 | if int(data[3]) > 150: 14 | chrn = data[0] 15 | sp = int(data[1])-1 16 | ep = int(data[2])-1 17 | if chrn not in win_regions: 18 | win_regions[chrn] = [] 19 | win_regions[chrn].append([sp, ep]) 20 | return win_regions 21 | 22 | 23 | def merge_regions(ori_regions): 24 | new_regions = {} 25 | for chrn in ori_regions: 26 | new_regions[chrn] = [] 27 | tmp_regions = [] 28 | last_e = 0 29 | for region in sorted(ori_regions[chrn]): 30 | sr = region[0] 31 | er = region[1] 32 | if last_e == 0: 33 | tmp_regions.append(sr) 34 | last_e = er 35 | if sr > last_e: 36 | tmp_regions.append(last_e) 37 | tmp_regions.append(sr) 38 | last_e = er 39 | else: 40 | if er > last_e: 41 | last_e = er 42 | tmp_regions.append(last_e) 43 | for i in range(0, len(tmp_regions), 2): 44 | new_regions[chrn].append([tmp_regions[i], tmp_regions[i+1]]) 45 | return new_regions 46 | 47 | 48 | def read_fasta(in_fa): 49 | seq_db = {} 50 | seq_id_list = [] 51 | with open(in_fa, 'r') as f_in: 52 | id = '' 53 | seq = '' 54 | for line in f_in: 55 | if line[0] == '>': 56 | if seq != '': 57 | seq_db[id] = seq 58 | id = line.strip()[1:] 59 | seq_id_list.append(id) 60 | seq = '' 61 | else: 62 | seq += line.strip() 63 | seq_db[id] = seq 64 | return seq_db, seq_id_list 65 | 66 | 67 | def mask_fasta(id_list, seq_db, win_regions): 68 | for id in id_list: 69 | new_seq = '' 70 | sp = 0 71 | if id in win_regions: 72 | for region in win_regions[id]: 73 | if region[0] > sp: 74 | new_seq += seq_db[id][sp: region[0]] 75 | new_seq += 'N'*(region[1]-region[0]+1) 76 | sp = region[1]+1 77 | if sp < len(seq_db[id]): 78 | new_seq += seq_db[id][sp: len(seq_db[id])] 79 | else: 80 | new_seq = seq_db[id] 81 | 82 | with open(id+'.tmp', 'w') as f_out: 83 | f_out.write(">%s\n%s\n"%(id, new_seq)) 84 | 85 | 86 | def quick_mask_genome(in_fa, in_bed, out_fa, th, ts): 87 | print("Reading fasta") 88 | seq_db, seq_id_list = read_fasta(in_fa) 89 | 90 | print("Reading bed") 91 | win_regions = merge_regions(read_bed(in_bed, th)) 92 | 93 | task_per_thread = int(len(seq_id_list)/ts) 94 | 95 | task_list = [] 96 | 97 | print("Masking genome") 98 | for i in range(0, ts): 99 | if i < ts-1: 100 | t = multiprocessing.Process(target=mask_fasta, args=(seq_id_list[i*task_per_thread: (i+1)*task_per_thread], seq_db, win_regions)) 101 | else: 102 | t = multiprocessing.Process(target=mask_fasta, args=(seq_id_list[i*task_per_thread:], seq_db, win_regions)) 103 | task_list.append(t) 104 | 105 | for t in task_list: 106 | t.start() 107 | 108 | for t in task_list: 109 | t.join() 110 | 111 | print("Merging") 112 | if os.path.exists(out_fa): 113 | os.remove(out_fa) 114 | for id in seq_id_list: 115 | os.system("cat "+id+".tmp >> "+out_fa) 116 | os.remove(id+".tmp") 117 | print("Success") 118 | 119 | 120 | if __name__ == "__main__": 121 | if len(sys.argv) < 6: 122 | print("Usage: python "+sys.argv[0]+" ") 123 | else: 124 | in_fa = sys.argv[1] 125 | in_bed = sys.argv[2] 126 | out_fa = sys.argv[3] 127 | th = int(sys.argv[4]) 128 | ts = int(sys.argv[5]) 129 | quick_mask_genome(in_fa, in_bed, out_fa, th, ts) 130 | -------------------------------------------------------------------------------- /bin/SeqStat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | import pysam 5 | import gzip 6 | 7 | 8 | def read_fasta(in_file): 9 | if in_file.split('.')[-1] == 'gz': 10 | fin = gzip.open(in_file, 'rt') 11 | else: 12 | fin = open(in_file, 'r') 13 | seq_len = [] 14 | for line in fin: 15 | if line[0] == '>': 16 | seq_len.append(0) 17 | else: 18 | seq_len[-1] += len(line.strip()) 19 | fin.close() 20 | return sorted(seq_len, reverse=True) 21 | 22 | 23 | def read_fastq(in_file): 24 | if in_file.split('.')[-1] == 'gz': 25 | fin = gzip.open(in_file, 'rt') 26 | else: 27 | fin = open(in_file) 28 | seq_len = [] 29 | cnt = 0 30 | for line in fin: 31 | if cnt%4 == 1: 32 | seq_len.append(len(line.strip())) 33 | cnt += 1 34 | fin.close() 35 | return sorted(seq_len, reverse=True) 36 | 37 | 38 | def read_bam(in_file): 39 | seq_len = [] 40 | ''' 41 | with os.popen("samtools view %s"%in_file, 'r') as fin: 42 | for line in fin: 43 | seq_len.append(len(line.strip().split()[9])) 44 | ''' 45 | with pysam.AlignmentFile(in_file, 'rb', check_sq=False) as fin: 46 | for line in fin: 47 | seq_len.append(line.query_length) 48 | return sorted(seq_len, reverse=True) 49 | 50 | 51 | def check_file_type(in_file): 52 | data = in_file.split('.') 53 | if data[-1] == 'gz': 54 | with gzip.open(in_file, 'rt') as fin: 55 | for line in fin: 56 | break 57 | if line[0] == '>': 58 | return "fa" 59 | elif line[0] == '@': 60 | return "fq" 61 | else: 62 | return "" 63 | elif data[-1] == 'bam': 64 | return "bam" 65 | else: 66 | with open(in_file, 'r') as fin: 67 | for line in fin: 68 | break 69 | if line[0] == '>': 70 | return "fa" 71 | elif line[0] == '@': 72 | return "fq" 73 | else: 74 | return "" 75 | 76 | 77 | def seq_stat(in_file, out_stat): 78 | file_type = check_file_type(in_file) 79 | if file_type == 'fa': 80 | seq_len = read_fasta(in_file) 81 | elif file_type == 'fq': 82 | seq_len = read_fastq(in_file) 83 | elif file_type == 'bam': 84 | seq_len = read_bam(in_file) 85 | else: 86 | print("Unsupport file type") 87 | sys.exit() 88 | seq_cnt = len(seq_len) 89 | min_len = seq_len[-1] 90 | max_len = seq_len[0] 91 | total_size = sum(seq_len) 92 | ave_len = total_size*1.0/seq_cnt 93 | n_threshold = [] 94 | n_values = [] 95 | n_labels = [] 96 | for i in range(90, 40, -10): 97 | n_threshold.append(i/100.0*total_size) 98 | n_values.append(0) 99 | n_labels.append("N%d:\t"%i) 100 | cur_size = 0 101 | cnt_500 = 0 102 | cnt_2k = 0 103 | for i in range(0, seq_cnt): 104 | cur_size += seq_len[i] 105 | for j in range(0, len(n_values)): 106 | if n_values[j] == 0 and cur_size >= n_threshold[j]: 107 | n_values[j] = seq_len[i] 108 | if seq_len[i] > 500: 109 | cnt_500 += 1 110 | if seq_len[i] > 2000: 111 | cnt_2k += 1 112 | n_info = [] 113 | for i in range(0, len(n_values)): 114 | n_info.append("%s%d\n"%(n_labels[i], n_values[i])) 115 | info = "number of seq:\t%d\nmin length:\t%d\nmax length:\t%d\ntotal size:\t%d\n%sAverage length:\t%d\nTotal number (>500bp):\t%d\nTotal number (>2000bp):\t%d"%(seq_cnt, min_len, max_len, total_size, ''.join(n_info), ave_len, cnt_500, cnt_2k) 116 | if out_stat == "": 117 | print(info) 118 | else: 119 | with open(out_stat, 'w') as fout: 120 | fout.write("%s\n"%info) 121 | 122 | 123 | if __name__ == "__main__": 124 | if len(sys.argv) < 2: 125 | print("Usage: python %s [out_stat]"%sys.argv[0]) 126 | else: 127 | if len(sys.argv) == 2: 128 | in_file = sys.argv[1] 129 | out_stat = "" 130 | else: 131 | in_file, out_stat = sys.argv[1:] 132 | seq_stat(in_file, out_stat) 133 | 134 | -------------------------------------------------------------------------------- /bin/approximate_cnv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | import multiprocessing 4 | 5 | 6 | def help_message(): 7 | print("Usage: python "+sys.argv[0]+" -bam -g -l -bed -o [-t ]") 8 | 9 | 10 | def get_opts(ARGV): 11 | opts = {} 12 | if len(ARGV) < 3: 13 | help_message() 14 | sys.exit(0) 15 | 16 | for i in range(1, len(ARGV), 2): 17 | key = ARGV[i][1:] 18 | value = ARGV[i+1] 19 | if key not in opts: 20 | opts[key] = value 21 | return opts 22 | 23 | 24 | def calc_mapped_reads_count(in_bam): 25 | fn = in_bam + '.read_counts.txt' 26 | counts = 0 27 | res = os.popen("samtools view "+in_bam) 28 | for line in res: 29 | data = line.strip().split() 30 | if data[2] != '*': 31 | counts += 1 32 | with open(fn, 'w') as f_out: 33 | f_out.write(str(counts)) 34 | 35 | 36 | def calc_read_depth(in_bam, in_bed): 37 | fn = in_bam + '.read_depth.txt' 38 | res = os.popen("bedtools coverage -a "+in_bed+" -b "+in_bam+" 2>/dev/null") 39 | with open(fn, 'w') as f_out: 40 | for line in res: 41 | data = line.strip().split('\t') 42 | if len(data) < 7: 43 | continue 44 | f_out.write(line) 45 | 46 | 47 | def calc_pipeline(in_bams, in_bed): 48 | calc_mapped_reads_count(in_bams) 49 | calc_read_depth(in_bams, in_bed) 50 | 51 | 52 | def quick_CNV(opts): 53 | bam_list = [] 54 | name_list = [] 55 | bed_rows = 0 56 | print("Calculating read depth and counts") 57 | with open(opts['bam'], 'r') as f_in: 58 | for line in f_in: 59 | print("\tDealing %s"%line.strip()) 60 | bam_list.append(line.strip()) 61 | name_list.append(line.strip().split('/')[-1].split('\\')[-1].split('.')[0]) 62 | if 't' in opts: 63 | t_n = int(opts['t']) 64 | else: 65 | t_n = 1 66 | 67 | print("Creating processes pool") 68 | bed_file = opts['bed'] 69 | 70 | pool = multiprocessing.Pool(processes=t_n) 71 | for bam_file in bam_list: 72 | res = pool.apply_async(calc_pipeline, (bam_file, bed_file,)) 73 | pool.close() 74 | pool.join() 75 | 76 | genome_size = int(opts['g']) 77 | read_length = int(opts['l']) 78 | gene_length_db = {} 79 | 80 | print("Reading bed") 81 | with open(bed_file, 'r') as f_in: 82 | for line in f_in: 83 | data = line.strip().split() 84 | gene_name = data[-1] 85 | s_p = int(data[1]) 86 | e_p = int(data[2]) 87 | length = e_p - s_p 88 | if length < 0: 89 | length = -length 90 | gene_length_db[gene_name] = length 91 | 92 | print("Approximating CNV") 93 | bed_rows = len(data) 94 | mapped_rc = {} 95 | coverage_db = {} 96 | for i in range(0, len(bam_list)): 97 | fn = bam_list[i] + '.read_counts.txt' 98 | with open(fn, 'r') as f_in: 99 | for line in f_in: 100 | if line.strip() == '': 101 | continue 102 | mapped_rc[name_list[i]] = int(line.strip()) 103 | #os.remove(fn) 104 | 105 | fn = bam_list[i] + '.read_depth.txt' 106 | if name_list[i] not in coverage_db: 107 | coverage_db[name_list[i]] = {} 108 | with open(fn, 'r') as f_in: 109 | for line in f_in: 110 | if line.strip() == '': 111 | continue 112 | data = line.strip().split('\t') 113 | gene_name = data[bed_rows-1] 114 | if gene_name not in coverage_db[name_list[i]]: 115 | coverage_db[name_list[i]][gene_name] = int(data[bed_rows]) 116 | #os.remove(fn) 117 | 118 | print("Writing result") 119 | out_file = opts['o'] 120 | with open(out_file, 'w') as f_out: 121 | f_out.write("#Sample\\CopyNumber\t") 122 | f_out.write('\t'.join(list(sorted(gene_length_db.keys())))+'\n') 123 | for name in sorted(mapped_rc.keys()): 124 | f_out.write(name) 125 | seq_depth = mapped_rc[name]*1.0*read_length/genome_size 126 | for gene in sorted(gene_length_db.keys()): 127 | copy_number = 1.0*coverage_db[name][gene]*read_length/gene_length_db[gene]/seq_depth 128 | f_out.write('\t'+str(copy_number)) 129 | f_out.write('\n') 130 | 131 | print("Finished") 132 | 133 | 134 | if __name__ == "__main__": 135 | opts = get_opts(sys.argv) 136 | necessary_paras = ['bam', 'g', 'l', 'bed', 'o'] 137 | for key in necessary_paras: 138 | if key not in opts: 139 | help_message() 140 | sys.exit(0) 141 | quick_CNV(opts) 142 | -------------------------------------------------------------------------------- /bin/simple_ANGSD.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | 4 | 5 | def help_message(): 6 | print("Usage: python "+sys.argv[0]+" -l -anc -r [-out -p -ref ]") 7 | 8 | 9 | def parse_options(ARGV): 10 | opt_dict = {} 11 | for i in range(0, len(ARGV), 2): 12 | opt = ARGV[i] 13 | if opt == '-l': 14 | opt_dict['list'] = ARGV[i+1] 15 | elif opt == '-p': 16 | opt_dict['path'] = ARGV[i+1] 17 | elif opt == '-anc': 18 | opt_dict['anc'] = ARGV[i+1] 19 | elif opt == '-ref': 20 | opt_dict['ref'] = ARGV[i+1] 21 | elif opt == '-r': 22 | opt_dict['regions'] = ARGV[i+1] 23 | elif opt == '-out': 24 | opt_dict['out_name'] = ARGV[i+1] 25 | elif opt == '-h': 26 | help_message() 27 | exit(0) 28 | return opt_dict 29 | 30 | 31 | def run_abbababa(opts): 32 | if 'path' not in opts: 33 | bam_path = './' 34 | else: 35 | bam_path = opts['path'] 36 | bam_files = [] 37 | for fn in os.listdir(bam_path): 38 | if fn[-4:] == '.bam': 39 | bam_files.append(os.path.join(bam_path, fn)) 40 | 41 | print("Indexing bams") 42 | for fn in bam_files: 43 | fn = fn.split('/')[-1] 44 | if os.path.isfile(os.path.join('./', fn+'.bai')) == False: 45 | cmd = 'samtools index '+fn 46 | print("Running command: "+cmd) 47 | os.system(cmd) 48 | 49 | print("Done\nReading list") 50 | list_db = {} 51 | with open(opts['list'], 'r') as f_in: 52 | for line in f_in: 53 | data = line.strip().split() 54 | if data[1] not in list_db: 55 | list_db[data[1]] = {} 56 | list_db[data[1]]['name'] = [] 57 | list_db[data[1]]['path'] = [] 58 | for fn in bam_files: 59 | if data[0] in fn: 60 | list_db[data[1]]['path'].append(fn) 61 | list_db[data[1]]['name'].append(data[0]) 62 | 63 | if 'out_name' not in opts: 64 | out_name = "Outgroup" 65 | else: 66 | out_name = opts['out_name'] 67 | 68 | print("Done\nGenerate bam.filelist sizeFile.size popNames.name bamWithErrors.filelist errorList.error") 69 | with open("bam.filelist", "w") as f_list: 70 | with open("sizeFile.size", "w") as f_size: 71 | with open("popNames.name", "w") as f_pop: 72 | with open("bamWithErrors.filelist", "w") as f_bwe: 73 | with open("errorList.error", "w") as f_err_list: 74 | i = 0 75 | for subgroup in list_db: 76 | if subgroup != out_name: 77 | f_pop.write(subgroup+'\n') 78 | if i < 2: 79 | f_bwe.write('\n'.join(list_db[subgroup]['path'])+"\n") 80 | i += 1 81 | f_err_list.write("./errorFile.ancError\n") 82 | else: 83 | f_err_list.write("NA\n") 84 | f_list.write('\n'.join(list_db[subgroup]['path'])+'\n') 85 | group_size = len(list_db[subgroup]['name']) 86 | f_size.write(str(group_size)+'\n') 87 | f_list.write(list_db[out_name]['path'][0]+'\n') 88 | f_pop.write(out_name+'\n') 89 | f_size.write('1\n') 90 | 91 | print("Done\nDo abbababa") 92 | anc_file = opts['anc'] 93 | if "regions" not in opts: 94 | regions_file = "regions.txt" 95 | else: 96 | regions_file = opts["regions"] 97 | 98 | cmd = "ANGSD -doAbbababa2 1 -bam bam.filelist -sizeFile sizeFile.size -doCounts 1 -out bam.Angsd -rf "+regions_file+" -useLast 1 -minQ 20 -minMapQ 30" 99 | print("Running command: "+cmd) 100 | os.system(cmd) 101 | 102 | print("Done\nIndex reference fasta") 103 | if 'ref' not in opts: 104 | os.system("ANGSD -i "+bam_files[-1]+" -doFasta 1 -doCounts 1 -out perfectSampleCEU") 105 | os.system("gunzip perfectSampleCEU.fa.gz") 106 | os.system("samtools faidx perfectSampleCEU.fa") 107 | ref_fasta = "perfectSampleCEU.fa" 108 | else: 109 | ref_fasta = opts['ref'] 110 | os.system("samtools faidx "+ref_fasta) 111 | 112 | print("Done\nDo Anc Error and Rscript") 113 | cmd = "ANGSD -doAncError 1 -anc "+anc_file+" -ref "+ref_fasta+" -out errorFile -bam bamWithErrors.filelist" 114 | print("Running command: "+cmd) 115 | os.system(cmd) 116 | 117 | cmd = "Rscript DSTAT angsdFile=\"bam.Angsd\" out=\"result\" sizeFile=sizeFile.size errFile=errorList.error nameFile=popNames.name" 118 | print("Running command: "+cmd) 119 | os.system(cmd) 120 | print("Done\nSuccess") 121 | 122 | 123 | if __name__ == "__main__": 124 | if len(sys.argv) == 1: 125 | help_message() 126 | else: 127 | opts = parse_options(sys.argv[1:]) 128 | run_abbababa(opts) 129 | -------------------------------------------------------------------------------- /bin/remove_region_by_blast_result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, operator 3 | 4 | 5 | def reverse_region(ori_region, chr_len_db): 6 | new_region = {} 7 | for chrn in ori_region: 8 | if chrn not in new_region: 9 | new_region[chrn] = [] 10 | temp_region = [] 11 | temp_region.append(0) 12 | for region in ori_region[chrn]: 13 | temp_region.append(region[0]-1) 14 | temp_region.append(region[1]+1) 15 | temp_region.append(chr_len_db[chrn]-1) 16 | i = 0 17 | while i < len(temp_region): 18 | new_region[chrn].append([temp_region[i], temp_region[i+1]]) 19 | i += 2 20 | return new_region 21 | 22 | 23 | def update_region(ori_region, region): 24 | temp_region = [] 25 | for chr_region in ori_region: 26 | if chr_region[0] <= region[0] <= chr_region[1]: 27 | if chr_region not in temp_region: 28 | temp_region.append(chr_region) 29 | if chr_region[0] <= region[1] <= chr_region[1]: 30 | if chr_region not in temp_region: 31 | temp_region.append(chr_region) 32 | for chr_region in temp_region: 33 | if chr_region in ori_region: 34 | ori_region.remove(chr_region) 35 | if chr_region[0] < region[0] <= chr_region[1]: 36 | ori_region.append([chr_region[0], region[0]-1]) 37 | if chr_region[0] <= region[1] < chr_region[1]: 38 | ori_region.append([region[1]+1, chr_region[1]]) 39 | return ori_region 40 | 41 | 42 | def remove_region(blast_results, chr_len_file, out_bed): 43 | chr_len_db = {} 44 | with open(chr_len_file, 'r') as f_in: 45 | for line in f_in: 46 | if line.strip() != '': 47 | data = line.strip().split() 48 | chr_len_db[data[0]] = int(data[1]) 49 | out_region = {} 50 | is_first = True 51 | for blast_file in blast_results: 52 | if is_first: 53 | with open(blast_file, 'r') as f_in: 54 | blast_region = {} 55 | for line in f_in: 56 | data = line.strip().split() 57 | chrn = data[1] 58 | s = int(data[8]) 59 | e = int(data[9]) 60 | if s > e: 61 | temp = s 62 | s = e 63 | e = temp 64 | if chrn not in out_region: 65 | blast_region[chrn] = [] 66 | blast_region[chrn].append([s, e]) 67 | temp_list = [] 68 | for i in range(0, len(blast_region[chrn])): 69 | if s <= blast_region[chrn][i][0] and e >= blast_region[chrn][i][0]: 70 | blast_region[chrn][i][0] = s 71 | if e >= blast_region[chrn][i][1] and s <= blast_region[chrn][i][1]: 72 | blast_region[chrn][i][1] = e 73 | if s > blast_region[chrn][i][1] or e < blast_region[chrn][i][0]: 74 | temp_list.append([s, e]) 75 | for region in temp_list: 76 | blast_region[chrn].append(region) 77 | blast_region[chrn] = sorted(blast_region[chrn]) 78 | for chrn in blast_region: 79 | temp_region = [] 80 | last_e = 0 81 | for i in range(0, len(blast_region[chrn])): 82 | s = blast_region[chrn][i][0] 83 | e = blast_region[chrn][i][1] 84 | if i == 0: 85 | temp_region.append(s) 86 | last_e = e 87 | else: 88 | if last_e < s: 89 | temp_region.append(e) 90 | temp_region.append(s) 91 | last_e = e 92 | else: 93 | if last_e < e: 94 | last_e = e 95 | temp_region.append(e) 96 | i = 0 97 | blast_region[chrn] = [] 98 | while i < len(temp_region): 99 | blast_region[chrn].append([temp_region[i], temp_region[i+1]]) 100 | i += 2 101 | out_region = reverse_region(blast_region, chr_len_db) 102 | is_first = False 103 | else: 104 | with open(blast_file, 'r') as f_in: 105 | for line in f_in: 106 | data = line.strip().split() 107 | chrn = data[1] 108 | s = int(data[8]) 109 | e = int(data[9]) 110 | if s > e: 111 | temp = s 112 | s = e 113 | e = temp 114 | out_region[chrn] = update_region(out_region[chrn], [s, e]) 115 | 116 | with open(out_bed, 'w') as f_out: 117 | for chrn in out_region: 118 | out_region[chrn] = sorted(out_region[chrn]) 119 | for region in out_region[chrn]: 120 | f_out.write(chrn+'\t'+str(region[0])+'\t'+str(region[1])+'\n') 121 | 122 | 123 | if __name__ == "__main__": 124 | if len(sys.argv) < 4: 125 | print("Notice: this script is using for remove regions from choromosome with blast results") 126 | print("Usage: python "+sys.argv[0]+" ") 127 | print("\t is a blast file list segmented with comma") 128 | else: 129 | blast_results = sys.argv[1].split(",") 130 | chr_len_file = sys.argv[2] 131 | out_bed = sys.argv[3] 132 | remove_region(blast_results, chr_len_file, out_bed) 133 | 134 | -------------------------------------------------------------------------------- /bin/convert_anchorwave.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | 4 | 5 | def get_opt(): 6 | group = argparse.ArgumentParser() 7 | group.add_argument('-i', '--input', help='Input maf file', required=True) 8 | group.add_argument('-o', '--output', help='Out put file', required=True) 9 | return group.parse_args() 10 | 11 | 12 | def convert_anchorwave(in_file, out_file): 13 | print("Converting") 14 | with open(in_file, 'r') as fin: 15 | with open(out_file, 'w') as fout: 16 | tmp = [] 17 | chrs = [] 18 | fout.write("#Ref\tStart\tEnd\tQuery\tStart\tEnd\tType\n") 19 | for line in fin: 20 | if line.strip() == '' or line[0] == '#' or line[0] == 'a': 21 | continue 22 | data = line.strip().split() 23 | chrs.append(data[1]) 24 | tmp.append(data[6]) 25 | if len(tmp) == 2: 26 | seq_len = len(tmp[0]) 27 | print("\tConverting pair: %s, length: %d"%(','.join(chrs), seq_len)) 28 | ref_pos = 0 29 | qry_pos = 0 30 | var_info = [] 31 | per_cnt = int(seq_len / 10) 32 | print("\t", end="") 33 | for _ in range(seq_len): 34 | if (_+1)%per_cnt == 0: 35 | print("%d%%"%(int((_+1)/per_cnt)*10), end='\t', flush=True) 36 | ref_base = tmp[0][_] 37 | qry_base = tmp[1][_] 38 | if ref_base == '-': 39 | var_type = 'INS' 40 | var_info.append([ref_pos, qry_pos, var_type]) 41 | qry_pos += 1 42 | elif qry_base == '-': 43 | var_type = 'DEL' 44 | var_info.append([ref_pos, qry_pos, var_type]) 45 | ref_pos += 1 46 | elif ref_base != qry_base: 47 | var_type = 'SNP' 48 | var_info.append([ref_pos, qry_pos, var_type]) 49 | ref_pos += 1 50 | qry_pos += 1 51 | else: 52 | ref_pos += 1 53 | qry_pos += 1 54 | print() 55 | if len(var_info) == 0: 56 | tmp = [] 57 | chrs = [] 58 | continue 59 | print("\tMerging pair: %s, length: %d"%(','.join(chrs), seq_len)) 60 | merge_info = [[var_info[0][0], var_info[0][0], 61 | var_info[0][1], var_info[0][1], 62 | var_info[0][2]]] 63 | for _ in range(1, len(var_info)): 64 | cur_info = var_info[_] 65 | if cur_info[-1] == merge_info[-1][-1]: 66 | is_continue = False 67 | if cur_info[0] == merge_info[-1][1] + 1: 68 | merge_info[-1][1] = cur_info[0] 69 | is_continue = True 70 | if cur_info[1] == merge_info[-1][3] + 1: 71 | merge_info[-1][3] = cur_info[1] 72 | is_continue = True 73 | if not is_continue: 74 | merge_info.append([cur_info[0], cur_info[0], 75 | cur_info[1], cur_info[1], 76 | cur_info[2]]) 77 | else: 78 | merge_info.append([cur_info[0], cur_info[0], 79 | cur_info[1], cur_info[1], 80 | cur_info[2]]) 81 | print("\tWriting pair: %s, length: %d"%(','.join(chrs), seq_len)) 82 | for rsp, rep, qsp, qep, var_type in merge_info: 83 | fout.write("%s\t%d\t%d\t%s\t%d\t%d\t%s\n"%(chrs[0], rsp+1, rep+1, 84 | chrs[1], qsp+1, qep+1, 85 | var_type)) 86 | chrs = [] 87 | tmp = [] 88 | print("Finished") 89 | 90 | 91 | if __name__ == "__main__": 92 | opts = get_opt() 93 | in_file = opts.input 94 | out_file = opts.output 95 | convert_anchorwave(in_file, out_file) 96 | 97 | -------------------------------------------------------------------------------- /bin/blast2heatmap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | 4 | 5 | def filter_blast(blast_file, out_file, t_i, t_m): 6 | blast_db = {} 7 | with open(blast_file, 'r') as f_in: 8 | for line in f_in: 9 | data = line.strip().split() 10 | chrn = data[1] 11 | iden = float(data[2]) 12 | s_pos_1 = int(data[6]) 13 | e_pos_1 = int(data[7]) 14 | s_pos_2 = int(data[8]) 15 | e_pos_2 = int(data[9]) 16 | if iden < t_i: 17 | continue 18 | if s_pos_1 == s_pos_2 and e_pos_1 == e_pos_2: 19 | continue 20 | if s_pos_2 > e_pos_2: 21 | tmp = s_pos_2 22 | s_pos_2 = e_pos_2 23 | e_pos_2 = tmp 24 | if e_pos_2 - s_pos_2 < t_m: 25 | continue 26 | if chrn not in blast_db: 27 | blast_db[chrn] = [] 28 | if [s_pos_2, e_pos_2] not in blast_db[chrn]: 29 | blast_db[chrn].append([s_pos_2, e_pos_2]) 30 | with open(out_file, 'w') as f_out: 31 | for chrn in sorted(blast_db.keys()): 32 | for region in sorted(blast_db[chrn]): 33 | f_out.write(chrn+'\t'+str(region[0])+'\t'+str(region[1])+'\n') 34 | 35 | 36 | def reshape(in_data, out_data): 37 | data_db = {} 38 | max_length = 0 39 | max_chr = '' 40 | with open(in_data, 'r') as f_in: 41 | for line in f_in: 42 | data = line.strip().split() 43 | chrn = data[0] 44 | pos = int(data[1]) 45 | value = data[-1] 46 | if chrn[:3].lower() != 'chr': 47 | continue 48 | if len(chrn) == 4: 49 | chrn = chrn[:3]+'0'+chrn[-1] 50 | if chrn not in data_db: 51 | data_db[chrn] = [] 52 | data_db[chrn].append([pos, value]) 53 | 54 | for chrn in data_db: 55 | if len(data_db[chrn]) > max_length: 56 | max_length = len(data_db[chrn]) 57 | max_chr = chrn 58 | 59 | for chrn in data_db: 60 | curr_len = len(data_db[chrn]) 61 | if curr_len < max_length: 62 | for i in range(curr_len, max_length): 63 | data_db[chrn].append([data_db[max_chr][i][0], 'nan']) 64 | 65 | new_data = {} 66 | for chrn in data_db: 67 | for value in data_db[chrn]: 68 | if value[0] not in new_data: 69 | new_data[value[0]] = {} 70 | new_data[value[0]][chrn] = value[1] 71 | 72 | head = [] 73 | for pos in new_data: 74 | for chrn in sorted(new_data[pos].keys()): 75 | head.append(chrn) 76 | break 77 | 78 | with open(out_data, 'w') as f_out: 79 | f_out.write('\t'+'\t'.join(head)+'\n') 80 | for pos in sorted(new_data.keys()): 81 | f_out.write(str(pos)) 82 | for chrn in sorted(new_data[pos].keys()): 83 | f_out.write('\t'+new_data[pos][chrn]) 84 | f_out.write('\n') 85 | 86 | 87 | def draw_heatmap_R(in_data, out_name, rs): 88 | current_path = os.getcwd() 89 | script = os.path.join(current_path, out_name+'_draw.R') 90 | with open(script, 'w') as f_out: 91 | f_out.write("setwd(\""+current_path+"\")\n") 92 | f_out.write("data<-read.table(\""+in_data+"\", header = TRUE)\n") 93 | f_out.write("cr<-c(0:length(colnames(data))-1)\n") 94 | f_out.write("library(\"pheatmap\")\n") 95 | f_out.write("pheatmap(data, cluster_cols = FALSE, cluster_rows = FALSE, show_rownames = FALSE, gaps_col = cr, filename = \""+out_name+".pdf\")\n") 96 | os.system("Rscript "+out_name+"_draw.R") 97 | 98 | script = os.path.join(current_path, out_name+'_draw_with_label.R') 99 | with open(script, 'w') as f_out: 100 | f_out.write("setwd(\""+current_path+"\")\n") 101 | f_out.write("data<-read.table(\""+in_data+"\", header = TRUE)\n") 102 | f_out.write("cr<-c(0:length(colnames(data))-1)\n") 103 | f_out.write("row_name<-array()\n") 104 | f_out.write("for(i in 1:length(rownames(data))){if(i%%==1){row_name[i]<-rownames(data)[i]}else{row_name[i]<-\"\"}}\n") 105 | f_out.write("library(\"pheatmap\")\n") 106 | f_out.write("pheatmap(data, cluster_cols = FALSE, cluster_rows = FALSE, labels_row = row_name, gaps_col = cr, filename = \""+out_name+"_label.pdf\")\n") 107 | os.system("Rscript "+out_name+"_draw_with_label.R") 108 | 109 | 110 | def blast2heatmap(ref_fasta, blast_file, ws, out_name, t_i, t_m): 111 | print("Filter blast result") 112 | filter_blast(blast_file, "01_"+out_name+'_filter.bed', t_i, t_m) 113 | print("Generate windows") 114 | os.system("bedtools makewindows -g "+ref_fasta+" -w "+ws+" > 02_"+out_name+"_win.bed") 115 | print("Coverage") 116 | os.system("bedtools coverage -a "+"02_"+out_name+"_win.bed -b "+"01_"+out_name+"_filter.bed > "+"03_"+out_name+"_cover.txt") 117 | print("Reshape data") 118 | reshape("03_"+out_name+"_cover.txt", "04_"+out_name+"_result.txt") 119 | print("Draw heatmap") 120 | draw_heatmap_R("04_"+out_name+"_result.txt", "05_"+out_name) 121 | 122 | 123 | if __name__ == "__main__": 124 | if len(sys.argv) < 7: 125 | print("Usage: python "+sys.argv[0]+" ") 126 | print("Notice: bedtools and R are required") 127 | else: 128 | proc, ref_fasta, blast_file, ws, out_name, t_i, t_m = sys.argv 129 | blast2heatmap(ref_fasta, blast_file, ws, out_name, float(t_i), float(t_m)) 130 | -------------------------------------------------------------------------------- /bin/SimContigs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import argparse 4 | import random 5 | import copy 6 | 7 | 8 | def GetOpts(): 9 | group = argparse.ArgumentParser() 10 | group.add_argument('--min', help='minimum length of contig, default: 15k, you can use both number or string end with k,m', default="15k") 11 | group.add_argument('--max', help='minimum length of contig, default: 5m, you can use both number or string end with k,m', default="5m") 12 | group.add_argument('-n', '--n50', help='size of N50, default: 500k, you can use both number or string end with k,m', default="500k") 13 | group.add_argument('-i', '--input', help='origin fasta file of genome', required=True) 14 | group.add_argument('-o', '--output', help='filename of simulated data', required=True) 15 | 16 | return group.parse_args() 17 | 18 | 19 | def ReadFasta(inFasta): 20 | fastaDB = {} 21 | with open(inFasta, 'r') as fIn: 22 | id = '' 23 | seq = '' 24 | for line in fIn: 25 | if line[0] == '>': 26 | if seq != '': 27 | fastaDB[id] = seq 28 | id = line.strip()[1:] 29 | seq = '' 30 | else: 31 | seq += line.strip() 32 | fastaDB[id] = seq 33 | return fastaDB 34 | 35 | 36 | def GenCtgLen(fastaLenDB, cntLowerDB, cntHigherDB, n50, minLen, maxLen): 37 | ctgLenDB = {} 38 | for chrn in cntLowerDB: 39 | ctgLenDB[chrn] = [] 40 | totalLower = 0 41 | totalHigher = 0 42 | totalLen = 0 43 | for i in range(0, cntLowerDB[chrn]): 44 | tmpLen = random.randint(minLen, n50) 45 | totalLower += tmpLen 46 | if totalLower > fastaLenDB[chrn]/2: 47 | break 48 | ctgLenDB[chrn].append(tmpLen) 49 | totalLen += tmpLen 50 | for i in range(0, cntHigherDB[chrn]): 51 | tmpLen = random.randint(n50, maxLen) 52 | totalHigher += tmpLen 53 | if totalHigher > fastaLenDB[chrn]/2: 54 | break 55 | ctgLenDB[chrn].append(tmpLen) 56 | totalLen += tmpLen 57 | cntN50 = int((fastaLenDB[chrn]-totalLen)/n50) 58 | for i in range(0, cntN50): 59 | tmpLen = random.randint(int(n50-n50*0.1), int(n50+n50*0.1)) 60 | totalLen += tmpLen 61 | if totalLen > fastaLenDB[chrn]: 62 | totalLen -= tmpLen 63 | break 64 | ctgLenDB[chrn].append(tmpLen) 65 | ctgLenDB[chrn].append(fastaLenDB[chrn]-totalLen) 66 | return ctgLenDB 67 | 68 | 69 | def GenCtgRegions(fastaLenDB, ctgLenDB): 70 | ctgRegionsDB = {} 71 | for chrn in fastaLenDB: 72 | ctgRegionsDB[chrn] = [] 73 | totalCtgLen = 0 74 | for ctgLen in ctgLenDB[chrn]: 75 | totalCtgLen += ctgLen 76 | cntCtg = len(ctgLenDB[chrn]) 77 | lastPos = 0 78 | ctgLenList = copy.deepcopy(ctgLenDB[chrn]) 79 | for i in range(0, cntCtg): 80 | index = random.randint(0, cntCtg-1) 81 | ctgRegionsDB[chrn].append([lastPos, lastPos+ctgLenList[index]]) 82 | lastPos += ctgLenList[index] 83 | del ctgLenList[index] 84 | cntCtg -= 1 85 | return ctgRegionsDB 86 | 87 | 88 | def SimGenomeCtg(inFasta, outFasta, n50, minLen, maxLen): 89 | random.seed() 90 | print("Reading fasta") 91 | fastaDB = ReadFasta(inFasta) 92 | fastaLenDB = {} 93 | cntLowerDB = {} 94 | cntHigherDB = {} 95 | for chrn in fastaDB: 96 | fastaLenDB[chrn] = len(fastaDB[chrn]) 97 | cntLowerDB[chrn] = int(fastaLenDB[chrn]/(minLen+n50)) 98 | cntHigherDB[chrn] = int(fastaLenDB[chrn]/(maxLen+n50)) 99 | 100 | print("\nGenerating contigs") 101 | ctgLenDB = GenCtgLen(fastaLenDB, cntLowerDB, cntHigherDB, n50, minLen, maxLen) 102 | ctgRegionsDB = GenCtgRegions(fastaLenDB, ctgLenDB) 103 | 104 | print("\nStatistics") 105 | for chrn in sorted(fastaDB): 106 | print("\tChromosome:\t%s"%(chrn)) 107 | print("\tChromosome size:\t%d"%(fastaLenDB[chrn])) 108 | print("\tContig counts:\t%d"%(len(ctgLenDB[chrn]))) 109 | tmpLen = 0 110 | n50Len = 0 111 | for ctgLen in sorted(ctgLenDB[chrn], reverse=True): 112 | tmpLen += ctgLen 113 | if tmpLen >= fastaLenDB[chrn]/2 and n50Len == 0: 114 | n50Len = ctgLen 115 | print("\tContig total size:\t%d"%(tmpLen)) 116 | print("\tN50 size:\t%d\n"%(n50Len)) 117 | 118 | print("\nWriting contigs") 119 | with open(outFasta, 'w') as fOut: 120 | base = 100 121 | for chrn in sorted(fastaDB): 122 | for region in ctgRegionsDB[chrn]: 123 | s = region[0] 124 | e = region[1] 125 | ctgName = "tig%07d"%(base) 126 | fOut.write(">%s %s %d:%d length=%d\n%s\n"%(ctgName, chrn, s+1, e+1, e-s+1, fastaDB[chrn][s: e])) 127 | base += 100 128 | print("\nFinished") 129 | 130 | 131 | if __name__ == "__main__": 132 | opts = GetOpts() 133 | inFasta = opts.input 134 | outFasta = opts.output 135 | n50 = opts.n50 136 | n50 = n50.lower() 137 | n50 = n50.replace('m', '000000') 138 | n50 = n50.replace('k', '000') 139 | n50 = int(n50) 140 | minLen = opts.min 141 | minLen = minLen.lower() 142 | minLen = minLen.replace('m', '000000') 143 | minLen = minLen.replace('k', '000') 144 | minLen = int(minLen) 145 | maxLen = opts.max 146 | maxLen = maxLen.lower() 147 | maxLen = maxLen.replace('m', '000000') 148 | maxLen = maxLen.replace('k', '000') 149 | maxLen = int(maxLen) 150 | SimGenomeCtg(inFasta, outFasta, n50, minLen, maxLen) 151 | -------------------------------------------------------------------------------- /bin/SimCollapse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import argparse 4 | import random 5 | 6 | 7 | def GetOpts(): 8 | group = argparse.ArgumentParser() 9 | group.add_argument('-a', '--a_contigs', help='first fasta file contain contigs generated by SimContigs.py', required=True) 10 | group.add_argument('-b', '--b_contigs', help='second fasta file contain contigs generated by SimContigs.py', required=True) 11 | group.add_argument('-p', '--prefix', help='prefix of contig file a and contig file b, divided by comma, like: HA, HB', required=True) 12 | group.add_argument('-o', '--output', help='filename of simulated data', required=True) 13 | group.add_argument('-s', '--blast', help='blast file with format 6, must use first file of input as query and second file as database', required=True) 14 | group.add_argument('-c', '--collapse', type=float, help='persentage of collapse region size, like 5 means 5%%, default: 10', default=10) 15 | 16 | return group.parse_args() 17 | 18 | 19 | def ReadFasta(inFasta): 20 | fastaDB = {} 21 | with open(inFasta, 'r') as fIn: 22 | id = '' 23 | seq = '' 24 | totalLen = 0 25 | for line in fIn: 26 | if line[0] == '>': 27 | if seq != '': 28 | fastaDB[id] = seq 29 | data = line.strip()[1:].split() 30 | id = data[0] 31 | seq = '' 32 | else: 33 | seq += line.strip() 34 | totalLen += len(line.strip()) 35 | fastaDB[id] = seq 36 | return fastaDB, totalLen 37 | 38 | 39 | def ReadBlast(inBlast, prefixList): 40 | blastDB = {} 41 | with open(inBlast, 'r') as fBlast: 42 | for line in fBlast: 43 | data = line.strip().split() 44 | queryID = data[0] 45 | targetID = data[1] 46 | identity = float(data[2]) 47 | queryRegion = list(map(int, [data[6], data[7]])) 48 | targetRegion = list(map(int, [data[8], data[9]])) 49 | if queryID not in blastDB: 50 | blastDB[queryID] = [targetID, identity, queryRegion, targetRegion] 51 | else: 52 | if identity > blastDB[queryID][1]: 53 | blastDB[queryID] = [targetID, identity, queryRegion, targetRegion] 54 | 55 | mapping = {} 56 | allContigList = [] 57 | for queryID in blastDB: 58 | targetID = blastDB[queryID][0] 59 | mapping[prefixList[0]+'-'+queryID] = prefixList[1]+"-"+targetID 60 | mapping[prefixList[1]+"-"+targetID] = prefixList[0]+'-'+queryID 61 | allContigList.append(prefixList[0]+'-'+queryID) 62 | allContigList.append(prefixList[1]+"-"+targetID) 63 | 64 | return mapping, allContigList 65 | 66 | 67 | def SimCollapse(aFasta, bFasta, outFa, blastFile, prefixList, collapse): 68 | print("Reading first contig file") 69 | fastaDBA, lenA = ReadFasta(aFasta) 70 | 71 | print("Reading second contig file") 72 | fastaDBB, lenB = ReadFasta(bFasta) 73 | 74 | lenDB = {} 75 | for id in fastaDBA: 76 | lenDB[prefixList[0]+"-"+id] = len(fastaDBA[id]) 77 | for id in fastaDBB: 78 | lenDB[prefixList[1]+"-"+id] = len(fastaDBB[id]) 79 | 80 | print("Reading blast file") 81 | mapping, allContigList = ReadBlast(blastFile, prefixList) 82 | collapseLen = int((lenA+lenB)*collapse) 83 | 84 | print("Total collapse size expected: %d"%(collapseLen)) 85 | removeList = {} 86 | print("Removing collapse regions") 87 | removeLen = 0 88 | while collapseLen > 0: 89 | index = random.randint(0, len(allContigList)-1) 90 | name = allContigList[index] 91 | 92 | while mapping[name] not in allContigList: 93 | index = random.randint(0, len(allContigList)-1) 94 | name = allContigList[index] 95 | 96 | repeatCnt = 0 97 | isLast = False 98 | while mapping[name] not in allContigList or lenDB[name] > collapseLen: 99 | if repeatCnt > 50: 100 | isLast = True 101 | break 102 | if lenDB[name] > collapseLen: 103 | repeatCnt += 1 104 | index = random.randint(0, len(allContigList)-1) 105 | name = allContigList[index] 106 | 107 | if isLast: 108 | break 109 | pre, ctg = name.split('-') 110 | collapseLen -= lenDB[name] 111 | removeLen += lenDB[name] 112 | 113 | if pre not in removeList: 114 | removeList[pre] = [] 115 | removeList[pre].append(ctg) 116 | 117 | allContigList.remove(name) 118 | allContigList.remove(mapping[name]) 119 | print("Total collapse size removed: %d"%(removeLen)) 120 | 121 | print("Writing result") 122 | with open(outFa, 'w') as fOut: 123 | for pre in removeList: 124 | if pre == prefixList[0]: 125 | for id in fastaDBA: 126 | if id not in removeList[pre]: 127 | fOut.write(">%s_%s\n%s\n"%(pre, id, fastaDBA[id])) 128 | else: 129 | for id in fastaDBB: 130 | if id not in removeList[pre]: 131 | fOut.write(">%s_%s\n%s\n"%(pre, id, fastaDBB[id])) 132 | 133 | print("Success") 134 | 135 | 136 | if __name__ == "__main__": 137 | opts = GetOpts() 138 | aFasta = opts.a_contigs 139 | bFasta = opts.b_contigs 140 | outFa = opts.output 141 | collapse = opts.collapse/100.0 142 | blastFile = opts.blast 143 | prefixList = opts.prefix.split(',') 144 | print("Arguments") 145 | print("\tInput files: %s, %s"%(aFasta, bFasta)) 146 | print("\tOutput file: %s"%(outFa)) 147 | print("\tBlast file: %s"%(blastFile)) 148 | print("\tCollapse ratio: %.2f%%"%(collapse*100)) 149 | SimCollapse(aFasta, bFasta, outFa, blastFile, prefixList, collapse) 150 | -------------------------------------------------------------------------------- /bin/simple_JBrowser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | 6 | def get_config(f_conf, section, key): 7 | if sys.version[0] == '2': 8 | import ConfigParser 9 | config = ConfigParser.ConfigParser() 10 | else: 11 | import configparser 12 | config = configparser.ConfigParser() 13 | config.read(f_conf) 14 | return config.get(section, key) 15 | 16 | 17 | def get_options(Argvs): 18 | i = 0 19 | opts = {} 20 | fasta_file = '' 21 | gff_files = [] 22 | bed_files = [] 23 | bam_files = [] 24 | bw_files = [] 25 | vcf_files = [] 26 | conf_file = '' 27 | chrs_file = '' 28 | tn = '1' 29 | while i [--gff --bed --bam --bw --conf ]") 50 | exit(0) 51 | i += 2 52 | opts['fasta_file'] = fasta_file 53 | opts['gff_files'] = gff_files 54 | opts['bed_files'] = bed_files 55 | opts['bam_files'] = bam_files 56 | opts['vcf_files'] = vcf_files 57 | opts['bw_files'] = bw_files 58 | opts['bam2bw'] = chrs_file 59 | opts['conf_file'] = conf_file 60 | opts['tn'] = tn 61 | return opts 62 | 63 | 64 | def simple_jbrowser(opts): 65 | samtools = '' 66 | jbrowser = '' 67 | bam2wig = '' 68 | if opts['conf_file'] != '': 69 | conf_file = opts['conf_file'] 70 | if os.path.isfile(conf_file): 71 | samtools = get_config(conf_file, "path", "samtools") 72 | bam2wig = get_config(conf_file, "path", "bam2wig") 73 | jbrowser = get_config(conf_file, "path", "JBrowser") 74 | wig2bw = get_config(conf_file, "path", "wig2bw") 75 | else: 76 | print("No configure file") 77 | exit(0) 78 | 79 | if samtools != '' and samtools[-1] != '/': 80 | samtools += '/' 81 | if jbrowser != '' and jbrowser[-1] != '/': 82 | jbrowser += '/' 83 | if bam2wig != '' and bam2wig[-1] != '/': 84 | bam2wig += '/' 85 | if wig2bw != '': 86 | os.system("export PATH="+wig2bw+":$PATH") 87 | 88 | print("Preparing reference sequences") 89 | if opts['fasta_file'] == '': 90 | print("No reference sequences") 91 | exit(0) 92 | os.system(jbrowser+"prepare-refseqs.pl --fasta "+opts['fasta_file']) 93 | 94 | print("Preparing gffs") 95 | for gff in opts['gff_files']: 96 | if gff != '': 97 | os.system(jbrowser+"flatfile-to-json.pl --gff "+gff+" --trackType CanvasFeatures --trackLabel "+gff.split('.')[0]) 98 | 99 | print("Preparing beds") 100 | for bed in opts['bed_files']: 101 | if bed != '': 102 | os.system(jbrowser+"flatfile-to-json.pl --bed "+bed+" --trackType CanvasFeatures --trackLabel "+bed) 103 | 104 | print("Preparing vcf") 105 | for vcf in opts['vcf_files']: 106 | if vcf[-3:].lower() != '.gz': 107 | os.system("bgzip "+vcf) 108 | vcf = vcf+".gz" 109 | if os.path.exists(vcf+".tbi") == False: 110 | os.system("tabix -p vcf "+vcf) 111 | with open("data/tracks.conf", "a") as f_track: 112 | f_track.write("[tracks."+vcf.replace('.', '_')+"]\nstoreClass = JBrowse/Store/SeqFeature/VCFTabix\nurlTemplate = ../"+vcf+"\ncategory = VCF\ntype = JBrowse/View/Track/CanvasVariants\nkey = "+vcf.replace('.', '_')+"\n") 113 | 114 | print("Preparing bam") 115 | for bam in opts['bam_files']: 116 | if not bam.endswith('sorted.bam'): 117 | sorted_bam = bam+".sorted.bam" 118 | indexed_bam = sorted_bam+".bai" 119 | if not os.path.exists(sorted_bam): 120 | os.system(samtools+"samtools sort -@ "+opts['tn']+" -o "+sorted_bam+" "+bam) 121 | if not os.path.exists(indexed_bam): 122 | os.system(samtools+"samtools index "+sorted_bam) 123 | else: 124 | sorted_bam = bam 125 | indexed_bam = sorted_bam+".bai" 126 | if not os.path.exists(indexed_bam): 127 | os.system(samtools+"samtools index "+sorted_bam) 128 | with open("data/tracks.conf", "a") as f_track: 129 | f_track.write("[tracks."+bam.replace('.', '_')+"]\nstoreClass = JBrowse/Store/SeqFeature/BAM\nurlTemplate = ../"+sorted_bam+"\nbaiUrlTemplate = ../"+indexed_bam+"\ncategory = NGS\ntype = JBrowse/View/Track/Alignments2\nkey = "+bam.replace('.', '_')+"\n") 130 | if opts['bam2bw'] != '': 131 | os.system(bam2wig+"bam2wig.py -i "+sorted_bam+" -s "+opts['bam2bw']+" -o "+bam) 132 | opts['bw_files'].append(bam+".bw") 133 | os.remove(bam+".wig") 134 | 135 | print("Preparing bigwig") 136 | for bw in opts['bw_files']: 137 | with open("data/tracks.conf", "a") as f_track: 138 | f_track.write("[tracks."+bw.replace('.', '_')+"]\nstoreClass = JBrowse/Store/SeqFeature/BigWig\nurlTemplate = ../"+bw+"\ncategory = Quantitative\ntype = JBrowse/View/Track/Wiggle/XYPlot\nkey = "+bw.replace('.', '_')+"\n") 139 | print("Finished") 140 | 141 | 142 | if __name__ == "__main__": 143 | if len(sys.argv) == 1: 144 | print("Usage: python "+sys.argv[0]+" -f --conf [--gff --bed --vcf --bam --bw --bam2bw -t ]") 145 | else: 146 | opts = get_options(sys.argv[1:]) 147 | simple_jbrowser(opts) 148 | -------------------------------------------------------------------------------- /bin/SentieonSNP_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import matplotlib as mpl 3 | mpl.use('Agg') 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import gzip 7 | import argparse 8 | import re 9 | import time 10 | 11 | 12 | def get_opts(): 13 | group = argparse.ArgumentParser() 14 | group.add_argument("-b", "--base", help="Input vcf file as base, .gz supported", required=True) 15 | group.add_argument("-v", "--validation", help="Input vcf file as validation, .gz supported", required=True) 16 | group.add_argument("-r", "--repeat", help="Repeat regions file, gff format", default="") 17 | group.add_argument("-o", "--output", help="Output vcf file based on base vcf file, compressed with gzip", required=True) 18 | group.add_argument("-m", "--missing_rate", type=float, help="Missing rate threshold, percentage, default: 40", default=40) 19 | group.add_argument("-d", "--min_distance", type=int, help="Minimum distance between two snp sites, default: 0", default=0) 20 | return group.parse_args() 21 | 22 | 23 | def read_vcf(in_vcf, method): 24 | if in_vcf[-3:].lower() == '.gz': 25 | f_in = gzip.open(in_vcf, 'rt') 26 | else: 27 | f_in = open(in_vcf, 'r') 28 | 29 | header = [] 30 | vcf_infos = {} 31 | dp_db = {} 32 | for line in f_in: 33 | if line[0] == '#': 34 | header.append(line) 35 | else: 36 | data = line.strip().split() 37 | chrn = data[0] 38 | pos = int(data[1]) 39 | ref = data[3] 40 | alt = data[4] 41 | filter = data[6] 42 | 43 | # filter indels 44 | if len(ref) > 1 or len(alt) > 1: 45 | continue 46 | if method.lower() == 'full': 47 | # filter LowQual marker 48 | if filter.lower() == 'lowqual': 49 | continue 50 | if chrn not in vcf_infos: 51 | vcf_infos[chrn] = {} 52 | if method.lower() == 'full': 53 | # calc missing rate 54 | m_c = 0 55 | for i in range(9, len(data)): 56 | if data[i].split(':')[0] == './.': 57 | m_c += 1 58 | m_r = m_c*1.0/(len(data)-8) 59 | vcf_infos[chrn][pos] = {'alt': alt, 'line': line, 'mr': m_r} 60 | dp = int(data[7].split('DP=')[1].split(';')[0]) 61 | if dp not in dp_db: 62 | dp_db[dp] = 0 63 | dp_db[dp] += 1 64 | else: 65 | vcf_infos[chrn][pos] = {'alt': alt} 66 | f_in.close() 67 | if method.lower() == 'full': 68 | return header, vcf_infos, dp_db 69 | else: 70 | return vcf_infos 71 | 72 | 73 | def read_gff(in_gff): 74 | regions_db = {} 75 | with open(in_gff, 'r') as fin: 76 | for line in fin: 77 | if line[0] == '#': 78 | continue 79 | data = line.strip().split() 80 | chrn = data[0] 81 | sr = int(data[3]) 82 | er = int(data[4]) 83 | if sr > er: 84 | tmp = er 85 | er = sr 86 | sr = tmp 87 | if chrn not in regions_db: 88 | regions_db[chrn] = [] 89 | regions_db[chrn].append([sr, er]) 90 | 91 | for chrn in regions_db: 92 | regions_db[chrn] = sorted(regions_db[chrn]) 93 | 94 | return regions_db 95 | 96 | 97 | def is_repeat(repeats, pos): 98 | s = 0 99 | e = len(repeats)-1 100 | while s<=e: 101 | mid = int((s+e)/2) 102 | if repeats[mid][0] > pos: 103 | e = mid-1 104 | elif repeats[mid][0] < pos: 105 | s = mid+1 106 | else: 107 | return True 108 | if repeats[e][1] >= pos: 109 | return True 110 | else: 111 | return False 112 | 113 | 114 | 115 | def snp_filter(in_base, in_valid, in_rep, mr, md, out_file): 116 | print("\033[32m%s\033[0m Reading valid file"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())))) 117 | valid_snps = read_vcf(in_valid, 'simple') 118 | 119 | print("\033[32m%s\033[0m Reading base file"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())))) 120 | base_header, base_snps, base_dps = read_vcf(in_base, 'full') 121 | 122 | print("\033[32m%s\033[0m Reading repeat file"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())))) 123 | if in_rep != '': 124 | repeat_db = read_gff(in_rep) 125 | else: 126 | repeat_db = {} 127 | ''' 128 | dp_cnt = [] 129 | for dp in sorted(base_dps): 130 | dp_cnt.append(base_dps[dp]) 131 | dp_cnt_th = int(max(dp_cnt)*0.05) 132 | 133 | dp_min = -1 134 | for dp in sorted(base_dps): 135 | if base_dps[dp] > dp_cnt_th: 136 | if dp_min == -1: 137 | dp_min = dp 138 | dp_max = dp 139 | ''' 140 | # Plot dist 141 | plt.figure(figsize=(10, 8), dpi=100) 142 | dp_x = [] 143 | dp_y = [] 144 | dp_total = [] 145 | for dp in sorted(base_dps): 146 | plt.bar(x=dp, height=base_dps[dp], width=1, edgecolor='white', facecolor='blue', align='center', linewidth=0.01) 147 | dp_x.append(dp) 148 | dp_y.append(base_dps[dp]) 149 | for i in range(0, base_dps[dp]): 150 | dp_total.append(dp_x) 151 | plt.plot(dp_x, dp_y, linewidth=0.05, linestyle='-', markersize=0, marker=',') 152 | 153 | dp_min = 5 #mean-std*1.96 154 | sum = np.sum(dp_y) 155 | top_sum = sum*0.95 156 | cnt = 0 157 | 158 | for dp in sorted(base_dps): 159 | if dp < dp_min: 160 | continue 161 | cnt += base_dps[dp] 162 | if cnt >= top_sum: 163 | dp_max = dp 164 | break 165 | plt.xlim(dp_x[0]-1, dp_max+1) 166 | plt.savefig('dist.pdf', filetype='pdf', bbox_inches='tight') 167 | print("\033[32m%s\033[0m range=[%f, %f]"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())), dp_min, dp_max)) 168 | 169 | print("\033[32m%s\033[0m Filtering and writing results"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())))) 170 | if out_file[-3:].lower() != '.gz': 171 | out_file += '.gz' 172 | with gzip.open(out_file, 'wt') as fout: 173 | fout.write(''.join(base_header)) 174 | for chrn in sorted(base_snps): 175 | if chrn not in valid_snps: 176 | continue 177 | last_pos = -1 178 | for pos in sorted(base_snps[chrn]): 179 | # filter repeat regions 180 | if chrn in repeat_db and is_repeat(repeat_db[chrn], pos): 181 | continue 182 | # filter base snps with valid snps 183 | if pos not in valid_snps[chrn]: 184 | continue 185 | if base_snps[chrn][pos]['alt'] != valid_snps[chrn][pos]['alt']: 186 | continue 187 | # filter missing rate 188 | if base_snps[chrn][pos]['mr'] > mr: 189 | continue 190 | 191 | # filter dp 192 | data = base_snps[chrn][pos]['line'].strip().split() 193 | dp = int(data[7].split('DP=')[1].split(';')[0]) 194 | if dp < dp_min or dp > dp_max: 195 | continue 196 | if last_pos != -1 and pos - last_pos <= md: 197 | last_pos = pos 198 | continue 199 | else: 200 | last_pos = pos 201 | fout.write(base_snps[chrn][pos]['line']) 202 | 203 | print("\033[32m%s\033[0m Finished"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())))) 204 | 205 | 206 | if __name__ == "__main__": 207 | opts = get_opts() 208 | in_base = opts.base 209 | in_valid = opts.validation 210 | in_rep = opts.repeat 211 | mr = opts.missing_rate/100.0 212 | md = opts.min_distance 213 | out_file = opts.output 214 | snp_filter(in_base, in_valid, in_rep, mr, md, out_file) 215 | -------------------------------------------------------------------------------- /bin/SimSID.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import argparse 5 | import random 6 | 7 | 8 | def ArgParser(): 9 | group = argparse.ArgumentParser() 10 | group.add_argument('-s', '--snp', type=float, help='snp ratio of whole genome, percentage, default: 0.01', default=0.01) 11 | group.add_argument('-i', '--insertion', type=float, help='insertion ratio of whole genome, percentage, default: 0.01', default=0.01) 12 | group.add_argument('--insert_length', type=int, help='max length of insertion, default: 10', default=10) 13 | group.add_argument('-d', '--deletion', type=float, help='delection ratio of whole genome, percentage, default: 0.01', default=0.01) 14 | group.add_argument('--delete_length', type=int, help='max length of deletion, default: 10', default=10) 15 | group.add_argument('--random_length', action="store_true", help='use this argument for generate random length of indels', default=False) 16 | group.add_argument('-v', '--verbose', action="store_true", help='print detail information', default=False) 17 | group.add_argument('-r', '--ref', help='origin fasta file of genome', required=True) 18 | group.add_argument('-o', '--out', help='prefix of simulated data', required=True) 19 | 20 | return group.parse_args() 21 | 22 | 23 | def ReadFASTA(inputFASTA): 24 | fastaDB = {} 25 | posDB = {} 26 | with open(inputFASTA, 'r') as fIN: 27 | id = '' 28 | seq = '' 29 | for line in fIN: 30 | if line[0] == '>': 31 | if seq != '': 32 | fastaDB[id] = seq 33 | id = line.strip()[1:] 34 | seq = '' 35 | else: 36 | seq += line.strip() 37 | fastaDB[id] = seq 38 | for chrn in fastaDB: 39 | posDB[chrn] = [1]*len(fastaDB[chrn]) 40 | return fastaDB, posDB 41 | 42 | 43 | def IsInRegions(regionDB, queryPos): 44 | s = 0 45 | e = len(regionDB)-1 46 | refRegions = sorted(regionDB) 47 | if len(refRegions) == 0: 48 | return False 49 | while s<=e: 50 | mid = int((s+e)/2) 51 | if refRegions[mid][0] < queryPos: 52 | s = mid+1 53 | elif refRegions[mid][0] > queryPos: 54 | e = mid-1 55 | else: 56 | return True 57 | if refRegions[e][1] >= queryPos: 58 | return True 59 | else: 60 | return False 61 | 62 | 63 | def GenDelRegions(fastaDB, posDB, delRatio, delLength, isRandom, isVerbose): 64 | delRegions = {} 65 | for chrn in fastaDB: 66 | chrLen = len(fastaDB[chrn]) 67 | if isRandom: 68 | avgDelLen = int(delLength/2) 69 | else: 70 | avgDelLen = delLength 71 | cntDel = int(chrLen*delRatio/avgDelLen) 72 | print("%s\tdelections count: %d"%(chrn, cntDel)) 73 | delRegions[chrn] = [] 74 | for i in range(0, cntDel): 75 | if isVerbose: 76 | print("Generating: %d"%(i+1)) 77 | if isRandom: 78 | curDelLen = random.randint(1, delLength) 79 | else: 80 | curDelLen = delLength 81 | sp = random.randint(0, chrLen-curDelLen) 82 | ep = sp+curDelLen 83 | while posDB[chrn][sp] == 0 or posDB[chrn][ep-1] == 0: 84 | sp = random.randint(0, chrLen-curDelLen) 85 | ep = sp+curDelLen 86 | delRegions[chrn].append([sp, ep]) 87 | for i in range(sp, ep): 88 | posDB[chrn][i] = 0 89 | for chrn in delRegions: 90 | delRegions[chrn] = sorted(delRegions[chrn]) 91 | 92 | return delRegions 93 | 94 | 95 | def GenSeq(seqLen): 96 | nucType = ['A', 'T', 'G', 'C'] 97 | seq = '' 98 | for i in range(0, seqLen): 99 | seq += nucType[random.randint(0, 3)] 100 | return seq 101 | 102 | 103 | def GenInsPosSeqs(delRegions, fastaDB, posDB, insRatio, insLength, isRandom, isVerbose): 104 | insList = {} 105 | insSeqs = {} 106 | for chrn in fastaDB: 107 | chrLen = len(fastaDB[chrn]) 108 | if isRandom: 109 | avgInsLen = int(insLength/2) 110 | else: 111 | avgInsLen = insLength 112 | cntIns = int(chrLen*insRatio/avgInsLen) 113 | print("%s\tinsertions count: %d"%(chrn, cntIns)) 114 | insList[chrn] = [] 115 | insSeqs[chrn] = [] 116 | for i in range(0, cntIns): 117 | if isVerbose: 118 | print("Generating: %d"%(i+1)) 119 | if isRandom: 120 | curInsLen = random.randint(1, insLength) 121 | else: 122 | curInsLen = insLength 123 | pos = random.randint(0, chrLen-1) 124 | while posDB[chrn][pos] == 0: 125 | pos = random.randint(0, chrLen-1) 126 | insList[chrn].append(pos) 127 | posDB[chrn][pos] = 0 128 | insSeqs[chrn].append(GenSeq(curInsLen)) 129 | return insList, insSeqs 130 | 131 | 132 | def GenSNPPos(delRegions, fastaDB, posDB, snpRatio, insPos, isVerbose): 133 | snpSeq = {} 134 | snpPos = {} 135 | nucType = ['A', 'T', 'G', 'C'] 136 | for chrn in fastaDB: 137 | snpSeq[chrn] = [] 138 | snpPos[chrn] = [] 139 | chrLen = len(fastaDB[chrn]) 140 | cntSNP = int(chrLen*snpRatio) 141 | print("%s\tSNPs count: %d"%(chrn, cntSNP)) 142 | for i in range(cntSNP): 143 | if isVerbose: 144 | print("Generating: %d"%(i+1)) 145 | pos = random.randint(0, chrLen-1) 146 | while posDB[chrn][pos] == 0: 147 | pos = random.randint(0, chrLen-1) 148 | SNP = nucType[random.randint(0, 3)] 149 | while SNP == fastaDB[chrn][pos]: 150 | SNP = nucType[random.randint(0, 3)] 151 | snpSeq[chrn].append(SNP) 152 | snpPos[chrn].append(pos) 153 | posDB[chrn][pos] = 0 154 | return snpPos, snpSeq 155 | 156 | 157 | def SimSID(snpRatio, insRatio, delRatio, insLength, delLength, isRandom, isVerbose, inputFASTA, outPrefix): 158 | print("SNP Ratio = %.2f%%\nINS Ratio = %.2f%%\nDEL Ratio = %.2f%%\nINS Length = %d\nDEL Length = %d\nRandom: %s\nVerbose: %s\nInput file: %s\nOut prefix: %s"%(snpRatio*100, insRatio*100, delRatio*100, insLength, delLength, isRandom, isVerbose, inputFASTA, outPrefix)) 159 | random.seed() 160 | print("Reading fasta") 161 | fastaDB, posDB = ReadFASTA(inputFASTA) 162 | print("Generating deletions") 163 | delRegions = GenDelRegions(fastaDB, posDB, delRatio, delLength, isRandom, isVerbose) 164 | 165 | print("Generating insertions") 166 | insPos, insSeq = GenInsPosSeqs(delRegions, fastaDB, posDB, insRatio, insLength, isRandom, isVerbose) 167 | 168 | print("Generating SNPs") 169 | snpPos, snpSeq = GenSNPPos(delRegions, fastaDB, posDB, snpRatio, insPos, isVerbose) 170 | 171 | print("Writing infomations") 172 | with open(outPrefix+"_snps.txt", 'w') as fSNP: 173 | fSNP.write("Chromosome\tPosition\tOrigin\tNew\n") 174 | writeStrings = [] 175 | for chrn in sorted(fastaDB): 176 | for i in range(0, len(snpPos[chrn])): 177 | pos = snpPos[chrn][i] 178 | writeStrings.append([chrn, pos+1, fastaDB[chrn][pos], snpSeq[chrn][i]]) 179 | for wString in sorted(writeStrings): 180 | fSNP.write('\t'.join(list(map(str, wString)))+'\n') 181 | 182 | with open(outPrefix+"_indel.txt", 'w') as fIndel: 183 | fIndel.write("Chromosome\tPosition\tOrigin\tNew\n") 184 | writeStrings = [] 185 | for chrn in sorted(fastaDB): 186 | for i in range(0, len(delRegions[chrn])): 187 | sp = delRegions[chrn][i][0] 188 | ep = delRegions[chrn][i][1] 189 | writeStrings.append([chrn, sp+1, fastaDB[chrn][sp: ep], '[]']) 190 | for chrn in sorted(fastaDB): 191 | for i in range(0, len(insPos[chrn])): 192 | pos = insPos[chrn][i] 193 | writeStrings.append([chrn, pos+1, '[]', insSeq[chrn][i]]) 194 | for wString in sorted(writeStrings): 195 | fIndel.write('\t'.join(list(map(str, wString)))+'\n') 196 | 197 | print("Writing fasta") 198 | with open(outPrefix+"_sim.fasta", 'w') as fSim: 199 | for chrn in sorted(fastaDB): 200 | newSeq = list(fastaDB[chrn]) 201 | for i in range(0, len(snpPos[chrn])): 202 | newSeq[snpPos[chrn][i]] = snpSeq[chrn][i] 203 | for i in range(0, len(insPos[chrn])): 204 | newSeq[insPos[chrn][i]] = insSeq[chrn][i] + newSeq[insPos[chrn][i]] 205 | for i in range(0, len(delRegions[chrn])): 206 | for j in range(delRegions[chrn][i][0], delRegions[chrn][i][1]): 207 | newSeq[j] = '' 208 | fSim.write(">%s\n%s\n"%(chrn, ''.join(newSeq))) 209 | print("Success") 210 | 211 | 212 | if __name__ == "__main__": 213 | opts = ArgParser() 214 | snpRatio = opts.snp/100.0 215 | insRatio = opts.insertion/100.0 216 | delRatio = opts.deletion/100.0 217 | inputFASTA = opts.ref 218 | outPrefix = opts.out 219 | insLength = opts.insert_length 220 | delLength = opts.delete_length 221 | isRandom = opts.random_length 222 | isVerbose = opts.verbose 223 | SimSID(snpRatio, insRatio, delRatio, insLength, delLength, isRandom, isVerbose, inputFASTA, outPrefix) 224 | -------------------------------------------------------------------------------- /bin/easyGoKegg.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(optparse) 3 | library(clusterProfiler) 4 | library(KEGGREST) 5 | library(dplyr) 6 | library(stringr) 7 | library(AnnotationForge) 8 | library(jsonlite) 9 | library(purrr) 10 | library(RCurl) 11 | library(ggplot2) 12 | 13 | download_plant_path <- function(db_file) { 14 | ### Download plant pathway 15 | org <- data.frame(keggList("organism")) 16 | plants <- org[grep("Plants", org$phylogeny), ] 17 | plant_pathways_total <- vector() 18 | 19 | for (i in seq_along(length(plants$organism))) { 20 | try({ 21 | pathways <- keggLink("pathway", plants[i, 2]) 22 | pathways <- sub(paste(".*", plants[i, 2], sep = ""), "", pathways) 23 | pathways <- unique(pathways) 24 | plant_pathways_total <- append(plant_pathways_total, pathways) 25 | plant_pathways_total <- unique(plant_pathways_total) 26 | }) 27 | } 28 | 29 | plant_pathways_total <- paste0("ko", plant_pathways_total) 30 | write.table(plant_pathways_total, file = db_file, quote = FALSE, row.names = FALSE, col.names = FALSE) 31 | } 32 | 33 | load_plant_path <- function(db_file) { 34 | ### Load plant pathway 35 | plant_pathways_total <- read.table(db_file, sep = "", header = FALSE) 36 | colnames(plant_pathways_total) <- "Pathway" 37 | plant_pathways_total 38 | } 39 | 40 | generate_go_db <- function(annotation_file, db_path, genus, species, tax_id) { 41 | cat("Reading annotation file\n") 42 | egg <- read.table(annotation_file, header = TRUE, sep = "\t", quote = "") 43 | egg[egg == ""] <- NA 44 | 45 | gterms <- egg %>% 46 | dplyr::select("query", "GOs") %>% 47 | na.omit() 48 | 49 | gene2go <- data.frame( 50 | GID = character(), 51 | GO = character(), 52 | EVIDENCE = character() 53 | ) 54 | 55 | gene_ids <- egg$query 56 | eggnog_lines_with_go <- egg$GOs != "-" & egg$GOs != "" 57 | eggnog_annoations_go <- strsplit(egg$GOs[eggnog_lines_with_go], ",") 58 | gene2go <- data.frame( 59 | GID = rep(gene_ids[eggnog_lines_with_go], 60 | times = sapply(eggnog_annoations_go, length) 61 | ), 62 | GO = unlist(eggnog_annoations_go), 63 | EVIDENCE = "IEA" 64 | ) 65 | 66 | gene_info <- egg %>% 67 | dplyr::select(GID = "query", GENENAME = "Preferred_name") %>% 68 | na.omit() 69 | 70 | gene2ko <- egg %>% 71 | dplyr::select(GID = "query", Ko = "KEGG_ko") %>% 72 | na.omit() 73 | 74 | gene2ko$Ko <- gsub("ko:", "", gene2ko$Ko) 75 | 76 | cat("Saving GO database\n") 77 | genus <- "Custom genus" 78 | species <- "CUSTOM" 79 | tax_id <- "0000" 80 | makeOrgPackage( 81 | gene_info = gene_info, 82 | go = gene2go, 83 | ko = gene2ko, 84 | version = "0.1", 85 | maintainer = "maintainer ", 86 | author = "author ", 87 | outputDir = db_path, 88 | tax_id = tax_id, 89 | genus = genus, 90 | species = species, 91 | goTable = "go" 92 | ) 93 | 94 | anno_db <- list(go = gene2go, ko = gene2ko) 95 | return(anno_db) 96 | } 97 | 98 | update_kegg_db <- function(db_path, kegg_json, kegg_db_file) { 99 | url <- "https://www.kegg.jp/kegg-bin/download_htext?htext=ko00001&format=json&filedir=" 100 | if (!file.exists(kegg_json)) { 101 | json <- paste(db_path, "ko00001.json", sep = "/") 102 | download.file(url, json) 103 | } else { 104 | json <- kegg_json 105 | } 106 | pathway2name <- tibble(Pathway = character(), Name = character()) 107 | ko2pathway <- tibble(Ko = character(), Pathway = character()) 108 | kegg <- fromJSON(json) 109 | for (a in seq_along(kegg[["children"]][["children"]])) { 110 | A <- kegg[["children"]][["name"]][[a]] 111 | for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) { 112 | B <- kegg[["children"]][["children"]][[a]][["name"]][[b]] 113 | for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) { 114 | pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]] 115 | pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1] 116 | pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "") 117 | pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name)) 118 | kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]] 119 | kos <- str_match(kos_info, "K[0-9]*")[, 1] 120 | ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos)))) 121 | } 122 | } 123 | } 124 | save(pathway2name, ko2pathway, file = kegg_db_file) 125 | } 126 | 127 | load_kegg_db <- function(kegg_db_file, gene2ko) { 128 | kegg_db <- new.env() 129 | load(file = kegg_db_file, envir = kegg_db) 130 | gene2pathway <- gene2ko %>% 131 | left_join(kegg_db$ko2pathway, by = "Ko", relationship = "many-to-many") %>% 132 | dplyr::select("GID", "Pathway") %>% 133 | na.omit() 134 | kegg_db <- list(pathway2name = kegg_db$pathway2name, gene2pathway = gene2pathway) 135 | return(kegg_db) 136 | } 137 | 138 | keep_plant_path_only <- function(pathway2name, gene2pathway, plant_pathways_total) { 139 | pathway2name <- pathway2name %>% filter(pathway2name$Pathway %in% plant_pathways_total$Pathway) 140 | gene2pathway <- gene2pathway %>% filter(gene2pathway$Pathway %in% pathway2name$Pathway) 141 | filtered_kegg_db <- list(pathway2name = pathway2name, gene2pathway = gene2pathway) 142 | return(filtered_kegg_db) 143 | } 144 | 145 | run_go_kegg <- function(db_path, gene2pathway, pathway2name, gene_file, 146 | pvalue_cutoff, qvalue_cutoff, padjust_method, ontology) { 147 | gene <- read.table(gene_file, header = FALSE) 148 | gene_list <- gene[, 1] 149 | GO <- enrichGO( 150 | gene = gene_list, 151 | OrgDb = "org.CCUSTOM.eg.db", 152 | keyType = "GID", 153 | ont = "ALL", 154 | pAdjustMethod = padjust_method, 155 | pvalueCutoff = pvalue_cutoff, 156 | qvalueCutoff = qvalue_cutoff 157 | ) 158 | 159 | cat("Saving GO result\n") 160 | GO_df <- as.data.frame(GO) 161 | write.table(GO_df, file = "GO.results.tsv", sep = "\t", quote = FALSE) 162 | 163 | cat("Saving GO barplot\n") 164 | pdf(file = "GO_barplot.pdf", width = 15, height = 20) 165 | print(barplot(GO, drop = TRUE, showCategory = 10, split = "ONTOLOGY") + facet_grid(ONTOLOGY ~ ., scale = "free")) 166 | dev.off() 167 | 168 | cat("Saving GO bubble\n") 169 | pdf(file = "GO_bubble.pdf", width = 15, height = 20) 170 | print(dotplot(GO, showCategory = 10, split = "ONTOLOGY") + facet_grid(ONTOLOGY ~ ., scale = "free")) 171 | dev.off() 172 | 173 | KEGG <- enricher( 174 | gene = gene_list, 175 | TERM2GENE = gene2pathway[c("Pathway", "GID")], 176 | TERM2NAME = pathway2name[c("Pathway", "Name")], 177 | pAdjustMethod = padjust_method, 178 | pvalueCutoff = pvalue_cutoff, 179 | qvalueCutoff = qvalue_cutoff 180 | ) 181 | 182 | 183 | cat("Saving KEGG result\n") 184 | KEGG_df <- as.data.frame(KEGG) 185 | write.table(KEGG_df, file = "KEGG.results.tsv", sep = "\t", quote = FALSE) 186 | 187 | cat("Saving KEGG barplot\n") 188 | pdf(file = "KEGG_barplot.pdf", width = 15, height = 20) 189 | print(barplot(KEGG, drop = TRUE, showCategory = 10)) 190 | dev.off() 191 | 192 | cat("Saving KEGG bubble\n") 193 | pdf(file = "KEGG_bubble.pdf", width = 15, height = 20) 194 | print(dotplot(KEGG)) 195 | dev.off() 196 | } 197 | 198 | main_pipe <- function() { 199 | opt <- parse_args(opt_parser) 200 | opt_names <- names(opt) 201 | if ("input" %in% opt_names && "anno" %in% opt_names && "db" %in% opt_names) { 202 | anno_file <- opt$anno 203 | db_path <- opt$db 204 | if (!dir.exists(db_path)) { 205 | dir.create(db_path) 206 | } 207 | go_db_name <- paste("org.", substr(opt$genus, 1, 1), opt$species, ".eg.db", sep = "") 208 | db_pack <- paste(db_path, go_db_name, sep = "/") 209 | 210 | pvalue_cutoff <- opt$pvalue 211 | qvalue_cutoff <- opt$qvalue 212 | padjust_method <- opt$padjust 213 | ontology <- opt$ontology 214 | cat("Generating GO database\n") 215 | if (!file.exists(anno_file)) { 216 | cat("Fatal: annotation file not exists\n") 217 | return() 218 | } 219 | 220 | if (file.exists(db_pack)) { 221 | unlink(db_pack, recursive = TRUE) 222 | } 223 | anno_db <- generate_go_db(anno_file, db_path, opt$genus, opt$species, opt$tax_id) 224 | 225 | cat("Loading GO database\n") 226 | install.packages(db_pack, repos = NULL, type = "sources") 227 | do.call(library, list(go_db_name)) 228 | kegg_json <- "" 229 | if ("kegg_json" %in% opt_names) { 230 | kegg_json <- opt$kegg_json 231 | } 232 | kegg_db_file <- paste(db_path, "KEGG_db.RData", sep = "/") 233 | if (!file.exists(kegg_db_file) || "update" %in% opt_names) { 234 | cat("Generating KEGG database\n") 235 | update_kegg_db(db_path, kegg_json, kegg_db_file) 236 | } 237 | 238 | cat("Loading KEGG database\n") 239 | kegg_db <- load_kegg_db(kegg_db_file, anno_db$ko) 240 | 241 | if ("plant" %in% opt_names) { 242 | cat("Keeping plants pathway\n") 243 | plants_kegg_file <- opt$plant_kegg 244 | if (!file.exists(plants_kegg_file)) { 245 | plants_kegg <- "plants.kegg.txt" 246 | plants_kegg_file <- paste(db_path, plants_kegg, sep = "/") 247 | download_plant_path(plants_kegg_file) 248 | } 249 | plant_pathways_total <- load_plant_path(plants_kegg_file) 250 | kegg_db <- keep_plant_path_only(kegg_db$pathway2name, kegg_db$gene2pathway, plant_pathways_total) 251 | } 252 | 253 | gene_file <- opt$input 254 | cat("Running GO and KEGG\n") 255 | run_go_kegg( 256 | db_path, kegg_db$gene2pathway, kegg_db$pathway2name, gene_file, 257 | pvalue_cutoff, qvalue_cutoff, padjust_method, ontology 258 | ) 259 | 260 | cat("Finished\n") 261 | } else { 262 | print_help(opt_parser) 263 | } 264 | } 265 | 266 | opt_list <- list( 267 | make_option(c("-i", "--input"), type = "character", help = "Input gene list file"), 268 | make_option(c("-a", "--anno"), type = "character", help = "Functional annotation file"), 269 | make_option(c("-d", "--db"), type = "character", help = "Database path"), 270 | make_option(c("--kegg_json"), type = "character", help = "Pre-downloaded kegg json file"), 271 | make_option(c("--genus"), 272 | type = "character", help = "Genus name for creating GO database, default=\"Custom genus\"", 273 | default = "Custom genus" 274 | ), 275 | make_option(c("--pvalue"), type = "numeric", help = "P value cutoff for GO and KEGG, default=0.05", default = 0.05), 276 | make_option(c("--qvalue"), type = "numeric", help = "Q value cutoff for GO and KEGG, default=0.05", default = 0.05), 277 | make_option(c("--padjust"), 278 | type = "character", help = "P adjust method for GO and KEGG, default=\"BH\"", 279 | default = "BH" 280 | ), 281 | make_option(c("--ontology"), 282 | type = "character", help = "Ontology for GO, default=\"ALL\"", 283 | default = "ALL" 284 | ), 285 | make_option(c("--species"), 286 | type = "character", help = "Species name for creating GO database, default=\"CUSTOM\"", 287 | default = "CUSTOM" 288 | ), 289 | make_option(c("--tax_id"), 290 | type = "character", help = "Tax id for creating GO database, default=\"0000\"", 291 | default = "0000" 292 | ), 293 | make_option(c("--update"), action = "store_true", help = "Update databases"), 294 | make_option(c("--plant"), action = "store_true", help = "enrich with plant pathway only"), 295 | make_option(c("--plant_kegg"), type = "character", help = "Pre-generated plant kegg db file") 296 | ) 297 | 298 | opt_parser <- OptionParser(option_list = opt_list, usage = "This Script is used for running GO and KEGG") 299 | main_pipe() 300 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | This repository contains several scripts for bioinformatics. 4 | 5 | ## Installation 6 | 7 | ```shell 8 | git clone https://github.com/sc-zhang/bioscripts.git 9 | cd bin 10 | chmod +x * 11 | # Optional, add following line to your ~/.bash_profile 12 | export PATH=/path/to/bioscripts/bin:$PATH 13 | ``` 14 | 15 | ## Usage 16 | 17 | 1. approximate_cnv.py is a script for approximating CNV (Copy Number Variation) with read depth. 18 | 19 | ```shell 20 | approximate_cnv.py -bam -g -l -bed -o [-t ] 21 | 22 | Usage: 23 | -bam: a list file, each line is the full path of a bam file 24 | -g: the size of genome, integer 25 | -l: the length of read, integer 26 | -bed: bed file contain 4 columns: chromosome, start position, end position, gene name, seperate with tab 27 | -o: result file 28 | -t: threads, integer 29 | ``` 30 | 31 | 2. average_fpkm.py is a script for calculating average of fpkm values. 32 | 33 | ```shell 34 | # Dependencies 35 | # Python modules: numpy 36 | average_fpkm.py 37 | ``` 38 | 39 | 3. blast2heatmap.py is a script for drawing heatmap with blast file of format 6. 40 | 41 | ```shell 42 | # Dependencies 43 | # Software: R, bedtools 44 | # R modules: pheatmap 45 | blast2heatmap.py 46 | ``` 47 | 48 | 4. calc_gap_cnt.py is a script for calculating gap count of all sequences. 49 | 50 | ```shell 51 | calc_gap_cnt.py 52 | ``` 53 | 54 | 5. calc_gene_ovlp_te.py is a script for calculating overlap ratio of genes with TE regions. 55 | 56 | ```shell 57 | calc_gene_ovlp_te.py 58 | Usage: 59 | ovlp_stat: is the output file. 60 | ``` 61 | 62 | 6. convert_collinearity_from_MCScanX_to_Circos.py is a script for converting collinearity file from MCScanX result to 63 | link file for Circos 64 | 65 | ```shell 66 | convert_collinearity_from_MCScanX_to_Circos.py 67 | ``` 68 | 69 | 7. convert_gbff_to_fasta.py is a script for converting NCBI GBFF file to fasta file. 70 | 71 | ```shell 72 | convert_gbff_to_fasta.py 73 | ``` 74 | 75 | 8. convert_QTL_info.py is a script for converting QTL information of contig-level to chromosome-level with agp file. 76 | 77 | ```shell 78 | convert_QTL_info.py 79 | ``` 80 | 81 | 9. convert_simple_for_circos.py is a script for converting JCVI simple file to link file for circos. 82 | 83 | ```shell 84 | convert_simple_for_circos.py 85 | ``` 86 | 87 | 10. dup_dotplot.pl is a script for plotting dotplot with monoploid and polyploid. 88 | 89 | ```shell 90 | dup_dotplot.pl -g reference_genome -r ref_id -q query_id -n number_of_dup -t threads 91 | Usage: 92 | ref_id: reference cds and bed name, like: Sb, Sb.cds and Sb.bed must exist 93 | query_id: query cds and bed name, like: Os 94 | number_of_dup: number of duplications 95 | threads: default 1 96 | ``` 97 | 98 | 11. eval_filled_gaps.py is a script for evaluating status that gaps been filled 99 | 100 | ```shell 101 | eval_filled_gaps.py 102 | ``` 103 | 104 | 12. extract_all_sv_from_nucmer_delta.py is a script for extracting SV from delta file generated by nucmer. 105 | 106 | ```shell 107 | extract_all_sv_from_nucmer_delta.py 108 | ``` 109 | 110 | 13. extract_gene_from_gff.py is a script for extracting genes from gff3 file with gene id list and generating a bed 111 | file. 112 | 113 | ```shell 114 | extract_gene_from_gff.py 115 | ``` 116 | 117 | 14. extract_vcf.py is a script for extracting vcf with bed file 118 | 119 | ```shell 120 | extract_vcf.py 121 | ``` 122 | 123 | 15. filter_cds.py is a script for removing invalided CDS sequences. 124 | 125 | ```shell 126 | filter_cds.py 127 | ``` 128 | 129 | 16. find_gff_ovlp_regions.py is a script for getting overlap regions from gff3 file. 130 | 131 | ```shell 132 | find_gff_ovlp_regions.py 133 | ``` 134 | 135 | 17. get_chr_len.py is a script for calculating length of chromosomes in fasta file 136 | 137 | ```shell 138 | get_chr_len.py 139 | ``` 140 | 141 | 18. get_genes_from_range.py is a script for getting genes with bed file. 142 | 143 | ```shell 144 | get_genes_from_range.py 145 | ``` 146 | 147 | 19. get_genes_region_from_gff.py is a script for getting gene regions from gff3 file. 148 | 149 | ```shell 150 | get_genes_region_from_gff.py 151 | ``` 152 | 153 | 20. get_gff_with_list.py is a script for extracting gff3 file with gene IDs. 154 | 155 | ```shell 156 | get_gff_with_list.py 157 | ``` 158 | 159 | 21. get_seq_from_range.py is a script for extracting sequence fragments with bed file. 160 | 161 | ```shell 162 | get_seq_from_range.py 163 | ``` 164 | 165 | 22. group_exon_and_intron.py is a script for classifying vcf positions to exon and intron. 166 | 167 | ```shell 168 | group_exon_and_intron.py 169 | ``` 170 | 171 | 23. group_SNP_exon_and_intron.py is a script for classifying SNP positions to exon and intron. 172 | 173 | ```shell 174 | group_SNP_exon_and_intron.py 175 | ``` 176 | 177 | 24. merge_bed_regions.py is a script for merging bed files based on distance 178 | 179 | ```shell 180 | merge_bed_regions.py 181 | ``` 182 | 183 | 25. modify_geno_with_snp_mummer.py is a script for modifying columns in geno file with snp result generated by show-snps 184 | of mummer 185 | 186 | ```shell 187 | modify_geno_with_snp_mummer.py 188 | ``` 189 | 190 | 26. nucmer_extract_all_sv.py is a script for running nucmer and extracting all SV. 191 | 192 | ``` 193 | # Dependencies 194 | # Software: nucmer 195 | nucmer_extract_all_sv.py 196 | ``` 197 | 198 | 27. nucmer_statistics.py & nucmer_statistics_all_sv.py are scripts for running nucmer and generating statistics. 199 | 200 | ```shell 201 | nucmer_statistics.py 202 | nucmer_statistics_all_sv.py 203 | ``` 204 | 205 | 28. quick_extract_fastx.py is a script for extracting fasta or fastq file with list. 206 | 207 | ```shell 208 | quick_extract_fastx.py 209 | ``` 210 | 211 | 29. quick_mask_genome.py is a script for masking genome with bed file. 212 | 213 | ```shell 214 | quick_mask_genome.py 215 | ``` 216 | 217 | 30. remove_region_by_blast_result.py is a script for removing regions in chromosomes with blast results. 218 | 219 | ```shell 220 | remove_region_by_blast_result.py 221 | Usage: 222 | is a list of blast files seperated with comma 223 | ``` 224 | 225 | 31. rename_ID.py is a script for sorting and renaming id with in_gff file, and renaming id in fasta files. 226 | 227 | ```shell 228 | rename_ID.py 229 | ``` 230 | 231 | 32. SentieonSNP_filter.py is a script for filtering vcf result generated by Sentieon. 232 | 233 | ```shell 234 | usage: SentieonSNP_filter.py [-h] -b BASE -v VALIDATION [-r REPEAT] -o OUTPUT [-m MISSING_RATE] [-d MIN_DISTANCE] 235 | 236 | options: 237 | -h, --help show this help message and exit 238 | -b BASE, --base BASE Input vcf file as base 239 | -v VALIDATION, --validation VALIDATION 240 | Input vcf file as validation 241 | -r REPEAT, --repeat REPEAT 242 | Repeat regions file, gff format 243 | -o OUTPUT, --output OUTPUT 244 | Output vcf file based on base vcf file, compressed with gzip 245 | -m MISSING_RATE, --missing_rate MISSING_RATE 246 | Missing rate threshold, percentage, default: 40 247 | -d MIN_DISTANCE, --min_distance MIN_DISTANCE 248 | Minimum distance between two snp sites, default: 0 249 | ``` 250 | 251 | 33. SeqStat.py is a script for generating statistics with fasta|fastq|bam file. 252 | 253 | ```shell 254 | SeqStat.py [out_stat] 255 | ``` 256 | 257 | 34. SimContigs.py & SimCollapse.py are scripts for simulating collapsed contigs. 258 | 259 | ```shell 260 | usage: SimContigs.py [-h] [--min MIN] [--max MAX] [-n N50] -i INPUT -o OUTPUT 261 | 262 | options: 263 | -h, --help show this help message and exit 264 | --min MIN minimum length of contig, default: 15k, you can use both number or string end with k,m 265 | --max MAX minimum length of contig, default: 5m, you can use both number or string end with k,m 266 | -n N50, --n50 N50 size of N50, default: 500k, you can use both number or string end with k,m 267 | -i INPUT, --input INPUT 268 | origin fasta file of genome 269 | -o OUTPUT, --output OUTPUT 270 | filename of simulated data 271 | 272 | ``` 273 | 274 | 35. SimCollapse.py 275 | 276 | ```shell 277 | usage: SimCollapse.py [-h] -a A_CONTIGS -b B_CONTIGS -p PREFIX -o OUTPUT -s BLAST [-c COLLAPSE] 278 | 279 | options: 280 | -h, --help show this help message and exit 281 | -a A_CONTIGS, --a_contigs A_CONTIGS 282 | first fasta file contain contigs generated by SimContigs.py 283 | -b B_CONTIGS, --b_contigs B_CONTIGS 284 | second fasta file contain contigs generated by SimContigs.py 285 | -p PREFIX, --prefix PREFIX 286 | prefix of contig file a and contig file b, divided by comma, like: HA, HB 287 | -o OUTPUT, --output OUTPUT 288 | filename of simulated data 289 | -s BLAST, --blast BLAST 290 | blast file with format 6, must use first file of input as query and second file as database 291 | -c COLLAPSE, --collapse COLLAPSE 292 | persentage of collapse region size, like 5 means 5%, default: 10 293 | ``` 294 | 295 | 36. simple_ANGSD.py & simple_ANGSD_without_errorCorrect.py are script for running ANGSD. 296 | 297 | ```shell 298 | simple_ANGSD.py -l -anc -r [-out -p -ref ] 299 | simple_ANGSD_without_errorCorrect.py -l -r [-out -p ] 300 | Notice: 301 | -p: path of bam files, default is current path 302 | -out: name of outgroup, default is "Outgroup" 303 | ``` 304 | 305 | 37. simple_JBrowser.py is a script for generating file for JBrowser 306 | 307 | ```shell 308 | # etc/SimpleJBrowser.conf is a template config file for simple_JBrowser.py 309 | simple_JBrowser.py -f [--gff --bed --bam --bw --conf ] 310 | ``` 311 | 312 | 38. SimSID.py is a script for simulating SNP, Insertions and Deletions. 313 | 314 | ```shell 315 | usage: SimSID.py [-h] [-s SNP] [-i INSERTION] [--insert_length INSERT_LENGTH] [-d DELETION] [--delete_length DELETE_LENGTH] [--random_length] [-v] -r REF -o OUT 316 | 317 | options: 318 | -h, --help show this help message and exit 319 | -s SNP, --snp SNP snp ratio of whole genome, percentage, default: 0.01 320 | -i INSERTION, --insertion INSERTION 321 | insertion ratio of whole genome, percentage, default: 0.01 322 | --insert_length INSERT_LENGTH 323 | max length of insertion, default: 10 324 | -d DELETION, --deletion DELETION 325 | delection ratio of whole genome, percentage, default: 0.01 326 | --delete_length DELETE_LENGTH 327 | max length of deletion, default: 10 328 | --random_length use this argument for generate random length of indels 329 | -v, --verbose print detail information 330 | -r REF, --ref REF origin fasta file of genome 331 | -o OUT, --out OUT prefix of simulated data 332 | ``` 333 | 334 | 39. split_cmd_with_parts.py is a script for splitting cmd file. 335 | 336 | ```shell 337 | split_cmd_with_parts.py 338 | ``` 339 | 340 | 40. split_ctg_with_agp.py is a script for splitting contig fasta file into chromosome groups with agp file. 341 | 342 | ```shell 343 | split_ctg_with_agp.py 344 | ``` 345 | 346 | 41. split_fasta_by_chr.py is a script for splitting fasta into several files contain single chromosome. 347 | 348 | ```shell 349 | split_fasta_by_chr.py 350 | ``` 351 | 352 | 42. split_fasta_by_count.py is a script for splitting fasta to several files with file size or sequence counts. 353 | 354 | ```shell 355 | split_fasta_by_count.py 356 | ``` 357 | 358 | 43. split_fasta_by_id.py is a script for splitting fasta with id. 359 | 360 | ```shell 361 | split_fasta_by_id.py 362 | ``` 363 | 364 | 44. StatAgp.py & StatAgpDetail.py are scripts for generating statistic with agp file. 365 | 366 | ```shell 367 | StatAgp.py 368 | StatAgpDetail.py 369 | ``` 370 | 371 | 45. subVCF.py is a script for extracting vcf file with list file, default missing rate 0.4. 372 | 373 | ```shell 374 | subVCF.py [] 375 | ``` 376 | 377 | 46. transfer_gff3_with_agp.py is a script for transferring positions with old agp and new agp file. 378 | 379 | ```shell 380 | transfer_gff3_with_agp.py 381 | ``` 382 | 383 | 47. eval_synteny.py is a script for evaluating the assembly consistency between query genome and reference genome by 384 | mapping cds of reference genome to query genome and reference genome with gmap and extract bed files with jcvi, be 385 | sure that the query bed file only contain the chromosomes and/or contigs which you want evalute. 386 | 387 | ```shell 388 | usage: eval_synteny.py [-h] -r REF -q QRY -p PAIR 389 | 390 | options: 391 | -h, --help show this help message and exit 392 | -r REF, --ref REF ref.bed 393 | -q QRY, --qry QRY qry.bed 394 | ``` 395 | 396 | 48. get_seq_with_bed.py is a script for extracting sequences from fasta file with bed file, the bed file can contain 4 397 | or 398 | 5 fields: [seq_id, start_pos, end_pos, out_id] or [seq_id, start_pos, end_pos, direct, out_id] 399 | 400 | ```shell 401 | Usage: python get_seq_with_bed.py 402 | ``` 403 | 404 | 49. convert_anchorwave.py is a script for convert anchorwave maf file to a table file, which contains 7 columns: "Ref 405 | id, 406 | start position, end position, query id, start position, end position, variant type" 407 | 408 | ```shell 409 | usage: convert_anchorwave.py [-h] -i INPUT -o OUTPUT 410 | 411 | options: 412 | -h, --help show this help message and exit 413 | -i INPUT, --input INPUT 414 | Input maf file 415 | -o OUTPUT, --output OUTPUT 416 | Out put file 417 | ``` 418 | 419 | 50. extract_fasta_with_bed.py is a script for extracting seq with bed file contain 5 colunms: "ID, start, end, 420 | direction, 421 | id", positions should be 1-based 422 | 423 | ```shell 424 | Usage: python bin/extract_fasta_with_bed.py 425 | Notice: bed should be 5 columns: "ID, start, end, direction, id", positions should be 1-based 426 | ``` 427 | 428 | 51. convert_chr_to_ctg_with_agp.py is a script for converting chromosomes to contigs with AGP file. 429 | 430 | ```shell 431 | Usage: python ./bin/convert_chr_to_ctg_with_agp.py 432 | ``` 433 | 434 | 52. bam_cov.py is a script for calculating genome coverage ratio from bam file. 435 | 436 | ```shell 437 | usage: bam_cov.py [-h] -b BAM -o OUTPUT [-t THREADS] 438 | 439 | options: 440 | -h, --help show this help message and exit 441 | -b BAM, --bam BAM Input bam file, must be indexed 442 | -o OUTPUT, --output OUTPUT 443 | Output statistic 444 | -t THREADS, --threads THREADS 445 | Threads, default=10 446 | ``` 447 | 448 | 53. sort_gff3.py is a script for sorting genes with chromosomes and positions, and generating new IDs. 449 | 450 | ```shell 451 | Usage: python ./bin/sort_gff3.py 452 | Notice: sort and rename id with in_gff by coordinate, the chromosome ID should be like: Chr01 for mono assembly, Chr01A for phased assembly. 453 | Example: python ./bin/sort_gff3.py CB5 in.gff3 out.gff3 454 | ``` 455 | 456 | 54. easyGoKegg.R is a simple script for running GO and KEGG with custom emapper annotation 457 | 458 | > ### Dependencies 459 | >**Software** 460 | > - R 461 | > 462 | >**R modules** 463 | > - optparse 464 | > - KEGGREST 465 | > - clusterProfiler 466 | > - dplyr 467 | > - stringr 468 | > - AnnotationForge 469 | > - jsonlite 470 | > - purrr 471 | > - RCurl 472 | > - ggplot2 473 | > 474 | >**Install R packages** 475 | > - Install with R 476 | > ```bash 477 | > install.packages("BiocManager") 478 | > BiocManager::install(c("optparse","dplyr","stringr", "jsonlite","purrr","ggplot2", "RCurl", "KEGGREST", "clusterProfiler", "AnnotationForge")) 479 | > ``` 480 | > - Install with conda/mamba 481 | > ```bash 482 | > conda create -n GoKegg -c conda-forge -c bioconda bioconductor-AnnotationForge bioconductor-clusterProfiler bioconductor-KEGGREST r-dplyr r-ggplot2 r-jsonlite r-optparse r-purrr r-RCurl r-stringr 483 | > # or 484 | > mamba create -n GoKegg -c conda-forge -c bioconda bioconductor-AnnotationForge bioconductor-clusterProfiler bioconductor-KEGGREST r-dplyr r-ggplot2 r-jsonlite r-optparse r-purrr r-RCurl r-stringr 485 | > ``` 486 | >### Data preparation 487 | >- Prepare eggnog result 488 | > Drop lines start with "##", remove "#" at the beginning of line "#query ...", make sure the first line of 489 | annotation file is like below: 490 | >```text 491 | >query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs 492 | >``` 493 | > 494 | >- Prepare target gene list 495 | > A text file each line contain one gene id, the id of genes should match which in annotation file. 496 | > 497 | >### Usage 498 | >- Run command like below 499 | >```bash 500 | >Rscript easyGoKegg.R -i gene.txt -a emapper.annotations -d database 501 | >``` 502 | > 503 | >- Detail of parameters 504 | >```bash 505 | >Usage: This Script is used for running GO and KEGG 506 | >Options: 507 | > -i INPUT, --input=INPUT 508 | > Input gene list file 509 | > -a ANNO, --anno=ANNO 510 | > Functional annotation file 511 | > -d DB, --db=DB 512 | > Database path 513 | > --kegg_json=KEGG_JSON 514 | > Pre-downloaded kegg json file 515 | > --genus=GENUS 516 | > Genus name for creating GO database, default="Custom genus" 517 | > --pvalue=PVALUE 518 | > P value cutoff for GO and KEGG, default=0.05 519 | > --qvalue=QVALUE 520 | > Q value cutoff for GO and KEGG, default=0.05 521 | > --padjust=PADJUST 522 | > P adjust method for GO and KEGG, default="BH" 523 | > --ontology=ONTOLOGY 524 | > Ontology for GO, default="ALL" 525 | > --species=SPECIES 526 | > Species name for creating GO database, default="CUSTOM" 527 | > --tax_id=TAX_ID 528 | > Tax id for creating GO database, default="0000" 529 | > --update 530 | > Update databases 531 | > --plant 532 | > enrich with plant pathway only 533 | > --plant_kegg=PLANT_KEGG 534 | > Pre-generated plant kegg db file 535 | > -h, --help 536 | > Show this help message and exit 537 | >``` 538 | >> **Notice** there should no space in species 539 | > 540 | >## Result 541 | >1. text file of GO and KEGG results 542 | >2. bubble plot of GO and KEGG results 543 | >3. bar plot of GO and KEGG results 544 | 545 | 55. check_cds.py is a script for checking CDS valid or not. 546 | 547 | ```shell 548 | usage: check_cds.py [-h] -i INPUT [--detail] [-o OUTPUT] 549 | 550 | options: 551 | -h, --help show this help message and exit 552 | -i INPUT, --input INPUT 553 | Input CDS file 554 | --detail If set, output detail information 555 | -o OUTPUT, --output OUTPUT 556 | Output summary file, if not set, output to stdout 557 | ``` 558 | --------------------------------------------------------------------------------