├── src
    └── rMETL
    │   ├── __init__.py
    │   ├── rMETL_version.py
    │   ├── rMETL_genotype.py
    │   ├── rMETL
    │   ├── rMETL_utils.py
    │   ├── rMETL_concensus.py
    │   ├── rMETL_realign.py
    │   ├── rMETL_cmdRunner.py
    │   ├── rMETL_MEIcalling.py
    │   └── rMETL_extraction.py
├── .gitignore
├── LICENSE
├── setup.py
├── README.md
└── Concensus
    └── super_TE.fa


/src/rMETL/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ##############################
2 | ## Folders                  ##
3 | ##############################
4 | /dist/
5 | /src/rMETL.egg-info/
6 | /src/*.pyc
7 | 


--------------------------------------------------------------------------------
/src/rMETL/rMETL_version.py:
--------------------------------------------------------------------------------
1 | # * @author: Jiang Tao (tjiang@hit.edu.cn)
2 | 
3 | __version__ = '1.0.4'
4 | __author__ = 'Jiang Tao'
5 | __contact__ = 'tjiang@hit.edu.cn'
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 JiangTao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | LONG_DESCRIPTION = '''Mobile element insertion (MEI) is a major category of structure variations (SVs). \
 6 | The rapid development of long read sequencing provides the opportunity to sensitively discover MEIs. \
 7 | However, the signals of MEIs implied by noisy long reads are highly complex, due to the repetitiveness \
 8 | of mobile elements as well as the serious sequencing errors. Herein, we propose Realignment-based \
 9 | Mobile Element insertion detection Tool for Long read (rMETL). rMETL takes advantage of \
10 | its novel chimeric read re-alignment approach to well handle complex MEI signals. \
11 | Benchmarking results on simulated and real datasets demonstrated that rMETL has the ability \
12 | to more sensitivity discover MEIs as well as prevent false positives. \
13 | It is suited to produce high quality MEI callsets in many genomics studies.'''
14 | 
15 | setup(
16 |     name = "rMETL",
17 |     version = "1.0.4",
18 |     description = "realignment-based Mobile Element insertion detection Tool for Long read",
19 |     author = "Jiang Tao",
20 |     author_email = "tjiang@hit.edu.cn",
21 |     url = "https://github.com/tjiangHIT/rMETL",
22 |     license = "MIT",
23 |     packages = find_packages("src"),
24 |     package_dir = {"": "src"},
25 |     data_files = [("", ["LICENSE"])],
26 |     scripts=['src/rMETL/rMETL'],
27 |     long_description = LONG_DESCRIPTION,
28 |     zip_safe = False,
29 |     install_requires = ['pysam', 'Biopython', 'Cigar']
30 | )
31 | 


--------------------------------------------------------------------------------
/src/rMETL/rMETL_genotype.py:
--------------------------------------------------------------------------------
 1 | # * @author: Jiang Tao (tjiang@hit.edu.cn)
 2 | 
 3 | GL_TAG = ['1/0', '0/1', '1/1']
 4 | 
 5 | def simple_call_genotype(Nalt, Ntotal, P_heterozygous, P_homozygous):
 6 | 	bound_low = Ntotal * P_heterozygous
 7 | 	bound_up = Ntotal * P_homozygous
 8 | 	if Nalt < bound_low:
 9 |         # reliability = 0
10 | 		return GL_TAG[0], "%d:%d"%(Nalt, Ntotal - Nalt), 0
11 | 	elif bound_low <= Nalt and Nalt < bound_up:
12 |         # reliability = 1
13 | 		return GL_TAG[1], "%d:%d"%(Nalt, Ntotal - Nalt), 1
14 | 	else:
15 |         # reliability = 1
16 | 		return GL_TAG[2], "%d:%d"%(Nalt, Ntotal - Nalt), 1
17 | 
18 | def simple_filter_genotype(Nalt, Ntotal, P_heterozygous):
19 |     bound_low = Ntotal * P_heterozygous
20 |     if Nalt < bound_low:
21 |         return 0
22 |     else:
23 |         return 1
24 | 
25 | def count_coverage(chr, s, e, f):
26 |     total = 0
27 |     for i in f.fetch(chr, s, e):
28 |         total += 1
29 |     return total
30 | 
31 | def add_genotype(info_list, file, low_bandary):
32 |     '''
33 |     allocate genotype for each MEI/MED
34 |     '''
35 |     for i in xrange(len(info_list)):
36 |         if info_list[i][0][0] == 'INS':
37 |             chr = info_list[i][0][1]
38 |             start = info_list[i][0][2]-low_bandary
39 |             end = info_list[i][0][2] + low_bandary
40 |             locus_cov = count_coverage(chr, start, end, file)
41 |             for j in xrange(len(info_list[i])):
42 |                 info_list[i][j].append(locus_cov)
43 |         else:
44 |             for j in xrange(len(info_list[i])):
45 |                 chr = info_list[i][j][1]
46 |                 start = info_list[i][j][2]
47 |                 end = info_list[i][j][2]+info_list[i][j][3]
48 |                 locus_cov = count_coverage(chr, start, end, file)
49 |                 info_list[i][j].append(locus_cov)
50 |     return info_list
51 | 
52 | if __name__ == '__main__':
53 | 	pass
54 | 


--------------------------------------------------------------------------------
/src/rMETL/rMETL:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ''' 
 4 |  * All rights Reserved, Designed By HIT-Bioinformatics   
 5 |  * @Description: Control the rMETL pipeline
 6 |  * @author: Jiang Tao (tjiang@hit.edu.cn)
 7 |  * @date: Apr 24 2018
 8 |  * @version V1.0.4   
 9 | '''
10 | 
11 | import argparse
12 | import rMETL.rMETL_extraction as rMETL_extraction
13 | import rMETL.rMETL_realign as rMETL_realign
14 | import rMETL.rMETL_MEIcalling as rMETL_MEIcalling
15 | from rMETL.rMETL_version import __version__, __author__, __contact__
16 | 
17 | STAGES = {'detection': rMETL_extraction.run, \
18 |           'realignment': rMETL_realign.run, \
19 |           'calling': rMETL_MEIcalling.run}
20 | 
21 | USAGE = '''\
22 |            _  ___  _   _____   _______   _
23 |      _ _  | ^_   _^ | |  ___| |__   __| | |
24 |     | ^_| | | | | | | | |__      | |    | |
25 |     | |   | | | | | | |  __|     | |    | |
26 |     | |   | | | | | | | |___     | |    | |___
27 |     |_|   |_| |_| |_| |_____|    |_|    |_____|
28 | 
29 |     rMETL - realignment-based Mobile Element insertion detection Tool for Long read
30 | 
31 |   STAGE is one of
32 |     detection    Inference of putative MEI loci.
33 |     realignment  Realignment of chimeric read parts.
34 |     calling      Mobile Element Insertion/Deletion calling.
35 |     
36 |   See README.md for documentation or --help for details
37 |   Strongly recommend making output directory manually at first.
38 |   
39 |   rMETL V%s
40 |   Author: %s
41 |   Contact: %s
42 | '''%(__version__, __author__, __contact__)
43 | 
44 | def parseArgs():
45 | 	parser = argparse.ArgumentParser(prog='rMETL', description=USAGE, \
46 |     formatter_class=argparse.RawDescriptionHelpFormatter)
47 | 	parser.add_argument('stage', metavar='STAGE', choices=STAGES.keys(), \
48 |     type=str, help='Stage to execute')
49 | 	parser.add_argument('options', metavar='OPTIONS', nargs=argparse.REMAINDER, \
50 |     help='Options to pass to the stage')
51 | 	args = parser.parse_args()
52 | 	STAGES[args.stage](args.options)
53 | 
54 | if __name__ == '__main__':
55 | 	parseArgs()
56 | 


--------------------------------------------------------------------------------
/src/rMETL/rMETL_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*-coding:utf-8-*-
 3 | 
 4 | import logging
 5 | import os
 6 | from Bio import SeqIO
 7 | from rMETL.rMETL_cmdRunner import exe
 8 | 
 9 | def load_ref(ref_g):
10 | 	logging.info("Loading reference genome...")
11 | 	return SeqIO.to_dict(SeqIO.parse(ref_g, "fasta"))
12 | 
13 | def check_bai(file, tempdir):
14 | 	'''
15 | 	check the index of a BAM file.
16 | 	'''
17 | 	if os.path.exists(file+".bai"):
18 | 		logging.info("The bam file is legal.")
19 | 		return ""
20 | 	else:
21 | 		logging.info("The bam.bai is missed.")
22 | 		logging.info("Running Samtools sort...")
23 | 		bam_path = file[:-3] + "sorted.bam"
24 | 		cmd = ("samtools sort -@ 4 -O bam -T %s -o %s %s" % (tempdir, \
25 | 			bam_path, file))
26 | 		r, o, e = exe(cmd)
27 | 		if r != 0:
28 | 			logging.error("Samtools sort failed!")
29 | 			logging.error("RETCODE %d" % (r))
30 | 			logging.error("STDOUT %s" % (str(o)))
31 | 			logging.error("STDERR %s" % (str(e)))
32 | 			logging.error("Exiting")
33 | 			exit(r)
34 | 		logging.info("Finished Samtools sort.")
35 | 
36 | 		logging.info("Running Samtools index...")
37 | 		cmd = ("samtools index %s" % (bam_path))
38 | 		r, o, e = exe(cmd)
39 | 		if r != 0:
40 | 			logging.error("Samtools index failed!")
41 | 			logging.error("RETCODE %d" % (r))
42 | 			logging.error("STDOUT %s" % (str(o)))
43 | 			logging.error("STDERR %s" % (str(e)))
44 | 			logging.error("Exiting")
45 | 			exit(r)
46 | 		logging.info("Finished Samtools index.")
47 | 		return bam_path
48 | 
49 | def call_ngmlr(inFile, ref, presets, nproc, outFile):
50 | 	"""
51 | 	run ngmlr to generate alignments
52 | 	"""
53 | 	outFile = outFile + "map.sam"
54 | 	logging.info("Running NGMLR...")
55 | 	cmd = ("ngmlr -r %s -q %s -o %s -t %d -x %s" % (ref, inFile, outFile, \
56 | 		nproc, presets))
57 | 	r, o, e = exe(cmd)
58 | 	
59 | 	if r != 0:
60 | 		logging.error("NGMLR mapping failed!")
61 | 		logging.error("RETCODE %d" % (r))
62 | 		logging.error("STDOUT %s" % (str(o)))
63 | 		logging.error("STDERR %s" % (str(e)))
64 | 		logging.error("Exiting")
65 | 		exit(r)
66 | 	logging.info("Finished NGMLR mapping.")
67 | 	return outFile
68 | 
69 | def call_samtools(file, tempdir):
70 | 	'''
71 | 	run samtools to generate sorted BAM files.
72 | 	'''
73 | 	logging.info("Running Samtools sort...")
74 | 	bam_path = file[:-3] + "bam"
75 | 	cmd = ("samtools view -Sb %s | samtools sort -@ 4 -O bam -T %s - > %s" % \
76 | 		(file, tempdir, bam_path))
77 | 	r, o, e = exe(cmd)
78 | 	if r != 0:
79 | 		logging.error("Samtools sort failed!")
80 | 		logging.error("RETCODE %d" % (r))
81 | 		logging.error("STDOUT %s" % (str(o)))
82 | 		logging.error("STDERR %s" % (str(e)))
83 | 		logging.error("Exiting")
84 | 		exit(r)
85 | 	logging.info("Finished Samtools sort.")
86 | 
87 | 	logging.info("Running Samtools index...")
88 | 	cmd = ("samtools index %s" % (bam_path))
89 | 	r, o, e = exe(cmd)
90 | 	if r != 0:
91 | 		logging.error("Samtools index failed!")
92 | 		logging.error("RETCODE %d" % (r))
93 | 		logging.error("STDOUT %s" % (str(o)))
94 | 		logging.error("STDERR %s" % (str(e)))
95 | 		logging.error("Exiting")
96 | 		exit(r)
97 | 	logging.info("Finished Samtools index.")
98 | 	return bam_path
99 | 


--------------------------------------------------------------------------------
/src/rMETL/rMETL_concensus.py:
--------------------------------------------------------------------------------
  1 | # * @author: Jiang Tao (tjiang@hit.edu.cn)
  2 | 
  3 | from collections import Counter
  4 | 
  5 | def acquire_count_max(_list_):
  6 | 	c = Counter(_list_)
  7 | 	return c.most_common(1)[0]
  8 | 	# this is a tuple
  9 | 
 10 | def construct_concensus_info(Ins_list, Clip_list, evidence_read, SV_size):
 11 | 	total_count = len(Ins_list) + len(Clip_list)
 12 | 	if total_count < evidence_read:
 13 | 		return 0
 14 | 	breakpoint = list()
 15 | 	insert_size = list()
 16 | 	boundary = list()
 17 | 	for i in Ins_list:
 18 | 		breakpoint.append(i[0])
 19 | 		insert_size.append(i[1])
 20 | 	for i in Clip_list:
 21 | 		if i[2] == 1:
 22 | 			breakpoint.append(i[0])
 23 | 
 24 | 	# ==============method_1=====================
 25 | 	Prob_pos_1 = Counter(breakpoint).most_common(1)[0][0]
 26 | 	# ==============method_2=====================
 27 | 	Prob_pos_2 = sum(breakpoint)/len(breakpoint)
 28 | 	Average_size = int(sum(insert_size)/len(insert_size))
 29 | 	if Average_size < SV_size:
 30 | 		return 0
 31 | 
 32 | 	local_info = list()
 33 | 	local_name = [Prob_pos_2, Average_size]
 34 | 	local_id = 0
 35 | 	for i in Ins_list:
 36 | 		info = local_name + [str(local_id), i[2]]
 37 | 		local_id += 1
 38 | 		local_info.append(info)
 39 | 	for i in Clip_list:
 40 | 		info = local_name + [str(local_id), i[1]]
 41 | 		local_id += 1
 42 | 		local_info.append(info)
 43 | 
 44 | 	return local_info
 45 | 
 46 | 
 47 | def construct_concensus_seq(Ins_list, Clip_list):
 48 | 	'''
 49 | 	Ins_list: 	start position on reference genome
 50 | 				Insertion size
 51 | 				Insertion sequence
 52 | 	Clip_list:	clip position on reference genome
 53 | 				clip sequence
 54 | 				clip type(0 for left and 1 for right)
 55 | 	'''
 56 | 	breakpoint = list()
 57 | 	insert_size = list()
 58 | 	for i in Ins_list:
 59 | 		breakpoint.append(i[0])
 60 | 		insert_size.append(i[1])
 61 | 	for i in Clip_list:
 62 | 		if i[2] == 1:
 63 | 			breakpoint.append(i[0])
 64 | 
 65 | 	# ==============method_1=====================
 66 | 	Prob_pos_1 = Counter(breakpoint).most_common(1)[0][0]
 67 | 	# ==============method_2=====================
 68 | 	Prob_pos_2 = sum(breakpoint)/len(breakpoint)
 69 | 	Max_size = max(insert_size)
 70 | 	Min_size = min(insert_size)
 71 | 	Average_size = int(sum(insert_size)/len(insert_size))
 72 | 
 73 | 	Seq = dict()
 74 | 	for i in Ins_list:
 75 | 		for j in xrange(i[1]):
 76 | 			pos = i[0] + j
 77 | 			ch = i[2][j]
 78 | 			if pos not in Seq:
 79 | 				Seq[pos] = list()
 80 | 			Seq[pos].append(ch)
 81 | 
 82 | 	for i in Clip_list:
 83 | 		if Average_size <= len(i[1]):
 84 | 			boundary = Average_size
 85 | 		else:
 86 | 			boundary = len(i[1])
 87 | 
 88 | 		if i[2] == 0:
 89 | 			local_clip_seq = i[1][len(i[1])-boundary:]
 90 | 			for j in xrange(boundary):
 91 | 				pos = i[0] + j
 92 | 				ch = local_clip_seq[j]
 93 | 				if pos not in Seq:
 94 | 					Seq[pos] = list()
 95 | 				Seq[pos].append(ch)
 96 | 		else:
 97 | 			for j in xrange(boundary):
 98 | 				pos = i[0] + j
 99 | 				ch = i[1][j]
100 | 				if pos not in Seq:
101 | 					Seq[pos] = list()
102 | 				Seq[pos].append(ch)
103 | 	Seq_trans = list()
104 | 	for key in Seq:
105 | 		if len(Seq[key]) < 5:
106 | 			continue
107 | 		Seq_trans.append([key, acquire_count_max(Seq[key])[0]])
108 | 	Seq_trans = sorted(Seq_trans, key = lambda x:x[0])
109 | 	final_consensus = str()
110 | 	for i in Seq_trans:
111 | 		if i[0] < Prob_pos_1:
112 | 			continue
113 | 		final_consensus += i[1]
114 | 		if len(final_consensus) > Average_size:
115 | 			break
116 | 	return final_consensus, Prob_pos_1
117 | 	


--------------------------------------------------------------------------------
/src/rMETL/rMETL_realign.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ''' 
 4 |  * All rights Reserved, Designed By HIT-Bioinformatics   
 5 |  * @Description: Classify the ME types
 6 |   * @author: Jiang Tao (tjiang@hit.edu.cn)
 7 |  * @date: Apr 24 2018
 8 |  * @version V1.0.4
 9 | '''
10 | 
11 | import argparse
12 | import logging
13 | import sys
14 | import time
15 | 
16 | from rMETL.rMETL_version import __version__, __author__, __contact__
17 | from rMETL.rMETL_cmdRunner import setupLogging, exe
18 | 
19 | USAGE="""\
20 |            _  ___  _   _____   _______   _
21 |      _ _  | ^_   _^ | |  ___| |__   __| | |
22 |     | ^_| | | | | | | | |__      | |    | |
23 |     | |   | | | | | | |  __|     | |    | |
24 |     | |   | | | | | | | |___     | |    | |___
25 |     |_|   |_| |_| |_| |_____|    |_|    |_____|
26 | 
27 |     rMETL - realignment-based Mobile Element insertion detection Tool for Long read
28 | 
29 | 	Realignment of chimeric read parts.
30 | 
31 | 	Aligner: NGMLR version 0.2.6
32 | 	TE refs: Alu concensus
33 | 		 L1 concensus
34 | 		 SVA concensus
35 | 	The output is a sam format file called 'cluster.sam'.
36 | 
37 | 	rMETL V%s
38 | 	Author: %s
39 | 	Contact: %s
40 | """%(__version__, __author__, __contact__)
41 | 
42 | # **************************Call-NGMLR********************************
43 | def call_ngmlr(inFile, ref, presets, nproc, outFile, SUBREAD_LENGTH, SUBREAD_CORRIDOR):
44 | 	"""
45 | 	fq = input file
46 | 	automatically search for .sa
47 | 	"""
48 | 	outFile = outFile + "cluster.sam"
49 | 	logging.info("Running NGMLR...")
50 | 	cmd = ("ngmlr -r %s -q %s -o %s -t %d -x %s --subread-length %d --subread-corridor %d" \
51 | 		% (ref, inFile, outFile, nproc, presets, SUBREAD_LENGTH, SUBREAD_CORRIDOR))
52 | 	r, o, e = exe(cmd)
53 | 	if r != 0:
54 | 		logging.error("NGMLR mapping failed!")
55 | 		logging.error("RETCODE %d" % (r))
56 | 		logging.error("STDOUT %s" % (str(o)))
57 | 		logging.error("STDERR %s" % (str(e)))
58 | 		logging.error("Exiting")
59 | 		exit(r)
60 | 	logging.info("Finished NGMLR mapping.")
61 | 	return outFile
62 | # **************************Call-NGMLR********************************
63 | # 
64 | # 
65 | # 
66 | # ************************MAIN_FUNCTION*******************************
67 | def parseArgs(argv):
68 | 	parser = argparse.ArgumentParser(prog="rMETL realignment", description=USAGE, \
69 | 		formatter_class=argparse.RawDescriptionHelpFormatter)
70 | 	parser.add_argument("input", metavar="FASTA", type=str, help="Input potential_ME.fa on STAGE detection.")
71 | 	parser.add_argument("ME_Ref", type=str, help="The transposable element concensus in fasta format.")
72 | 	parser.add_argument('output', type=str, help = "Directory to output realignments.")
73 | 	parser.add_argument('-t', '--threads', help = "Number of threads to use.[%(default)s]", \
74 | 		default = 8, type = int)
75 | 	parser.add_argument('-x', '--presets', \
76 | 		help = "The sequencing platform <pacbio,ont> of the reads.[%(default)s]", \
77 | 		default = "pacbio", type = str)
78 | 	parser.add_argument('--subread_length', \
79 | 		help = "Length of fragments reads are split into [%(default)s]", \
80 | 		default = 128, type = int)
81 | 	parser.add_argument('--subread_corridor', \
82 | 		help = "Length of corridor sub-reads are aligned with [%(default)s]", \
83 | 		default = 20, type = int)
84 | 	args = parser.parse_args(argv)
85 | 	return args
86 | 
87 | def run(argv):
88 | 	args = parseArgs(argv)
89 | 	setupLogging(False)
90 | 	starttime = time.time()
91 | 	call_ngmlr(args.input, args.ME_Ref, args.presets, args.threads, args.output, \
92 | 		args.subread_length, args.subread_corridor)
93 | 	logging.info("Finished in %0.2f seconds."%(time.time() - starttime))
94 | 
95 | if __name__ == '__main__':
96 |     run(sys.argv[:1])
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 		        __        __   ______   _________   _
  2 | 		 _ __  |  \      /  | |  ____| |___   ___| | |
  3 | 		| ^__| |   \    /   | | |___       | |     | |
  4 | 		| |    | |\ \  / /| | |  ___|      | |     | |
  5 | 		| |    | | \ \/ / | | | |____      | |     | |____
  6 | 		|_|    |_|  \__/  |_| |______|     |_|     |______|
  7 |      
  8 | 	rMETL - realignment-based Mobile Element insertion detection Tool for Long read
  9 |  **NOTE: The community users give the newest installation approach after 2023, which is referred to** [here](https://github.com/tjiangHIT/rMETL/issues/8).
 10 | 
 11 | [![PyPI version](https://badge.fury.io/py/rMETL.svg)](https://badge.fury.io/py/rMETL)
 12 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/rmetl/badges/version.svg)](https://anaconda.org/bioconda/rmetl)
 13 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/rmetl/badges/license.svg)](https://anaconda.org/bioconda/rmetl)
 14 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/rmetl/badges/platforms.svg)](https://anaconda.org/bioconda/rmetl)
 15 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/rmetl/badges/latest_release_date.svg)](https://anaconda.org/bioconda/rmetl)
 16 | 
 17 | ---	
 18 | ### Introduction
 19 | Mobile element insertion (MEI) is a significant category of structure variations (SVs). The rapid development of long-read sequencing technologies provides the opportunity to detect MEIs sensitively. However, the signals of MEI implied by noisy long reads are highly complex due to the repetitiveness of mobile elements and the high sequencing error rates. Herein, we propose the Realignment-based Mobile Element insertion detection Tool for Long read (rMETL). Benchmarking results of simulated and real datasets demonstrate that rMETL has the ability to discover MEIs sensitively as well as prevent false positives. It is suited to produce high-quality MEI callsets in many genomics studies.
 20 | 
 21 | ---
 22 | ### Simulated datasets
 23 | 
 24 | The simulated datasets used for benchmarking are available at [Google Drive](https://drive.google.com/open?id=1ujV2C8e1PNAVhSkh9vKtjWLdG_OHcH-k)
 25 | 
 26 | ---
 27 | ### Memory usage
 28 | 
 29 | The memory usage of rMETL can fit the configurations of most modern servers and workstations.
 30 | Its peak memory footprint is about 7.05 Gigabytes (default setting), on a server with Intel Xeon CPU at 2.00 GHz, 1 Terabytes RAM running Linux Ubuntu 14.04. These reads were aligned to human reference genome hs37d5.
 31 | 
 32 | ---
 33 | ### Dependences
 34 | 	
 35 | 	1. pysam
 36 | 	2. Biopython
 37 | 	3. ngmlr
 38 | 	4. samtools
 39 | 	5. cigar
 40 | 
 41 | 	Python version 2.7
 42 | 
 43 | ---
 44 | ### Installation
 45 | 
 46 | 	#install via pip
 47 | 	$ pip install rMETL
 48 | 	
 49 | 	#install via conda
 50 | 	$ conda install -c bioconda rmetl
 51 | 
 52 | 	#install from GitHub
 53 | 	$ git clone https://github.com/tjiangHIT/rMETL.git (git clone https://github.com/hitbc/rMETL.git)
 54 | 	$ cd rMETL/
 55 | 	$ pip install .
 56 | 
 57 | The current version of rMETL has been tested on a 64-bit Linux operating system.
 58 | 
 59 | **NOTE: The community users give the newest installation approach after 2023, which is referred to** [here](https://github.com/tjiangHIT/rMETL/issues/8).
 60 | 
 61 | ---
 62 | ### Synopsis
 63 | Inference of putative MEI loci.
 64 | 
 65 | 	rMETL.py detection <alignments> <reference> <temp_dir> <output>
 66 | 
 67 | Realignment of chimeric read parts.
 68 | 
 69 | 	rMETL.py realignment <FASTA> <MEREF> <output>
 70 | 
 71 | Mobile Element Insertion calling.
 72 | 
 73 | 	rMETL.py calling <SAM> <reference> <out_type> <output>
 74 | 	
 75 | Strongly recommend making the output directory manually at first.:blush:
 76 | 
 77 | ---
 78 | ### Optional Parameters
 79 | 
 80 | #### Detection
 81 | 
 82 | | Parameters | Descriptions | Defaults |
 83 | | :------------ |:---------------|:---------------|
 84 | | MIN_SUPPORT   |Mininum number of reads that support a ME.| 5 |
 85 | | MIN_LENGTH    | Minimum length of ME to be reported.        |50|
 86 | | MIN_DISTANCE  | Minimum distance of two ME clusters. |20|
 87 | | THREADS       |Number of threads to use.|1|
 88 | | PRESETS       |The sequencing type <pacbio,ont> of the reads.|pacbio|
 89 | 
 90 | #### Realignment
 91 | 
 92 | | Parameters | Descriptions | Defaults |
 93 | | :------------ |:---------------|:---------------|
 94 | | THREADS       |Number of threads to use.|1|
 95 | | PRESETS       |The sequencing type <pacbio,ont> of the reads.|pacbio|
 96 | | SUBREAD_LENGTH       |Length of fragments reads are split into.|128|
 97 | | SUBREAD_CORRIDOR       |Length of corridor sub-reads are aligned with.|20|
 98 | 
 99 | #### Calling
100 | 
101 | | Parameters | Descriptions | Defaults |
102 | | :------------ |:---------------|:---------------|
103 | | HOMOZYGOUS       |The minimum score of a genotyping reported as homozygous.|0.8|
104 | | HETEROZYGOUS       |The minimum score of a genotyping reported as a heterozygous.|0.3|
105 | | MIN_MAPQ       |Mininum mapping quality.|20|
106 | | CLIPPING_THRESHOLD  |Mininum threshold of realignment clipping.|0.5|
107 | | SAMPLE       |The name of the sample which is noted.|None|
108 | | MEI       |Enables rMETL to display MEI/MED only.|False|
109 | 
110 | ---
111 | ### Citation
112 | If you use rMETL, please cite:
113 | > Tao Jiang *et al*; rMETL: sensitive mobile element insertion detection with long read realignment, *Bioinformatics*, Volume 35, Issue 18, 15 September 2019, Pages 3484–3486, https://doi.org/10.1093/bioinformatics/btz106
114 | 
115 | ---
116 | ### Contact
117 | For advising, bug reporting, and requiring help, please post on [Github Issue](https://github.com/tjiangHIT/rMETL/issues) or contact tjiang@hit.edu.cn.
118 | 


--------------------------------------------------------------------------------
/src/rMETL/rMETL_cmdRunner.py:
--------------------------------------------------------------------------------
  1 | from string import Template
  2 | import tempfile
  3 | import subprocess, signal, logging, os, stat, sys
  4 | 
  5 | class Alarm(Exception):
  6 |     pass
  7 | 
  8 | def alarm_handler(signum, frame):
  9 |     raise Alarm
 10 |     
 11 | def setupLogging(debug=False):
 12 |     logLevel = logging.DEBUG if debug else logging.INFO
 13 |     logFormat = "%(asctime)s [%(levelname)s] %(message)s"
 14 |     logging.basicConfig( stream=sys.stderr, level=logLevel, format=logFormat )
 15 |     logging.info("Running %s" % " ".join(sys.argv))
 16 | 
 17 | def exe(cmd, timeout=-1):
 18 |     """
 19 |     Executes a command through the shell.
 20 |     timeout in minutes! so 1440 mean is 24 hours.
 21 |     -1 means never
 22 |     """
 23 |     proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, \
 24 |                             stderr=subprocess.STDOUT, close_fds=True,\
 25 |                             preexec_fn=os.setsid)
 26 |     signal.signal(signal.SIGALRM, alarm_handler)
 27 |     if timeout > 0:
 28 |         signal.alarm(int(timeout*60))  
 29 |     try:
 30 |         stdoutVal, stderrVal =  proc.communicate()
 31 |         signal.alarm(0)  # reset the alarm
 32 |     except Alarm:
 33 |         logging.error(("Command was taking too long. "
 34 |                        "Automatic Timeout Initiated after %d" % (timeout)))
 35 |         os.killpg(proc.pid, signal.SIGTERM)
 36 |         proc.kill()
 37 |         return 214,None,None
 38 |     
 39 |     retCode = proc.returncode
 40 |     return retCode,stdoutVal,stderrVal
 41 | 
 42 | class Command():
 43 |     def __init__(self, cmd, jobname, stdout, stderr):
 44 |         self.cmd = cmd
 45 |         self.jobname = jobname
 46 |         self.stdout = stdout
 47 |         self.stderr = stderr
 48 |     
 49 |     def asDict(self):
 50 |         return {"CMD":self.cmd, "JOBNAME":self.jobname, \
 51 |                 "STDOUT":self.stdout, "STDERR":self.stderr}
 52 |     
 53 | class CommandRunner():
 54 |     """
 55 |     Uses a command template to run stuff. This is helpful for cluster commands
 56 |     and chunking several commands together
 57 |     """
 58 |     def __init__(self, template=None, njobs=0):
 59 |         """
 60 |         template: a string that will become the template for submitting to your cluster:
 61 |             #you can also go ahead and specify a string.Template
 62 |             default is to not submit to your cluster
 63 |             ${CMD} > ${STDOUT} 2> ${STDERR}
 64 |         njobs: (0)
 65 |             for clumping commands together and submitting them in a script
 66 |         """
 67 |         if template is None:
 68 |             template = "${CMD} > ${STDOUT} 2> ${STDERR}"
 69 |             self.runType = "Running"
 70 |         else:
 71 |             self.runType = "Submitting"
 72 |         self.template = Template(template)
 73 |         self.njobs = njobs
 74 |     
 75 |     def __call__(self, cmds, wDir = None, id = None):
 76 |         """
 77 |         Executes Commands - can either be a list or a single Command
 78 |         wDir is the working directory where chunk scripts will be written
 79 |         if id is None a random identifier will be applied when chunking
 80 |         """
 81 |         if wDir is None:
 82 |             wDir = "./"
 83 |         
 84 |         if type(cmds) != list:
 85 |             cmd = self.buildCommand(cmds)
 86 |             return exe(cmd)
 87 |         
 88 |         if self.njobs == 0:
 89 |             outRet = []
 90 |             for c in cmds:
 91 |                 outRet.append(exe(self.buildCommand(c)))
 92 |             return outRet
 93 |         
 94 |         if id is None:
 95 |             id = tempfile.mkstemp(dir=wDir)[1]
 96 |         
 97 |         outputRet =[]
 98 |         for chunk, commands in enumerate( partition(cmds, self.njobs) ):
 99 |             outScript = open(os.path.join(wDir, "%s_chunk%d.sh" % (id, chunk)),'w')
100 |             outScript.write("#!/bin/bash\n\n")
101 |             for c in commands:
102 |                 outScript.write(c.cmd+"\n")
103 |             outScript.close()
104 |             #Add executeable 
105 |             existing_permissions = stat.S_IMODE(os.stat(outScript.name).st_mode)
106 |             if not os.access(outScript.name, os.X_OK):
107 |                 new_permissions = existing_permissions | stat.S_IXUSR
108 |                 os.chmod(outScript.name, new_permissions)
109 |                 
110 |             submit = Command(outScript.name, \
111 |                             id + "_chunk%d" % chunk, \
112 |                             os.path.join(wDir, id + ("_chunk%d.out" % chunk)), \
113 |                             os.path.join(wDir, id + ("_chunk%d.err" % chunk)))
114 |             cmd = self.buildCommand(submit)
115 |             outputRet.append(exe(cmd))
116 |             
117 |         return outputRet
118 |         
119 |     def checkTemplate(self):
120 |         """
121 |         Checks that my template works okay
122 |         """
123 |         temp.update({"CMD":"test", \
124 |                      "STDOUT":"testo", \
125 |                      "STDERR":"teste", \
126 |                      "JOBNAME":"testn"})
127 |         try:
128 |             w = self.template.substitute(temp)
129 |         except KeyError:
130 |             logging.error("Your submission template is invalid ")
131 |             sys.exit(1)
132 | 
133 |     def buildCommand(self, cmdSetup):
134 |         """
135 |         substitutes a template with a Command
136 |         """
137 |         return self.template.substitute(cmdSetup.asDict())
138 | 
139 | def partition(n,m):
140 |     """
141 |     Helper function. splits list n into m partitions
142 |     """
143 |     p = map(lambda x: list(), range(m))
144 |     index = 0
145 |     for item in n:
146 |         p[index].append(item)
147 |         if index < m-1:
148 |             index += 1
149 |         else:
150 |             index = 0
151 |     return filter(lambda x: len(x)>0, p)
152 | 


--------------------------------------------------------------------------------
/Concensus/super_TE.fa:
--------------------------------------------------------------------------------
1 | >Alu
2 | GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGGCTAACACGGTGAAACCCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAGGCGGAGCTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTC
3 | >L1
4 | GGGGGAGGAGCCAAGATGGCCGAATAGGAACAGCTCCGGTCTACAGCTCCCAGCGTGAGCGACGCAGAAGACGGTGATTTCTGCATTTCCATCTGAGGTACCGGGTTCATCTCACTAGGGAGTGCCAGACAGTGGGCGCAGGCCAGTGTGTGTGCGCACCGTGCGCGAGCCGAAGCAGGGCGAGGCATTGCCTCACCTGGGAAGCGCAAGGGGTCAGGGAGTTCCCTTTCTGAGTCAAAGAAAGGGGTGACGGTCGCACCTGGAAAATCGGGTCACTCCCACCCGAATATTGCGCTTTTCAGACCGGCTTAAGAAACGGCGCACCACGAGACTATATCCCACACCTGGCTCGGAGGGTCCTACGCCCACGGAATCTCGCTGATTGCTAGCACAGCAGTCTGAGATCAAACTGCAAGGCGGCAACGAGGCTGGGGGAGGGGCGCCCGCCATTGCCCAGGCTTGCTTAGGTAAACAAAGCAGCCGGGAAGCTCGAACTGGGTGGAGCCCACCACAGCTCAAGGAGGCCTGCCTGCCTCTGTAGGCTCCACCTCTGGGGGCAGGGCACAGACAAACAAAAAGACAGCAGTAACCTCTGCAGACTTAAGTGTCCCTGTCTGACAGCTTTGAAGAGAGCAGTGGTTCTCCCAGCACGCAGCTGGAGATCTGAGAACGGGCAGACAGACTGCCTCCTCAAGTGGGTCCCTGACTCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCAGCAGGGGCACACTGACACCTCACACGGCAGGGTATTCCAACAGACCTGCAGCTGAGGGTCCTGTCTGTTAGAAGGAAAACTAACAACCAGAAAGGACATCTACACCGAAAACCCATCTGTACATCACCATCATCAAAGACCAAAAGTAGATAAAACCACAAAGATGGGGAAAAAACAGAACAGAAAAACTGGAAACTCTAAAACGCAGAGCGCCTCTCCTCCTCCAAAGGAACGCAGTTCCTCACCAGCAACGGAACAAAGCTGGATGGAGAATGATTTTGACGAGCTGAGAGAAGAAGGCTTCAGACGATCAAATTACTCTGAGCTACGGGAGGACATTCAAACCAAAGGCAAAGAAGTTGAAAACTTTGAAAAAAATTTAGAAGAATGTATAACTAGAATAACCAATACAGAGAAGTGCTTAAAGGAGCTGATGGAGCTGAAAACCAAGGCTCGAGAACTACGTGAAGAATGCAGAAGCCTCAGGAGCCGATGCGATCAACTGGAAGAAAGGGTATCAGCAATGGAAGATGAAATGAATGAAATGAAGCGAGAAGGGAAGTTTAGAGAAAAAAGAATAAAAAGAAATGAGCAAAGCCTCCAAGAAATATGGGACTATGTGAAAAGACCAAATCTACGTCTGATTGGTGTACCTGAAAGTGATGTGGAGAATGGAACCAAGTTGGAAAACACTCTGCAGGATATTATCCAGGAGAACTTCCCCAATCTAGCAAGGCAGGCCAACGTTCAGATTCAGGAAATACAGAGAACGCCACAAAGATACTCCTCGAGAAGAGCAACTCCAAGACACATAATTGTCAGATTCACCAAAGTTGAAATGAAGGAAAAAATGTTAAGGGCAGCCAGAGAGAAAGGTCGGGTTACCCTCAAAGGAAAGCCCATCAGACTAACAGTGGATCTCTCGGCAGAAACCCTACAAGCCAGAAGAGAGTGGGGGCCAATATTCAACATTCTTAAAGAAAAGAATTTTCAACCCAGAATTTCATATCCAGCCAAACTAAGCTTCATAAGTGAAGGAGAAATAAAATACTTTATAGACAAGCAAATGTTGAGAGATTTTGTCACCACCAGGCCTGCCCTAAAAGAGCTCCTGAAGGAAGCGCTAAACATGGAAAGGAACAACCGGTACCAGCCGCTGCAAAATCATGCCAAAATGTAAAGACCATCGAGACTAGGAAGAAACTGCATCAACTAATGAGCAAAATCACCAGCTAACATCATAATGACAGGATCAAATTCACACATAACAATATTAACTTTAAATATAAATGGACTAAATTCTGCAATTAAAAGACACAGACTGGCAAGTTGGATAAAGAGTCAAGACCCATCAGTGTGCTGTATTCAGGAAACCCATCTCACGTGCAGAGACACACATAGGCTCAAAATAAAAGGATGGAGGAAGATCTACCAAGCCAATGGAAAACAAAAAAAGGCAGGGGTTGCAATCCTAGTCTCTGATAAAACAGACTTTAAACCAACAAAGATCAAAAGAGACAAAGAAGGCCATTACATAATGGTAAAGGGATCAATTCAACAAGAGGAGCTAACTATCCTAAATATTTATGCACCCAATACAGGAGCACCCAGATTCATAAAGCAAGTCCTCAGTGACCTACAAAGAGACTTAGACTCCCACACATTAATAATGGGAGACTTTAACACCCCACTGTCAACATTAGACAGATCAACGAGACAGAAAGTCAACAAGGATACCCAGGAATTGAACTCAGCTCTGCACCAAGCAGACCTAATAGACATCTACAGAACTCTCCACCCCAAATCAACAGAATATACCTTTTTTTCAGCACCACACCACACCTATTCCAAAATTGACCACATAGTTGGAAGTAAAGCTCTCCTCAGCAAATGTAAAAGAACAGAAATTATAACAAACTATCTCTCAGACCACAGTGCAATCAAACTAGAACTCAGGATTAAGAATCTCACTCAAAGCCGCTCAACTACATGGAAACTGAACAACCTGCTCCTGAATGACTACTGGGTACATAACGAAATGAAGGCAGAAATAAAGATGTTCTTTGAAACCAACGAGAACAAAGACACCACATACCAGAATCTCTGGGACGCATTCAAAGCAGTGTGTAGAGGGAAATTTATAGCACTAAATGCCTACAAGAGAAAGCAGGAAAGATCCAAAATTGACACCCTAACATCACAATTAAAAGAACTAGAAAAGCAAGAGCAAACACATTCAAAAGCTAGCAGAAGGCAAGAAATAACTAAAATCAGAGCAGAACTGAAGGAAATAGAGACACAAAAAACCCTTCAAAAAATCAATGAATCCAGGAGCTGGTTTTTTGAAAGGATCAACAAAATTGATAGACCGCTAGCAAGACTAATAAAGAAAAAAAGAGAGAAGAATCAAATAGACACAATAAAAAATGATAAAGGGGATATCACCACCGATCCCACAGAAATACAAACTACCATCAGAGAATACTACAAACACCTCTACGCAAATAAACTAGAAAATCTAGAAGAAATGGATACATTCCTCGACACATACACTCTCCCAAGACTAAACCAGGAAGAAGTTGAATCTCTGAATAGACCAATAACAGGCTCTGAAATTGTGGCAATAATCAATAGTTTACCAACCAAAAAGAGTCCAGGACCAGATGGATTCACAGCCGAATTCTACCAGAGGTACATGGAGGAACTGGTACCATTCCTTCTGAAACTATTCCAATCAATAGAAAAAGAGGGAATCCTCCCTAACTCATTTTATGAGGCCAGCATCATTCTGATACCAAAGCCGGGCAGAGACACAACCAAAAAAGAGAATTTTAGACCAATATCCTTGATGAACATTGATGCAAAAATCCTCAATAAAATACTGGCAAACCGAATCCAGCAGCACATCAAAAAGCTTATCCACCATGATCAAGTGGGCTTCATCCCTGGGATGCAAGGCTGGTTCAATATACGCAAATCAATAAATGTAATCCAGCATATAAACAGAGCCAAAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAAGCCTTTGACAAAATTCAACAACCCTTCATGCTAAAAACTCTCAATAAATTAGGTATTGATGGGACGTATTTCAAAATAATAAGAGCTATCTATGACAAACCCACAGCCAATATCATACTGAATGGGCAAAAACTGGAAGCATTCCCTTTGAAAACCGGCACAAGACAGGGATGCCCTCTCTCACCGCTCCTATTCAACATAGTGTTGGAAGTTCTGGCCAGGGCAATCAGGCAGGAGAAGGAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCAAATTGTCCCTGTTTGCAGACGACATGATTGTATATCTAGAAAACCCCATCGTCTCAGCCCAAAATCTCCTTAAGCTGATAAGCAACTTCAGCAAAGTCTCAGGATACAAAATCAATGTACAAAAATCACAAGCATTCTTATACACCAACAACAGACAAACAGAGAGCCAAATCATGGGTGAACTCCCATTCGTAATTGCTTCAAAGAGAATAAAATACCTAGGAATCCAACTTACAAGGGATGTGAAGGACCTCTTCAAGGAGAACTACAAACCACTGCTCAAGGAAATAAAAGAGGACACAAACAAATGGAAGAACATTCCATGCTCATGGGTAGGAAGAATCAATATCGTGAAAATGGCCATACTGCCCAAGGTAATTTACAGATTCAATGCCATCCCCATCAAGCTACCAATGACTTTCTTCACAGAATTGGAAAAAACTACTTTAAAGTTCATATGGAACCAAAAAAGAGCCCGCATTGCCAAGTCAATCCTAAGCCAAAAGAACAAAGCTGGAGGCATCACACTACCTGACTTCAAACTATACTACAAGGCTACAGTAACCAAAACAGCATGGTACTGGTACCAAAACAGAGATATAGATCAATGGAACAGAACAGAGCCCTCAGAAATAATGCCGCATATCTACAACTATCTGATCTTTGACAAACCTGAGAAAAACAAGCAATGGGGAAAGGATTCCCTATTTAATAAATGGTGCTGGGAAAACTGGCTAGCCATATGTAGAAAGCTGAAACTGGATCCCTTCCTTACACCTTATACAAAAATCAATTCAAGATGGATTAAAGATTTAAACGTTAAACCTAAAACCATAAAAACCCTAGAAGAAAACCTAGGCATTACCATTCAGGACATAGGCGTGGGCAAGGACTTCATGTCCAAAACACCAAAAGCAATGGCAACAAAAGACAAAATTGACAAATGGGATCTAATTAAACTAAAGAGCTTCTGCACAGCAAAAGAAACTACCATCAGAGTGAACAGGCAACCTACAACATGGGAGAAAATTTTCGCAACCTACTCATCTGACAAAGGGCTAATATCCAGAATCTACAATGAACTTAAACAAATTTACAAGAAAAAAACAAACAACCCCATCAAAAAGTGGGCGAAGGACATGAACAGACACTTCTCAAAAGAAGACATTTATGCAGCCAAAAAACACATGAAGAAATGCTCATCATCACTGGCCATCAGAGAAATGCAAATCAAAACCACTATGAGATATCATCTCACACCAGTTAGAATGGCAATCATTAAAAAGTCAGGAAACAACAGGTGCTGGAGAGGATGCGGAGAAATAGGAACACTTTTACACTGTTGGTGGGACTGTAAACTAGTTCAACCATTGTGGAAGTCAGTGTGGCGATTCCTCAGGGATCTAGAACTAGAAATACCATTTGACCCAGCCATCCCATTACTGGGTATATACCCAAATGAGTATAAATCATGCTGCTATAAAGACACATGCACACGTATGTTTATTGCGGCACTATTCACAATAGCAAAGACTTGGAACCAACCCAAATGTCCAACAATGATAGACTGGATTAAGAAAATGTGGCACATATACACCATGGAATACTATGCAGCCATAAAAAATGATGAGTTCATATCCTTTGTAGGGACATGGATGAAATTGGAAACCATCATTCTCAGTAAACTATCGCAAGAACAAAAAACCAAACACCGCATATTCTCACTCATAGGTGGGAATTGAACAATGAGATCACATGGACACAGGAAGGGGAATATCACACTCTGGGGACTGTGGTGGGGTCGGGGGAGGGGGGAGGGATAGCATTGGGAGATATACCTAATGCTAGATGACACATTAGTGGGTGCAGCGCACCAGCATGGCACATGTATACATATGTAACTAACCTGCACAATGTGCACATGTACCCTAAAACTTAGAGTAT
5 | >SVA
6 | CTCCCTCTCCCTCACCCTCTCCCCATGGTCTCCCTCTCCCTCTCTTTCCACGGTCTCCCTCTGATGCCGAGCCGAAGCTGGACGGTACTGCTGCCATCTCGGCTCACTGCAACCTCCCTGCCTGATTCTCCTGCCTCAGCTTGCCGAGTGCCTGCGATTGCAGGCGCGCGCCGCCACGCCTGACTGGTTTTCGTATTTTGTTAGTGGAGACGGGGTTTCGCTGTGTTGGCCGGGCTGGTCTCCAGCTCCTAACCGCGAGTGATCCACCAGCCTCGGCCTCCCGAGGTGCTGGGATTGCAGACGGAGTCTCGTTCACTCAGTGCTCAATGATGCCCAGGCTGGAGTGCAGTGGCGTGATCTCGGCTCGCTACAACCTCCACCTCCCAGCAGCCTGCCTTGGCCTCCCAAAGTGCCGAGATTGCAGCCTCTGCCCGGCCGCCACCCCGTCTGGGAAGTGAGGAGTGTCTCCGCCTGGCCACCCATCGTCTGGGATGTGAGGAGCGTCTCTGCCCTGCCGCCCATCGTCTGAGATGTGGGGAGCACCTCTGCCCGGCCGCCCCGTCCGGGATGTGAGGAGCGTCGCTGCCCGGCCGCCCCGTCTGAGAAGTGAGGAGACCCTCTGCCTGGCAACCGCTCCATCTGAGAAGTGAGGAGCCCCTCCGCCCGGCAGCCGCCCTGTCTGAGAAGTGAGGAGCCCCTCCGCCCAGCAGCCACCTGGTCCGGGAGGGAGGTGGGGGGGTCAGCCCCCCGCCCGGCCAGCCGCCCCGTCCGGGAGGGAGGTGGGGGGGTCAGCCCCCAGCCCGGCCAGCCGCCCCGTCCGGGAAGTGAGGGGCGCCTCTGCCCGGCCGCCCCTACTGGGAAGTGAGGAGCCACTTTGCCCGGCCAGCCACTCTGTCCGGGAGGGAGGTGGGGGGGTCAGCCCCCCGCCCGGCCAGCCGCCCCGTCCGGGAGGGAGGTGGGGGGATCAGCCCCCCGCCCAGCCAGCCGCCCCGTCCGGGAGGGAGGTGGGGGGGTCAGCCCCCCGCCCGGCCAGCCGCCCTGTCCGGGAGGTGAGGGGCGCCTCTGCCCGGCCGCGCCTACTGGAAAGTGAGGAGCCCCTCTGCCCGGCCACCACCCCGTCTGGGAGGTGTGCCCAACAGCTCATTGAGAAGGGGCCATGATGACAATGGCGGTTTTGTGGAATAGAAAGGGGGGAAAGGTGGGGAAAAGATTGAGAAATCGGATGGTTGCCGTGTCTGTGTAGAAAGAGGTAGACCTGGGAGACTTTTCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGGGATCCTGTTGATCGGTGACCTTACCCCCAACCCTGTGCTCTCTGAAACATGTGCTGTATCCACTCAGGGTTGAATGGATTAAGAGCGGTGCAAGATGTGCTTTGTTAAACAGATGCTTGAAGGCAGCATGCTCCTTAAGAGTCATCACCACTCCCTAATCTCAAGTACCCAGGGACACAAACACTGCGGAAGGCCGCAGGGTCCTCTGCCTAGGAAAACCAGAGACCTTTGTTCACTTGTTTATCTGCTGACCTTCCCTCCACTATTGTCCTGTGACCCTGCCAAATCCCCCTCTGTGAGAAACACCCAAGAATGATCAAT
7 | 
8 | 


--------------------------------------------------------------------------------
/src/rMETL/rMETL_MEIcalling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | ''' 
  4 |  * All rights Reserved, Designed By HIT-Bioinformatics   
  5 |  * @Description: Establish the ME callset
  6 |  * @author: Jiang Tao (tjiang@hit.edu.cn)
  7 |  * @date: Apr 24 2018
  8 |  * @version V1.0.4  
  9 | '''
 10 | 
 11 | import argparse
 12 | import logging
 13 | import sys
 14 | import time
 15 | import cigar
 16 | 
 17 | from collections import Counter
 18 | from rMETL.rMETL_version import __version__, __author__, __contact__
 19 | from rMETL.rMETL_genotype import simple_call_genotype
 20 | from rMETL.rMETL_cmdRunner import setupLogging
 21 | from rMETL.rMETL_utils import load_ref
 22 | 
 23 | USAGE="""\
 24 |            _  ___  _   _____   _______   _
 25 |      _ _  | ^_   _^ | |  ___| |__   __| | |
 26 |     | ^_| | | | | | | | |__      | |    | |
 27 |     | |   | | | | | | |  __|     | |    | |
 28 |     | |   | | | | | | | |___     | |    | |___
 29 |     |_|   |_| |_| |_| |_____|    |_|    |_____|
 30 | 
 31 |     rMETL - realignment-based Mobile Element insertion detection Tool for Long read
 32 | 
 33 | 	Generate final MEI/MED callset in bed or vcf file.
 34 | 	
 35 | 	The output file called 'calling.bed' or 'calling.vcf'
 36 | 	stores in output directory.
 37 | 	
 38 | 	rMETL V%s
 39 | 	Author: %s
 40 | 	Contact: %s
 41 | """%(__version__, __author__, __contact__)
 42 | 
 43 | def acquire_count_max(_list_):
 44 | 	c = Counter(_list_)
 45 | 	return c.most_common(1)[0][0]
 46 | 
 47 | flag_dic = {0:1, \
 48 | 			16:2, \
 49 | 			256:0, \
 50 | 			272:0, \
 51 | 			2048:0, \
 52 | 			2064:0, \
 53 | 			4:0}
 54 | 
 55 | STRAND = {'1':'+', \
 56 | 		  '2':'-', \
 57 | 		  '*':'+-'}
 58 | 
 59 | cluster_dic = {}
 60 | 
 61 | strand_dic = {1:'+', \
 62 | 			  2:'-'}
 63 | 
 64 | class R_INFO(object):
 65 | 	"""store the infomation of the signal sequence"""
 66 | 	def __init__(self, Type, Chr, Pos, Len, GT):
 67 | 		self.Type = Type
 68 | 		self.Chr = Chr
 69 | 		self.Pos = Pos
 70 | 		self.Len = Len
 71 | 		self.GT = GT
 72 | 
 73 | def parse_name(seq):
 74 | 	chr = seq.split('*')[0]
 75 | 	breakpoint = seq.split('*')[1]
 76 | 	insert_size = seq.split('*')[2]
 77 | 	GT = seq.split('*')[3]
 78 | 	return chr, breakpoint, insert_size, GT
 79 | 
 80 | def parse_name_tp(line):
 81 | 	'''
 82 | 	resolution signatures for bed format
 83 | 	'''
 84 | 	seq = line.split('*')
 85 | 	Type = seq[0]
 86 | 	chr = seq[1]
 87 | 	pos = seq[2]
 88 | 	len = seq[3]
 89 | 	if Type == 'DEL':
 90 | 		rc = seq[4]
 91 | 		cov = seq[5]
 92 | 	else:
 93 | 		rc = seq[5]
 94 | 		cov = seq[6]
 95 | 	GT = rc+':'+cov
 96 | 	local_info = R_INFO(Type, chr, pos, len, GT)
 97 | 	return local_info
 98 | 
 99 | def clip_analysis(deal_cigar, clipping_threshold):
100 | 	'''
101 | 	resolution cogar
102 | 	'''
103 | 	seq = list(cigar.Cigar(deal_cigar).items())
104 | 	if seq[0][1] == 'S':
105 | 		first_pos = seq[0][0]
106 | 	else:
107 | 		first_pos = 0
108 | 	if seq[-1][1] == 'S':
109 | 		last_pos = seq[-1][0]
110 | 	else:
111 | 		last_pos = 0
112 | 	total_len = first_pos + last_pos
113 | 	signal_len = 0
114 | 	for i in seq:
115 | 		signal_len += i[0]
116 | 	if signal_len == 0:
117 | 		return 0
118 | 	if total_len*1.0 / signal_len >= clipping_threshold:
119 | 		return 0
120 | 	else:
121 | 		return 1
122 | 
123 | def print_vcf_head(ref, sample):
124 | 	'''
125 | 	generation of VCF head
126 | 	'''
127 | 	import time
128 | 	Date = time.strftime("%Y%m%d")
129 | 	head = list()
130 | 	head.append("##fileformat=VCFv4.2\n")
131 | 	head.append("##fileDate=%s\n"%(Date))
132 | 	head.append("##source=rMETL\n")
133 | 	for i in ref:
134 | 		head.append("##contig=<ID=%s,length=%d>\n"%(i, len(ref[i])))
135 | 	head.append("##ALT=<ID=<DEL>,Description=\"Deletion relative to the reference\">\n")
136 | 	head.append("##ALT=<ID=<INS>,Description=\"Insertion of sequence relative to the reference\">\n")
137 | 	head.append("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">\n")
138 | 	head.append("##INFO=<ID=SVLEN,Number=.,Type=String,Description=\"Difference in length between REF and ALT alleles\">\n")
139 | 	head.append("##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variation\">\n")
140 | 	head.append("##INFO=<ID=PRECISE,Number=0,Type=Flag,Description=\"Precise structural variation\">\n")
141 | 	head.append("##INFO=<ID=AC,Number=.,Type=Integer,Description=\"Allele count'\">\n")
142 | 	head.append("##INFO=<ID=AF,Number=.,Type=Float,Description=\"Allele frequency'\">\n")
143 | 	head.append("##INFO=<ID=AN,Number=.,Type=String,Description=\"Allele name'\">\n")
144 | 	head.append("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
145 | 	head.append("##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"#High-quality variant reads\">\n")
146 | 	head.append("##FORMAT=<ID=DR,Number=1,Type=Integer,Description=\"#Reference reads\">\n")
147 | 	head.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n"%(sample))
148 | 	return head
149 | 
150 | def parse_seq_head(line):
151 | 	'''
152 | 	resolution signatures for vcf format
153 | 	'''
154 | 	seq = line.split('*')
155 | 	Type = seq[0]
156 | 	chr = seq[1]
157 | 	pos = seq[2]
158 | 	len = seq[3]
159 | 	if Type == 'DEL':
160 | 		rc = seq[4]
161 | 		cov = seq[5]
162 | 	else:
163 | 		rc = seq[5]
164 | 		cov = seq[6]
165 | 	GT = rc+':'+cov
166 | 	local_info = R_INFO(Type, chr, pos, len, GT)
167 | 	return local_info
168 | 
169 | # ************************BED_FUNCTION*******************************
170 | def call_bed(args):
171 | 	path = args.input
172 | 	out_path = args.output + "calling.bed"
173 | 	AlignmentFile = open(path, 'r')
174 | 	logging.info("Loading ME realignmets...")
175 | 	for line in AlignmentFile:
176 | 		seq = line.strip('\n').split('\t')
177 | 		if seq[0][0] == '@':
178 | 			continue
179 | 		local_info = parse_name_tp(seq[0])
180 | 		Flag = int(seq[1])
181 | 		sub_type = seq[2]
182 | 		MAPQ = int(seq[4])
183 | 		cigar = seq[5]
184 | 		cigar_flag = clip_analysis(cigar, args.clipping_threshold)
185 | 		if flag_dic[Flag] != 0 and MAPQ >= args.min_mapq and cigar_flag == 1:
186 | 			key = "%s*%s*%s*%s"%(local_info.Chr, local_info.Pos, local_info.Len, \
187 | 				local_info.GT)
188 | 			if key not in cluster_dic:
189 | 				cluster_dic[key] = list()
190 | 			cluster_dic[key].append("<%s:ME:%s>"%(local_info.Type, sub_type))
191 | 	AlignmentFile.close()
192 | 
193 | 	if args.MEI == 'False':
194 | 		AlignmentFile = open(path, 'r')
195 | 		for line in AlignmentFile:
196 | 			seq = line.strip('\n').split('\t')
197 | 			if seq[0][0] == '@':
198 | 				continue
199 | 			local_info = parse_name_tp(seq[0])
200 | 			Flag = int(seq[1])
201 | 			sub_type = seq[2]
202 | 			if sub_type == '*':
203 | 				key = "%s*%s*%s*%s"%(local_info.Chr, local_info.Pos, local_info.Len, \
204 | 					local_info.GT)
205 | 				if key not in cluster_dic:
206 | 					cluster_dic[key] = list()
207 | 					cluster_dic[key].append("<%s>"%(local_info.Type))
208 | 		AlignmentFile.close()
209 | 
210 | 	sort_list = list()
211 | 	for i in cluster_dic:
212 | 		chr, breakpoint, insert_size, GT = parse_name(i)
213 | 		final_type = acquire_count_max(cluster_dic[i])
214 | 		sort_list.append([chr, breakpoint, insert_size, final_type])
215 | 	sort_list = sorted(sort_list, key = lambda x:(x[0], int(x[1])))
216 | 	file = open(out_path, 'w')
217 | 	logging.info("Writing results into disk...")
218 | 	file.write("# Chromsome\tBreakpoint\tSV length\tMEI Type")
219 | 	for i in sort_list:
220 | 		file.write("\t".join(i)+"\n")
221 | 	file.close()
222 | # ************************BED_FUNCTION*******************************
223 | # 
224 | # 
225 | # 
226 | # ************************VCF_FUNCTION*******************************
227 | def call_vcf(args):
228 | 	path = args.input
229 | 	out_path = args.output + "calling.vcf"
230 | 	ref = load_ref(args.Reference)
231 | 	AlignmentFile = open(path, 'r')
232 | 	logging.info("Loading ME realignmets...")
233 | 	for line in AlignmentFile:
234 | 		seq = line.strip('\n').split('\t')
235 | 		if seq[0][0] == '@':
236 | 			continue
237 | 		local_info = parse_seq_head(seq[0])
238 | 		Flag = int(seq[1])
239 | 		sub_type = seq[2]
240 | 		MAPQ = int(seq[4])
241 | 		cigar = seq[5]
242 | 		cigar_flag = clip_analysis(cigar, args.clipping_threshold)
243 | 		if flag_dic[Flag] != 0 and MAPQ >= args.min_mapq and cigar_flag == 1:
244 | 			key = "%s*%s*%s*%s"%(local_info.Chr, local_info.Pos, local_info.Len, \
245 | 				local_info.GT)
246 | 			if key not in cluster_dic:
247 | 				cluster_dic[key] = list()
248 | 			cluster_dic[key].append("<%s:ME:%s>\t%d"%(local_info.Type, sub_type, \
249 | 				flag_dic[Flag]))
250 | 	AlignmentFile.close()
251 | 
252 | 	if args.MEI == 'False':
253 | 		AlignmentFile = open(path, 'r')
254 | 		for line in AlignmentFile:
255 | 			seq = line.strip('\n').split('\t')
256 | 			if seq[0][0] == '@':
257 | 				continue
258 | 			local_info = parse_name_tp(seq[0])
259 | 			Flag = int(seq[1])
260 | 			sub_type = seq[2]
261 | 			if sub_type == '*':
262 | 				key = "%s*%s*%s*%s"%(local_info.Chr, local_info.Pos, local_info.Len, \
263 | 					local_info.GT)
264 | 				if key not in cluster_dic:
265 | 					cluster_dic[key] = list()
266 | 					cluster_dic[key].append("<%s>\t%s"%(local_info.Type, '*'))
267 | 		AlignmentFile.close()
268 | 
269 | 	sort_list = list()
270 | 	for i in cluster_dic:
271 | 		chr, breakpoint, insert_size, GT = parse_name(i)
272 | 		final_type = acquire_count_max(cluster_dic[i]).split('\t')[0]
273 | 		strand = STRAND[acquire_count_max(cluster_dic[i]).split('\t')[1]]
274 | 		sort_list.append([chr, breakpoint, insert_size, final_type, GT, strand])
275 | 	sort_list = sorted(sort_list, key = lambda x:(x[0], int(x[1])))
276 | 	head_info = print_vcf_head(ref, args.sample)
277 | 
278 | 	file = open(out_path, 'w')
279 | 	logging.info("Writing results into disk...")
280 | 
281 | 	for line in head_info:
282 | 		file.write(line)
283 | 
284 | 	ID = 0
285 | 	for i in sort_list:
286 | 		concordant = int(i[4].split(':')[0])
287 | 		discordant = int(i[4].split(':')[1]) - int(i[4].split(':')[0])
288 | 		if discordant < 0:
289 | 			discordant = 0
290 | 		GT, GL, reliability = simple_call_genotype(concordant, concordant+discordant, \
291 | 			args.heterozygous, args.homozygous)
292 | 
293 | 		if reliability == 1:
294 | 			INFO = "PRECISE;SVTYPE=%s;SVLEN=%d;END=%d;SAMPLE=%s;STRAND=%s"%(i[3][1:4], \
295 | 				int(i[2]), int(i[1])+int(i[2])-1, args.sample, i[5])
296 | 		else:
297 | 			INFO = "IMPRECISE;SVTYPE=%s;SVLEN=%d;END=%d;SAMPLE=%s;STRAND=%s"%(i[3][1:4], \
298 | 				int(i[2]), int(i[1])+int(i[2])-1, args.sample, i[5])
299 | 		try:
300 | 			REF = ref[i[0]][int(i[1])-1]
301 | 		except:
302 | 			REF = "N"
303 | 		file.write("%s\t%s\t%d\t%s\t%s\t.\tPASS\t%s\tGT:DV:DR\t%s:%s\n"%(i[0], i[1], \
304 | 			ID, REF, i[3], INFO, GT, GL))
305 | 		ID += 1
306 | 	file.close()
307 | # *************************VCF_FUNCTION*******************************
308 | # 
309 | # 
310 | # 
311 | # ************************MAIN_FUNCTION*******************************
312 | def parseArgs(argv):
313 | 	parser = argparse.ArgumentParser(prog="rMETL calling", description=USAGE, \
314 | 		formatter_class=argparse.RawDescriptionHelpFormatter)
315 | 	parser.add_argument("input", metavar="SAM", type=str, help="Input cluster.sam on STAGE realignment.")
316 | 	parser.add_argument("Reference", metavar="REFERENCE", type=str, \
317 | 		help="The reference genome in fasta format.")
318 | 	parser.add_argument("format", metavar="[BED,VCF]", type=str, \
319 | 		help="The format of the output file. [%(default)s]", default = "bed")
320 | 	parser.add_argument('output', type=str, help = "Directory to output final callset.")
321 | 	parser.add_argument('-hom', '--homozygous', \
322 | 		help = "The mininum score of a genotyping reported as a homozygous.[%(default)s]", \
323 | 		default = 0.8, type = float)
324 | 	parser.add_argument('-het','--heterozygous', \
325 | 		help = "The mininum score of a genotyping reported as a heterozygous.[%(default)s]", \
326 | 		default = 0.3, type = float)
327 | 	parser.add_argument('-q', '--min_mapq', help = "Mininum mapping quality.[%(default)s]", \
328 | 		default = 20, type = int)
329 | 	parser.add_argument('-c', '--clipping_threshold', \
330 | 		help = "Mininum threshold of realignment clipping.[%(default)s]", \
331 | 		default = 0.5, type = float)
332 | 	parser.add_argument('--sample', help = "Sample description", \
333 | 		default = "None", type = str)
334 | 	parser.add_argument('--MEI', help = "Enables rMETL to display MEI/MED only.[%(default)s]", \
335 | 		default = "True", type = str)
336 | 	args = parser.parse_args(argv)
337 | 	return args
338 | 
339 | def run(argv):
340 |     args = parseArgs(argv)
341 |     setupLogging(False)
342 |     starttime = time.time()
343 |     if args.format == "bed":
344 |     	call_bed(args)
345 |     elif args.format == "vcf":
346 |     	call_vcf(args)
347 |     else:
348 |     	logging.error("Invalid format.")
349 |     	exit(1)
350 |     logging.info("Finished in %0.2f seconds."%(time.time() - starttime))
351 | 
352 | if __name__ == '__main__':
353 | 	run(sys.argv[:1])
354 | 


--------------------------------------------------------------------------------
/src/rMETL/rMETL_extraction.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | ''' 
  4 |  * All rights Reserved, Designed By HIT-Bioinformatics   
  5 |  * @Description: Parse the ME signatures from alignments
  6 |  * @author: Jiang Tao (tjiang@hit.edu.cn)
  7 |  * @date: Apr 24 2018
  8 |  * @version V1.0.4     
  9 | '''
 10 | 
 11 | import pysam
 12 | import cigar
 13 | import os
 14 | import argparse
 15 | import logging
 16 | import sys
 17 | import time
 18 | import gc
 19 | 
 20 | from multiprocessing import Pool
 21 | from rMETL.rMETL_version import __version__, __author__, __contact__
 22 | from rMETL.rMETL_concensus import construct_concensus_info
 23 | from rMETL.rMETL_genotype import add_genotype
 24 | from rMETL.rMETL_utils import load_ref, check_bai, call_ngmlr, call_samtools
 25 | from rMETL.rMETL_cmdRunner import setupLogging, exe
 26 | 
 27 | USAGE="""\
 28 |            _  ___  _   _____   _______   _
 29 |      _ _  | ^_   _^ | |  ___| |__   __| | |
 30 |     | ^_| | | | | | | | |__      | |    | |
 31 |     | |   | | | | | | |  __|     | |    | |
 32 |     | |   | | | | | | | |___     | |    | |___
 33 |     |_|   |_| |_| |_| |_____|    |_|    |_____|
 34 | 
 35 |     rMETL - realignment-based Mobile Element insertion detection Tool for Long read
 36 | 
 37 | 
 38 | 	Support reads aligned with Ngmlr and sorted with Samtools
 39 | 
 40 | 	If input is a fastq or fasta format file, rMETL generates
 41 | 	alignments with Ngmlr at first;
 42 | 
 43 | 	If input is a sam format file, rMETL converts and sorts it
 44 | 	to be a bam format file;
 45 | 
 46 | 	If your input is a bam format file with index, rMETL extracts
 47 | 	the ME signatures and collects the sub-sequence of them.
 48 | 
 49 | 	The output is a fasta format file called 'potential.fa' 
 50 | 	contains potentials non-reference ME clusters.
 51 | 
 52 | 	rMETL V%s
 53 | 	Author: %s
 54 | 	Contact: %s
 55 | """%(__version__, __author__, __contact__)
 56 | 
 57 | INS_flag = {1:'I'}
 58 | DEL_flag = {2:'D'}
 59 | clip_flag = {4:'S', 5:'H'}
 60 | global_ref = list()
 61 | 
 62 | # **********************check-input-format****************************
 63 | def decipherInput(input):
 64 | 	"""
 65 | 	resolution input format
 66 | 	"""
 67 | 	extension = input.split('.')[-1].lower()
 68 | 	choice = {"bam": 0, \
 69 | 			  "sam": 1, \
 70 | 			  "fasta": 2, \
 71 | 			  "fastq": 2, \
 72 | 			  "fa": 2, \
 73 | 			  "fq": 2}
 74 | 	return choice[extension]
 75 | # **********************check-input-format****************************
 76 | # 
 77 | # 
 78 | # 
 79 | # ************************mini-operations*****************************
 80 | def revcom_complement(s): 
 81 | 	'''
 82 | 	generation reverse complementary sequence.
 83 | 	all of the lowercase will be changed to capital letter.
 84 | 	'''
 85 | 	basecomplement = {'A': 'T', \
 86 | 					  'C': 'G', \
 87 | 					  'G': 'C', \
 88 | 					  'T': 'A', \
 89 | 					  'a': 'T', \
 90 | 					  'c': 'G', \
 91 | 					  'g': 'C', \
 92 | 					  't': 'A'} 
 93 | 	letters = list(s) 
 94 | 	letters = [basecomplement[base] for base in letters] 
 95 | 	return ''.join(letters)[::-1]
 96 | 
 97 | def detect_flag(Flag):
 98 | 	'''
 99 | 	identification of flag in BAM
100 | 		0 means unmapped read
101 | 		1 & 2 means primary mapping read with normal strand or reverse strand 
102 | 		3 & 4 means supplementary mapping read with normal strand or reverse strand
103 | 	'''
104 | 	Normal_foward = 1 >> 1
105 | 	Abnormal = 1 << 2
106 | 	Reverse_complement = 1 << 4
107 | 	Supplementary_map = 1 << 11
108 | 
109 | 	signal = {Abnormal: 0, \
110 | 			  Normal_foward: 1, \
111 | 			  Reverse_complement: 2, \
112 | 			  Supplementary_map: 3, \
113 | 			  Reverse_complement | Supplementary_map: 4}
114 | 
115 | 	back_sig = signal[Flag] if Flag in signal else 0
116 | 	return back_sig
117 | 
118 | def acquire_clip_pos(deal_cigar):
119 | 	'''
120 | 	resolution of cigar in supplementary mapping
121 | 	'''
122 | 	seq = list(cigar.Cigar(deal_cigar).items())
123 | 	first_pos = seq[0][0] if seq[0][1] == 'S' else 0
124 | 	last_pos = seq[-1][0] if seq[-1][1] == 'S' else 0
125 | 	bias = 0
126 | 	for i in seq:
127 | 		if i[1] in ['M', 'D']:
128 | 			bias += i[0]
129 | 	return [first_pos, last_pos, bias]
130 | # ************************mini-operations*****************************
131 | # 
132 | # 
133 | # 
134 | # ************************soft-clippings******************************
135 | def store_clip_pos(locus, chr, seq, flag, CLIP_note):
136 | 	'''
137 | 	A data structure store info of soft-clippings.
138 | 	It has two hashtables:
139 | 		1. key1 is an integer calculated by coordinate / 10000;
140 | 		2. key2 is an integer calculated by (coordinate % 10000) / 50.
141 | 	'''
142 | 	hash_1 = int(locus /10000)
143 | 	mod = locus % 10000
144 | 	hash_2 = int(mod / 50)
145 | 	element = [locus, seq, flag]
146 | 	if hash_1 not in CLIP_note[chr]:
147 | 		CLIP_note[chr][hash_1] = dict()
148 | 		CLIP_note[chr][hash_1][hash_2] = list()
149 | 		CLIP_note[chr][hash_1][hash_2].append(element)
150 | 	else:
151 | 		if hash_2 not in CLIP_note[chr][hash_1]:
152 | 			CLIP_note[chr][hash_1][hash_2] = list()
153 | 			CLIP_note[chr][hash_1][hash_2].append(element)
154 | 		else:
155 | 			CLIP_note[chr][hash_1][hash_2].append(element)
156 | 
157 | def acquire_clip_locus(down, up, chr, CLIP_note):
158 | 	'''
159 | 	search soft-clippings within limited region
160 | 	'''
161 | 	list_clip = list()
162 | 	if int(up/10000) == int(down/10000):
163 | 		key_1 = int(down/10000)
164 | 		if key_1 not in CLIP_note[chr]:
165 | 			return list_clip
166 | 		for i in xrange(int((up%10000)/50)-int((down%10000)/50)+1):
167 | 			key_2 = int((down%10000)/50)+i
168 | 			if key_2 not in CLIP_note[chr][key_1]:
169 | 				continue
170 | 			for ele in CLIP_note[chr][key_1][key_2]:
171 | 				if ele[0] >= down and ele[0] <= up:
172 | 					list_clip.append(ele)
173 | 	else:
174 | 		key_1 = int(down/10000)
175 | 		if key_1 in CLIP_note[chr]:
176 | 			for i in xrange(200-int((down%10000)/50)):
177 | 				key_2 = int((down%10000)/50)+i
178 | 				if key_2 not in CLIP_note[chr][key_1]:
179 | 					continue
180 | 				for ele in CLIP_note[chr][key_1][key_2]:
181 | 					if ele[0] >= down and ele[0] <= up:
182 | 						list_clip.append(ele)
183 | 		key_1 += 1
184 | 		if key_1 not in CLIP_note[chr]:
185 | 			return list_clip
186 | 		for i in xrange(int((up%10000)/50)+1):
187 | 			key_2 = i
188 | 			if key_2 not in CLIP_note[chr][key_1]:
189 | 				continue
190 | 			for ele in CLIP_note[chr][key_1][key_2]:
191 | 				if ele[0] >= down and ele[0] <= up:
192 | 					list_clip.append(ele)
193 | 	return list_clip
194 | # ************************soft-clippings******************************
195 | # 
196 | # 
197 | # 
198 | # ***********************resolution-reads*****************************
199 | def organize_split_signal(chr, primary_info, Supplementary_info, \
200 | 	total_L, low_bandary):
201 | 	'''
202 | 	resolution split alignments
203 | 	'''
204 | 	overlap = list()
205 | 	for i in Supplementary_info:
206 | 		seq = i.split(',')
207 | 		local_chr = seq[0]
208 | 		local_start = int(seq[1])
209 | 		local_cigar = seq[3]
210 | 		dic_starnd = {1:'+', 2: '-'}
211 | 		if dic_starnd[primary_info[4]] != seq[2]:
212 | 			continue
213 | 		if chr != local_chr:
214 | 			continue
215 | 		local_set = acquire_clip_pos(local_cigar)
216 | 		if primary_info[0] < local_start:
217 | 			if primary_info[3]+local_set[0]-total_L > low_bandary:
218 | 				overlap.append([total_L - primary_info[3], \
219 | 					local_set[0], primary_info[1]])
220 | 		else:
221 | 			if local_set[1]+primary_info[2]-total_L > low_bandary:
222 | 				overlap.append([total_L - local_set[1], \
223 | 					primary_info[2], local_start+local_set[2]-1])
224 | 	return overlap
225 | 
226 | def parse_read(read, Chr_name, low_bandary, CLIP_note):
227 | 	'''
228 | 	Check:	1.Flag
229 | 			2.Supplementary mapping
230 | 			3.Seq
231 | 	'''
232 | 	DEL_ME_pos = list()
233 | 	INS_ME_pos = list()
234 | 	process_signal = detect_flag(read.flag)
235 | 	if process_signal == 0:
236 | 		return INS_ME_pos, DEL_ME_pos
237 | 
238 | 	# Add DEL:ME type call signal
239 | 	pos_start = read.reference_start
240 | 	shift = 0
241 | 	for element in read.cigar:
242 | 		if element[0] == 0:
243 | 			shift += element[1]
244 | 		if element[0] in DEL_flag and element[1] <= low_bandary:
245 | 			shift += element[1]
246 | 		if element[0] in DEL_flag and element[1] > low_bandary:
247 | 			DEL_ME_pos.append([pos_start+shift, element[1]])
248 | 			shift += element[1]
249 | 
250 | 	# Add INS:ME type call signal
251 | 	pos_start = read.reference_start
252 | 	shift = 0
253 | 	_shift_read_ = 0
254 | 	pos_end = read.reference_end
255 | 	primary_clip_0 = 0
256 | 	primary_clip_1 = 0
257 | 	for element in read.cigar:
258 | 		if element[0] == 0 or element[0] == 2:
259 | 			shift += element[1]
260 | 		if element[0] != 2:
261 | 			_shift_read_ += element[1]
262 | 		if element[0] in INS_flag and element[1] > low_bandary:
263 | 			shift += 1
264 | 			MEI_contig = read.query_sequence[_shift_read_ - \
265 | 			element[1]:_shift_read_]
266 | 			INS_ME_pos.append([pos_start + shift, element[1], \
267 | 				MEI_contig])
268 | 		if element[0] in clip_flag:
269 | 			if shift == 0:
270 | 				primary_clip_0 = element[1]
271 | 			else:
272 | 				primary_clip_1 = element[1]
273 | 			if element[1] > low_bandary:
274 | 				if shift == 0:
275 | 					clip_pos = pos_start - 1
276 | 					clip_contig = read.query_sequence[:element[1]]
277 | 					store_clip_pos(clip_pos, Chr_name, clip_contig, \
278 | 						0, CLIP_note)
279 | 				else:
280 | 					clip_pos = pos_start + shift - 1
281 | 					clip_contig = read.query_sequence[read.query_length \
282 | 					- element[1]:]
283 | 					store_clip_pos(clip_pos, Chr_name, clip_contig, 1, \
284 | 						CLIP_note)
285 | 
286 | 	if process_signal in [1, 2]:
287 | 		Tags = read.get_tags()
288 | 		chr = Chr_name
289 | 		primary_info = [pos_start, pos_end, primary_clip_0, primary_clip_1, \
290 | 		process_signal]
291 | 		for i in Tags:
292 | 			if i[0] == 'SA':
293 | 				Supplementary_info = i[1].split(';')[:-1]
294 | 				overlap = organize_split_signal(chr, primary_info, \
295 | 					Supplementary_info, read.query_length, low_bandary)
296 | 				for k in overlap:
297 | 					MEI_contig = read.query_sequence[k[0]:k[1]]
298 | 					INS_ME_pos.append([k[2], k[1] - k[0], MEI_contig])
299 | 	return INS_ME_pos, DEL_ME_pos
300 | # ***********************resolution-reads*****************************
301 | # 
302 | # 
303 | # 
304 | # ***********************Cluster-Function*****************************
305 | def merge_pos(pos_list, chr, evidence_read, SV_size, CLIP_note):
306 | 	'''
307 | 	INS: inner-cluster function
308 | 	'''
309 | 	start = list()
310 | 	end = list()
311 | 	for ele in pos_list:
312 | 		start.append(ele[0])
313 | 		end.append(ele[0] + ele[1])
314 | 	search_down = min(start) - 10
315 | 	search_up = max(start) + 10
316 | 	temp_clip = acquire_clip_locus(search_down, search_up, chr, CLIP_note)
317 | 	result = construct_concensus_info(pos_list, temp_clip, evidence_read, \
318 | 		SV_size)
319 | 	if result != 0:
320 | 		for i in xrange(len(result)):
321 | 			result[i] = ["INS", chr] + result[i] + [len(result)]
322 | 		return result
323 | 	else:
324 | 		return 0
325 | 
326 | def cluster(pos_list, chr, evidence_read, SV_size, low_bandary, CLIP_note):
327 | 	'''
328 | 	INS: outer-cluster function
329 | 	'''
330 | 	_cluster_ = list()
331 | 	temp = list()
332 | 	temp.append(pos_list[0])
333 | 	for pos in pos_list[1:]:
334 | 		if temp[-1][0] + low_bandary < pos[0]:
335 | 			result = merge_pos(temp, chr, evidence_read, SV_size, CLIP_note)
336 | 			if result != 0:
337 | 				_cluster_.append(result)
338 | 			temp = list()
339 | 			temp.append(pos)
340 | 		else:
341 | 			temp.append(pos)
342 | 	result = merge_pos(temp, chr, evidence_read, SV_size, CLIP_note)
343 | 	if result != 0:
344 | 		_cluster_.append(result)
345 | 	return _cluster_
346 | 
347 | def merge_pos_del(pos_list, chr, Ref, evidence_read, SV_size):
348 | 	'''
349 | 	DEL: inner-cluster function
350 | 	'''
351 | 	start = list()
352 | 	end = list()
353 | 	for ele in pos_list:
354 | 		start.append(ele[0])
355 | 		end.append(ele[0] + ele[1])
356 | 	breakpoint = sum(start)/len(start)
357 | 	size = sum(end)/len(end) - breakpoint
358 | 	result = list()
359 | 	if len(pos_list) < evidence_read:
360 | 		return result
361 | 	else:
362 | 		if chr in Ref and size >= SV_size:
363 | 			result.append(['DEL', chr, breakpoint, size, len(pos_list), \
364 | 				str(Ref[chr].seq[breakpoint:breakpoint+size])])
365 | 	return result
366 | 
367 | def cluster_del(pos_list, chr, Ref, evidence_read, SV_size, low_bandary):
368 | 	'''
369 | 	DEL: outer-cluster function
370 | 	'''
371 | 	_cluster_ = list()
372 | 	temp = list()
373 | 	temp.append(pos_list[0])
374 | 	for pos in pos_list[1:]:
375 | 		if temp[-1][0] + low_bandary < pos[0]:
376 | 			result = merge_pos_del(temp, chr, Ref, evidence_read, SV_size)
377 | 			if len(result) != 0:
378 | 				_cluster_.append(result)
379 | 			temp = list()
380 | 			temp.append(pos)
381 | 		else:
382 | 			temp.append(pos)
383 | 	result = merge_pos_del(temp, chr, Ref, evidence_read, SV_size)
384 | 	if len(result) != 0:
385 | 		_cluster_.append(result)
386 | 	return _cluster_
387 | # ***********************Cluster-Function*****************************
388 | # 
389 | # 
390 | # 
391 | # ***********************Output-Function******************************
392 | def combine_result(INS, DEL, path, chr):
393 | 	'''
394 | 	Merge results into one list and output it.
395 | 	'''
396 | 	output = "%ssignatures/%s_sig.fa"%(path, chr)
397 | 	file = open(output, 'w')
398 | 	for i in INS:
399 | 		for j in i:
400 | 			if len(j) != 8:
401 | 				continue
402 | 			key = "%s*%s*%d*%d*%s*%d*%d"%(j[0], j[1], j[2], j[3], j[4], j[6], \
403 | 				j[7])
404 | 			file.write(">"+key+'\n')
405 | 			file.write(j[5]+'\n')
406 | 	del INS
407 | 	gc.collect()
408 | 	for i in DEL:
409 | 		for j in i:
410 | 			if len(j) != 7:
411 | 				continue
412 | 			key = "%s*%s*%d*%d*%d*%d"%(j[0], j[1], j[2], j[3], j[4], j[6])
413 | 			file.write(">%s\n"%(key))
414 | 			file.write('%s\n'%(j[5]))
415 | 	del DEL
416 | 	gc.collect()
417 | 	file.close()
418 | # ***********************Output-Function******************************
419 | # 
420 | # 
421 | # 
422 | # ********************Signatures-extraction***************************
423 | def single_pipe(out_path, chr, bam_path, low_bandary, evidence_read, SV_size):
424 | 	'''
425 | 	resolution signatures
426 | 	'''
427 | 	samfile = pysam.AlignmentFile(bam_path)
428 | 	CLIP_note = dict()
429 | 	logging.info("Resolving chromsome %s."%(chr))
430 | 	if chr not in CLIP_note:
431 | 		CLIP_note[chr] = dict()
432 | 	cluster_pos_INS = list()
433 | 	cluster_pos_DEL = list()
434 | 	for read in samfile.fetch(chr):
435 | 		feed_back, feed_back_del = parse_read(read, chr, low_bandary, CLIP_note)
436 | 		cluster_pos_INS += feed_back
437 | 		cluster_pos_DEL += feed_back_del
438 | 	cluster_pos_INS = sorted(cluster_pos_INS, key = lambda x:x[0])
439 | 	cluster_pos_DEL = sorted(cluster_pos_DEL, key = lambda x:x[0])
440 | 	if len(cluster_pos_INS) == 0:
441 | 		Cluster_INS = list()
442 | 	else:
443 | 		Cluster_INS = cluster(cluster_pos_INS, chr, evidence_read, SV_size, \
444 | 			low_bandary, CLIP_note)
445 | 		del cluster_pos_INS
446 | 		del CLIP_note[chr]
447 | 		gc.collect()
448 | 	if len(cluster_pos_DEL) == 0:
449 | 		Cluster_DEL = list()
450 | 	else:
451 | 		Ref = global_ref[0]
452 | 		Cluster_DEL = cluster_del(cluster_pos_DEL, chr, Ref, evidence_read, \
453 | 			SV_size, low_bandary)
454 | 		del cluster_pos_DEL
455 | 		gc.collect()
456 | 	logging.info("%d MEI/MED signal loci in the chromsome %s."%(len(Cluster_INS)+\
457 | 		len(Cluster_DEL), chr))
458 | 	combine_result(add_genotype(Cluster_INS, samfile, low_bandary), \
459 | 		add_genotype(Cluster_DEL, samfile, low_bandary), out_path, chr)
460 | 	samfile.close()
461 | 
462 | def multi_run_wrapper(args):
463 |    return single_pipe(*args)
464 | 
465 | def load_sam_multi_processes(args):
466 | 	'''
467 | 	task scheduling
468 | 	'''
469 | 	temporary_dir = args.temp_dir if args.temp_dir.endswith('/') else \
470 | 	"%s/"%(args.temp_dir)
471 | 	os.mkdir("%ssignatures"%temporary_dir)
472 | 	# Major Steps:
473 | 	# loading alignment file: bam format
474 | 	samfile = pysam.AlignmentFile(args.input)
475 | 	# loading reference genome
476 | 	Ref = load_ref(args.Reference)
477 | 	global_ref.append(Ref)
478 | 	# acquire the total numbers of the ref contigs
479 | 	contig_num = len(samfile.get_index_statistics())
480 | 	logging.info("The total number of chromsomes: %d"%(contig_num))
481 | 	# Thread scheduling
482 | 	process_list = list()
483 | 	for i in samfile.get_index_statistics():
484 | 		process_list.append([i[0], i[3]])
485 | 		# #chr #read
486 | 	process_list = sorted(process_list, key = lambda x:-x[1])
487 | 	# start to establish multiprocesses
488 | 	analysis_pools = Pool(processes = args.threads)
489 | 	# Acquire_Chr_name
490 | 	for i in process_list:
491 | 		para = [(temporary_dir, i[0], args.input, args.min_distance, \
492 | 			args.min_support, args.min_length)]
493 | 		analysis_pools.map_async(multi_run_wrapper, para)
494 | 	analysis_pools.close()
495 | 	analysis_pools.join()
496 | 	samfile.close()
497 | 
498 | 	output_p = args.output_dir if args.output_dir.endswith('/') else \
499 | 	"%s/"%(args.output_dir)
500 | 	if not os.path.exists(output_p):
501 | 		os.mkdir(output_p)
502 | 	merge_cmd = ("cat %ssignatures/* > %spotential_ME.fa"%(temporary_dir, output_p))
503 | 	r, o, e = exe(merge_cmd)
504 | 	if r != 0:
505 | 		logging.error("Merging ME signatures failed!")
506 | 		logging.error("RETCODE %d" % (r))
507 | 		logging.error("STDOUT %s" % (str(o)))
508 | 		logging.error("STDERR %s" % (str(e)))
509 | 		logging.error("Exiting")
510 | 		exit(r)
511 | 	logging.info("Cleaning temporary files.")
512 | 	cmd_remove_tempfile = ("rm -r %ssignatures"%(temporary_dir))
513 | 	r, o, e = exe(cmd_remove_tempfile)
514 | 	if r != 0:
515 | 		logging.error("Cleaning temporary files failed!")
516 | 		logging.error("RETCODE %d" % (r))
517 | 		logging.error("STDOUT %s" % (str(o)))
518 | 		logging.error("STDERR %s" % (str(e)))
519 | 		logging.error("Exiting")
520 | 		exit(r)
521 | # ********************Signatures-extraction***************************
522 | # 
523 | # 
524 | # 
525 | # ************************MAIN_FUNCTION*******************************
526 | def parseArgs(argv):
527 | 	parser = argparse.ArgumentParser(prog="rMETL detection", \
528 | 		description=USAGE, formatter_class=argparse.RawDescriptionHelpFormatter)
529 | 	parser.add_argument("input", metavar="[SAM,BAM,FASTA,FASTQ]", type=str, \
530 | 		help="Input reads with/without alignment.")
531 | 	parser.add_argument("Reference", metavar="REFERENCE", type=str, \
532 | 		help="The reference genome in fasta format.")
533 | 	parser.add_argument('temp_dir', type=str, \
534 | 		help = "Temporary directory to use for distributed jobs.")
535 | 	parser.add_argument('output_dir', type=str, \
536 | 		help = "Directory to output potential ME loci.")
537 | 	parser.add_argument('-s', '--min_support',\
538 | 	 help = "Mininum number of reads that support a ME.[%(default)s]", \
539 | 	 default = 5, type = int)
540 | 	parser.add_argument('-l', '--min_length', \
541 | 		help = "Mininum length of ME to be reported.[%(default)s]", \
542 | 		default = 50, type = int)
543 | 	parser.add_argument('-d', '--min_distance', \
544 | 		help = "Mininum distance of two ME signatures to be intergrated.[%(default)s]", \
545 | 		default = 20, type = int)
546 | 	parser.add_argument('-t', '--threads', \
547 | 		help = "Number of threads to use.[%(default)s]", default = 8, \
548 | 		type = int)
549 | 	parser.add_argument('-x', '--presets', \
550 | 		help = "The sequencing platform <pacbio,ont> of the reads.[%(default)s]", \
551 | 		default = "pacbio", type = str)
552 | 	args = parser.parse_args(argv)
553 | 	return args
554 | 
555 | def run(argv):
556 | 	args = parseArgs(argv)
557 | 	setupLogging(False)
558 | 	starttime = time.time()
559 | 	flag = decipherInput(args.input)
560 | 	
561 | 	if flag == 0:
562 | 		# detection
563 | 		result = check_bai(args.input, args.temp_dir)
564 | 		if len(result) == 0:
565 | 			load_sam_multi_processes(args)
566 | 		else:
567 | 			args.input = result
568 | 			load_sam_multi_processes(args)
569 | 	elif flag == 1:
570 | 		bam_path = call_samtools(args.input, args.temp_dir)
571 | 		args.input = bam_path
572 | 		load_sam_multi_processes(args)
573 | 		# samtools transfer
574 | 	else:
575 | 		# inFile, ref, seq_type, nproc=1, outFile="map.sam", presets="pacbio"
576 | 		file = call_ngmlr(args.input, args.Reference, args.presets, \
577 | 			args.threads, args.temp_dir)
578 | 		bam_path = call_samtools(file, args.temp_dir)
579 | 		args.input = bam_path
580 | 		load_sam_multi_processes(args)
581 | 	logging.info("Finished in %0.2f seconds."%(time.time() - starttime))
582 | 
583 | if __name__ == '__main__':
584 | 	run(sys.argv[:1])
585 | 


--------------------------------------------------------------------------------