├── src └── rMETL │ ├── __init__.py │ ├── rMETL_version.py │ ├── rMETL_genotype.py │ ├── rMETL │ ├── rMETL_utils.py │ ├── rMETL_concensus.py │ ├── rMETL_realign.py │ ├── rMETL_cmdRunner.py │ ├── rMETL_MEIcalling.py │ └── rMETL_extraction.py ├── .gitignore ├── LICENSE ├── setup.py ├── README.md └── Concensus └── super_TE.fa /src/rMETL/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ############################## 2 | ## Folders ## 3 | ############################## 4 | /dist/ 5 | /src/rMETL.egg-info/ 6 | /src/*.pyc 7 | -------------------------------------------------------------------------------- /src/rMETL/rMETL_version.py: -------------------------------------------------------------------------------- 1 | # * @author: Jiang Tao (tjiang@hit.edu.cn) 2 | 3 | __version__ = '1.0.4' 4 | __author__ = 'Jiang Tao' 5 | __contact__ = 'tjiang@hit.edu.cn' 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 JiangTao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from setuptools import setup, find_packages 4 | 5 | LONG_DESCRIPTION = '''Mobile element insertion (MEI) is a major category of structure variations (SVs). \ 6 | The rapid development of long read sequencing provides the opportunity to sensitively discover MEIs. \ 7 | However, the signals of MEIs implied by noisy long reads are highly complex, due to the repetitiveness \ 8 | of mobile elements as well as the serious sequencing errors. Herein, we propose Realignment-based \ 9 | Mobile Element insertion detection Tool for Long read (rMETL). rMETL takes advantage of \ 10 | its novel chimeric read re-alignment approach to well handle complex MEI signals. \ 11 | Benchmarking results on simulated and real datasets demonstrated that rMETL has the ability \ 12 | to more sensitivity discover MEIs as well as prevent false positives. \ 13 | It is suited to produce high quality MEI callsets in many genomics studies.''' 14 | 15 | setup( 16 | name = "rMETL", 17 | version = "1.0.4", 18 | description = "realignment-based Mobile Element insertion detection Tool for Long read", 19 | author = "Jiang Tao", 20 | author_email = "tjiang@hit.edu.cn", 21 | url = "https://github.com/tjiangHIT/rMETL", 22 | license = "MIT", 23 | packages = find_packages("src"), 24 | package_dir = {"": "src"}, 25 | data_files = [("", ["LICENSE"])], 26 | scripts=['src/rMETL/rMETL'], 27 | long_description = LONG_DESCRIPTION, 28 | zip_safe = False, 29 | install_requires = ['pysam', 'Biopython', 'Cigar'] 30 | ) 31 | -------------------------------------------------------------------------------- /src/rMETL/rMETL_genotype.py: -------------------------------------------------------------------------------- 1 | # * @author: Jiang Tao (tjiang@hit.edu.cn) 2 | 3 | GL_TAG = ['1/0', '0/1', '1/1'] 4 | 5 | def simple_call_genotype(Nalt, Ntotal, P_heterozygous, P_homozygous): 6 | bound_low = Ntotal * P_heterozygous 7 | bound_up = Ntotal * P_homozygous 8 | if Nalt < bound_low: 9 | # reliability = 0 10 | return GL_TAG[0], "%d:%d"%(Nalt, Ntotal - Nalt), 0 11 | elif bound_low <= Nalt and Nalt < bound_up: 12 | # reliability = 1 13 | return GL_TAG[1], "%d:%d"%(Nalt, Ntotal - Nalt), 1 14 | else: 15 | # reliability = 1 16 | return GL_TAG[2], "%d:%d"%(Nalt, Ntotal - Nalt), 1 17 | 18 | def simple_filter_genotype(Nalt, Ntotal, P_heterozygous): 19 | bound_low = Ntotal * P_heterozygous 20 | if Nalt < bound_low: 21 | return 0 22 | else: 23 | return 1 24 | 25 | def count_coverage(chr, s, e, f): 26 | total = 0 27 | for i in f.fetch(chr, s, e): 28 | total += 1 29 | return total 30 | 31 | def add_genotype(info_list, file, low_bandary): 32 | ''' 33 | allocate genotype for each MEI/MED 34 | ''' 35 | for i in xrange(len(info_list)): 36 | if info_list[i][0][0] == 'INS': 37 | chr = info_list[i][0][1] 38 | start = info_list[i][0][2]-low_bandary 39 | end = info_list[i][0][2] + low_bandary 40 | locus_cov = count_coverage(chr, start, end, file) 41 | for j in xrange(len(info_list[i])): 42 | info_list[i][j].append(locus_cov) 43 | else: 44 | for j in xrange(len(info_list[i])): 45 | chr = info_list[i][j][1] 46 | start = info_list[i][j][2] 47 | end = info_list[i][j][2]+info_list[i][j][3] 48 | locus_cov = count_coverage(chr, start, end, file) 49 | info_list[i][j].append(locus_cov) 50 | return info_list 51 | 52 | if __name__ == '__main__': 53 | pass 54 | -------------------------------------------------------------------------------- /src/rMETL/rMETL: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | * All rights Reserved, Designed By HIT-Bioinformatics 5 | * @Description: Control the rMETL pipeline 6 | * @author: Jiang Tao (tjiang@hit.edu.cn) 7 | * @date: Apr 24 2018 8 | * @version V1.0.4 9 | ''' 10 | 11 | import argparse 12 | import rMETL.rMETL_extraction as rMETL_extraction 13 | import rMETL.rMETL_realign as rMETL_realign 14 | import rMETL.rMETL_MEIcalling as rMETL_MEIcalling 15 | from rMETL.rMETL_version import __version__, __author__, __contact__ 16 | 17 | STAGES = {'detection': rMETL_extraction.run, \ 18 | 'realignment': rMETL_realign.run, \ 19 | 'calling': rMETL_MEIcalling.run} 20 | 21 | USAGE = '''\ 22 | _ ___ _ _____ _______ _ 23 | _ _ | ^_ _^ | | ___| |__ __| | | 24 | | ^_| | | | | | | | |__ | | | | 25 | | | | | | | | | | __| | | | | 26 | | | | | | | | | | |___ | | | |___ 27 | |_| |_| |_| |_| |_____| |_| |_____| 28 | 29 | rMETL - realignment-based Mobile Element insertion detection Tool for Long read 30 | 31 | STAGE is one of 32 | detection Inference of putative MEI loci. 33 | realignment Realignment of chimeric read parts. 34 | calling Mobile Element Insertion/Deletion calling. 35 | 36 | See README.md for documentation or --help for details 37 | Strongly recommend making output directory manually at first. 38 | 39 | rMETL V%s 40 | Author: %s 41 | Contact: %s 42 | '''%(__version__, __author__, __contact__) 43 | 44 | def parseArgs(): 45 | parser = argparse.ArgumentParser(prog='rMETL', description=USAGE, \ 46 | formatter_class=argparse.RawDescriptionHelpFormatter) 47 | parser.add_argument('stage', metavar='STAGE', choices=STAGES.keys(), \ 48 | type=str, help='Stage to execute') 49 | parser.add_argument('options', metavar='OPTIONS', nargs=argparse.REMAINDER, \ 50 | help='Options to pass to the stage') 51 | args = parser.parse_args() 52 | STAGES[args.stage](args.options) 53 | 54 | if __name__ == '__main__': 55 | parseArgs() 56 | -------------------------------------------------------------------------------- /src/rMETL/rMETL_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*-coding:utf-8-*- 3 | 4 | import logging 5 | import os 6 | from Bio import SeqIO 7 | from rMETL.rMETL_cmdRunner import exe 8 | 9 | def load_ref(ref_g): 10 | logging.info("Loading reference genome...") 11 | return SeqIO.to_dict(SeqIO.parse(ref_g, "fasta")) 12 | 13 | def check_bai(file, tempdir): 14 | ''' 15 | check the index of a BAM file. 16 | ''' 17 | if os.path.exists(file+".bai"): 18 | logging.info("The bam file is legal.") 19 | return "" 20 | else: 21 | logging.info("The bam.bai is missed.") 22 | logging.info("Running Samtools sort...") 23 | bam_path = file[:-3] + "sorted.bam" 24 | cmd = ("samtools sort -@ 4 -O bam -T %s -o %s %s" % (tempdir, \ 25 | bam_path, file)) 26 | r, o, e = exe(cmd) 27 | if r != 0: 28 | logging.error("Samtools sort failed!") 29 | logging.error("RETCODE %d" % (r)) 30 | logging.error("STDOUT %s" % (str(o))) 31 | logging.error("STDERR %s" % (str(e))) 32 | logging.error("Exiting") 33 | exit(r) 34 | logging.info("Finished Samtools sort.") 35 | 36 | logging.info("Running Samtools index...") 37 | cmd = ("samtools index %s" % (bam_path)) 38 | r, o, e = exe(cmd) 39 | if r != 0: 40 | logging.error("Samtools index failed!") 41 | logging.error("RETCODE %d" % (r)) 42 | logging.error("STDOUT %s" % (str(o))) 43 | logging.error("STDERR %s" % (str(e))) 44 | logging.error("Exiting") 45 | exit(r) 46 | logging.info("Finished Samtools index.") 47 | return bam_path 48 | 49 | def call_ngmlr(inFile, ref, presets, nproc, outFile): 50 | """ 51 | run ngmlr to generate alignments 52 | """ 53 | outFile = outFile + "map.sam" 54 | logging.info("Running NGMLR...") 55 | cmd = ("ngmlr -r %s -q %s -o %s -t %d -x %s" % (ref, inFile, outFile, \ 56 | nproc, presets)) 57 | r, o, e = exe(cmd) 58 | 59 | if r != 0: 60 | logging.error("NGMLR mapping failed!") 61 | logging.error("RETCODE %d" % (r)) 62 | logging.error("STDOUT %s" % (str(o))) 63 | logging.error("STDERR %s" % (str(e))) 64 | logging.error("Exiting") 65 | exit(r) 66 | logging.info("Finished NGMLR mapping.") 67 | return outFile 68 | 69 | def call_samtools(file, tempdir): 70 | ''' 71 | run samtools to generate sorted BAM files. 72 | ''' 73 | logging.info("Running Samtools sort...") 74 | bam_path = file[:-3] + "bam" 75 | cmd = ("samtools view -Sb %s | samtools sort -@ 4 -O bam -T %s - > %s" % \ 76 | (file, tempdir, bam_path)) 77 | r, o, e = exe(cmd) 78 | if r != 0: 79 | logging.error("Samtools sort failed!") 80 | logging.error("RETCODE %d" % (r)) 81 | logging.error("STDOUT %s" % (str(o))) 82 | logging.error("STDERR %s" % (str(e))) 83 | logging.error("Exiting") 84 | exit(r) 85 | logging.info("Finished Samtools sort.") 86 | 87 | logging.info("Running Samtools index...") 88 | cmd = ("samtools index %s" % (bam_path)) 89 | r, o, e = exe(cmd) 90 | if r != 0: 91 | logging.error("Samtools index failed!") 92 | logging.error("RETCODE %d" % (r)) 93 | logging.error("STDOUT %s" % (str(o))) 94 | logging.error("STDERR %s" % (str(e))) 95 | logging.error("Exiting") 96 | exit(r) 97 | logging.info("Finished Samtools index.") 98 | return bam_path 99 | -------------------------------------------------------------------------------- /src/rMETL/rMETL_concensus.py: -------------------------------------------------------------------------------- 1 | # * @author: Jiang Tao (tjiang@hit.edu.cn) 2 | 3 | from collections import Counter 4 | 5 | def acquire_count_max(_list_): 6 | c = Counter(_list_) 7 | return c.most_common(1)[0] 8 | # this is a tuple 9 | 10 | def construct_concensus_info(Ins_list, Clip_list, evidence_read, SV_size): 11 | total_count = len(Ins_list) + len(Clip_list) 12 | if total_count < evidence_read: 13 | return 0 14 | breakpoint = list() 15 | insert_size = list() 16 | boundary = list() 17 | for i in Ins_list: 18 | breakpoint.append(i[0]) 19 | insert_size.append(i[1]) 20 | for i in Clip_list: 21 | if i[2] == 1: 22 | breakpoint.append(i[0]) 23 | 24 | # ==============method_1===================== 25 | Prob_pos_1 = Counter(breakpoint).most_common(1)[0][0] 26 | # ==============method_2===================== 27 | Prob_pos_2 = sum(breakpoint)/len(breakpoint) 28 | Average_size = int(sum(insert_size)/len(insert_size)) 29 | if Average_size < SV_size: 30 | return 0 31 | 32 | local_info = list() 33 | local_name = [Prob_pos_2, Average_size] 34 | local_id = 0 35 | for i in Ins_list: 36 | info = local_name + [str(local_id), i[2]] 37 | local_id += 1 38 | local_info.append(info) 39 | for i in Clip_list: 40 | info = local_name + [str(local_id), i[1]] 41 | local_id += 1 42 | local_info.append(info) 43 | 44 | return local_info 45 | 46 | 47 | def construct_concensus_seq(Ins_list, Clip_list): 48 | ''' 49 | Ins_list: start position on reference genome 50 | Insertion size 51 | Insertion sequence 52 | Clip_list: clip position on reference genome 53 | clip sequence 54 | clip type(0 for left and 1 for right) 55 | ''' 56 | breakpoint = list() 57 | insert_size = list() 58 | for i in Ins_list: 59 | breakpoint.append(i[0]) 60 | insert_size.append(i[1]) 61 | for i in Clip_list: 62 | if i[2] == 1: 63 | breakpoint.append(i[0]) 64 | 65 | # ==============method_1===================== 66 | Prob_pos_1 = Counter(breakpoint).most_common(1)[0][0] 67 | # ==============method_2===================== 68 | Prob_pos_2 = sum(breakpoint)/len(breakpoint) 69 | Max_size = max(insert_size) 70 | Min_size = min(insert_size) 71 | Average_size = int(sum(insert_size)/len(insert_size)) 72 | 73 | Seq = dict() 74 | for i in Ins_list: 75 | for j in xrange(i[1]): 76 | pos = i[0] + j 77 | ch = i[2][j] 78 | if pos not in Seq: 79 | Seq[pos] = list() 80 | Seq[pos].append(ch) 81 | 82 | for i in Clip_list: 83 | if Average_size <= len(i[1]): 84 | boundary = Average_size 85 | else: 86 | boundary = len(i[1]) 87 | 88 | if i[2] == 0: 89 | local_clip_seq = i[1][len(i[1])-boundary:] 90 | for j in xrange(boundary): 91 | pos = i[0] + j 92 | ch = local_clip_seq[j] 93 | if pos not in Seq: 94 | Seq[pos] = list() 95 | Seq[pos].append(ch) 96 | else: 97 | for j in xrange(boundary): 98 | pos = i[0] + j 99 | ch = i[1][j] 100 | if pos not in Seq: 101 | Seq[pos] = list() 102 | Seq[pos].append(ch) 103 | Seq_trans = list() 104 | for key in Seq: 105 | if len(Seq[key]) < 5: 106 | continue 107 | Seq_trans.append([key, acquire_count_max(Seq[key])[0]]) 108 | Seq_trans = sorted(Seq_trans, key = lambda x:x[0]) 109 | final_consensus = str() 110 | for i in Seq_trans: 111 | if i[0] < Prob_pos_1: 112 | continue 113 | final_consensus += i[1] 114 | if len(final_consensus) > Average_size: 115 | break 116 | return final_consensus, Prob_pos_1 117 | -------------------------------------------------------------------------------- /src/rMETL/rMETL_realign.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | * All rights Reserved, Designed By HIT-Bioinformatics 5 | * @Description: Classify the ME types 6 | * @author: Jiang Tao (tjiang@hit.edu.cn) 7 | * @date: Apr 24 2018 8 | * @version V1.0.4 9 | ''' 10 | 11 | import argparse 12 | import logging 13 | import sys 14 | import time 15 | 16 | from rMETL.rMETL_version import __version__, __author__, __contact__ 17 | from rMETL.rMETL_cmdRunner import setupLogging, exe 18 | 19 | USAGE="""\ 20 | _ ___ _ _____ _______ _ 21 | _ _ | ^_ _^ | | ___| |__ __| | | 22 | | ^_| | | | | | | | |__ | | | | 23 | | | | | | | | | | __| | | | | 24 | | | | | | | | | | |___ | | | |___ 25 | |_| |_| |_| |_| |_____| |_| |_____| 26 | 27 | rMETL - realignment-based Mobile Element insertion detection Tool for Long read 28 | 29 | Realignment of chimeric read parts. 30 | 31 | Aligner: NGMLR version 0.2.6 32 | TE refs: Alu concensus 33 | L1 concensus 34 | SVA concensus 35 | The output is a sam format file called 'cluster.sam'. 36 | 37 | rMETL V%s 38 | Author: %s 39 | Contact: %s 40 | """%(__version__, __author__, __contact__) 41 | 42 | # **************************Call-NGMLR******************************** 43 | def call_ngmlr(inFile, ref, presets, nproc, outFile, SUBREAD_LENGTH, SUBREAD_CORRIDOR): 44 | """ 45 | fq = input file 46 | automatically search for .sa 47 | """ 48 | outFile = outFile + "cluster.sam" 49 | logging.info("Running NGMLR...") 50 | cmd = ("ngmlr -r %s -q %s -o %s -t %d -x %s --subread-length %d --subread-corridor %d" \ 51 | % (ref, inFile, outFile, nproc, presets, SUBREAD_LENGTH, SUBREAD_CORRIDOR)) 52 | r, o, e = exe(cmd) 53 | if r != 0: 54 | logging.error("NGMLR mapping failed!") 55 | logging.error("RETCODE %d" % (r)) 56 | logging.error("STDOUT %s" % (str(o))) 57 | logging.error("STDERR %s" % (str(e))) 58 | logging.error("Exiting") 59 | exit(r) 60 | logging.info("Finished NGMLR mapping.") 61 | return outFile 62 | # **************************Call-NGMLR******************************** 63 | # 64 | # 65 | # 66 | # ************************MAIN_FUNCTION******************************* 67 | def parseArgs(argv): 68 | parser = argparse.ArgumentParser(prog="rMETL realignment", description=USAGE, \ 69 | formatter_class=argparse.RawDescriptionHelpFormatter) 70 | parser.add_argument("input", metavar="FASTA", type=str, help="Input potential_ME.fa on STAGE detection.") 71 | parser.add_argument("ME_Ref", type=str, help="The transposable element concensus in fasta format.") 72 | parser.add_argument('output', type=str, help = "Directory to output realignments.") 73 | parser.add_argument('-t', '--threads', help = "Number of threads to use.[%(default)s]", \ 74 | default = 8, type = int) 75 | parser.add_argument('-x', '--presets', \ 76 | help = "The sequencing platform of the reads.[%(default)s]", \ 77 | default = "pacbio", type = str) 78 | parser.add_argument('--subread_length', \ 79 | help = "Length of fragments reads are split into [%(default)s]", \ 80 | default = 128, type = int) 81 | parser.add_argument('--subread_corridor', \ 82 | help = "Length of corridor sub-reads are aligned with [%(default)s]", \ 83 | default = 20, type = int) 84 | args = parser.parse_args(argv) 85 | return args 86 | 87 | def run(argv): 88 | args = parseArgs(argv) 89 | setupLogging(False) 90 | starttime = time.time() 91 | call_ngmlr(args.input, args.ME_Ref, args.presets, args.threads, args.output, \ 92 | args.subread_length, args.subread_corridor) 93 | logging.info("Finished in %0.2f seconds."%(time.time() - starttime)) 94 | 95 | if __name__ == '__main__': 96 | run(sys.argv[:1]) 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | __ __ ______ _________ _ 2 | _ __ | \ / | | ____| |___ ___| | | 3 | | ^__| | \ / | | |___ | | | | 4 | | | | |\ \ / /| | | ___| | | | | 5 | | | | | \ \/ / | | | |____ | | | |____ 6 | |_| |_| \__/ |_| |______| |_| |______| 7 | 8 | rMETL - realignment-based Mobile Element insertion detection Tool for Long read 9 | **NOTE: The community users give the newest installation approach after 2023, which is referred to** [here](https://github.com/tjiangHIT/rMETL/issues/8). 10 | 11 | [![PyPI version](https://badge.fury.io/py/rMETL.svg)](https://badge.fury.io/py/rMETL) 12 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/rmetl/badges/version.svg)](https://anaconda.org/bioconda/rmetl) 13 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/rmetl/badges/license.svg)](https://anaconda.org/bioconda/rmetl) 14 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/rmetl/badges/platforms.svg)](https://anaconda.org/bioconda/rmetl) 15 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/rmetl/badges/latest_release_date.svg)](https://anaconda.org/bioconda/rmetl) 16 | 17 | --- 18 | ### Introduction 19 | Mobile element insertion (MEI) is a significant category of structure variations (SVs). The rapid development of long-read sequencing technologies provides the opportunity to detect MEIs sensitively. However, the signals of MEI implied by noisy long reads are highly complex due to the repetitiveness of mobile elements and the high sequencing error rates. Herein, we propose the Realignment-based Mobile Element insertion detection Tool for Long read (rMETL). Benchmarking results of simulated and real datasets demonstrate that rMETL has the ability to discover MEIs sensitively as well as prevent false positives. It is suited to produce high-quality MEI callsets in many genomics studies. 20 | 21 | --- 22 | ### Simulated datasets 23 | 24 | The simulated datasets used for benchmarking are available at [Google Drive](https://drive.google.com/open?id=1ujV2C8e1PNAVhSkh9vKtjWLdG_OHcH-k) 25 | 26 | --- 27 | ### Memory usage 28 | 29 | The memory usage of rMETL can fit the configurations of most modern servers and workstations. 30 | Its peak memory footprint is about 7.05 Gigabytes (default setting), on a server with Intel Xeon CPU at 2.00 GHz, 1 Terabytes RAM running Linux Ubuntu 14.04. These reads were aligned to human reference genome hs37d5. 31 | 32 | --- 33 | ### Dependences 34 | 35 | 1. pysam 36 | 2. Biopython 37 | 3. ngmlr 38 | 4. samtools 39 | 5. cigar 40 | 41 | Python version 2.7 42 | 43 | --- 44 | ### Installation 45 | 46 | #install via pip 47 | $ pip install rMETL 48 | 49 | #install via conda 50 | $ conda install -c bioconda rmetl 51 | 52 | #install from GitHub 53 | $ git clone https://github.com/tjiangHIT/rMETL.git (git clone https://github.com/hitbc/rMETL.git) 54 | $ cd rMETL/ 55 | $ pip install . 56 | 57 | The current version of rMETL has been tested on a 64-bit Linux operating system. 58 | 59 | **NOTE: The community users give the newest installation approach after 2023, which is referred to** [here](https://github.com/tjiangHIT/rMETL/issues/8). 60 | 61 | --- 62 | ### Synopsis 63 | Inference of putative MEI loci. 64 | 65 | rMETL.py detection 66 | 67 | Realignment of chimeric read parts. 68 | 69 | rMETL.py realignment 70 | 71 | Mobile Element Insertion calling. 72 | 73 | rMETL.py calling 74 | 75 | Strongly recommend making the output directory manually at first.:blush: 76 | 77 | --- 78 | ### Optional Parameters 79 | 80 | #### Detection 81 | 82 | | Parameters | Descriptions | Defaults | 83 | | :------------ |:---------------|:---------------| 84 | | MIN_SUPPORT |Mininum number of reads that support a ME.| 5 | 85 | | MIN_LENGTH | Minimum length of ME to be reported. |50| 86 | | MIN_DISTANCE | Minimum distance of two ME clusters. |20| 87 | | THREADS |Number of threads to use.|1| 88 | | PRESETS |The sequencing type of the reads.|pacbio| 89 | 90 | #### Realignment 91 | 92 | | Parameters | Descriptions | Defaults | 93 | | :------------ |:---------------|:---------------| 94 | | THREADS |Number of threads to use.|1| 95 | | PRESETS |The sequencing type of the reads.|pacbio| 96 | | SUBREAD_LENGTH |Length of fragments reads are split into.|128| 97 | | SUBREAD_CORRIDOR |Length of corridor sub-reads are aligned with.|20| 98 | 99 | #### Calling 100 | 101 | | Parameters | Descriptions | Defaults | 102 | | :------------ |:---------------|:---------------| 103 | | HOMOZYGOUS |The minimum score of a genotyping reported as homozygous.|0.8| 104 | | HETEROZYGOUS |The minimum score of a genotyping reported as a heterozygous.|0.3| 105 | | MIN_MAPQ |Mininum mapping quality.|20| 106 | | CLIPPING_THRESHOLD |Mininum threshold of realignment clipping.|0.5| 107 | | SAMPLE |The name of the sample which is noted.|None| 108 | | MEI |Enables rMETL to display MEI/MED only.|False| 109 | 110 | --- 111 | ### Citation 112 | If you use rMETL, please cite: 113 | > Tao Jiang *et al*; rMETL: sensitive mobile element insertion detection with long read realignment, *Bioinformatics*, Volume 35, Issue 18, 15 September 2019, Pages 3484–3486, https://doi.org/10.1093/bioinformatics/btz106 114 | 115 | --- 116 | ### Contact 117 | For advising, bug reporting, and requiring help, please post on [Github Issue](https://github.com/tjiangHIT/rMETL/issues) or contact tjiang@hit.edu.cn. 118 | -------------------------------------------------------------------------------- /src/rMETL/rMETL_cmdRunner.py: -------------------------------------------------------------------------------- 1 | from string import Template 2 | import tempfile 3 | import subprocess, signal, logging, os, stat, sys 4 | 5 | class Alarm(Exception): 6 | pass 7 | 8 | def alarm_handler(signum, frame): 9 | raise Alarm 10 | 11 | def setupLogging(debug=False): 12 | logLevel = logging.DEBUG if debug else logging.INFO 13 | logFormat = "%(asctime)s [%(levelname)s] %(message)s" 14 | logging.basicConfig( stream=sys.stderr, level=logLevel, format=logFormat ) 15 | logging.info("Running %s" % " ".join(sys.argv)) 16 | 17 | def exe(cmd, timeout=-1): 18 | """ 19 | Executes a command through the shell. 20 | timeout in minutes! so 1440 mean is 24 hours. 21 | -1 means never 22 | """ 23 | proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, \ 24 | stderr=subprocess.STDOUT, close_fds=True,\ 25 | preexec_fn=os.setsid) 26 | signal.signal(signal.SIGALRM, alarm_handler) 27 | if timeout > 0: 28 | signal.alarm(int(timeout*60)) 29 | try: 30 | stdoutVal, stderrVal = proc.communicate() 31 | signal.alarm(0) # reset the alarm 32 | except Alarm: 33 | logging.error(("Command was taking too long. " 34 | "Automatic Timeout Initiated after %d" % (timeout))) 35 | os.killpg(proc.pid, signal.SIGTERM) 36 | proc.kill() 37 | return 214,None,None 38 | 39 | retCode = proc.returncode 40 | return retCode,stdoutVal,stderrVal 41 | 42 | class Command(): 43 | def __init__(self, cmd, jobname, stdout, stderr): 44 | self.cmd = cmd 45 | self.jobname = jobname 46 | self.stdout = stdout 47 | self.stderr = stderr 48 | 49 | def asDict(self): 50 | return {"CMD":self.cmd, "JOBNAME":self.jobname, \ 51 | "STDOUT":self.stdout, "STDERR":self.stderr} 52 | 53 | class CommandRunner(): 54 | """ 55 | Uses a command template to run stuff. This is helpful for cluster commands 56 | and chunking several commands together 57 | """ 58 | def __init__(self, template=None, njobs=0): 59 | """ 60 | template: a string that will become the template for submitting to your cluster: 61 | #you can also go ahead and specify a string.Template 62 | default is to not submit to your cluster 63 | ${CMD} > ${STDOUT} 2> ${STDERR} 64 | njobs: (0) 65 | for clumping commands together and submitting them in a script 66 | """ 67 | if template is None: 68 | template = "${CMD} > ${STDOUT} 2> ${STDERR}" 69 | self.runType = "Running" 70 | else: 71 | self.runType = "Submitting" 72 | self.template = Template(template) 73 | self.njobs = njobs 74 | 75 | def __call__(self, cmds, wDir = None, id = None): 76 | """ 77 | Executes Commands - can either be a list or a single Command 78 | wDir is the working directory where chunk scripts will be written 79 | if id is None a random identifier will be applied when chunking 80 | """ 81 | if wDir is None: 82 | wDir = "./" 83 | 84 | if type(cmds) != list: 85 | cmd = self.buildCommand(cmds) 86 | return exe(cmd) 87 | 88 | if self.njobs == 0: 89 | outRet = [] 90 | for c in cmds: 91 | outRet.append(exe(self.buildCommand(c))) 92 | return outRet 93 | 94 | if id is None: 95 | id = tempfile.mkstemp(dir=wDir)[1] 96 | 97 | outputRet =[] 98 | for chunk, commands in enumerate( partition(cmds, self.njobs) ): 99 | outScript = open(os.path.join(wDir, "%s_chunk%d.sh" % (id, chunk)),'w') 100 | outScript.write("#!/bin/bash\n\n") 101 | for c in commands: 102 | outScript.write(c.cmd+"\n") 103 | outScript.close() 104 | #Add executeable 105 | existing_permissions = stat.S_IMODE(os.stat(outScript.name).st_mode) 106 | if not os.access(outScript.name, os.X_OK): 107 | new_permissions = existing_permissions | stat.S_IXUSR 108 | os.chmod(outScript.name, new_permissions) 109 | 110 | submit = Command(outScript.name, \ 111 | id + "_chunk%d" % chunk, \ 112 | os.path.join(wDir, id + ("_chunk%d.out" % chunk)), \ 113 | os.path.join(wDir, id + ("_chunk%d.err" % chunk))) 114 | cmd = self.buildCommand(submit) 115 | outputRet.append(exe(cmd)) 116 | 117 | return outputRet 118 | 119 | def checkTemplate(self): 120 | """ 121 | Checks that my template works okay 122 | """ 123 | temp.update({"CMD":"test", \ 124 | "STDOUT":"testo", \ 125 | "STDERR":"teste", \ 126 | "JOBNAME":"testn"}) 127 | try: 128 | w = self.template.substitute(temp) 129 | except KeyError: 130 | logging.error("Your submission template is invalid ") 131 | sys.exit(1) 132 | 133 | def buildCommand(self, cmdSetup): 134 | """ 135 | substitutes a template with a Command 136 | """ 137 | return self.template.substitute(cmdSetup.asDict()) 138 | 139 | def partition(n,m): 140 | """ 141 | Helper function. splits list n into m partitions 142 | """ 143 | p = map(lambda x: list(), range(m)) 144 | index = 0 145 | for item in n: 146 | p[index].append(item) 147 | if index < m-1: 148 | index += 1 149 | else: 150 | index = 0 151 | return filter(lambda x: len(x)>0, p) 152 | -------------------------------------------------------------------------------- /Concensus/super_TE.fa: -------------------------------------------------------------------------------- 1 | >Alu 2 | GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGGCTAACACGGTGAAACCCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGGTGGCGGGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAGGCGGAGCTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTC 3 | >L1 4 | GGGGGAGGAGCCAAGATGGCCGAATAGGAACAGCTCCGGTCTACAGCTCCCAGCGTGAGCGACGCAGAAGACGGTGATTTCTGCATTTCCATCTGAGGTACCGGGTTCATCTCACTAGGGAGTGCCAGACAGTGGGCGCAGGCCAGTGTGTGTGCGCACCGTGCGCGAGCCGAAGCAGGGCGAGGCATTGCCTCACCTGGGAAGCGCAAGGGGTCAGGGAGTTCCCTTTCTGAGTCAAAGAAAGGGGTGACGGTCGCACCTGGAAAATCGGGTCACTCCCACCCGAATATTGCGCTTTTCAGACCGGCTTAAGAAACGGCGCACCACGAGACTATATCCCACACCTGGCTCGGAGGGTCCTACGCCCACGGAATCTCGCTGATTGCTAGCACAGCAGTCTGAGATCAAACTGCAAGGCGGCAACGAGGCTGGGGGAGGGGCGCCCGCCATTGCCCAGGCTTGCTTAGGTAAACAAAGCAGCCGGGAAGCTCGAACTGGGTGGAGCCCACCACAGCTCAAGGAGGCCTGCCTGCCTCTGTAGGCTCCACCTCTGGGGGCAGGGCACAGACAAACAAAAAGACAGCAGTAACCTCTGCAGACTTAAGTGTCCCTGTCTGACAGCTTTGAAGAGAGCAGTGGTTCTCCCAGCACGCAGCTGGAGATCTGAGAACGGGCAGACAGACTGCCTCCTCAAGTGGGTCCCTGACTCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCAGCAGGGGCACACTGACACCTCACACGGCAGGGTATTCCAACAGACCTGCAGCTGAGGGTCCTGTCTGTTAGAAGGAAAACTAACAACCAGAAAGGACATCTACACCGAAAACCCATCTGTACATCACCATCATCAAAGACCAAAAGTAGATAAAACCACAAAGATGGGGAAAAAACAGAACAGAAAAACTGGAAACTCTAAAACGCAGAGCGCCTCTCCTCCTCCAAAGGAACGCAGTTCCTCACCAGCAACGGAACAAAGCTGGATGGAGAATGATTTTGACGAGCTGAGAGAAGAAGGCTTCAGACGATCAAATTACTCTGAGCTACGGGAGGACATTCAAACCAAAGGCAAAGAAGTTGAAAACTTTGAAAAAAATTTAGAAGAATGTATAACTAGAATAACCAATACAGAGAAGTGCTTAAAGGAGCTGATGGAGCTGAAAACCAAGGCTCGAGAACTACGTGAAGAATGCAGAAGCCTCAGGAGCCGATGCGATCAACTGGAAGAAAGGGTATCAGCAATGGAAGATGAAATGAATGAAATGAAGCGAGAAGGGAAGTTTAGAGAAAAAAGAATAAAAAGAAATGAGCAAAGCCTCCAAGAAATATGGGACTATGTGAAAAGACCAAATCTACGTCTGATTGGTGTACCTGAAAGTGATGTGGAGAATGGAACCAAGTTGGAAAACACTCTGCAGGATATTATCCAGGAGAACTTCCCCAATCTAGCAAGGCAGGCCAACGTTCAGATTCAGGAAATACAGAGAACGCCACAAAGATACTCCTCGAGAAGAGCAACTCCAAGACACATAATTGTCAGATTCACCAAAGTTGAAATGAAGGAAAAAATGTTAAGGGCAGCCAGAGAGAAAGGTCGGGTTACCCTCAAAGGAAAGCCCATCAGACTAACAGTGGATCTCTCGGCAGAAACCCTACAAGCCAGAAGAGAGTGGGGGCCAATATTCAACATTCTTAAAGAAAAGAATTTTCAACCCAGAATTTCATATCCAGCCAAACTAAGCTTCATAAGTGAAGGAGAAATAAAATACTTTATAGACAAGCAAATGTTGAGAGATTTTGTCACCACCAGGCCTGCCCTAAAAGAGCTCCTGAAGGAAGCGCTAAACATGGAAAGGAACAACCGGTACCAGCCGCTGCAAAATCATGCCAAAATGTAAAGACCATCGAGACTAGGAAGAAACTGCATCAACTAATGAGCAAAATCACCAGCTAACATCATAATGACAGGATCAAATTCACACATAACAATATTAACTTTAAATATAAATGGACTAAATTCTGCAATTAAAAGACACAGACTGGCAAGTTGGATAAAGAGTCAAGACCCATCAGTGTGCTGTATTCAGGAAACCCATCTCACGTGCAGAGACACACATAGGCTCAAAATAAAAGGATGGAGGAAGATCTACCAAGCCAATGGAAAACAAAAAAAGGCAGGGGTTGCAATCCTAGTCTCTGATAAAACAGACTTTAAACCAACAAAGATCAAAAGAGACAAAGAAGGCCATTACATAATGGTAAAGGGATCAATTCAACAAGAGGAGCTAACTATCCTAAATATTTATGCACCCAATACAGGAGCACCCAGATTCATAAAGCAAGTCCTCAGTGACCTACAAAGAGACTTAGACTCCCACACATTAATAATGGGAGACTTTAACACCCCACTGTCAACATTAGACAGATCAACGAGACAGAAAGTCAACAAGGATACCCAGGAATTGAACTCAGCTCTGCACCAAGCAGACCTAATAGACATCTACAGAACTCTCCACCCCAAATCAACAGAATATACCTTTTTTTCAGCACCACACCACACCTATTCCAAAATTGACCACATAGTTGGAAGTAAAGCTCTCCTCAGCAAATGTAAAAGAACAGAAATTATAACAAACTATCTCTCAGACCACAGTGCAATCAAACTAGAACTCAGGATTAAGAATCTCACTCAAAGCCGCTCAACTACATGGAAACTGAACAACCTGCTCCTGAATGACTACTGGGTACATAACGAAATGAAGGCAGAAATAAAGATGTTCTTTGAAACCAACGAGAACAAAGACACCACATACCAGAATCTCTGGGACGCATTCAAAGCAGTGTGTAGAGGGAAATTTATAGCACTAAATGCCTACAAGAGAAAGCAGGAAAGATCCAAAATTGACACCCTAACATCACAATTAAAAGAACTAGAAAAGCAAGAGCAAACACATTCAAAAGCTAGCAGAAGGCAAGAAATAACTAAAATCAGAGCAGAACTGAAGGAAATAGAGACACAAAAAACCCTTCAAAAAATCAATGAATCCAGGAGCTGGTTTTTTGAAAGGATCAACAAAATTGATAGACCGCTAGCAAGACTAATAAAGAAAAAAAGAGAGAAGAATCAAATAGACACAATAAAAAATGATAAAGGGGATATCACCACCGATCCCACAGAAATACAAACTACCATCAGAGAATACTACAAACACCTCTACGCAAATAAACTAGAAAATCTAGAAGAAATGGATACATTCCTCGACACATACACTCTCCCAAGACTAAACCAGGAAGAAGTTGAATCTCTGAATAGACCAATAACAGGCTCTGAAATTGTGGCAATAATCAATAGTTTACCAACCAAAAAGAGTCCAGGACCAGATGGATTCACAGCCGAATTCTACCAGAGGTACATGGAGGAACTGGTACCATTCCTTCTGAAACTATTCCAATCAATAGAAAAAGAGGGAATCCTCCCTAACTCATTTTATGAGGCCAGCATCATTCTGATACCAAAGCCGGGCAGAGACACAACCAAAAAAGAGAATTTTAGACCAATATCCTTGATGAACATTGATGCAAAAATCCTCAATAAAATACTGGCAAACCGAATCCAGCAGCACATCAAAAAGCTTATCCACCATGATCAAGTGGGCTTCATCCCTGGGATGCAAGGCTGGTTCAATATACGCAAATCAATAAATGTAATCCAGCATATAAACAGAGCCAAAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAAGCCTTTGACAAAATTCAACAACCCTTCATGCTAAAAACTCTCAATAAATTAGGTATTGATGGGACGTATTTCAAAATAATAAGAGCTATCTATGACAAACCCACAGCCAATATCATACTGAATGGGCAAAAACTGGAAGCATTCCCTTTGAAAACCGGCACAAGACAGGGATGCCCTCTCTCACCGCTCCTATTCAACATAGTGTTGGAAGTTCTGGCCAGGGCAATCAGGCAGGAGAAGGAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCAAATTGTCCCTGTTTGCAGACGACATGATTGTATATCTAGAAAACCCCATCGTCTCAGCCCAAAATCTCCTTAAGCTGATAAGCAACTTCAGCAAAGTCTCAGGATACAAAATCAATGTACAAAAATCACAAGCATTCTTATACACCAACAACAGACAAACAGAGAGCCAAATCATGGGTGAACTCCCATTCGTAATTGCTTCAAAGAGAATAAAATACCTAGGAATCCAACTTACAAGGGATGTGAAGGACCTCTTCAAGGAGAACTACAAACCACTGCTCAAGGAAATAAAAGAGGACACAAACAAATGGAAGAACATTCCATGCTCATGGGTAGGAAGAATCAATATCGTGAAAATGGCCATACTGCCCAAGGTAATTTACAGATTCAATGCCATCCCCATCAAGCTACCAATGACTTTCTTCACAGAATTGGAAAAAACTACTTTAAAGTTCATATGGAACCAAAAAAGAGCCCGCATTGCCAAGTCAATCCTAAGCCAAAAGAACAAAGCTGGAGGCATCACACTACCTGACTTCAAACTATACTACAAGGCTACAGTAACCAAAACAGCATGGTACTGGTACCAAAACAGAGATATAGATCAATGGAACAGAACAGAGCCCTCAGAAATAATGCCGCATATCTACAACTATCTGATCTTTGACAAACCTGAGAAAAACAAGCAATGGGGAAAGGATTCCCTATTTAATAAATGGTGCTGGGAAAACTGGCTAGCCATATGTAGAAAGCTGAAACTGGATCCCTTCCTTACACCTTATACAAAAATCAATTCAAGATGGATTAAAGATTTAAACGTTAAACCTAAAACCATAAAAACCCTAGAAGAAAACCTAGGCATTACCATTCAGGACATAGGCGTGGGCAAGGACTTCATGTCCAAAACACCAAAAGCAATGGCAACAAAAGACAAAATTGACAAATGGGATCTAATTAAACTAAAGAGCTTCTGCACAGCAAAAGAAACTACCATCAGAGTGAACAGGCAACCTACAACATGGGAGAAAATTTTCGCAACCTACTCATCTGACAAAGGGCTAATATCCAGAATCTACAATGAACTTAAACAAATTTACAAGAAAAAAACAAACAACCCCATCAAAAAGTGGGCGAAGGACATGAACAGACACTTCTCAAAAGAAGACATTTATGCAGCCAAAAAACACATGAAGAAATGCTCATCATCACTGGCCATCAGAGAAATGCAAATCAAAACCACTATGAGATATCATCTCACACCAGTTAGAATGGCAATCATTAAAAAGTCAGGAAACAACAGGTGCTGGAGAGGATGCGGAGAAATAGGAACACTTTTACACTGTTGGTGGGACTGTAAACTAGTTCAACCATTGTGGAAGTCAGTGTGGCGATTCCTCAGGGATCTAGAACTAGAAATACCATTTGACCCAGCCATCCCATTACTGGGTATATACCCAAATGAGTATAAATCATGCTGCTATAAAGACACATGCACACGTATGTTTATTGCGGCACTATTCACAATAGCAAAGACTTGGAACCAACCCAAATGTCCAACAATGATAGACTGGATTAAGAAAATGTGGCACATATACACCATGGAATACTATGCAGCCATAAAAAATGATGAGTTCATATCCTTTGTAGGGACATGGATGAAATTGGAAACCATCATTCTCAGTAAACTATCGCAAGAACAAAAAACCAAACACCGCATATTCTCACTCATAGGTGGGAATTGAACAATGAGATCACATGGACACAGGAAGGGGAATATCACACTCTGGGGACTGTGGTGGGGTCGGGGGAGGGGGGAGGGATAGCATTGGGAGATATACCTAATGCTAGATGACACATTAGTGGGTGCAGCGCACCAGCATGGCACATGTATACATATGTAACTAACCTGCACAATGTGCACATGTACCCTAAAACTTAGAGTAT 5 | >SVA 6 | CTCCCTCTCCCTCACCCTCTCCCCATGGTCTCCCTCTCCCTCTCTTTCCACGGTCTCCCTCTGATGCCGAGCCGAAGCTGGACGGTACTGCTGCCATCTCGGCTCACTGCAACCTCCCTGCCTGATTCTCCTGCCTCAGCTTGCCGAGTGCCTGCGATTGCAGGCGCGCGCCGCCACGCCTGACTGGTTTTCGTATTTTGTTAGTGGAGACGGGGTTTCGCTGTGTTGGCCGGGCTGGTCTCCAGCTCCTAACCGCGAGTGATCCACCAGCCTCGGCCTCCCGAGGTGCTGGGATTGCAGACGGAGTCTCGTTCACTCAGTGCTCAATGATGCCCAGGCTGGAGTGCAGTGGCGTGATCTCGGCTCGCTACAACCTCCACCTCCCAGCAGCCTGCCTTGGCCTCCCAAAGTGCCGAGATTGCAGCCTCTGCCCGGCCGCCACCCCGTCTGGGAAGTGAGGAGTGTCTCCGCCTGGCCACCCATCGTCTGGGATGTGAGGAGCGTCTCTGCCCTGCCGCCCATCGTCTGAGATGTGGGGAGCACCTCTGCCCGGCCGCCCCGTCCGGGATGTGAGGAGCGTCGCTGCCCGGCCGCCCCGTCTGAGAAGTGAGGAGACCCTCTGCCTGGCAACCGCTCCATCTGAGAAGTGAGGAGCCCCTCCGCCCGGCAGCCGCCCTGTCTGAGAAGTGAGGAGCCCCTCCGCCCAGCAGCCACCTGGTCCGGGAGGGAGGTGGGGGGGTCAGCCCCCCGCCCGGCCAGCCGCCCCGTCCGGGAGGGAGGTGGGGGGGTCAGCCCCCAGCCCGGCCAGCCGCCCCGTCCGGGAAGTGAGGGGCGCCTCTGCCCGGCCGCCCCTACTGGGAAGTGAGGAGCCACTTTGCCCGGCCAGCCACTCTGTCCGGGAGGGAGGTGGGGGGGTCAGCCCCCCGCCCGGCCAGCCGCCCCGTCCGGGAGGGAGGTGGGGGGATCAGCCCCCCGCCCAGCCAGCCGCCCCGTCCGGGAGGGAGGTGGGGGGGTCAGCCCCCCGCCCGGCCAGCCGCCCTGTCCGGGAGGTGAGGGGCGCCTCTGCCCGGCCGCGCCTACTGGAAAGTGAGGAGCCCCTCTGCCCGGCCACCACCCCGTCTGGGAGGTGTGCCCAACAGCTCATTGAGAAGGGGCCATGATGACAATGGCGGTTTTGTGGAATAGAAAGGGGGGAAAGGTGGGGAAAAGATTGAGAAATCGGATGGTTGCCGTGTCTGTGTAGAAAGAGGTAGACCTGGGAGACTTTTCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGGGATCCTGTTGATCGGTGACCTTACCCCCAACCCTGTGCTCTCTGAAACATGTGCTGTATCCACTCAGGGTTGAATGGATTAAGAGCGGTGCAAGATGTGCTTTGTTAAACAGATGCTTGAAGGCAGCATGCTCCTTAAGAGTCATCACCACTCCCTAATCTCAAGTACCCAGGGACACAAACACTGCGGAAGGCCGCAGGGTCCTCTGCCTAGGAAAACCAGAGACCTTTGTTCACTTGTTTATCTGCTGACCTTCCCTCCACTATTGTCCTGTGACCCTGCCAAATCCCCCTCTGTGAGAAACACCCAAGAATGATCAAT 7 | 8 | -------------------------------------------------------------------------------- /src/rMETL/rMETL_MEIcalling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | * All rights Reserved, Designed By HIT-Bioinformatics 5 | * @Description: Establish the ME callset 6 | * @author: Jiang Tao (tjiang@hit.edu.cn) 7 | * @date: Apr 24 2018 8 | * @version V1.0.4 9 | ''' 10 | 11 | import argparse 12 | import logging 13 | import sys 14 | import time 15 | import cigar 16 | 17 | from collections import Counter 18 | from rMETL.rMETL_version import __version__, __author__, __contact__ 19 | from rMETL.rMETL_genotype import simple_call_genotype 20 | from rMETL.rMETL_cmdRunner import setupLogging 21 | from rMETL.rMETL_utils import load_ref 22 | 23 | USAGE="""\ 24 | _ ___ _ _____ _______ _ 25 | _ _ | ^_ _^ | | ___| |__ __| | | 26 | | ^_| | | | | | | | |__ | | | | 27 | | | | | | | | | | __| | | | | 28 | | | | | | | | | | |___ | | | |___ 29 | |_| |_| |_| |_| |_____| |_| |_____| 30 | 31 | rMETL - realignment-based Mobile Element insertion detection Tool for Long read 32 | 33 | Generate final MEI/MED callset in bed or vcf file. 34 | 35 | The output file called 'calling.bed' or 'calling.vcf' 36 | stores in output directory. 37 | 38 | rMETL V%s 39 | Author: %s 40 | Contact: %s 41 | """%(__version__, __author__, __contact__) 42 | 43 | def acquire_count_max(_list_): 44 | c = Counter(_list_) 45 | return c.most_common(1)[0][0] 46 | 47 | flag_dic = {0:1, \ 48 | 16:2, \ 49 | 256:0, \ 50 | 272:0, \ 51 | 2048:0, \ 52 | 2064:0, \ 53 | 4:0} 54 | 55 | STRAND = {'1':'+', \ 56 | '2':'-', \ 57 | '*':'+-'} 58 | 59 | cluster_dic = {} 60 | 61 | strand_dic = {1:'+', \ 62 | 2:'-'} 63 | 64 | class R_INFO(object): 65 | """store the infomation of the signal sequence""" 66 | def __init__(self, Type, Chr, Pos, Len, GT): 67 | self.Type = Type 68 | self.Chr = Chr 69 | self.Pos = Pos 70 | self.Len = Len 71 | self.GT = GT 72 | 73 | def parse_name(seq): 74 | chr = seq.split('*')[0] 75 | breakpoint = seq.split('*')[1] 76 | insert_size = seq.split('*')[2] 77 | GT = seq.split('*')[3] 78 | return chr, breakpoint, insert_size, GT 79 | 80 | def parse_name_tp(line): 81 | ''' 82 | resolution signatures for bed format 83 | ''' 84 | seq = line.split('*') 85 | Type = seq[0] 86 | chr = seq[1] 87 | pos = seq[2] 88 | len = seq[3] 89 | if Type == 'DEL': 90 | rc = seq[4] 91 | cov = seq[5] 92 | else: 93 | rc = seq[5] 94 | cov = seq[6] 95 | GT = rc+':'+cov 96 | local_info = R_INFO(Type, chr, pos, len, GT) 97 | return local_info 98 | 99 | def clip_analysis(deal_cigar, clipping_threshold): 100 | ''' 101 | resolution cogar 102 | ''' 103 | seq = list(cigar.Cigar(deal_cigar).items()) 104 | if seq[0][1] == 'S': 105 | first_pos = seq[0][0] 106 | else: 107 | first_pos = 0 108 | if seq[-1][1] == 'S': 109 | last_pos = seq[-1][0] 110 | else: 111 | last_pos = 0 112 | total_len = first_pos + last_pos 113 | signal_len = 0 114 | for i in seq: 115 | signal_len += i[0] 116 | if signal_len == 0: 117 | return 0 118 | if total_len*1.0 / signal_len >= clipping_threshold: 119 | return 0 120 | else: 121 | return 1 122 | 123 | def print_vcf_head(ref, sample): 124 | ''' 125 | generation of VCF head 126 | ''' 127 | import time 128 | Date = time.strftime("%Y%m%d") 129 | head = list() 130 | head.append("##fileformat=VCFv4.2\n") 131 | head.append("##fileDate=%s\n"%(Date)) 132 | head.append("##source=rMETL\n") 133 | for i in ref: 134 | head.append("##contig=\n"%(i, len(ref[i]))) 135 | head.append("##ALT=,Description=\"Deletion relative to the reference\">\n") 136 | head.append("##ALT=,Description=\"Insertion of sequence relative to the reference\">\n") 137 | head.append("##INFO=\n") 138 | head.append("##INFO=\n") 139 | head.append("##INFO=\n") 140 | head.append("##INFO=\n") 141 | head.append("##INFO=\n") 142 | head.append("##INFO=\n") 143 | head.append("##INFO=\n") 144 | head.append("##FORMAT=\n") 145 | head.append("##FORMAT=\n") 146 | head.append("##FORMAT=\n") 147 | head.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n"%(sample)) 148 | return head 149 | 150 | def parse_seq_head(line): 151 | ''' 152 | resolution signatures for vcf format 153 | ''' 154 | seq = line.split('*') 155 | Type = seq[0] 156 | chr = seq[1] 157 | pos = seq[2] 158 | len = seq[3] 159 | if Type == 'DEL': 160 | rc = seq[4] 161 | cov = seq[5] 162 | else: 163 | rc = seq[5] 164 | cov = seq[6] 165 | GT = rc+':'+cov 166 | local_info = R_INFO(Type, chr, pos, len, GT) 167 | return local_info 168 | 169 | # ************************BED_FUNCTION******************************* 170 | def call_bed(args): 171 | path = args.input 172 | out_path = args.output + "calling.bed" 173 | AlignmentFile = open(path, 'r') 174 | logging.info("Loading ME realignmets...") 175 | for line in AlignmentFile: 176 | seq = line.strip('\n').split('\t') 177 | if seq[0][0] == '@': 178 | continue 179 | local_info = parse_name_tp(seq[0]) 180 | Flag = int(seq[1]) 181 | sub_type = seq[2] 182 | MAPQ = int(seq[4]) 183 | cigar = seq[5] 184 | cigar_flag = clip_analysis(cigar, args.clipping_threshold) 185 | if flag_dic[Flag] != 0 and MAPQ >= args.min_mapq and cigar_flag == 1: 186 | key = "%s*%s*%s*%s"%(local_info.Chr, local_info.Pos, local_info.Len, \ 187 | local_info.GT) 188 | if key not in cluster_dic: 189 | cluster_dic[key] = list() 190 | cluster_dic[key].append("<%s:ME:%s>"%(local_info.Type, sub_type)) 191 | AlignmentFile.close() 192 | 193 | if args.MEI == 'False': 194 | AlignmentFile = open(path, 'r') 195 | for line in AlignmentFile: 196 | seq = line.strip('\n').split('\t') 197 | if seq[0][0] == '@': 198 | continue 199 | local_info = parse_name_tp(seq[0]) 200 | Flag = int(seq[1]) 201 | sub_type = seq[2] 202 | if sub_type == '*': 203 | key = "%s*%s*%s*%s"%(local_info.Chr, local_info.Pos, local_info.Len, \ 204 | local_info.GT) 205 | if key not in cluster_dic: 206 | cluster_dic[key] = list() 207 | cluster_dic[key].append("<%s>"%(local_info.Type)) 208 | AlignmentFile.close() 209 | 210 | sort_list = list() 211 | for i in cluster_dic: 212 | chr, breakpoint, insert_size, GT = parse_name(i) 213 | final_type = acquire_count_max(cluster_dic[i]) 214 | sort_list.append([chr, breakpoint, insert_size, final_type]) 215 | sort_list = sorted(sort_list, key = lambda x:(x[0], int(x[1]))) 216 | file = open(out_path, 'w') 217 | logging.info("Writing results into disk...") 218 | file.write("# Chromsome\tBreakpoint\tSV length\tMEI Type") 219 | for i in sort_list: 220 | file.write("\t".join(i)+"\n") 221 | file.close() 222 | # ************************BED_FUNCTION******************************* 223 | # 224 | # 225 | # 226 | # ************************VCF_FUNCTION******************************* 227 | def call_vcf(args): 228 | path = args.input 229 | out_path = args.output + "calling.vcf" 230 | ref = load_ref(args.Reference) 231 | AlignmentFile = open(path, 'r') 232 | logging.info("Loading ME realignmets...") 233 | for line in AlignmentFile: 234 | seq = line.strip('\n').split('\t') 235 | if seq[0][0] == '@': 236 | continue 237 | local_info = parse_seq_head(seq[0]) 238 | Flag = int(seq[1]) 239 | sub_type = seq[2] 240 | MAPQ = int(seq[4]) 241 | cigar = seq[5] 242 | cigar_flag = clip_analysis(cigar, args.clipping_threshold) 243 | if flag_dic[Flag] != 0 and MAPQ >= args.min_mapq and cigar_flag == 1: 244 | key = "%s*%s*%s*%s"%(local_info.Chr, local_info.Pos, local_info.Len, \ 245 | local_info.GT) 246 | if key not in cluster_dic: 247 | cluster_dic[key] = list() 248 | cluster_dic[key].append("<%s:ME:%s>\t%d"%(local_info.Type, sub_type, \ 249 | flag_dic[Flag])) 250 | AlignmentFile.close() 251 | 252 | if args.MEI == 'False': 253 | AlignmentFile = open(path, 'r') 254 | for line in AlignmentFile: 255 | seq = line.strip('\n').split('\t') 256 | if seq[0][0] == '@': 257 | continue 258 | local_info = parse_name_tp(seq[0]) 259 | Flag = int(seq[1]) 260 | sub_type = seq[2] 261 | if sub_type == '*': 262 | key = "%s*%s*%s*%s"%(local_info.Chr, local_info.Pos, local_info.Len, \ 263 | local_info.GT) 264 | if key not in cluster_dic: 265 | cluster_dic[key] = list() 266 | cluster_dic[key].append("<%s>\t%s"%(local_info.Type, '*')) 267 | AlignmentFile.close() 268 | 269 | sort_list = list() 270 | for i in cluster_dic: 271 | chr, breakpoint, insert_size, GT = parse_name(i) 272 | final_type = acquire_count_max(cluster_dic[i]).split('\t')[0] 273 | strand = STRAND[acquire_count_max(cluster_dic[i]).split('\t')[1]] 274 | sort_list.append([chr, breakpoint, insert_size, final_type, GT, strand]) 275 | sort_list = sorted(sort_list, key = lambda x:(x[0], int(x[1]))) 276 | head_info = print_vcf_head(ref, args.sample) 277 | 278 | file = open(out_path, 'w') 279 | logging.info("Writing results into disk...") 280 | 281 | for line in head_info: 282 | file.write(line) 283 | 284 | ID = 0 285 | for i in sort_list: 286 | concordant = int(i[4].split(':')[0]) 287 | discordant = int(i[4].split(':')[1]) - int(i[4].split(':')[0]) 288 | if discordant < 0: 289 | discordant = 0 290 | GT, GL, reliability = simple_call_genotype(concordant, concordant+discordant, \ 291 | args.heterozygous, args.homozygous) 292 | 293 | if reliability == 1: 294 | INFO = "PRECISE;SVTYPE=%s;SVLEN=%d;END=%d;SAMPLE=%s;STRAND=%s"%(i[3][1:4], \ 295 | int(i[2]), int(i[1])+int(i[2])-1, args.sample, i[5]) 296 | else: 297 | INFO = "IMPRECISE;SVTYPE=%s;SVLEN=%d;END=%d;SAMPLE=%s;STRAND=%s"%(i[3][1:4], \ 298 | int(i[2]), int(i[1])+int(i[2])-1, args.sample, i[5]) 299 | try: 300 | REF = ref[i[0]][int(i[1])-1] 301 | except: 302 | REF = "N" 303 | file.write("%s\t%s\t%d\t%s\t%s\t.\tPASS\t%s\tGT:DV:DR\t%s:%s\n"%(i[0], i[1], \ 304 | ID, REF, i[3], INFO, GT, GL)) 305 | ID += 1 306 | file.close() 307 | # *************************VCF_FUNCTION******************************* 308 | # 309 | # 310 | # 311 | # ************************MAIN_FUNCTION******************************* 312 | def parseArgs(argv): 313 | parser = argparse.ArgumentParser(prog="rMETL calling", description=USAGE, \ 314 | formatter_class=argparse.RawDescriptionHelpFormatter) 315 | parser.add_argument("input", metavar="SAM", type=str, help="Input cluster.sam on STAGE realignment.") 316 | parser.add_argument("Reference", metavar="REFERENCE", type=str, \ 317 | help="The reference genome in fasta format.") 318 | parser.add_argument("format", metavar="[BED,VCF]", type=str, \ 319 | help="The format of the output file. [%(default)s]", default = "bed") 320 | parser.add_argument('output', type=str, help = "Directory to output final callset.") 321 | parser.add_argument('-hom', '--homozygous', \ 322 | help = "The mininum score of a genotyping reported as a homozygous.[%(default)s]", \ 323 | default = 0.8, type = float) 324 | parser.add_argument('-het','--heterozygous', \ 325 | help = "The mininum score of a genotyping reported as a heterozygous.[%(default)s]", \ 326 | default = 0.3, type = float) 327 | parser.add_argument('-q', '--min_mapq', help = "Mininum mapping quality.[%(default)s]", \ 328 | default = 20, type = int) 329 | parser.add_argument('-c', '--clipping_threshold', \ 330 | help = "Mininum threshold of realignment clipping.[%(default)s]", \ 331 | default = 0.5, type = float) 332 | parser.add_argument('--sample', help = "Sample description", \ 333 | default = "None", type = str) 334 | parser.add_argument('--MEI', help = "Enables rMETL to display MEI/MED only.[%(default)s]", \ 335 | default = "True", type = str) 336 | args = parser.parse_args(argv) 337 | return args 338 | 339 | def run(argv): 340 | args = parseArgs(argv) 341 | setupLogging(False) 342 | starttime = time.time() 343 | if args.format == "bed": 344 | call_bed(args) 345 | elif args.format == "vcf": 346 | call_vcf(args) 347 | else: 348 | logging.error("Invalid format.") 349 | exit(1) 350 | logging.info("Finished in %0.2f seconds."%(time.time() - starttime)) 351 | 352 | if __name__ == '__main__': 353 | run(sys.argv[:1]) 354 | -------------------------------------------------------------------------------- /src/rMETL/rMETL_extraction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | * All rights Reserved, Designed By HIT-Bioinformatics 5 | * @Description: Parse the ME signatures from alignments 6 | * @author: Jiang Tao (tjiang@hit.edu.cn) 7 | * @date: Apr 24 2018 8 | * @version V1.0.4 9 | ''' 10 | 11 | import pysam 12 | import cigar 13 | import os 14 | import argparse 15 | import logging 16 | import sys 17 | import time 18 | import gc 19 | 20 | from multiprocessing import Pool 21 | from rMETL.rMETL_version import __version__, __author__, __contact__ 22 | from rMETL.rMETL_concensus import construct_concensus_info 23 | from rMETL.rMETL_genotype import add_genotype 24 | from rMETL.rMETL_utils import load_ref, check_bai, call_ngmlr, call_samtools 25 | from rMETL.rMETL_cmdRunner import setupLogging, exe 26 | 27 | USAGE="""\ 28 | _ ___ _ _____ _______ _ 29 | _ _ | ^_ _^ | | ___| |__ __| | | 30 | | ^_| | | | | | | | |__ | | | | 31 | | | | | | | | | | __| | | | | 32 | | | | | | | | | | |___ | | | |___ 33 | |_| |_| |_| |_| |_____| |_| |_____| 34 | 35 | rMETL - realignment-based Mobile Element insertion detection Tool for Long read 36 | 37 | 38 | Support reads aligned with Ngmlr and sorted with Samtools 39 | 40 | If input is a fastq or fasta format file, rMETL generates 41 | alignments with Ngmlr at first; 42 | 43 | If input is a sam format file, rMETL converts and sorts it 44 | to be a bam format file; 45 | 46 | If your input is a bam format file with index, rMETL extracts 47 | the ME signatures and collects the sub-sequence of them. 48 | 49 | The output is a fasta format file called 'potential.fa' 50 | contains potentials non-reference ME clusters. 51 | 52 | rMETL V%s 53 | Author: %s 54 | Contact: %s 55 | """%(__version__, __author__, __contact__) 56 | 57 | INS_flag = {1:'I'} 58 | DEL_flag = {2:'D'} 59 | clip_flag = {4:'S', 5:'H'} 60 | global_ref = list() 61 | 62 | # **********************check-input-format**************************** 63 | def decipherInput(input): 64 | """ 65 | resolution input format 66 | """ 67 | extension = input.split('.')[-1].lower() 68 | choice = {"bam": 0, \ 69 | "sam": 1, \ 70 | "fasta": 2, \ 71 | "fastq": 2, \ 72 | "fa": 2, \ 73 | "fq": 2} 74 | return choice[extension] 75 | # **********************check-input-format**************************** 76 | # 77 | # 78 | # 79 | # ************************mini-operations***************************** 80 | def revcom_complement(s): 81 | ''' 82 | generation reverse complementary sequence. 83 | all of the lowercase will be changed to capital letter. 84 | ''' 85 | basecomplement = {'A': 'T', \ 86 | 'C': 'G', \ 87 | 'G': 'C', \ 88 | 'T': 'A', \ 89 | 'a': 'T', \ 90 | 'c': 'G', \ 91 | 'g': 'C', \ 92 | 't': 'A'} 93 | letters = list(s) 94 | letters = [basecomplement[base] for base in letters] 95 | return ''.join(letters)[::-1] 96 | 97 | def detect_flag(Flag): 98 | ''' 99 | identification of flag in BAM 100 | 0 means unmapped read 101 | 1 & 2 means primary mapping read with normal strand or reverse strand 102 | 3 & 4 means supplementary mapping read with normal strand or reverse strand 103 | ''' 104 | Normal_foward = 1 >> 1 105 | Abnormal = 1 << 2 106 | Reverse_complement = 1 << 4 107 | Supplementary_map = 1 << 11 108 | 109 | signal = {Abnormal: 0, \ 110 | Normal_foward: 1, \ 111 | Reverse_complement: 2, \ 112 | Supplementary_map: 3, \ 113 | Reverse_complement | Supplementary_map: 4} 114 | 115 | back_sig = signal[Flag] if Flag in signal else 0 116 | return back_sig 117 | 118 | def acquire_clip_pos(deal_cigar): 119 | ''' 120 | resolution of cigar in supplementary mapping 121 | ''' 122 | seq = list(cigar.Cigar(deal_cigar).items()) 123 | first_pos = seq[0][0] if seq[0][1] == 'S' else 0 124 | last_pos = seq[-1][0] if seq[-1][1] == 'S' else 0 125 | bias = 0 126 | for i in seq: 127 | if i[1] in ['M', 'D']: 128 | bias += i[0] 129 | return [first_pos, last_pos, bias] 130 | # ************************mini-operations***************************** 131 | # 132 | # 133 | # 134 | # ************************soft-clippings****************************** 135 | def store_clip_pos(locus, chr, seq, flag, CLIP_note): 136 | ''' 137 | A data structure store info of soft-clippings. 138 | It has two hashtables: 139 | 1. key1 is an integer calculated by coordinate / 10000; 140 | 2. key2 is an integer calculated by (coordinate % 10000) / 50. 141 | ''' 142 | hash_1 = int(locus /10000) 143 | mod = locus % 10000 144 | hash_2 = int(mod / 50) 145 | element = [locus, seq, flag] 146 | if hash_1 not in CLIP_note[chr]: 147 | CLIP_note[chr][hash_1] = dict() 148 | CLIP_note[chr][hash_1][hash_2] = list() 149 | CLIP_note[chr][hash_1][hash_2].append(element) 150 | else: 151 | if hash_2 not in CLIP_note[chr][hash_1]: 152 | CLIP_note[chr][hash_1][hash_2] = list() 153 | CLIP_note[chr][hash_1][hash_2].append(element) 154 | else: 155 | CLIP_note[chr][hash_1][hash_2].append(element) 156 | 157 | def acquire_clip_locus(down, up, chr, CLIP_note): 158 | ''' 159 | search soft-clippings within limited region 160 | ''' 161 | list_clip = list() 162 | if int(up/10000) == int(down/10000): 163 | key_1 = int(down/10000) 164 | if key_1 not in CLIP_note[chr]: 165 | return list_clip 166 | for i in xrange(int((up%10000)/50)-int((down%10000)/50)+1): 167 | key_2 = int((down%10000)/50)+i 168 | if key_2 not in CLIP_note[chr][key_1]: 169 | continue 170 | for ele in CLIP_note[chr][key_1][key_2]: 171 | if ele[0] >= down and ele[0] <= up: 172 | list_clip.append(ele) 173 | else: 174 | key_1 = int(down/10000) 175 | if key_1 in CLIP_note[chr]: 176 | for i in xrange(200-int((down%10000)/50)): 177 | key_2 = int((down%10000)/50)+i 178 | if key_2 not in CLIP_note[chr][key_1]: 179 | continue 180 | for ele in CLIP_note[chr][key_1][key_2]: 181 | if ele[0] >= down and ele[0] <= up: 182 | list_clip.append(ele) 183 | key_1 += 1 184 | if key_1 not in CLIP_note[chr]: 185 | return list_clip 186 | for i in xrange(int((up%10000)/50)+1): 187 | key_2 = i 188 | if key_2 not in CLIP_note[chr][key_1]: 189 | continue 190 | for ele in CLIP_note[chr][key_1][key_2]: 191 | if ele[0] >= down and ele[0] <= up: 192 | list_clip.append(ele) 193 | return list_clip 194 | # ************************soft-clippings****************************** 195 | # 196 | # 197 | # 198 | # ***********************resolution-reads***************************** 199 | def organize_split_signal(chr, primary_info, Supplementary_info, \ 200 | total_L, low_bandary): 201 | ''' 202 | resolution split alignments 203 | ''' 204 | overlap = list() 205 | for i in Supplementary_info: 206 | seq = i.split(',') 207 | local_chr = seq[0] 208 | local_start = int(seq[1]) 209 | local_cigar = seq[3] 210 | dic_starnd = {1:'+', 2: '-'} 211 | if dic_starnd[primary_info[4]] != seq[2]: 212 | continue 213 | if chr != local_chr: 214 | continue 215 | local_set = acquire_clip_pos(local_cigar) 216 | if primary_info[0] < local_start: 217 | if primary_info[3]+local_set[0]-total_L > low_bandary: 218 | overlap.append([total_L - primary_info[3], \ 219 | local_set[0], primary_info[1]]) 220 | else: 221 | if local_set[1]+primary_info[2]-total_L > low_bandary: 222 | overlap.append([total_L - local_set[1], \ 223 | primary_info[2], local_start+local_set[2]-1]) 224 | return overlap 225 | 226 | def parse_read(read, Chr_name, low_bandary, CLIP_note): 227 | ''' 228 | Check: 1.Flag 229 | 2.Supplementary mapping 230 | 3.Seq 231 | ''' 232 | DEL_ME_pos = list() 233 | INS_ME_pos = list() 234 | process_signal = detect_flag(read.flag) 235 | if process_signal == 0: 236 | return INS_ME_pos, DEL_ME_pos 237 | 238 | # Add DEL:ME type call signal 239 | pos_start = read.reference_start 240 | shift = 0 241 | for element in read.cigar: 242 | if element[0] == 0: 243 | shift += element[1] 244 | if element[0] in DEL_flag and element[1] <= low_bandary: 245 | shift += element[1] 246 | if element[0] in DEL_flag and element[1] > low_bandary: 247 | DEL_ME_pos.append([pos_start+shift, element[1]]) 248 | shift += element[1] 249 | 250 | # Add INS:ME type call signal 251 | pos_start = read.reference_start 252 | shift = 0 253 | _shift_read_ = 0 254 | pos_end = read.reference_end 255 | primary_clip_0 = 0 256 | primary_clip_1 = 0 257 | for element in read.cigar: 258 | if element[0] == 0 or element[0] == 2: 259 | shift += element[1] 260 | if element[0] != 2: 261 | _shift_read_ += element[1] 262 | if element[0] in INS_flag and element[1] > low_bandary: 263 | shift += 1 264 | MEI_contig = read.query_sequence[_shift_read_ - \ 265 | element[1]:_shift_read_] 266 | INS_ME_pos.append([pos_start + shift, element[1], \ 267 | MEI_contig]) 268 | if element[0] in clip_flag: 269 | if shift == 0: 270 | primary_clip_0 = element[1] 271 | else: 272 | primary_clip_1 = element[1] 273 | if element[1] > low_bandary: 274 | if shift == 0: 275 | clip_pos = pos_start - 1 276 | clip_contig = read.query_sequence[:element[1]] 277 | store_clip_pos(clip_pos, Chr_name, clip_contig, \ 278 | 0, CLIP_note) 279 | else: 280 | clip_pos = pos_start + shift - 1 281 | clip_contig = read.query_sequence[read.query_length \ 282 | - element[1]:] 283 | store_clip_pos(clip_pos, Chr_name, clip_contig, 1, \ 284 | CLIP_note) 285 | 286 | if process_signal in [1, 2]: 287 | Tags = read.get_tags() 288 | chr = Chr_name 289 | primary_info = [pos_start, pos_end, primary_clip_0, primary_clip_1, \ 290 | process_signal] 291 | for i in Tags: 292 | if i[0] == 'SA': 293 | Supplementary_info = i[1].split(';')[:-1] 294 | overlap = organize_split_signal(chr, primary_info, \ 295 | Supplementary_info, read.query_length, low_bandary) 296 | for k in overlap: 297 | MEI_contig = read.query_sequence[k[0]:k[1]] 298 | INS_ME_pos.append([k[2], k[1] - k[0], MEI_contig]) 299 | return INS_ME_pos, DEL_ME_pos 300 | # ***********************resolution-reads***************************** 301 | # 302 | # 303 | # 304 | # ***********************Cluster-Function***************************** 305 | def merge_pos(pos_list, chr, evidence_read, SV_size, CLIP_note): 306 | ''' 307 | INS: inner-cluster function 308 | ''' 309 | start = list() 310 | end = list() 311 | for ele in pos_list: 312 | start.append(ele[0]) 313 | end.append(ele[0] + ele[1]) 314 | search_down = min(start) - 10 315 | search_up = max(start) + 10 316 | temp_clip = acquire_clip_locus(search_down, search_up, chr, CLIP_note) 317 | result = construct_concensus_info(pos_list, temp_clip, evidence_read, \ 318 | SV_size) 319 | if result != 0: 320 | for i in xrange(len(result)): 321 | result[i] = ["INS", chr] + result[i] + [len(result)] 322 | return result 323 | else: 324 | return 0 325 | 326 | def cluster(pos_list, chr, evidence_read, SV_size, low_bandary, CLIP_note): 327 | ''' 328 | INS: outer-cluster function 329 | ''' 330 | _cluster_ = list() 331 | temp = list() 332 | temp.append(pos_list[0]) 333 | for pos in pos_list[1:]: 334 | if temp[-1][0] + low_bandary < pos[0]: 335 | result = merge_pos(temp, chr, evidence_read, SV_size, CLIP_note) 336 | if result != 0: 337 | _cluster_.append(result) 338 | temp = list() 339 | temp.append(pos) 340 | else: 341 | temp.append(pos) 342 | result = merge_pos(temp, chr, evidence_read, SV_size, CLIP_note) 343 | if result != 0: 344 | _cluster_.append(result) 345 | return _cluster_ 346 | 347 | def merge_pos_del(pos_list, chr, Ref, evidence_read, SV_size): 348 | ''' 349 | DEL: inner-cluster function 350 | ''' 351 | start = list() 352 | end = list() 353 | for ele in pos_list: 354 | start.append(ele[0]) 355 | end.append(ele[0] + ele[1]) 356 | breakpoint = sum(start)/len(start) 357 | size = sum(end)/len(end) - breakpoint 358 | result = list() 359 | if len(pos_list) < evidence_read: 360 | return result 361 | else: 362 | if chr in Ref and size >= SV_size: 363 | result.append(['DEL', chr, breakpoint, size, len(pos_list), \ 364 | str(Ref[chr].seq[breakpoint:breakpoint+size])]) 365 | return result 366 | 367 | def cluster_del(pos_list, chr, Ref, evidence_read, SV_size, low_bandary): 368 | ''' 369 | DEL: outer-cluster function 370 | ''' 371 | _cluster_ = list() 372 | temp = list() 373 | temp.append(pos_list[0]) 374 | for pos in pos_list[1:]: 375 | if temp[-1][0] + low_bandary < pos[0]: 376 | result = merge_pos_del(temp, chr, Ref, evidence_read, SV_size) 377 | if len(result) != 0: 378 | _cluster_.append(result) 379 | temp = list() 380 | temp.append(pos) 381 | else: 382 | temp.append(pos) 383 | result = merge_pos_del(temp, chr, Ref, evidence_read, SV_size) 384 | if len(result) != 0: 385 | _cluster_.append(result) 386 | return _cluster_ 387 | # ***********************Cluster-Function***************************** 388 | # 389 | # 390 | # 391 | # ***********************Output-Function****************************** 392 | def combine_result(INS, DEL, path, chr): 393 | ''' 394 | Merge results into one list and output it. 395 | ''' 396 | output = "%ssignatures/%s_sig.fa"%(path, chr) 397 | file = open(output, 'w') 398 | for i in INS: 399 | for j in i: 400 | if len(j) != 8: 401 | continue 402 | key = "%s*%s*%d*%d*%s*%d*%d"%(j[0], j[1], j[2], j[3], j[4], j[6], \ 403 | j[7]) 404 | file.write(">"+key+'\n') 405 | file.write(j[5]+'\n') 406 | del INS 407 | gc.collect() 408 | for i in DEL: 409 | for j in i: 410 | if len(j) != 7: 411 | continue 412 | key = "%s*%s*%d*%d*%d*%d"%(j[0], j[1], j[2], j[3], j[4], j[6]) 413 | file.write(">%s\n"%(key)) 414 | file.write('%s\n'%(j[5])) 415 | del DEL 416 | gc.collect() 417 | file.close() 418 | # ***********************Output-Function****************************** 419 | # 420 | # 421 | # 422 | # ********************Signatures-extraction*************************** 423 | def single_pipe(out_path, chr, bam_path, low_bandary, evidence_read, SV_size): 424 | ''' 425 | resolution signatures 426 | ''' 427 | samfile = pysam.AlignmentFile(bam_path) 428 | CLIP_note = dict() 429 | logging.info("Resolving chromsome %s."%(chr)) 430 | if chr not in CLIP_note: 431 | CLIP_note[chr] = dict() 432 | cluster_pos_INS = list() 433 | cluster_pos_DEL = list() 434 | for read in samfile.fetch(chr): 435 | feed_back, feed_back_del = parse_read(read, chr, low_bandary, CLIP_note) 436 | cluster_pos_INS += feed_back 437 | cluster_pos_DEL += feed_back_del 438 | cluster_pos_INS = sorted(cluster_pos_INS, key = lambda x:x[0]) 439 | cluster_pos_DEL = sorted(cluster_pos_DEL, key = lambda x:x[0]) 440 | if len(cluster_pos_INS) == 0: 441 | Cluster_INS = list() 442 | else: 443 | Cluster_INS = cluster(cluster_pos_INS, chr, evidence_read, SV_size, \ 444 | low_bandary, CLIP_note) 445 | del cluster_pos_INS 446 | del CLIP_note[chr] 447 | gc.collect() 448 | if len(cluster_pos_DEL) == 0: 449 | Cluster_DEL = list() 450 | else: 451 | Ref = global_ref[0] 452 | Cluster_DEL = cluster_del(cluster_pos_DEL, chr, Ref, evidence_read, \ 453 | SV_size, low_bandary) 454 | del cluster_pos_DEL 455 | gc.collect() 456 | logging.info("%d MEI/MED signal loci in the chromsome %s."%(len(Cluster_INS)+\ 457 | len(Cluster_DEL), chr)) 458 | combine_result(add_genotype(Cluster_INS, samfile, low_bandary), \ 459 | add_genotype(Cluster_DEL, samfile, low_bandary), out_path, chr) 460 | samfile.close() 461 | 462 | def multi_run_wrapper(args): 463 | return single_pipe(*args) 464 | 465 | def load_sam_multi_processes(args): 466 | ''' 467 | task scheduling 468 | ''' 469 | temporary_dir = args.temp_dir if args.temp_dir.endswith('/') else \ 470 | "%s/"%(args.temp_dir) 471 | os.mkdir("%ssignatures"%temporary_dir) 472 | # Major Steps: 473 | # loading alignment file: bam format 474 | samfile = pysam.AlignmentFile(args.input) 475 | # loading reference genome 476 | Ref = load_ref(args.Reference) 477 | global_ref.append(Ref) 478 | # acquire the total numbers of the ref contigs 479 | contig_num = len(samfile.get_index_statistics()) 480 | logging.info("The total number of chromsomes: %d"%(contig_num)) 481 | # Thread scheduling 482 | process_list = list() 483 | for i in samfile.get_index_statistics(): 484 | process_list.append([i[0], i[3]]) 485 | # #chr #read 486 | process_list = sorted(process_list, key = lambda x:-x[1]) 487 | # start to establish multiprocesses 488 | analysis_pools = Pool(processes = args.threads) 489 | # Acquire_Chr_name 490 | for i in process_list: 491 | para = [(temporary_dir, i[0], args.input, args.min_distance, \ 492 | args.min_support, args.min_length)] 493 | analysis_pools.map_async(multi_run_wrapper, para) 494 | analysis_pools.close() 495 | analysis_pools.join() 496 | samfile.close() 497 | 498 | output_p = args.output_dir if args.output_dir.endswith('/') else \ 499 | "%s/"%(args.output_dir) 500 | if not os.path.exists(output_p): 501 | os.mkdir(output_p) 502 | merge_cmd = ("cat %ssignatures/* > %spotential_ME.fa"%(temporary_dir, output_p)) 503 | r, o, e = exe(merge_cmd) 504 | if r != 0: 505 | logging.error("Merging ME signatures failed!") 506 | logging.error("RETCODE %d" % (r)) 507 | logging.error("STDOUT %s" % (str(o))) 508 | logging.error("STDERR %s" % (str(e))) 509 | logging.error("Exiting") 510 | exit(r) 511 | logging.info("Cleaning temporary files.") 512 | cmd_remove_tempfile = ("rm -r %ssignatures"%(temporary_dir)) 513 | r, o, e = exe(cmd_remove_tempfile) 514 | if r != 0: 515 | logging.error("Cleaning temporary files failed!") 516 | logging.error("RETCODE %d" % (r)) 517 | logging.error("STDOUT %s" % (str(o))) 518 | logging.error("STDERR %s" % (str(e))) 519 | logging.error("Exiting") 520 | exit(r) 521 | # ********************Signatures-extraction*************************** 522 | # 523 | # 524 | # 525 | # ************************MAIN_FUNCTION******************************* 526 | def parseArgs(argv): 527 | parser = argparse.ArgumentParser(prog="rMETL detection", \ 528 | description=USAGE, formatter_class=argparse.RawDescriptionHelpFormatter) 529 | parser.add_argument("input", metavar="[SAM,BAM,FASTA,FASTQ]", type=str, \ 530 | help="Input reads with/without alignment.") 531 | parser.add_argument("Reference", metavar="REFERENCE", type=str, \ 532 | help="The reference genome in fasta format.") 533 | parser.add_argument('temp_dir', type=str, \ 534 | help = "Temporary directory to use for distributed jobs.") 535 | parser.add_argument('output_dir', type=str, \ 536 | help = "Directory to output potential ME loci.") 537 | parser.add_argument('-s', '--min_support',\ 538 | help = "Mininum number of reads that support a ME.[%(default)s]", \ 539 | default = 5, type = int) 540 | parser.add_argument('-l', '--min_length', \ 541 | help = "Mininum length of ME to be reported.[%(default)s]", \ 542 | default = 50, type = int) 543 | parser.add_argument('-d', '--min_distance', \ 544 | help = "Mininum distance of two ME signatures to be intergrated.[%(default)s]", \ 545 | default = 20, type = int) 546 | parser.add_argument('-t', '--threads', \ 547 | help = "Number of threads to use.[%(default)s]", default = 8, \ 548 | type = int) 549 | parser.add_argument('-x', '--presets', \ 550 | help = "The sequencing platform of the reads.[%(default)s]", \ 551 | default = "pacbio", type = str) 552 | args = parser.parse_args(argv) 553 | return args 554 | 555 | def run(argv): 556 | args = parseArgs(argv) 557 | setupLogging(False) 558 | starttime = time.time() 559 | flag = decipherInput(args.input) 560 | 561 | if flag == 0: 562 | # detection 563 | result = check_bai(args.input, args.temp_dir) 564 | if len(result) == 0: 565 | load_sam_multi_processes(args) 566 | else: 567 | args.input = result 568 | load_sam_multi_processes(args) 569 | elif flag == 1: 570 | bam_path = call_samtools(args.input, args.temp_dir) 571 | args.input = bam_path 572 | load_sam_multi_processes(args) 573 | # samtools transfer 574 | else: 575 | # inFile, ref, seq_type, nproc=1, outFile="map.sam", presets="pacbio" 576 | file = call_ngmlr(args.input, args.Reference, args.presets, \ 577 | args.threads, args.temp_dir) 578 | bam_path = call_samtools(file, args.temp_dir) 579 | args.input = bam_path 580 | load_sam_multi_processes(args) 581 | logging.info("Finished in %0.2f seconds."%(time.time() - starttime)) 582 | 583 | if __name__ == '__main__': 584 | run(sys.argv[:1]) 585 | --------------------------------------------------------------------------------