├── .gitignore ├── LICENSE ├── README.md ├── circlemap ├── Coverage.py ├── __init__.py ├── __version__.py ├── bam2bam.py ├── call.py ├── circle_map.py ├── extract_circle_SV_reads.py ├── realigner.py ├── repeats.py ├── simulations.py └── utils.py ├── setup.py ├── tests ├── profile_circle_map.py └── run_call.py └── tutorial ├── repetitive_region1.fastq ├── repetitive_region2.fastq ├── unknown_circle_reads_1.fastq └── unknown_circle_reads_2.fastq /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/python 2 | 3 | ### Python ### 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/simulations.cpython-36.pyc 6 | __pycache__/ 7 | __pycache__/*pyc 8 | *.py[cod] 9 | *$py.class 10 | *.pyc 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *,cover 52 | .hypothesis/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # dotenv 88 | .env 89 | 90 | # virtualenv 91 | .venv 92 | venv/ 93 | ENV/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # End of https://www.gitignore.io/api/python 102 | #don't track Pycharm files 103 | 104 | .screenlog.0 105 | 106 | # Created by https://www.gitignore.io/api/pycharm 107 | # Edit at https://www.gitignore.io/?templates=pycharm 108 | 109 | ### PyCharm ### 110 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 111 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 112 | src/Circle-Map.iml 113 | src/misc.xml 114 | modules.xml 115 | # User-specific stuff 116 | src/.idea/**/workspace.xml 117 | src/.idea/**/tasks.xml 118 | .idea/**/usage.statistics.xml 119 | .idea/**/dictionaries 120 | .idea/**/shelf 121 | 122 | # Generated files 123 | .idea/**/contentModel.xml 124 | 125 | # Sensitive or high-churn files 126 | .idea/**/dataSources/ 127 | .idea/**/dataSources.ids 128 | .idea/**/dataSources.local.xml 129 | .idea/**/sqlDataSources.xml 130 | .idea/**/dynamic.xml 131 | .idea/**/uiDesigner.xml 132 | .idea/**/dbnavigator.xml 133 | 134 | # Gradle 135 | .idea/**/gradle.xml 136 | .idea/**/libraries 137 | 138 | # Gradle and Maven with auto-import 139 | # When using Gradle or Maven with auto-import, you should exclude module files, 140 | # since they will be recreated, and may cause churn. Uncomment if using 141 | # auto-import. 142 | # .idea/modules.xml 143 | # .idea/*.iml 144 | # .idea/modules 145 | 146 | # CMake 147 | cmake-build-*/ 148 | 149 | # Mongo Explorer plugin 150 | .idea/**/mongoSettings.xml 151 | 152 | # File-based project format 153 | *.iws 154 | 155 | # IntelliJ 156 | out/ 157 | 158 | # mpeltonen/sbt-idea plugin 159 | .idea_modules/ 160 | 161 | # JIRA plugin 162 | atlassian-ide-plugin.xml 163 | 164 | # Cursive Clojure plugin 165 | .idea/replstate.xml 166 | 167 | # Crashlytics plugin (for Android Studio and IntelliJ) 168 | com_crashlytics_export_strings.xml 169 | crashlytics.properties 170 | crashlytics-build.properties 171 | fabric.properties 172 | 173 | # Editor-based Rest Client 174 | .idea/httpRequests 175 | 176 | # Android studio 3.1+ serialized cache file 177 | .idea/caches/build_file_checksums.ser 178 | 179 | # JetBrains templates 180 | **___jb_tmp___ 181 | 182 | ### PyCharm Patch ### 183 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 184 | 185 | # *.iml 186 | # modules.xml 187 | # .idea/misc.xml 188 | # *.ipr 189 | 190 | # Sonarlint plugin 191 | .idea/sonarlint 192 | 193 | # End of https://www.gitignore.io/api/pycharm 194 | # 195 | build/ 196 | Circle_Map.egg-info/ 197 | dist/ 198 | 199 | #ignore tutorial files 200 | tutorial/*bam 201 | tutorial/*bed 202 | tutorial/*sam 203 | tutorial/*sam 204 | tutorial/*bai 205 | .idea 206 | src/.idea 207 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Iñigo Prada Luengo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Welcome to Circle-Map official repository! 2 | [![PyPI](https://img.shields.io/pypi/v/Circle-Map.svg)](https://pypi.python.org/pypi/Circle-Map) 3 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/circle-map/badges/version.svg)](https://anaconda.org/bioconda/circle-map) 4 | [![Bioconda Downloads](https://anaconda.org/bioconda/circle-map/badges/downloads.svg)](https://anaconda.org/bioconda/circle-map) 5 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/circle-map/badges/latest_release_date.svg)](https://anaconda.org/bioconda/circle-map) 6 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/circle-map/badges/license.svg)](https://github.com/iprada/Circle-Map/blob/master/LICENSE) 7 | 8 | Circle-Map is an easy to install, python package that implements all the steps required to detect extrachromosomal DNA circles. The package contains easy to run algorithms to accurately detect circular DNA formed from mappable and non mappable regions of a genome. 9 | 10 | 11 | ## Why should I use Circle-Map? 12 | 13 | Circle-Map takes as input an alignment of reads to a reference genome (e.g. a *BWA-MEM* generated *BAM* file) and like other methods, it will use those alignments to detect cases were the read has been split into two segments (e.g. split reads) to detect genomic rearrangements supporting a circular DNA structure. 14 | 15 | However, this approach results in many split read alignments being missed because the aligner is not able to map both split segments of the read, either because they are too short or because they align to too many places. In this cases, the aligner will report a read alignment containing some of the bases unmapped (e.g soft-clipped reads). 16 | 17 | Unlike other methods, Circle-Map is able to map both segments of the soft-clipped reads by realigning the unmapped parts probabilistically to a graph representation of the circular DNA breakpoints. Which allows for a more accurate detection of the circular DNA breakpoints. In our recent paper (https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3160-3) we show how this approach dramatically increases sensitivity while retaining high precision. 18 | 19 | 20 | ## Getting started 21 | 22 | ### Installation 23 | 24 | Circle-Map runs on GNU/Linux operating systems, **requires >=python3.6** and can be installed and set-up using the following ways: 25 | 26 | This will install Circle-Map, and all the external packages required to run every part of Circle-Map software. 27 | 28 | Installation using **pip**: 29 | 30 | python -m pip install Circle-Map 31 | 32 | Installation using **conda**: 33 | 34 | conda install -c bioconda circle-map 35 | 36 | **Note**: If you want to simulate circular DNA short reads, you will need to install [BBMap](https://sourceforge.net/projects/bbmap/) and [ART](https://www.niehs.nih.gov/research/resources/software/biostatistics/art/index.cfm) on your system. 37 | 38 | ### Using Circle-Map 39 | 40 | Now you are ready to get started detecting circular DNA. We have created a [Circle-Map wiki](https://github.com/iprada/Circle-Map/wiki) that explains step by step how you can go from your raw sequencing reads to interpretable results. In the wiki, you can try and learn using Circle-Map with the following tutorials: 41 | 42 | * [Tutorial: identification of mappable circular DNA using Circle Map Realign](https://github.com/iprada/Circle-Map/wiki/Tutorial:-Identification-of-circular-DNA-using-Circle-Map-Realign) 43 | 44 | * [Tutorial: identification of repetitive circular DNA using Circle Map Repeats](https://github.com/iprada/Circle-Map/wiki/Tutorial:-Identification-of-repetitive-circular-DNA-using-Circle-Map-Repeats) 45 | 46 | 47 | Once you have detected circular DNA in your samples you will probably be interested at looking at the output files. To help you understanding the output, we have created a page explaining what is the information provided by Circle-Map in every column of its output. You can find the information in the following link: 48 | 49 | * [Circle-Map output explanation](https://github.com/iprada/Circle-Map/wiki/Circle-Map-output-files) 50 | 51 | 52 | 53 | 54 | ## Getting help 55 | 56 | The best place for getting help, feedback,report bugs or request new features is to post an [issue](https://github.com/iprada/Circle-Map/issues). You can also reach me at xsh723 at dot binf dot ku dot dk 57 | 58 | ## Citing 59 | 60 | If you use Circle-Map Realign, please cite: 61 | 62 | * Prada-Luengo, I., Krogh, A., Maretty, L. & Regenberg,B. Sensitive detection of circular DNAs at single-nucleotide resolution using guided realignment of partially aligned reads. BMC Bioinformatics 20, 663 (2019) doi:10.1186/s12859-019-3160-3 63 | 64 | If you use Circle-Map Repeats please cite: 65 | 66 | * Prada-Luengo, I., Møller, H.D., Henriksen, R.A., Gao, Q., Larsen, C..E, Alizadeh, S., Maretty, L., Houseley, J. & Regenberg, B., Replicative aging is associated with loss of genetic heterogeneity from extrachromosomal circular DNA in Saccharomyces cerevisiae. Nucleic Acids Research gkaa545, doi:10.1093/nar/gkaa545 67 | 68 | ## License 69 | 70 | Circle-Map is freely available under the [MIT license](https://opensource.org/licenses/MIT) 71 | 72 | ## Acknowledgements 73 | 74 | Circle-Map is being developed by Iñigo Prada-Luengo, Anders Krogh, Lasse Maretty and Birgitte Regenberg at the University of Copenhagen 75 | -------------------------------------------------------------------------------- /circlemap/Coverage.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2019 Iñigo Prada Luengo 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | import os 23 | import pysam as ps 24 | import pybedtools as bt 25 | import numpy as np 26 | 27 | 28 | class coverage: 29 | """Class for managing the coverage metrics of circle-map""" 30 | 31 | def __init__(self,sorted_bam,eccdna_bed,extension,mapq,inside_length,directory): 32 | 33 | self.bam = ps.AlignmentFile(directory + "/" + sorted_bam, "rb") 34 | self.bed = eccdna_bed 35 | 36 | #length of out 37 | self.ext = extension 38 | self.mapq = mapq 39 | 40 | #length of the region for the ratio 41 | self.ilen = inside_length 42 | 43 | def print_parameters(self): 44 | print("Running coverage computations \n") 45 | 46 | 47 | 48 | def get_wg_coverage(self): 49 | """Generator that takes as input a sorted bam and a merged bam of the circles in the whole genome and returns a numpy 50 | array for every interval with the coverage""" 51 | 52 | 53 | 54 | reference_contigs = self.bam.header['SQ'] 55 | 56 | header_dict = {} 57 | for reference in reference_contigs: 58 | header_dict[reference['SN']] = reference['LN'] 59 | 60 | merged_bed = self.bed.sort().merge() 61 | 62 | for interval in merged_bed: 63 | 64 | coverage_dict = {} 65 | if interval.start - self.ext < 0: 66 | start = 0 67 | 68 | else: 69 | start = interval.start - self.ext 70 | 71 | if header_dict[interval.chrom] < (interval.end + self.ext): 72 | end = interval.end + self.ext 73 | else: 74 | end = interval.end 75 | 76 | cov = self.bam.count_coverage(contig=interval.chrom, start=start, end=end, quality_threshold=self.mapq) 77 | summarized_cov = np.array([cov[0], cov[1], cov[2], cov[3]]).sum(axis=0) 78 | 79 | # save memory, convert to uint32. 80 | summ_cov = np.uint32(summarized_cov) 81 | 82 | print("Computing coverage on interval %s:%s-%s" % (interval.chrom,interval.start,interval.end)) 83 | coverage_dict[bt.Interval(interval.chrom, start, end)] = summ_cov 84 | 85 | yield(coverage_dict,header_dict) 86 | 87 | 88 | def compute_coverage(self,cov_generator): 89 | 90 | 91 | """Function that takes as input generator returning coverage numpy arrays and file with summarized statistics 92 | of the coverage within the intervals""" 93 | 94 | print("Computing the coverage of the identified eccDNA") 95 | print("Merging intervals for coverage computation") 96 | 97 | output = [] 98 | for cov_dict,header_dict in cov_generator: 99 | for key,value in cov_dict.items(): 100 | 101 | 102 | overlaps = bt.BedTool(self.bed.all_hits(key)) 103 | 104 | 105 | for interval in overlaps: 106 | 107 | # compute array slicing indices 108 | start = interval.start -key.start 109 | end = interval.end - key.start 110 | 111 | 112 | if start - self.ext < 0: 113 | ext_start = 0 114 | else: 115 | ext_start = start - self.ext 116 | 117 | if header_dict[interval.chrom] < (end+ self.ext): 118 | ext_end = header_dict[interval.chrom] 119 | else: 120 | ext_end = end + self.ext 121 | 122 | # slice extended array and coverage array 123 | ext_array = value[ext_start:ext_end] 124 | region_array = value[start:end] 125 | 126 | 127 | 128 | 129 | 130 | 131 | try: 132 | 133 | mean = np.mean(region_array) 134 | sd = np.std(region_array) 135 | 136 | interval.append(str(mean)) 137 | interval.append(str(sd)) 138 | 139 | except: 140 | 141 | interval.append('NA') 142 | interval.append('NA') 143 | 144 | 145 | # compute ratios 146 | 147 | try: 148 | 149 | start_coverage_ratio = np.sum(region_array[0:self.ilen]) / np.sum( 150 | ext_array[0:(self.ilen + self.ext)]) 151 | end_coverage_ratio = np.sum(region_array[-self.ilen:]) / np.sum(ext_array[-(self.ilen + self.ext):]) 152 | 153 | interval.append(str(start_coverage_ratio)) 154 | interval.append(str(end_coverage_ratio)) 155 | 156 | except: 157 | 158 | interval.append('NA') 159 | interval.append('NA') 160 | 161 | try: 162 | 163 | 164 | zero_frac = np.count_nonzero(region_array == 0) / len(region_array) 165 | interval.append(str(zero_frac)) 166 | 167 | except: 168 | 169 | interval.append('NA') 170 | output.append(interval) 171 | 172 | 173 | return(bt.BedTool(output)) 174 | -------------------------------------------------------------------------------- /circlemap/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iprada/Circle-Map/e1d122a4bc3d9f36ae00fcf0cbcdfef77ada7c90/circlemap/__init__.py -------------------------------------------------------------------------------- /circlemap/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.1.5' 2 | -------------------------------------------------------------------------------- /circlemap/bam2bam.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2019 Iñigo Prada Luengo 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | from __future__ import division 24 | 25 | 26 | import os 27 | import sys 28 | from Bio.Seq import Seq 29 | import time 30 | from circlemap.utils import * 31 | import pandas as pd 32 | import traceback 33 | import multiprocessing as mp 34 | import warnings 35 | import datetime 36 | 37 | 38 | 39 | class bam2bam: 40 | """Class for managing the realignment and eccDNA indetification of circle-map""" 41 | 42 | queue = mp.Manager().Queue() 43 | 44 | def __init__(self, input_bam,output,qname_bam,genome_fasta,directory,mapq_cutoff,insert_size_mapq,std_extension, 45 | insert_size_sample_size,gap_open,gap_ext,n_hits,prob_cutoff,min_soft_clipped_length, 46 | interval_p_cut,ncores,locker,verbose,pid,edit_distance_frac, 47 | remap_splits,only_discordants,score,insert_size,manager): 48 | #I/O 49 | self.edit_distance_frac = edit_distance_frac 50 | self.ecc_dna_str = input_bam 51 | self.output = output 52 | self.qname_bam = qname_bam 53 | self.directory = directory 54 | self.genome_fa = genome_fasta 55 | 56 | #realignment parameters 57 | 58 | # probabilistic realignment options 59 | self.n_hits = n_hits 60 | self.prob_cutoff = prob_cutoff 61 | self.min_sc_length = min_soft_clipped_length 62 | self.mapq_cutoff = mapq_cutoff 63 | self.interval_p = interval_p_cut 64 | self.remap = remap_splits 65 | self.only_discordants = only_discordants 66 | self.score = score 67 | self.insert = insert_size 68 | 69 | # affine gap scoring options 70 | self.gap_open = gap_open 71 | self.gap_ext = gap_ext 72 | 73 | 74 | #insert size stimation parameters 75 | self.insert_size_mapq = insert_size_mapq 76 | self.std_extenstion = std_extension 77 | self.insert_sample_size = insert_size_sample_size 78 | 79 | #regular options 80 | self.cores = ncores 81 | self.verbose = verbose 82 | self.lock = locker 83 | 84 | 85 | 86 | #for instances running on the same directoiry 87 | 88 | self.pid = pid 89 | 90 | #parallel enviroment 91 | self.read_list = manager.list() 92 | self.read_count = manager.Value('i', 0) 93 | self.write_round = manager.Value('i', 0) 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | def listener_writer(self,bam): 102 | 103 | f = open('test.sam',"w") 104 | 105 | header = bam.header 106 | 107 | while True: 108 | 109 | # Read from the queue and do nothing 110 | read = self.queue.get() 111 | 112 | 113 | if read == "DONE": 114 | f.close() 115 | print("breaking") 116 | bam.close() 117 | break 118 | else: 119 | 120 | pysam_read = ps.AlignedSegment.fromstring(read,bam.header) 121 | f.write(read + "\n") 122 | bam.write(pysam_read) 123 | 124 | def kill(self): 125 | print("KILLING") 126 | self.queue.put("DONE") 127 | 128 | def beta_version_warning(self): 129 | """Warn the user that this is experimental""" 130 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S: You are using a beta version feature")) 131 | warnings.warn("The bam2bam feature on Circle-Map is experimental. The development of this feature is active, but" 132 | " have in mind that it might produce unintended results. Check https://github.com/iprada/Circle-Map" 133 | " for the development status.") 134 | 135 | 136 | 137 | 138 | def realign(self,peaks): 139 | """Function that will iterate trough the bam file containing reads indicating eccDNA structural variants and 140 | will output a bed file containing the soft-clipped reads, the discordant and the coverage within the interval""" 141 | 142 | #open files for every process 143 | try: 144 | peaks_pd = pd.DataFrame.from_records(peaks,columns=['chrom', 'start', 'end']) 145 | genome_fa = ps.FastaFile(self.genome_fa) 146 | ecc_dna = ps.AlignmentFile(self.ecc_dna_str,"rb") 147 | 148 | begin = time.time() 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | # compute insert size distribution 158 | 159 | insert_metrics = self.insert 160 | 161 | 162 | #define realignment extension interval 163 | extension = insert_metrics[0] + self.std_extenstion*insert_metrics[1] 164 | 165 | 166 | iteration = 0 167 | 168 | 169 | 170 | for index,interval in peaks_pd.iterrows(): 171 | 172 | 173 | 174 | 175 | try: 176 | 177 | 178 | #find out the prior distribution (mate alignment positions). 179 | candidate_mates = get_mate_intervals(ecc_dna,interval,self.mapq_cutoff,self.verbose,self.only_discordants) 180 | 181 | 182 | 183 | 184 | 185 | 186 | if len(candidate_mates) > 0 or candidate_mates != None: 187 | 188 | 189 | realignment_interval_extended = get_realignment_intervals(candidate_mates,extension,self.interval_p, 190 | self.verbose) 191 | 192 | 193 | if realignment_interval_extended is None: 194 | continue 195 | 196 | 197 | 198 | iteration_results = [] 199 | for index,mate_interval in realignment_interval_extended.iterrows(): 200 | 201 | iteration += 1 202 | 203 | 204 | 205 | #sample realignment intervals 206 | #fasta file fetch is 1 based that why I do +1 207 | 208 | plus_coding_interval = genome_fa.fetch(str(mate_interval['chrom']),int(int(mate_interval['start'])+1),int(int(mate_interval['end'])+1)).upper() 209 | interval_length = len(plus_coding_interval) 210 | minus_coding_interval = str(Seq(plus_coding_interval).complement()) 211 | 212 | # precompute the denominators of the error model. They will be constants for every interval 213 | plus_base_freqs = background_freqs(plus_coding_interval) 214 | 215 | minus_base_freqs = {'T':plus_base_freqs['A'],'A':plus_base_freqs['T'], 216 | 'C':plus_base_freqs['G'],'G':plus_base_freqs['C']} 217 | 218 | minus_base_freqs = np.array([plus_base_freqs['T'],plus_base_freqs['A'],plus_base_freqs['G'],plus_base_freqs['C']]) 219 | plus_base_freqs = np.array([plus_base_freqs['A'],plus_base_freqs['T'],plus_base_freqs['C'],plus_base_freqs['G']]) 220 | 221 | 222 | #note that I am getting the reads of the interval. Not the reads of the mates 223 | 224 | for read in ecc_dna.fetch(interval['chrom'],int(interval['start']),int(interval['end']),multiple_iterators=True): 225 | 226 | 227 | if is_soft_clipped(read): 228 | 229 | if read.mapq >= self.mapq_cutoff: 230 | 231 | # no need to realignment 232 | if read.has_tag('SA') and self.remap != True: 233 | 234 | # check realignment from SA tag 235 | support = circle_from_SA(read, self.mapq_cutoff, mate_interval) 236 | 237 | if support is None: 238 | pass 239 | 240 | else: 241 | 242 | if support['support'] == True: 243 | self.queue.put(read.to_string()) 244 | 245 | else: 246 | # uninformative read 247 | pass 248 | 249 | 250 | 251 | else: 252 | #sc length 253 | sc_len = len(get_longest_soft_clipped_bases(read)['seq']) 254 | 255 | 256 | if non_colinearity(int(read.cigar[0][0]),int(read.cigar[-1][0]),int(read.pos), 257 | int(mate_interval.start),int(mate_interval.end)) == True: 258 | if sc_len >= self.min_sc_length: 259 | edits_allowed = adaptative_myers_k(sc_len, self.edit_distance_frac) 260 | #realignment 261 | 262 | realignment_dict = realign(read,self.n_hits,plus_coding_interval,minus_coding_interval, 263 | plus_base_freqs,minus_base_freqs,self.gap_open,self.gap_ext,self.verbose,edits_allowed) 264 | 265 | 266 | if realignment_dict == None: 267 | 268 | pass 269 | 270 | else: 271 | #calc edit distance allowed 272 | prob = realignment_probability(realignment_dict,interval_length) 273 | if prob >= self.prob_cutoff and realignment_dict['alignments'][1][3] <= edits_allowed: 274 | 275 | # here I have to retrieve the nucleotide mapping positions. Which should be the 276 | # the left sampling pysam coordinate - edlib coordinates 277 | 278 | read_end = rightmost_from_read(read) 279 | 280 | #aln start on the reference 281 | soft_clip_start = int(mate_interval['start'])+ int(realignment_dict['alignments'][1][0][0]) 282 | 283 | soft_clip_end = int(mate_interval['start']) + int(realignment_dict['alignments'][1][0][1]) 284 | 285 | score = sc_len*prob 286 | 287 | 288 | # I store the read name to the output, so that a read counts as 1 no matter it is SC in 2 pieces 289 | # Soft-clipped aligned upstream. Primary aligned downstream 290 | if read.reference_start < int(mate_interval['start']) + int( 291 | realignment_dict['alignments'][1][0][0]): 292 | # construct tag 293 | sa_tag = realignment_read_to_SA_string(realignment_dict, 294 | prob, interval['chrom'], 295 | soft_clip_start) 296 | 297 | 298 | #read.tags += [('SA', sa_tag)] 299 | 300 | self.queue.put(read.to_string()) 301 | 302 | 303 | 304 | # soft-clipped aligned downstream primary alignment is upstream 305 | elif read.reference_start + int(mate_interval['start']) + int( 306 | realignment_dict['alignments'][1][0][0]): 307 | 308 | sa_tag = realignment_read_to_SA_string(realignment_dict, 309 | prob, interval[ 310 | 'chrom'], 311 | soft_clip_start) 312 | 313 | read.tags += [('SA', sa_tag)] 314 | 315 | self.queue.put(read.to_string()) 316 | 317 | else: 318 | # uninformative read 319 | pass 320 | 321 | 322 | 323 | else: 324 | pass 325 | else: 326 | 327 | pass 328 | 329 | else: 330 | pass 331 | else: 332 | pass 333 | 334 | 335 | except BaseException as e: 336 | traceback.print_exc(file=sys.stdout) 337 | warnings.warn( 338 | "Failed on interval %s due to the error %s" % ( 339 | str(interval), str(e))) 340 | return([1,1]) 341 | 342 | 343 | 344 | 345 | ecc_dna.close() 346 | genome_fa.close() 347 | 348 | except: 349 | print("Failed on cluster:") 350 | print(traceback.print_exc(file=sys.stdout)) 351 | return([1,1]) 352 | 353 | genome_fa.close() 354 | ecc_dna.close() 355 | 356 | 357 | return([0,0]) -------------------------------------------------------------------------------- /circlemap/call.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iprada/Circle-Map/e1d122a4bc3d9f36ae00fcf0cbcdfef77ada7c90/circlemap/call.py -------------------------------------------------------------------------------- /circlemap/circle_map.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2019 Iñigo Prada Luengo 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | 24 | import argparse 25 | import sys 26 | from functools import partial 27 | import os 28 | import time 29 | import pandas as pd 30 | import pysam as ps 31 | from circlemap.extract_circle_SV_reads import readExtractor 32 | from circlemap.realigner import realignment 33 | from circlemap.bam2bam import bam2bam 34 | from circlemap.repeats import repeat 35 | from circlemap.utils import merge_final_output, filter_by_ratio, start_realign, start_simulate, mutate, insert_size_dist 36 | from circlemap.Coverage import coverage 37 | import multiprocessing as mp 38 | import pybedtools as bt 39 | from circlemap.simulations import sim_ecc_reads 40 | import subprocess as sp 41 | import glob 42 | from tqdm import * 43 | from circlemap.__version__ import __version__ as cm_version 44 | import datetime 45 | 46 | class circle_map: 47 | 48 | def __getpid__(self): 49 | 50 | pid = os.getpid() 51 | return (pid) 52 | 53 | def __init__(self): 54 | self.parser = argparse.ArgumentParser( 55 | description='Circle-Map', 56 | usage='''Circle-Map [options] 57 | 58 | version=%s 59 | contact= https://github.com/iprada/Circle-Map/issues 60 | 61 | The Circle-Map suite 62 | 63 | Commands: 64 | 65 | ReadExtractor Extracts circular DNA read candidates 66 | Realign Realign circular DNA read candidates 67 | bam2bam Realign circular DNA read candidates and report them on a new BAM file 68 | Repeats Identify circular DNA from repetitive regions 69 | Simulate Simulate circular DNA 70 | 71 | ''' % cm_version) 72 | subparsers = self.parser.add_subparsers() 73 | 74 | self.readextractor = subparsers.add_parser( 75 | name="ReadExtractor", 76 | description='Extracts circular DNA read candidates', 77 | prog="Circle-Map ReadExtractor", 78 | usage='''Circle-Map ReadExtractor [options]''' 79 | 80 | ) 81 | 82 | self.realigner = subparsers.add_parser( 83 | name="Realign", 84 | description='Realign circular DNA read candidates', 85 | prog="Circle-Map Realign", 86 | usage='''Circle-Map Realign [options]''' 87 | 88 | ) 89 | 90 | self.repeats = subparsers.add_parser( 91 | name="Repeats", 92 | description='Identify circular DNA from repetitive regions', 93 | prog="Circle-Map Repeats", 94 | usage='''Circle-Map Repeats [options]''' 95 | 96 | ) 97 | self.simulate = subparsers.add_parser( 98 | name="Simulate", 99 | description='Simulate eccDNA NGS datastes', 100 | prog="Circle-Map Reepeats", 101 | usage='''Circle-Map Simulate [options]''' 102 | 103 | ) 104 | self.simulate = subparsers.add_parser( 105 | name="bam2bam", 106 | description='Realign the soft-clipped reads and report', 107 | prog="Circle-Map bam2bam", 108 | usage='''Circle-Map bam2bam [options]''' 109 | 110 | ) 111 | 112 | if len(sys.argv) <= 1: 113 | self.parser.print_help() 114 | time.sleep(0.01) 115 | sys.stderr.write("\nNo argument given to Circle-Map" 116 | "\nExiting\n") 117 | sys.exit(0) 118 | 119 | else: 120 | if sys.argv[1] == "ReadExtractor": 121 | 122 | 123 | self.subprogram = self.args_readextractor() 124 | self.args = self.subprogram.parse_args(sys.argv[2:]) 125 | 126 | object = readExtractor(self.args.i, self.args.output, self.args.directory, self.args.quality, 127 | self.args.nodiscordant, 128 | self.args.nohardclipped, self.args.nosoftclipped, self.args.verbose, 129 | self.subprogram) 130 | object.extract_sv_circleReads() 131 | 132 | elif sys.argv[1] == "Realign": 133 | self.subprogram = self.args_realigner() 134 | self.args = self.subprogram.parse_args(sys.argv[2:]) 135 | 136 | # get clusters 137 | splitted, sorted_bam, begin = start_realign(self.args.i, self.args.output, self.args.threads, 138 | self.args.verbose, self.__getpid__(), 139 | self.args.clustering_dist) 140 | 141 | 142 | 143 | 144 | sorted_bam.close() 145 | #get global insert size prior 146 | metrics = insert_size_dist(self.args.sample_size, self.args.insert_mapq, self.args.qbam) 147 | 148 | 149 | # pool based parallel of religment 150 | m = mp.Manager() 151 | 152 | lock = m.Lock() 153 | 154 | object = realignment(self.args.i, self.args.qbam, self.args.sbam, self.args.fasta, 155 | self.args.directory, 156 | self.args.mapq, 157 | self.args.insert_mapq, self.args.std, self.args.sample_size, 158 | self.args.gap_open, 159 | self.args.gap_ext, self.args.nhits, self.args.cut_off, self.args.min_sc, 160 | self.args.merge_fraction, self.args.interval_probability, self.args.output, 161 | self.args.threads, self.args.allele_frequency, lock, self.args.split, 162 | self.args.ratio, self.args.verbose, self.__getpid__(), 163 | self.args.edit_distance_fraction, self.args.remap_splits, 164 | self.args.only_discordants, self.args.split, 165 | self.args.split_quality, metrics,self.args.number_of_discordants) 166 | 167 | 168 | pool = mp.Pool(processes=self.args.threads) 169 | 170 | 171 | #progress bar 172 | with tqdm(total=len(splitted)) as pbar: 173 | for i,exits in tqdm(enumerate(pool.imap_unordered(object.realign, splitted))): 174 | pbar.update() 175 | #kill if process returns 1,1 176 | if exits == [1,1]: 177 | pool.close() 178 | pool.terminate() 179 | pbar.close() 180 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), 181 | "An error happenend during execution. Exiting") 182 | sys.exit() 183 | 184 | pbar.close() 185 | pool.close() 186 | pool.join() 187 | output = merge_final_output(self.args.sbam, self.args.output, begin, self.args.split, 188 | self.args.directory, 189 | self.args.merge_fraction, self.__getpid__()) 190 | 191 | # compute coverage statistics 192 | if self.args.no_coverage == False: 193 | 194 | coverage_object = coverage(self.args.sbam, output, 195 | self.args.bases, self.args.cmapq, self.args.extension, 196 | self.args.directory) 197 | 198 | #Generator function for the coverage calculations 199 | output = coverage_object.compute_coverage(coverage_object.get_wg_coverage()) 200 | filtered_output = filter_by_ratio(output, self.args.ratio) 201 | filtered_output.to_csv(r'%s' % self.args.output, header=None, index=None, sep='\t', mode='w') 202 | 203 | else: 204 | output.saveas("%s" % self.args.output) 205 | 206 | elif sys.argv[1] == "bam2bam": 207 | self.subprogram = self.args_bam2bam() 208 | self.args = self.subprogram.parse_args(sys.argv[2:]) 209 | 210 | # get clusters 211 | splitted, sorted_bam, begin = start_realign(self.args.i, self.args.output, self.args.threads, 212 | self.args.verbose, self.__getpid__(), 213 | self.args.clustering_dist) 214 | 215 | 216 | #create output bam 217 | circle_sv_reads = ps.AlignmentFile(self.args.output, "wb", template=sorted_bam) 218 | 219 | sorted_bam.close() 220 | #get global insert size prior 221 | metrics = insert_size_dist(self.args.sample_size, self.args.insert_mapq, self.args.qbam) 222 | 223 | 224 | 225 | 226 | manager = mp.Manager() 227 | 228 | 229 | 230 | 231 | 232 | lock = manager.Lock() 233 | 234 | object = bam2bam(self.args.i,self.args.output,self.args.qbam, self.args.fasta, 235 | self.args.directory, 236 | self.args.mapq, 237 | self.args.insert_mapq, self.args.std, self.args.sample_size, 238 | self.args.gap_open, 239 | self.args.gap_ext, self.args.nhits, self.args.cut_off, self.args.min_sc, 240 | self.args.interval_probability, 241 | self.args.threads, lock, 242 | self.args.verbose, self.__getpid__(), 243 | self.args.edit_distance_fraction, self.args.remap_splits, 244 | self.args.only_discordants, 245 | self.args.split_quality, metrics,manager) 246 | 247 | object.beta_version_warning() 248 | 249 | 250 | pool = mp.Pool(processes=self.args.threads) 251 | # create writer process 252 | 253 | writer_p = mp.Process(target=object.listener_writer, args=(circle_sv_reads,)) 254 | writer_p.daemon = True 255 | writer_p.start() 256 | #progress bar 257 | with tqdm(total=len(splitted)) as pbar: 258 | for i,exits in tqdm(enumerate(pool.imap_unordered(object.realign,splitted))): 259 | pbar.update() 260 | # kill if process returns 1,1 261 | if exits == [1, 1]: 262 | pool.close() 263 | pool.terminate() 264 | pbar.close() 265 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), 266 | "An error happenend during execution. Exiting") 267 | sys.exit() 268 | 269 | pbar.close() 270 | pool.close() 271 | pool.join() 272 | object.kill() 273 | writer_p.terminate() 274 | writer_p.join() 275 | 276 | circle_sv_reads.close() 277 | print("Done") 278 | 279 | 280 | elif sys.argv[1] == "Repeats": 281 | 282 | self.subprogram = self.args_repeats() 283 | self.args = self.subprogram.parse_args(sys.argv[2:]) 284 | 285 | 286 | object = repeat(self.args.i, self.args.directory, self.args.mismatch, self.args.fraction, 287 | self.args.read_number) 288 | 289 | bed = object.find_circles() 290 | 291 | coverage_object = coverage(self.args.i, bed, 292 | self.args.bases, self.args.cmapq, self.args.extension, 293 | self.args.directory) 294 | 295 | output = coverage_object.compute_coverage(coverage_object.get_wg_coverage()) 296 | 297 | filtered_output = filter_by_ratio(output, self.args.ratio) 298 | filtered_output.to_csv(r'%s' % self.args.output, header=None, index=None, sep='\t', mode='w') 299 | 300 | 301 | 302 | elif sys.argv[1] == "Simulate": 303 | 304 | self.subprogram = self.args_simulate() 305 | self.args = self.subprogram.parse_args(sys.argv[2:]) 306 | 307 | sim_pid = start_simulate(self.__getpid__()) 308 | 309 | lock = mp.Lock() 310 | 311 | paired_end_fastq_1 = open("%s_1.fastq" % self.args.base_name, "w") 312 | paired_end_fastq_2 = open("%s_2.fastq" % self.args.base_name, "w") 313 | paired_end_fastq_1.close() 314 | paired_end_fastq_2.close() 315 | 316 | # mutate reference genome 317 | if self.args.variants == True: 318 | mutate(self.args.g, sim_pid, self.args.Indels, self.args.substitution, self.args.java_memory) 319 | 320 | 321 | manager = mp.Manager() 322 | # Shared memory object 323 | circle_list = manager.list() 324 | skipped_circles = mp.Value('i', 0) 325 | correct_circles = mp.Value('i', 0) 326 | jobs = [] 327 | # init the processes 328 | 329 | for i in range(self.args.processes): 330 | p = mp.Process(target=sim_ecc_reads, 331 | args=(self.args.g, self.args.read_length, self.args.directory, 332 | int(round(self.args.read_number / self.args.processes)), 333 | self.args.skip_region, self.args.base_name, 334 | self.args.mean_insert_size, self.args.error, 335 | self.args.mean_coverage, lock, i, circle_list, 336 | "%s_1.fastq" % self.args.base_name, "%s_2.fastq" % self.args.base_name, 337 | skipped_circles, 338 | correct_circles, self.args.insRate, self.args.insRate2, self.args.delRate, 339 | self.args.delRate2, sim_pid,)) 340 | jobs.append(p) 341 | p.start() 342 | # kill the process 343 | for p in jobs: 344 | p.join() 345 | print("Skipped %s circles, that overlapped the provided regions to exclude" % skipped_circles.value) 346 | print("Simulated %s circles across %s parallel processes" % ( 347 | correct_circles.value, self.args.processes)) 348 | print("Writting to disk bed file containing the simulated circle coordinates") 349 | 350 | bt.BedTool(list(circle_list)).saveas(self.args.output) 351 | 352 | else: 353 | self.parser.print_help() 354 | time.sleep(0.01) 355 | sys.stderr.write("\nWrong argument given to Circle-Map" 356 | "\nExiting\n") 357 | sys.exit(0) 358 | 359 | def args_readextractor(self): 360 | 361 | parser = self.readextractor 362 | 363 | parser._action_groups.pop() 364 | required = parser.add_argument_group('required arguments') 365 | optional = parser.add_argument_group('optional arguments') 366 | # prefixing the argument with -- means it's optional 367 | # input and output 368 | 369 | required.add_argument('-i', metavar='', help="Input: query name sorted bam file") 370 | 371 | if "-i" in sys.argv: 372 | optional.add_argument('-o', '--output', metavar='', 373 | help="Ouput: Reads indicating circular DNA structural variants", 374 | default="circle_%s" % sys.argv[sys.argv.index("-i") + 1]) 375 | 376 | optional.add_argument('-dir', '--directory', metavar='', 377 | help="Working directory, default is the working directory", 378 | default=os.getcwd()) 379 | 380 | # mapping quality cutoff 381 | 382 | optional.add_argument('-q', '--quality', type=int, metavar='', 383 | help="bwa-mem mapping quality cutoff. Default value 10", 384 | default=10) 385 | 386 | # read extraction options 387 | # extract discordant reads 388 | optional.add_argument('-nd', '--nodiscordant', help="Turn off discordant (R2F1 oriented) read extraction", 389 | action='store_true') 390 | 391 | # soft-clipped argument 392 | optional.add_argument('-nsc', '--nosoftclipped', help="Turn off soft-clipped read extraction", 393 | action='store_true') 394 | # extract hard-clippped reads 395 | optional.add_argument('-nhc', '--nohardclipped', help="Turn off hard-clipped read extraction", 396 | action='store_true') 397 | 398 | # verbose level 399 | 400 | optional.add_argument('-v', '--verbose', type=int, metavar='', 401 | help='Verbose level, 1=error,2=warning, 3=message', 402 | choices=[1, 2, 3], default=3) 403 | 404 | else: 405 | optional.add_argument('-o', '--output', metavar='', 406 | help="Ouput: Reads indicating circular DNA structural variants") 407 | 408 | optional.add_argument('-dir', '--directory', metavar='', 409 | help="Working directory, default is the working directory", 410 | default=os.getcwd()) 411 | 412 | # mapping quality cutoff 413 | optional.add_argument('-q', '--quality', type=int, metavar='', 414 | help="bwa-mem mapping quality cutoff. Default value 10", 415 | default=10) 416 | 417 | # read extraction options 418 | # extract discordant reads 419 | optional.add_argument('-nd', '--nodiscordant', help="Turn off discordant (R2F1 oriented) read extraction", 420 | action='store_true') 421 | 422 | # soft-clipped argument 423 | optional.add_argument('-nsc', '--nosoftclipped', help="Turn off soft-clipped read extraction", 424 | action='store_true') 425 | # extract hard-clippped reads 426 | optional.add_argument('-nhc', '--nohardclipped', help="Turn off hard-clipped read extraction", 427 | action='store_true') 428 | 429 | # verbose level 430 | 431 | optional.add_argument('-v', '--verbose', type=int, metavar='', 432 | help='Verbose level, 1=error,2=warning, 3=message. Default=3', 433 | choices=[1, 2, 3], default=3) 434 | 435 | parser.print_help() 436 | 437 | time.sleep(0.01) 438 | sys.stderr.write( 439 | "\nNo input or output input given to readExtractor, be sure that you are providing the flags'-i' and '-o'" 440 | "\nExiting\n") 441 | sys.exit(0) 442 | 443 | # parse the commands 444 | 445 | if len(sys.argv[2:]) == 0: 446 | parser.print_help() 447 | time.sleep(0.01) 448 | sys.stderr.write("\nNo arguments given to read extractor. Exiting\n") 449 | sys.exit(0) 450 | 451 | return (parser) 452 | 453 | def args_realigner(self): 454 | parser = self.realigner 455 | 456 | # declare the different groups for the parser 457 | parser._action_groups.pop() 458 | io_options = parser.add_argument_group('Input/Output options') 459 | alignment_options = parser.add_argument_group('Alignment options') 460 | out_decision = parser.add_argument_group('eccDNA output options') 461 | i_size_estimate = parser.add_argument_group('Insert size estimation options') 462 | interval = parser.add_argument_group('Interval processing options') 463 | coverage_metrics = parser.add_argument_group('Coverage metrics options') 464 | running = parser.add_argument_group('Running options') 465 | 466 | io_options.add_argument('-i', metavar='', 467 | help="Input: bam file containing the reads extracted by ReadExtractor") 468 | io_options.add_argument('-qbam', metavar='', help="Input: query name sorted bam file") 469 | io_options.add_argument('-sbam', metavar='', help="Input: coordinate sorted bam file") 470 | io_options.add_argument('-fasta', metavar='', help="Input: Reference genome fasta file") 471 | 472 | if "-i" and "-qbam" and "-fasta" in sys.argv: 473 | # output 474 | 475 | io_options.add_argument('-o', '--output', metavar='', help="Output filename", 476 | default="circle_%s.bed" % sys.argv[sys.argv.index("-i") + 1]) 477 | 478 | # alignment 479 | alignment_options.add_argument('-n', '--nhits', type=int, metavar='', 480 | help="Number of realignment attempts. Default: 10", 481 | default=10) 482 | 483 | alignment_options.add_argument('-p', '--cut_off', type=float, metavar='', 484 | help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", 485 | default=0.99) 486 | 487 | alignment_options.add_argument('-m', '--min_sc', type=float, metavar='', 488 | help="Minimum soft-clipped length to attempt the realignment. Default: 8", 489 | default=8) 490 | 491 | alignment_options.add_argument('-g', '--gap_open', type=int, metavar='', 492 | help="Gap open penalty in the position specific scoring matrix. Default: 5", 493 | default=5) 494 | 495 | alignment_options.add_argument('-e', '--gap_ext', type=int, metavar='', 496 | help="Gap extension penalty in the position specific scoring matrix. Default: 1", 497 | default=1) 498 | 499 | alignment_options.add_argument('-q', '--mapq', type=int, metavar='', 500 | help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", 501 | default=20) 502 | 503 | alignment_options.add_argument('-d', '--edit_distance-fraction', type=float, metavar='', 504 | help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", 505 | default=0.05) 506 | 507 | alignment_options.add_argument('-Q', '--split_quality', type=float, metavar='', 508 | help="Minium split score to output an interval. Default (0.0)", 509 | default=0.0) 510 | 511 | alignment_options.add_argument('-R', '--remap_splits', help="Remap probabilistacally the split reads", 512 | action='store_true') 513 | 514 | # insert size 515 | 516 | i_size_estimate.add_argument('-iq', '--insert_mapq', type=int, metavar='', 517 | help="Mapq cutoff for stimating the insert size distribution. Default 60", 518 | default=60) 519 | 520 | i_size_estimate.add_argument('-sd', '--std', type=int, metavar='', 521 | help="Standard deviations of the insert size to extend the intervals. Default 5", 522 | default=4) 523 | 524 | i_size_estimate.add_argument('-s', '--sample_size', type=int, metavar='', 525 | help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", 526 | default=100000) 527 | 528 | # Interval options 529 | 530 | interval.add_argument('-f', '--merge_fraction', type=float, metavar='', 531 | help="Merge intervals reciprocally overlapping by a fraction. Default 0.99", 532 | default=0.99) 533 | 534 | interval.add_argument('-P', '--interval_probability', type=float, metavar='', 535 | help="Skip edges of the graph with a probability below the threshold. Default: 0.01", 536 | default=0.01) 537 | interval.add_argument('-K', '--clustering_dist', type=int, metavar='', 538 | help="Cluster reads that are K nucleotides appart in the same node. Default: 500", 539 | default=500) 540 | 541 | interval.add_argument('-D', '--only_discordants', help="Use only discordant reads to build the graph", 542 | action='store_false') 543 | interval.add_argument('-F', '--allele_frequency', type=float, metavar='', 544 | help="Minimum allele frequency required to report the circle interval. Default (0.1)", 545 | default=0.1) 546 | # When to call a circle 547 | 548 | out_decision.add_argument('-S', '--split', type=int, metavar='', 549 | help="Number of required split reads to output a eccDNA. Default: 0", 550 | default=0) 551 | 552 | out_decision.add_argument('-O', '--number_of_discordants', type=int, metavar='', 553 | help="Number of required discordant reads for intervals with only discordants. Default: 3", 554 | default=3) 555 | out_decision.add_argument('-r', '--ratio', type=float, metavar='', 556 | help="Minimum in/out required coverage ratio. Default: 0.0", 557 | default=0.0) 558 | 559 | # coverage metrics 560 | 561 | coverage_metrics.add_argument('-N', '--no_coverage', help="Don't compute coverage statistics", 562 | action='store_true') 563 | 564 | coverage_metrics.add_argument('-b', '--bases', type=int, metavar='', 565 | help="Number of bases to extend for computing the coverage ratio. Default: 200", 566 | default=200) 567 | 568 | coverage_metrics.add_argument('-cq', '--cmapq', type=int, metavar='', 569 | help="Minimum mapping quality treshold for coverage computation. Default: 0", 570 | default=0) 571 | 572 | coverage_metrics.add_argument('-E', '--extension', type=int, metavar='', 573 | help="Number of bases inside the eccDNA breakpoint coordinates to compute the ratio. Default: 100", 574 | default=100) 575 | 576 | # run options 577 | 578 | running.add_argument('-t', '--threads', type=int, metavar='', 579 | help="Number of threads to use.Default 1", 580 | default=1) 581 | 582 | running.add_argument('-dir', '--directory', metavar='', 583 | help="Working directory, default is the working directory", 584 | default=os.getcwd()) 585 | 586 | running.add_argument('-v', '--verbose', type=int, metavar='', 587 | help='Verbose level, 1=error,2=warning, 3=message', 588 | choices=[1, 2, 3], default=3) 589 | 590 | 591 | 592 | else: 593 | 594 | # output 595 | 596 | io_options.add_argument('-o', metavar='', help="Output filename") 597 | 598 | alignment_options.add_argument('-n', '--nhits', type=int, metavar='', 599 | help="Number of realignment attempts. Default: 10", 600 | default=10) 601 | 602 | alignment_options.add_argument('-p', '--cut_off', type=float, metavar='', 603 | help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", 604 | default=0.99) 605 | 606 | alignment_options.add_argument('-m', '--min_sc', type=float, metavar='', 607 | help="Minimum soft-clipped length to attempt the realignment. Default: 8", 608 | default=8) 609 | 610 | alignment_options.add_argument('-g', '--gap_open', type=int, metavar='', 611 | help="Gap open penalty in the position specific scoring matrix. Default: 5", 612 | default=5) 613 | 614 | alignment_options.add_argument('-e', '--gap_ext', type=int, metavar='', 615 | help="Gap extension penalty in the position specific scoring matrix. Default: 1", 616 | default=1) 617 | 618 | alignment_options.add_argument('-q', '--mapq', type=int, metavar='', 619 | help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", 620 | default=20) 621 | 622 | alignment_options.add_argument('-d', '--edit_distance-fraction', type=float, metavar='', 623 | help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", 624 | default=0.05) 625 | 626 | alignment_options.add_argument('-Q', '--split_quality', type=float, metavar='', 627 | help="Minium split score to output an interval. Default (0.0)", 628 | default=0.0) 629 | alignment_options.add_argument('-R', '--remap_splits', help="Remap probabilistacally bwa-mem split reads", 630 | action='store_true') 631 | 632 | # insert size 633 | 634 | i_size_estimate.add_argument('-iq', '--insert_mapq', type=int, metavar='', 635 | help="Mapq cutoff for stimating the insert size distribution. Default 60", 636 | default=60) 637 | 638 | i_size_estimate.add_argument('-sd', '--std', type=int, metavar='', 639 | help="Standard deviations of the insert size to extend the intervals. Default 5", 640 | default=5) 641 | 642 | i_size_estimate.add_argument('-s', '--sample_size', type=int, metavar='', 643 | help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", 644 | default=100000) 645 | 646 | # Interval options 647 | 648 | interval.add_argument('-f', '--merge_fraction', type=float, metavar='', 649 | help="Merge intervals reciprocally overlapping by a fraction. Default 0.99", 650 | default=0.99) 651 | 652 | interval.add_argument('-P', '--interval_probability', type=float, metavar='', 653 | help="Skip edges of the graph with a probability below the threshold. Default: 0.01", 654 | default=0.01) 655 | interval.add_argument('-K', '--clustering_dist', type=int, metavar='', 656 | help="Cluster reads that are K nucleotides appart in the same node. Default: 500", 657 | default=500) 658 | interval.add_argument('-D', '--only_discordants', help="Use only discordant reads to build the graph", 659 | action='store_true') 660 | interval.add_argument('-F', '--allele_frequency', type=float, metavar='', 661 | help="Minimum allele frequency required to report the circle interval. Default (0.1)", 662 | default=0.1) 663 | 664 | # When to call a circle 665 | 666 | out_decision.add_argument('-S', '--split', type=int, metavar='', 667 | help="Number of required split reads to output a eccDNA. Default: 0", 668 | default=0) 669 | out_decision.add_argument('-O', '--number_of_discordants', type=int, metavar='', 670 | help="Number of required discordant reads for intervals with only discordants. Default: 3", 671 | default=3) 672 | 673 | out_decision.add_argument('-r', '--ratio', type=float, metavar='', 674 | help="Minimum in/out required coverage ratio. Default: 0.0", 675 | default=0.0) 676 | 677 | # coverage metrics 678 | 679 | coverage_metrics.add_argument('-N', '--no_coverage', help="Don't compute coverage statistics", 680 | action='store_true') 681 | 682 | coverage_metrics.add_argument('-b', '--bases', type=int, metavar='', 683 | help="Number of bases to extend for computing the coverage ratio. Default: 200", 684 | default=200) 685 | 686 | coverage_metrics.add_argument('-cq', '--cmapq', type=int, metavar='', 687 | help="Minimum mapping quality treshold for coverage computation. Default: 0", 688 | default=0) 689 | 690 | coverage_metrics.add_argument('-E', '--extension', type=int, metavar='', 691 | help="Number of bases inside the eccDNA breakpoint coordinates to compute the ratio. Default: 100", 692 | default=100) 693 | 694 | # Running options 695 | 696 | running.add_argument('-t', '--threads', type=int, metavar='', 697 | help="Number of threads to use.Default 1", 698 | default=1) 699 | 700 | running.add_argument('-dir', '--directory', metavar='', 701 | help="Working directory, default is the working directory", 702 | default=os.getcwd()) 703 | 704 | running.add_argument('-v', '--verbose', type=int, metavar='', 705 | help='Verbose level, 1=error,2=warning, 3=message', 706 | choices=[1, 2, 3], default=3) 707 | 708 | # find out which arguments are missing 709 | 710 | parser.print_help() 711 | 712 | time.sleep(0.01) 713 | sys.stderr.write("\nInput does not match. Check that you provide the -i, -qbam and -fasta options" 714 | "\nExiting\n") 715 | sys.exit(0) 716 | 717 | if len(sys.argv[2:]) == 0: 718 | parser.print_help() 719 | time.sleep(0.01) 720 | sys.stderr.write("\nNo arguments given to Realign. Exiting\n") 721 | sys.exit(0) 722 | 723 | return (parser) 724 | 725 | 726 | def args_bam2bam(self): 727 | parser = self.realigner 728 | 729 | # declare the different groups for the parser 730 | parser._action_groups.pop() 731 | io_options = parser.add_argument_group('Required') 732 | alignment_options = parser.add_argument_group('Alignment options') 733 | i_size_estimate = parser.add_argument_group('Insert size estimation options') 734 | interval = parser.add_argument_group('Interval processing options') 735 | running = parser.add_argument_group('Running options') 736 | 737 | io_options.add_argument('-i', metavar='', 738 | help="Input: bam file containing the reads extracted by ReadExtractor") 739 | io_options.add_argument('-qbam', metavar='', help="Input: query name sorted bam file") 740 | io_options.add_argument('-fasta', metavar='', help="Input: Reference genome fasta file") 741 | io_options.add_argument('-o', '--output', metavar='', help="Output BAM name") 742 | 743 | if "-i" and "-qbam" and "-fasta" and "-o" in sys.argv: 744 | # output 745 | 746 | 747 | 748 | # alignment 749 | alignment_options.add_argument('-n', '--nhits', type=int, metavar='', 750 | help="Number of realignment attempts. Default: 10", 751 | default=10) 752 | 753 | alignment_options.add_argument('-p', '--cut_off', type=float, metavar='', 754 | help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", 755 | default=0.99) 756 | 757 | alignment_options.add_argument('-m', '--min_sc', type=float, metavar='', 758 | help="Minimum soft-clipped length to attempt the realignment. Default: 8", 759 | default=8) 760 | 761 | alignment_options.add_argument('-g', '--gap_open', type=int, metavar='', 762 | help="Gap open penalty in the position specific scoring matrix. Default: 5", 763 | default=5) 764 | 765 | alignment_options.add_argument('-e', '--gap_ext', type=int, metavar='', 766 | help="Gap extension penalty in the position specific scoring matrix. Default: 1", 767 | default=1) 768 | 769 | alignment_options.add_argument('-q', '--mapq', type=int, metavar='', 770 | help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", 771 | default=20) 772 | 773 | alignment_options.add_argument('-d', '--edit_distance-fraction', type=float, metavar='', 774 | help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", 775 | default=0.05) 776 | 777 | alignment_options.add_argument('-Q', '--split_quality', type=float, metavar='', 778 | help="Minium split score to output an interval. Default (0.0)", 779 | default=0.0) 780 | 781 | alignment_options.add_argument('-R', '--remap_splits', help="Remap probabilistacally the split reads", 782 | action='store_true') 783 | 784 | # insert size 785 | 786 | i_size_estimate.add_argument('-iq', '--insert_mapq', type=int, metavar='', 787 | help="Mapq cutoff for stimating the insert size distribution. Default 60", 788 | default=60) 789 | 790 | i_size_estimate.add_argument('-sd', '--std', type=int, metavar='', 791 | help="Standard deviations of the insert size to extend the intervals. Default 5", 792 | default=4) 793 | 794 | i_size_estimate.add_argument('-s', '--sample_size', type=int, metavar='', 795 | help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", 796 | default=100000) 797 | 798 | # Interval options 799 | 800 | 801 | 802 | interval.add_argument('-P', '--interval_probability', type=float, metavar='', 803 | help="Skip edges of the graph with a probability below the threshold. Default: 0.01", 804 | default=0.01) 805 | interval.add_argument('-K', '--clustering_dist', type=int, metavar='', 806 | help="Cluster reads that are K nucleotides appart in the same node. Default: 500", 807 | default=500) 808 | 809 | interval.add_argument('-D', '--only_discordants', help="Use only discordant reads to build the graph", 810 | action='store_false') 811 | 812 | 813 | 814 | # run options 815 | 816 | running.add_argument('-t', '--threads', type=int, metavar='', 817 | help="Number of threads to use.Default 1", 818 | default=1) 819 | 820 | running.add_argument('-dir', '--directory', metavar='', 821 | help="Working directory, default is the working directory", 822 | default=os.getcwd()) 823 | 824 | running.add_argument('-v', '--verbose', type=int, metavar='', 825 | help='Verbose level, 1=error,2=warning, 3=message', 826 | choices=[1, 2, 3], default=3) 827 | 828 | 829 | 830 | else: 831 | 832 | # output 833 | 834 | alignment_options.add_argument('-n', '--nhits', type=int, metavar='', 835 | help="Number of realignment attempts. Default: 10", 836 | default=10) 837 | 838 | alignment_options.add_argument('-p', '--cut_off', type=float, metavar='', 839 | help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", 840 | default=0.99) 841 | 842 | alignment_options.add_argument('-m', '--min_sc', type=float, metavar='', 843 | help="Minimum soft-clipped length to attempt the realignment. Default: 8", 844 | default=8) 845 | 846 | alignment_options.add_argument('-g', '--gap_open', type=int, metavar='', 847 | help="Gap open penalty in the position specific scoring matrix. Default: 5", 848 | default=5) 849 | 850 | alignment_options.add_argument('-e', '--gap_ext', type=int, metavar='', 851 | help="Gap extension penalty in the position specific scoring matrix. Default: 1", 852 | default=1) 853 | 854 | alignment_options.add_argument('-q', '--mapq', type=int, metavar='', 855 | help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", 856 | default=20) 857 | 858 | alignment_options.add_argument('-d', '--edit_distance-fraction', type=float, metavar='', 859 | help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", 860 | default=0.05) 861 | 862 | alignment_options.add_argument('-Q', '--split_quality', type=float, metavar='', 863 | help="Minium split score to output an interval. Default (0.0)", 864 | default=0.0) 865 | alignment_options.add_argument('-R', '--remap_splits', help="Remap probabilistacally bwa-mem split reads", 866 | action='store_true') 867 | 868 | # insert size 869 | 870 | i_size_estimate.add_argument('-iq', '--insert_mapq', type=int, metavar='', 871 | help="Mapq cutoff for stimating the insert size distribution. Default 60", 872 | default=60) 873 | 874 | i_size_estimate.add_argument('-sd', '--std', type=int, metavar='', 875 | help="Standard deviations of the insert size to extend the intervals. Default 5", 876 | default=5) 877 | 878 | i_size_estimate.add_argument('-s', '--sample_size', type=int, metavar='', 879 | help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", 880 | default=100000) 881 | 882 | # Interval options 883 | 884 | 885 | interval.add_argument('-P', '--interval_probability', type=float, metavar='', 886 | help="Skip edges of the graph with a probability below the threshold. Default: 0.01", 887 | default=0.01) 888 | interval.add_argument('-K', '--clustering_dist', type=int, metavar='', 889 | help="Cluster reads that are K nucleotides appart in the same node. Default: 500", 890 | default=500) 891 | interval.add_argument('-D', '--only_discordants', help="Use only discordant reads to build the graph", 892 | action='store_true') 893 | 894 | 895 | # Running options 896 | 897 | running.add_argument('-t', '--threads', type=int, metavar='', 898 | help="Number of threads to use.Default 1", 899 | default=1) 900 | 901 | running.add_argument('-dir', '--directory', metavar='', 902 | help="Working directory, default is the working directory", 903 | default=os.getcwd()) 904 | 905 | running.add_argument('-v', '--verbose', type=int, metavar='', 906 | help='Verbose level, 1=error,2=warning, 3=message', 907 | choices=[1, 2, 3], default=3) 908 | 909 | # find out which arguments are missing 910 | 911 | parser.print_help() 912 | 913 | time.sleep(0.01) 914 | sys.stderr.write("\nInput does not match. Check that you provide the -i, -qbam and -fasta options" 915 | "\nExiting\n") 916 | sys.exit(0) 917 | 918 | if len(sys.argv[2:]) == 0: 919 | parser.print_help() 920 | time.sleep(0.01) 921 | sys.stderr.write("\nNo arguments given to bam2bam. Exiting\n") 922 | sys.exit(0) 923 | 924 | return (parser) 925 | 926 | def args_repeats(self): 927 | 928 | parser = self.repeats 929 | 930 | parser._action_groups.pop() 931 | required = parser.add_argument_group('required arguments') 932 | optional = parser.add_argument_group('optional arguments') 933 | # prefixing the argument with -- means it's optional 934 | # input and output 935 | 936 | required.add_argument('-i', metavar='', help="Input: coordinate name sorted bam file") 937 | 938 | if "-i" in sys.argv: 939 | 940 | optional.add_argument('-o', '--output', metavar='', 941 | help="Ouput: Reads indicating circular DNA structural variants from repeat regions", 942 | default="circle_repeats_%s" % sys.argv[sys.argv.index("-i") + 1]) 943 | 944 | optional.add_argument('-dir', '--directory', metavar='', 945 | help="Working directory, default is the working directory", 946 | default=os.getcwd()) 947 | 948 | # coverage metrics 949 | optional.add_argument('-m', '--mismatch', metavar='', 950 | help="Number of mismatches allowed on the reads", 951 | default=2) 952 | 953 | optional.add_argument('-b', '--bases', type=int, metavar='', 954 | help="Number of bases to extend for computing the coverage ratio. Default: 200", 955 | default=200) 956 | 957 | optional.add_argument('-cq', '--cmapq', type=int, metavar='', 958 | help="Minimum mapping quality treshold for coverage computation. Default: 0", 959 | default=0) 960 | 961 | optional.add_argument('-E', '--extension', type=int, metavar='', 962 | help="Number of bases inside the eccDNA coordinates to compute the ratio. Default: 100", 963 | default=100) 964 | 965 | optional.add_argument('-r', '--ratio', type=float, metavar='', 966 | help="Minimum in/out required ratio. Default: 0.6", 967 | default=0.6) 968 | 969 | optional.add_argument('-f', '--fraction', type=float, metavar='', 970 | help="Required fraction to merge the intervals of the double mapped reads. Default 0.8", 971 | default=0.8) 972 | 973 | optional.add_argument('-n', '--read_number', metavar='', 974 | help="Minimum number of reads required to output", 975 | default=20) 976 | 977 | 978 | 979 | else: 980 | 981 | optional.add_argument('-o', '--output', metavar='', 982 | help="Ouput: Reads indicating circular DNA structural variants", 983 | ) 984 | 985 | optional.add_argument('-dir', '--directory', metavar='', 986 | help="Working directory, default is the working directory", 987 | default=os.getcwd()) 988 | 989 | # coverage metrics 990 | 991 | optional.add_argument('-m', '--mismatch', metavar='', 992 | help="Number of mismatches allowed on the reads", 993 | default=2) 994 | 995 | optional.add_argument('-b', '--bases', type=int, metavar='', 996 | help="Number of bases to extend for computing the coverage ratio. Default: 200", 997 | default=200) 998 | 999 | optional.add_argument('-cq', '--cmapq', type=int, metavar='', 1000 | help="Minimum mapping quality treshold for coverage computation. Default: 0", 1001 | default=0.6) 1002 | 1003 | optional.add_argument('-E', '--extension', type=int, metavar='', 1004 | help="Number of bases inside the eccDNA coordinates to compute the ratio. Default: 100", 1005 | default=100) 1006 | 1007 | optional.add_argument('-r', '--ratio', type=float, metavar='', 1008 | help="Minimum in/out required ratio. Default: 0.6", 1009 | default=0.6) 1010 | 1011 | optional.add_argument('-f', '--fraction', type=float, metavar='', 1012 | help="Required fraction to merge the intervals of the double mapped reads. Default 0.8", 1013 | default=0.8) 1014 | 1015 | optional.add_argument('-n', '--read_number', metavar='', 1016 | help="Minimum number of reads required to output", 1017 | default=20) 1018 | 1019 | parser.print_help() 1020 | 1021 | time.sleep(0.01) 1022 | sys.stderr.write("\nNo input input given to Repeats, be sure that you are providing the flag '-i'" 1023 | "\nExiting\n") 1024 | sys.exit(0) 1025 | 1026 | # parse the commands 1027 | 1028 | if len(sys.argv[2:]) == 0: 1029 | parser.print_help() 1030 | time.sleep(0.01) 1031 | sys.stderr.write("\nNo arguments given to Repeats. Exiting\n") 1032 | sys.exit(0) 1033 | 1034 | return (parser) 1035 | 1036 | def args_simulate(self): 1037 | 1038 | parser = self.simulate 1039 | 1040 | parser._action_groups.pop() 1041 | required = parser.add_argument_group('required arguments') 1042 | optional = parser.add_argument_group('optional arguments') 1043 | # prefixing the argument with -- means it's optional 1044 | # input and output 1045 | 1046 | if "-g" and "-N" in sys.argv: 1047 | required.add_argument('-g', metavar='', 1048 | help="Genome fasta file (Needs to be indexed with samtools faidx)") 1049 | required.add_argument('-N', '--read-number', type=int, metavar='', 1050 | help="Number of reads to simulate") 1051 | optional.add_argument('-o', '--output', default='simulated.bed', 1052 | help="Output file name") 1053 | optional.add_argument('-dir', '--directory', metavar='', 1054 | help="Working directory, default is the working directory", 1055 | default=os.getcwd()) 1056 | optional.add_argument('-b', '--base-name', metavar='', default='simulated', 1057 | help="Fastq output basename") 1058 | optional.add_argument('-s', '--skip-region', metavar='', default=None, 1059 | help="Regions of the genome to skip the simulation. The input needs to be in bed format") 1060 | optional.add_argument('-r', '--read-length', metavar='', type=int, default=150, 1061 | help="Read length to simulate") 1062 | optional.add_argument('-m', '--mean-insert-size', metavar='', type=int, default=300, 1063 | help="Mean of the insert size distribution") 1064 | optional.add_argument('-c', '--mean-coverage', metavar='', type=int, default=30, 1065 | help="Mean sequencing coverage within the eccDNA coordinates") 1066 | optional.add_argument('-p', '--processes', metavar='', type=int, default=1, 1067 | help="Mean sequencing coverage within the eccDNA coordinates") 1068 | 1069 | optional.add_argument('-v', '--variants', action='store_true', 1070 | help="If set to true, introduce mutations in the reference genome prior to simulating" 1071 | "reads.") 1072 | optional.add_argument('-S', '--substitution', metavar='', type=float, default=0.0001, 1073 | help="Fraction of base substitutions to introduce on the genome. Default: 0.0001") 1074 | 1075 | optional.add_argument('-I', '--Indels', metavar='', type=float, default=0.001, 1076 | help="Fraction of indels to introduce on the genome. Default: 0.001") 1077 | optional.add_argument('-J', '--java_memory', metavar='', type=str, default="-Xmx16g", 1078 | help="Java memory allocation, required for mutating the genome. Default: -Xmx16g") 1079 | 1080 | optional.add_argument('-e', '--error', action='store_true', 1081 | help="Introduce sequencing errors ( Uses ART on the background)") 1082 | 1083 | optional.add_argument('-i', '--instrument', metavar='', type=str, default="HS25", 1084 | help="Illumina sequecing instrument to simulate reads from (Default HiSeq 2500)") 1085 | 1086 | optional.add_argument('-ir', '--insRate', metavar='', type=float, default=0.00009, 1087 | help="the first-read insertion rate (default: 0.00009)") 1088 | optional.add_argument('-ir2', '--insRate2', metavar='', type=float, default=0.00015, 1089 | help="the second-read insertion rate (default: 0.00015)") 1090 | optional.add_argument('-dr', '--delRate', metavar='', type=float, default=0.00011, 1091 | help="the first-read deletion rate (default: 0.00011)") 1092 | optional.add_argument('-dr2', '--delRate2', metavar='', type=float, default=0.00023, 1093 | help="the second-read deletion rate (default: 0.00023)") 1094 | else: 1095 | required.add_argument('-g', metavar='', 1096 | help="Genome fasta file (Needs to be indexed with samtools faidx)") 1097 | required.add_argument('-N', '--read-number', type=int, metavar='', 1098 | help="Number of reads to simulate") 1099 | optional.add_argument('-o', '--output', default='simulated.bed', 1100 | help="Output file name") 1101 | optional.add_argument('-dir', '--directory', metavar='', 1102 | help="Working directory, default is the working directory", 1103 | default=os.getcwd()) 1104 | optional.add_argument('-b', '--base-name', metavar='', default='simulated', 1105 | help="Fastq output basename") 1106 | optional.add_argument('-s', '--skip-region', metavar='', default=None, 1107 | help="Regions of the genome to skip the simulation. The input needs to be in bed format") 1108 | optional.add_argument('-r', '--read-length', metavar='', type=int, default=150, 1109 | help="Read length to simulate") 1110 | optional.add_argument('-m', '--mean-insert', metavar='', type=int, default=300, 1111 | help="Mean of the insert size distribution") 1112 | 1113 | optional.add_argument('-c', '--mean-coverage', metavar='', type=int, default=30, 1114 | help="Mean sequencing coverage within the eccDNA coordinates") 1115 | 1116 | optional.add_argument('-p', '--processes', metavar='', type=int, default=1, 1117 | help="Number of parallel processes to use") 1118 | 1119 | optional.add_argument('-v', '--variants', action='store_true', 1120 | help="If set to true, introduce mutations in the reference genome prior to simulating" 1121 | "reads.") 1122 | optional.add_argument('-S', '--substitution', metavar='', type=float, default=0.0001, 1123 | help="Fraction of base substitutions to introduce on the genome. Default: 0.0001") 1124 | 1125 | optional.add_argument('-I', '--Indels', metavar='', type=float, default=0.001, 1126 | help="Fraction of indels to introduce on the genome. Default: 0.001") 1127 | optional.add_argument('-J', '--java_memory', metavar='', type=str, default="-Xmx16g", 1128 | help="Java memory allocation, required for mutating the genome. Default: -Xmx16g") 1129 | 1130 | optional.add_argument('-e', '--error', action='store_true', 1131 | help="Introduce sequencing errors ( Uses ART on the background)") 1132 | 1133 | optional.add_argument('-i', '--instrument', metavar='', type=str, default="HS25", 1134 | help="Illumina sequecing instrument to simulate reads from (Default HiSeq 2500)") 1135 | optional.add_argument('-ir', '--insRate', metavar='', type=float, default=0.00009, 1136 | help="the first-read insertion rate (default: 0.00009). Requires -e") 1137 | optional.add_argument('-ir2', '--insRate2', metavar='', type=float, default=0.00015, 1138 | help="the second-read insertion rate (default: 0.00015). Requires -e") 1139 | optional.add_argument('-dr', '--delRate', metavar='', type=float, default=0.00011, 1140 | help="the first-read deletion rate (default: 0.00011). Requires -e") 1141 | optional.add_argument('-dr2', '--delRate2', metavar='', type=float, default=0.00023, 1142 | help="the second-read deletion rate (default: 0.00023). Requires -e") 1143 | 1144 | parser.print_help() 1145 | 1146 | time.sleep(0.01) 1147 | sys.stderr.write( 1148 | "\nNo input input given to Simulate, be sure that you are providing the flags '-g' and '-N'" 1149 | "\nExiting\n") 1150 | 1151 | sys.exit(0) 1152 | 1153 | 1154 | if len(sys.argv[2:]) == 0: 1155 | parser.print_help() 1156 | time.sleep(0.01) 1157 | sys.stderr.write("\nNo arguments given to Simulate. Exiting\n") 1158 | 1159 | 1160 | 1161 | 1162 | return (parser) 1163 | 1164 | def main(): 1165 | run = circle_map() 1166 | pid = run.__getpid__() 1167 | # clean 1168 | os.system("rm -rf temp_files_%s" % pid) 1169 | 1170 | if __name__ == '__main__': 1171 | main() 1172 | 1173 | -------------------------------------------------------------------------------- /circlemap/extract_circle_SV_reads.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2019 Iñigo Prada Luengo 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | import pysam as ps 24 | import os 25 | from circlemap.utils import * 26 | import time 27 | import sys 28 | import warnings 29 | 30 | 31 | class readExtractor: 32 | """Class for managing the read extracting part of circle map""" 33 | def __init__(self,sorted_bam,output_bam,working_dir,mapq_cutoff,extract_discordant,extract_soft_clipped,extract_hard_clipped, 34 | verbose,parser 35 | ): 36 | #input-output 37 | self.sorted_bam = sorted_bam 38 | self.output_bam = output_bam 39 | #working place 40 | self.working_dir = working_dir 41 | 42 | #read options 43 | self.no_discordants = extract_discordant 44 | self.no_soft_clipped = extract_soft_clipped 45 | self.no_hard_clipped = extract_hard_clipped 46 | 47 | #mapq cutoff 48 | 49 | self.mapq_cutoff = mapq_cutoff 50 | 51 | #verbose level 52 | self.verbose = int(verbose) 53 | #parser options 54 | self.parser = parser 55 | 56 | def extract_sv_circleReads(self): 57 | 58 | """Function that extracts Structural Variant reads that indicate circular DNA, 59 | The programme with extract soft-clipped reads and R2F1 (<- ->) oriented reads""" 60 | 61 | os.chdir(self.working_dir) 62 | 63 | #input 64 | if os.path.isabs(self.sorted_bam): 65 | raw_bam = ps.AlignmentFile(self.sorted_bam, "rb") 66 | else: 67 | raw_bam = ps.AlignmentFile(self.working_dir + "/" + self.sorted_bam, "rb") 68 | 69 | #HD the tag for the header line. SO indicates sorting order of the alignements 70 | if 'HD' in raw_bam.header: 71 | 72 | if raw_bam.header['HD']['SO'] != 'queryname': 73 | sys.stderr.write( 74 | "The input bam header says that bam is not sorted by queryname. It is sorted by %s\n\n" % (raw_bam.header['HD']['SO'])) 75 | sys.stderr.write( 76 | "Sort your bam file queryname with the following command:\n\n\tsamtools sort -n -o output.bam input.bam") 77 | 78 | time.sleep(0.01) 79 | 80 | self.parser.print_help() 81 | sys.exit(1) 82 | else: 83 | 84 | if self.verbose >=2: 85 | warnings.warn("WARNING:Circle-Map does not know if the input bam is queryname sorted\n Please check that, the output would be unexpected otherwise") 86 | print("As sanity check, sort your bam file queryname with the following command:\n\n\tsamtools sort -n -o output.bam input.bam") 87 | 88 | 89 | 90 | 91 | if os.path.isabs(self.output_bam): 92 | circle_sv_reads = ps.AlignmentFile(self.output_bam , "wb", template=raw_bam) 93 | else: 94 | circle_sv_reads = ps.AlignmentFile(self.working_dir + "/" + self.output_bam , "wb", template=raw_bam) 95 | 96 | 97 | #modify the tag to unsorted 98 | if 'HD' in raw_bam.header == True: 99 | circle_sv_reads.header['HD']['SO'] = 'unsorted' 100 | 101 | if self.verbose >=3: 102 | print("Extracting circular structural variants") 103 | 104 | #timing 105 | begin = time.time() 106 | 107 | 108 | #cache read1. operate in read2. this speed-ups the search 109 | read1 = '' 110 | 111 | #counter for processed reads 112 | processed_reads = 0 113 | 114 | for read in raw_bam: 115 | 116 | if self.verbose >=3: 117 | processed_reads +=1 118 | 119 | 120 | if (processed_reads/1000000).is_integer() == True: 121 | partial_timer = time.time() 122 | partial_time = (partial_timer - begin)/60 123 | print("Processed %s reads in %s mins" % (processed_reads,round(partial_time,3))) 124 | 125 | if read.is_read1: 126 | read1 = read 127 | else: 128 | if read.is_read2 and read.qname == read1.qname: 129 | # both reads in memory 130 | read2 = read 131 | 132 | #both reads need to be mapped 133 | if read1.is_unmapped == False and read2.is_unmapped == False: 134 | 135 | 136 | if read2.is_reverse and read1.is_reverse == False: 137 | 138 | 139 | # read2 leftmost mapping position smaller than read1 leftmost mapping position 140 | if read2.reference_start < read1.reference_start: 141 | 142 | 143 | #aligned to the same chromosome 144 | if read1.reference_id == read2.reference_id: 145 | 146 | 147 | if read1.mapq >= self.mapq_cutoff and read2.mapq >= self.mapq_cutoff: 148 | 149 | 150 | #is discordant extraction turn off? 151 | 152 | if self.no_discordants == False: 153 | 154 | #add mate mapping quality info 155 | read1.tags += [('MQ',read2.mapq)] 156 | read2.tags += [('MQ', read1.mapq)] 157 | 158 | circle_sv_reads.write(read1) 159 | circle_sv_reads.write(read2) 160 | else: 161 | 162 | pass 163 | else: 164 | 165 | #extract soft-clipped if the mapq is high enough 166 | write_clipped_read(circle_sv_reads, read1, read2, self.no_soft_clipped, 167 | self.no_hard_clipped, self.mapq_cutoff) 168 | 169 | write_clipped_read(circle_sv_reads, read2, read1, self.no_soft_clipped, 170 | self.no_hard_clipped, self.mapq_cutoff) 171 | 172 | 173 | 174 | else: 175 | 176 | write_clipped_read(circle_sv_reads, read1, read2, self.no_soft_clipped, 177 | self.no_hard_clipped, self.mapq_cutoff) 178 | 179 | write_clipped_read(circle_sv_reads, read2, read1, self.no_soft_clipped, 180 | self.no_hard_clipped, self.mapq_cutoff) 181 | 182 | 183 | else: 184 | 185 | #if the leftmost mapping condition is not met check if they are soft-clipped 186 | write_clipped_read(circle_sv_reads, read1, read2, self.no_soft_clipped, 187 | self.no_hard_clipped, self.mapq_cutoff) 188 | 189 | write_clipped_read(circle_sv_reads, read2, read1, self.no_soft_clipped, 190 | self.no_hard_clipped, self.mapq_cutoff) 191 | 192 | 193 | else: 194 | 195 | #check soft-clipped if R2F1 orientation is not True 196 | 197 | write_clipped_read(circle_sv_reads, read1, read2, self.no_soft_clipped, 198 | self.no_hard_clipped, self.mapq_cutoff) 199 | 200 | write_clipped_read(circle_sv_reads, read2, read1, self.no_soft_clipped, 201 | self.no_hard_clipped, self.mapq_cutoff) 202 | 203 | 204 | else: 205 | 206 | #check read 1 and read two for independent unmaps 207 | if read1.is_unmapped == False: 208 | write_clipped_read(circle_sv_reads, read1,read2, self.no_soft_clipped, 209 | self.no_hard_clipped, self.mapq_cutoff, own_mapq=True) 210 | 211 | if read2.is_unmapped == False: 212 | write_clipped_read(circle_sv_reads, read2, read1, self.no_soft_clipped, 213 | self.no_hard_clipped, self.mapq_cutoff, own_mapq=True) 214 | 215 | else: 216 | # reads are not queryname sorted and cannot be processed in paired mode 217 | warnings.warn("Unpaired reads found. Is your bam file queryname sorted?") 218 | 219 | 220 | end = time.time() 221 | 222 | circle_sv_reads.close() 223 | 224 | 225 | 226 | if self.verbose >=3: 227 | 228 | 229 | print("finished extracting reads. Elapsed time:", (end - begin) / 60, "mins") 230 | 231 | print("Thanks for using Circle-Map") 232 | -------------------------------------------------------------------------------- /circlemap/realigner.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2019 Iñigo Prada Luengo 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | from __future__ import division 24 | 25 | 26 | import os 27 | import sys 28 | from Bio.Seq import Seq 29 | import time 30 | from circlemap.utils import * 31 | import pandas as pd 32 | import traceback 33 | 34 | 35 | 36 | class realignment: 37 | """Class for managing the realignment and eccDNA indetification of circle-map""" 38 | 39 | def __init__(self, input_bam,qname_bam,sorted_bam,genome_fasta,directory,mapq_cutoff,insert_size_mapq,std_extension, 40 | insert_size_sample_size,gap_open,gap_ext,n_hits,prob_cutoff,min_soft_clipped_length,overlap_frac, 41 | interval_p_cut, output_name,ncores,af,locker,split,ratio,verbose,pid,edit_distance_frac, 42 | remap_splits,only_discordants,splits,score,insert_size,discordant_filter): 43 | #I/O 44 | self.edit_distance_frac = edit_distance_frac 45 | self.ecc_dna_str = input_bam 46 | self.qname_bam = qname_bam 47 | self.sorted_bam_str = sorted_bam 48 | self.directory = directory 49 | self.genome_fa = genome_fasta 50 | 51 | #realignment parameters 52 | 53 | # probabilistic realignment options 54 | self.n_hits = n_hits 55 | self.prob_cutoff = prob_cutoff 56 | self.min_sc_length = min_soft_clipped_length 57 | self.mapq_cutoff = mapq_cutoff 58 | self.interval_p = interval_p_cut 59 | self.remap = remap_splits 60 | self.only_discordants = only_discordants 61 | self.split = splits 62 | self.score = score 63 | self.af= af 64 | self.insert = insert_size 65 | 66 | # affine gap scoring options 67 | self.gap_open = gap_open 68 | self.gap_ext = gap_ext 69 | 70 | 71 | #insert size stimation parameters 72 | self.insert_size_mapq = insert_size_mapq 73 | self.std_extenstion = std_extension 74 | self.insert_sample_size = insert_size_sample_size 75 | 76 | 77 | 78 | #output options 79 | 80 | self.overlap_fraction = overlap_frac 81 | self.output = output_name 82 | self.discordant_filter = discordant_filter 83 | 84 | 85 | #regular options 86 | self.cores = ncores 87 | self.verbose = verbose 88 | self.lock = locker 89 | 90 | #this two parameters don't work on this class. They are here for printing the parameters 91 | self.split = split 92 | self.ratio = ratio 93 | 94 | #for instances running on the same directoiry 95 | 96 | self.pid = pid 97 | 98 | 99 | 100 | 101 | 102 | def print_parameters(self): 103 | 104 | print("Running realignment\n") 105 | print("Probabilistic realignment parameters:\n" 106 | "\tAlignments to consider: %s \n" 107 | "\tProbability cut-off to consider as mapped: %s \n" 108 | "\tMinimum soft-clipped length to attemp realignment: %s \n" 109 | "\tMinimum bwa mem mapping quality to consider: %s \n" 110 | "\tGap open penalty: %s \n" 111 | "\tGap extension penalty: %s \n" 112 | % (self.n_hits, self.prob_cutoff,self.min_sc_length,self.mapq_cutoff,self.gap_open, self.gap_ext)) 113 | 114 | print("Interval extension parameters:\n" 115 | "\tInsert size mapping quality cut-off: %s \n" 116 | "\tNumber of read to sample: %s \n" 117 | "\tNumber of standard deviations to extend the realignment intervals: %s \n" 118 | % (self.insert_size_mapq,self.insert_sample_size,self.std_extenstion)) 119 | 120 | print("eccDNA output options: \n" 121 | "\tSplit read cut-off: %s \n" 122 | "\tCoverage ratio cut-off: %s \n" % (self.split,self.ratio)) 123 | 124 | 125 | print("Interval processing options: \n" 126 | "\tMerging fraction: %s \n" 127 | "\tInterval probability cut-off: %s \n" 128 | % (self.overlap_fraction,self.interval_p)) 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | def realign(self,peaks): 137 | """Function that will iterate trough the bam file containing reads indicating eccDNA structural variants and 138 | will output a bed file containing the soft-clipped reads, the discordant and the coverage within the interval""" 139 | 140 | #open files for every process 141 | try: 142 | peaks_pd = pd.DataFrame.from_records(peaks,columns=['chrom', 'start', 'end']) 143 | sorted_bam = ps.AlignmentFile(self.sorted_bam_str, "rb") 144 | genome_fa = ps.FastaFile(self.genome_fa) 145 | ecc_dna = ps.AlignmentFile(self.ecc_dna_str,"rb") 146 | 147 | begin = time.time() 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | # compute insert size distribution 157 | 158 | insert_metrics = self.insert 159 | 160 | 161 | #define realignment extension interval 162 | extension = insert_metrics[0] + self.std_extenstion*insert_metrics[1] 163 | 164 | 165 | iteration = 0 166 | 167 | 168 | results = [] 169 | only_discordants = [] 170 | 171 | 172 | 173 | for index,interval in peaks_pd.iterrows(): 174 | 175 | 176 | 177 | if check_size_and_write(results,only_discordants,self.output,self.lock,self.directory,self.overlap_fraction,self.pid) == True: 178 | results = [] 179 | only_discordants = [] 180 | 181 | try: 182 | 183 | 184 | 185 | 186 | #find out the prior distribution (mate alignment positions). 187 | candidate_mates = get_mate_intervals(ecc_dna,interval,self.mapq_cutoff,self.verbose,self.only_discordants) 188 | 189 | 190 | 191 | 192 | 193 | 194 | if len(candidate_mates) > 0 or candidate_mates != None: 195 | 196 | 197 | realignment_interval_extended = get_realignment_intervals(candidate_mates,extension,self.interval_p, 198 | self.verbose) 199 | 200 | 201 | if realignment_interval_extended is None: 202 | continue 203 | 204 | 205 | iteration_results = [] 206 | iteration_discordants = [] 207 | disorcordants_per_it = 0 208 | for index,mate_interval in realignment_interval_extended.iterrows(): 209 | 210 | iteration += 1 211 | 212 | 213 | 214 | #sample realignment intervals 215 | #fasta file fetch is 1 based that why I do +1 216 | 217 | plus_coding_interval = genome_fa.fetch(str(mate_interval['chrom']),int(int(mate_interval['start'])+1),int(int(mate_interval['end'])+1)).upper() 218 | interval_length = len(plus_coding_interval) 219 | minus_coding_interval = str(Seq(plus_coding_interval).complement()) 220 | 221 | # precompute the denominators of the error model. They will be constants for every interval 222 | plus_base_freqs = background_freqs(plus_coding_interval) 223 | 224 | minus_base_freqs = {'T':plus_base_freqs['A'],'A':plus_base_freqs['T'], 225 | 'C':plus_base_freqs['G'],'G':plus_base_freqs['C']} 226 | 227 | minus_base_freqs = np.array([plus_base_freqs['T'],plus_base_freqs['A'],plus_base_freqs['G'],plus_base_freqs['C']]) 228 | plus_base_freqs = np.array([plus_base_freqs['A'],plus_base_freqs['T'],plus_base_freqs['C'],plus_base_freqs['G']]) 229 | 230 | 231 | #note that I am getting the reads of the interval. Not the reads of the mates 232 | 233 | for read in ecc_dna.fetch(interval['chrom'],int(interval['start']),int(interval['end']),multiple_iterators=True): 234 | 235 | 236 | if is_soft_clipped(read): 237 | 238 | if read.mapq >= self.mapq_cutoff: 239 | 240 | # no need to realignment 241 | if read.has_tag('SA') and self.remap != True: 242 | 243 | 244 | #check realignment from SA tag 245 | support = circle_from_SA(read, self.mapq_cutoff, mate_interval) 246 | 247 | 248 | 249 | if support is None: 250 | pass 251 | 252 | else: 253 | 254 | if support['support'] == True: 255 | 256 | score = len(get_longest_soft_clipped_bases(read)['seq'])* (1-phred_to_prob(np.array(int(read.get_tag('SA').split(',')[4]),dtype=np.float64))) 257 | 258 | #compute mapping positions 259 | 260 | read_end = rightmost_from_read(read) 261 | 262 | supplementary_end = rightmost_from_sa(support['leftmost'],support['cigar']) 263 | 264 | 265 | 266 | # I store the read name to the output, so that a read counts as 1 no matter it is SC in 2 pieces 267 | if read.reference_start < support['leftmost']: 268 | 269 | iteration_results.append([interval['chrom'],read.reference_start,(supplementary_end-1),read.qname,iteration,float(round(score,2))]) 270 | 271 | elif read.reference_start > support['leftmost']: 272 | 273 | iteration_results.append( 274 | [interval['chrom'], (support['leftmost']-1), read_end, read.qname,iteration,float(round(score,2))]) 275 | 276 | else: 277 | #uninformative read 278 | pass 279 | 280 | 281 | 282 | else: 283 | #sc length 284 | sc_len = len(get_longest_soft_clipped_bases(read)['seq']) 285 | 286 | 287 | if non_colinearity(int(read.cigar[0][0]),int(read.cigar[-1][0]),int(read.pos), 288 | int(mate_interval.start),int(mate_interval.end)) == True: 289 | 290 | 291 | if sc_len >= self.min_sc_length: 292 | edits_allowed = adaptative_myers_k(sc_len, self.edit_distance_frac) 293 | #realignment 294 | 295 | realignment_dict = realign(read,self.n_hits,plus_coding_interval,minus_coding_interval, 296 | plus_base_freqs,minus_base_freqs,self.gap_open,self.gap_ext,self.verbose,edits_allowed) 297 | 298 | 299 | if realignment_dict == None: 300 | 301 | pass 302 | 303 | else: 304 | #calc edit distance allowed 305 | prob = realignment_probability(realignment_dict,interval_length) 306 | if prob >= self.prob_cutoff and realignment_dict['alignments'][1][3] <= edits_allowed: 307 | 308 | # here I have to retrieve the nucleotide mapping positions. Which should be the 309 | # the left sampling pysam coordinate - edlib coordinates 310 | 311 | read_end = rightmost_from_read(read) 312 | 313 | 314 | soft_clip_start = int(mate_interval['start'])+ int(realignment_dict['alignments'][1][0][0]) 315 | 316 | soft_clip_end = int(mate_interval['start']) + int(realignment_dict['alignments'][1][0][1]) 317 | 318 | score = sc_len*prob 319 | 320 | 321 | # I store the read name to the output, so that a read counts as 1 no matter it is SC in 2 pieces 322 | if read.reference_start < int(mate_interval['start']) + int( 323 | realignment_dict['alignments'][1][0][0]): 324 | 325 | iteration_results.append([interval['chrom'], read.reference_start, soft_clip_end+1, read.qname,iteration,float(round(score,2))]) 326 | 327 | elif read.reference_start + int(mate_interval['start']) + int( 328 | realignment_dict['alignments'][1][0][0]): 329 | 330 | iteration_results.append([interval['chrom'], soft_clip_start, read_end, read.qname,iteration,float(round(score,2))]) 331 | 332 | else: 333 | # uninformative read 334 | pass 335 | 336 | 337 | 338 | else: 339 | pass 340 | else: 341 | 342 | pass 343 | 344 | else: 345 | pass 346 | else: 347 | #discordant reads 348 | #R2F1 oriented when iterating trough R2 349 | if read.is_reverse == True and read.mate_is_reverse == False: 350 | if read.is_read2: 351 | if read.reference_start < read.next_reference_start: 352 | # discordant read 353 | disorcordants_per_it +=1 354 | iteration_discordants.append([interval['chrom'],read.reference_start,read.next_reference_start + read.infer_query_length(),read.qname]) 355 | 356 | 357 | 358 | 359 | #R2F1 when iterating trough F1 360 | elif read.is_reverse == False and read.mate_is_reverse == True: 361 | if read.is_read2 == False: 362 | if read.next_reference_start < read.reference_start: 363 | disorcordants_per_it +=1 364 | iteration_discordants.append([interval['chrom'], read.next_reference_start,read.reference_start+read.infer_query_length(), read.qname]) 365 | 366 | 367 | #second pass to add discordant read info 368 | if len(iteration_results) > 0: 369 | 370 | 371 | results = results + assign_discordants(iteration_results,iteration_discordants,insert_metrics[0],insert_metrics[1]) 372 | 373 | 374 | elif len(iteration_discordants) > 0: 375 | discordant_bed = pd.DataFrame.from_records(iteration_discordants,columns=['chrom','start','end','read']).sort_values(['chrom','start','end']) 376 | 377 | discordant_bed = discordant_bed.groupby(merge_bed(discordant_bed)).agg( 378 | {'chrom': 'first', 'start': 'first', 'end': 'last', 'read': 'count'}) 379 | 380 | 381 | for index,disc_interval in discordant_bed.iterrows(): 382 | only_discordants.append([disc_interval['chrom'],disc_interval['start'],disc_interval['end'],disc_interval['read'],0]) 383 | 384 | 385 | 386 | except BaseException as e: 387 | 388 | traceback.print_exc(file=sys.stdout) 389 | warnings.warn( 390 | "Failed on interval %s due to the error %s" % ( 391 | str(interval), str(e))) 392 | return([1,1]) 393 | 394 | 395 | ecc_dna.close() 396 | genome_fa.close() 397 | 398 | 399 | # Write process output to disk 400 | output = iteration_merge(only_discordants,results, 401 | self.overlap_fraction,self.split,self.score, 402 | self.min_sc_length,sorted_bam,self.af,insert_metrics[0],insert_metrics[1],self.discordant_filter) 403 | 404 | write_to_disk(output, self.output, self.lock, self.directory, self.pid) 405 | 406 | 407 | except: 408 | print("Failed on cluster:") 409 | print(traceback.print_exc(file=sys.stdout)) 410 | return([1,1]) 411 | 412 | sorted_bam.close() 413 | genome_fa.close() 414 | ecc_dna.close() 415 | 416 | 417 | return([0,0]) -------------------------------------------------------------------------------- /circlemap/repeats.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2019 Iñigo Prada Luengo 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | import pysam as ps 24 | import pybedtools as bt 25 | import os 26 | import time 27 | from circlemap.utils import merge_coverage_bed,rightmost_from_sa 28 | 29 | 30 | class repeat: 31 | """Class for indentifying repeat derived eccDNA by looking of the reads with two alignments""" 32 | 33 | 34 | def __init__(self,bam,directory,mismatch,fraction,read_number): 35 | self.bam = bam 36 | self.dir = directory 37 | self.mismatch = mismatch 38 | self.fraction = fraction 39 | self.number = read_number 40 | 41 | def find_circles(self): 42 | 43 | begin = time.time() 44 | 45 | os.chdir("%s" % self.dir) 46 | 47 | bam = ps.AlignmentFile("%s" % self.bam,'rb') 48 | 49 | 50 | 51 | print("Iterating trough the bam file") 52 | 53 | output = [] 54 | for read in bam: 55 | 56 | try: 57 | if read.has_tag('XA'): 58 | tag = read.get_tag('XA').split(';')[:-1] 59 | 60 | read_edit_distance = read.get_tag('NM') 61 | 62 | if read_edit_distance <= self.mismatch and len(tag) ==1: 63 | 64 | read_chrom = bam.get_reference_name(read.reference_id) 65 | chrom = tag[0].split(',')[0] 66 | 67 | 68 | if chrom == read_chrom: 69 | 70 | 71 | aln = int(tag[0].split(',')[1][1:]) 72 | 73 | if aln < read.reference_start: 74 | 75 | interval = [chrom,aln,read.reference_start+ read.infer_read_length(),1] 76 | 77 | output.append(interval) 78 | 79 | else: 80 | 81 | interval = [chrom,read.reference_start,rightmost_from_sa(aln,tag[0].split(',')[2]),1] 82 | output.append(interval) 83 | 84 | 85 | except BaseException as e: 86 | print(e) 87 | 88 | 89 | 90 | bed = merge_coverage_bed(output,self.fraction,self.number) 91 | 92 | #add dots to read metrics stats 93 | 94 | with_dot = [] 95 | for interval in bed: 96 | interval.append(".") 97 | with_dot.append(interval) 98 | 99 | bed= bt.BedTool(with_dot) 100 | 101 | return(bed) 102 | -------------------------------------------------------------------------------- /circlemap/simulations.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2019 Iñigo Prada Luengo 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | import numpy as np 24 | import os 25 | import pysam as ps 26 | from Bio import SeqIO 27 | from io import StringIO 28 | import random as rd 29 | from Bio.Seq import Seq 30 | from Bio.Alphabet import generic_dna 31 | import sys 32 | import pybedtools as bt 33 | import time 34 | import subprocess as sp 35 | import warnings 36 | 37 | 38 | def sim_ecc_reads(genome_fasta,read_length,directory,reads,exclude_regions,fastq,insert_size,errors,mean_cov,locker, 39 | process,sim_circles,paired_end_fastq_1,paired_end_fastq_2,skipped,correct,ins_rate1,ins_rate2,del_rate1, 40 | del_rate2,sim_pid): 41 | """Function that takes as arguments a genome fasta file, weights each chromosome based on the length 42 | and simulates single end eccDNA reads 43 | """ 44 | 45 | 46 | # Get the length of the chromosomes and store them in a sequence dictionary 47 | chromosomes = {} 48 | whole_genome_len = 0 49 | for rec in SeqIO.parse(genome_fasta, 'fasta'): 50 | name = rec.id 51 | seqLen = len(rec) 52 | whole_genome_len += seqLen 53 | chromosomes[name] = seqLen 54 | #chromosome sampling probability weighted based on its length 55 | weighted_chromosomes = {} 56 | for contigs in chromosomes: 57 | weighted_chromosomes[contigs] = chromosomes[contigs]/whole_genome_len 58 | 59 | 60 | 61 | 62 | contig_list = [] 63 | weights = [] 64 | for contigs, value in weighted_chromosomes.items(): 65 | weights.append(value) 66 | contig_list.append(contigs) 67 | 68 | #Simulate the reads: 69 | 70 | 71 | 72 | os.chdir(directory) 73 | circle_bed = [] 74 | 75 | 76 | 77 | 78 | set_of_reads = [] 79 | set_of_left_reads = [] 80 | set_of_right_reads = [] 81 | 82 | circle_number = 0 83 | #reads simulated by a process 84 | n_of_reads = 0 85 | # 86 | n_of_reads_it = 0 87 | 88 | begin = time.time() 89 | #simulated reads 90 | while n_of_reads < reads + 1: 91 | 92 | 93 | #sample weighted chromosome 94 | #set random seed, important for paralell 95 | np.random.seed() 96 | chr = np.random.choice(contig_list, p=weights) 97 | 98 | # decide ecDNA length 99 | 100 | #sample circle length 101 | circle_length = rd.randint(150,350) 102 | 103 | 104 | # linear decrease in coverage based on circle length 105 | 106 | 107 | 108 | 109 | 110 | # compute circles sequencing coverage 111 | 112 | rounds_of_sim = (circle_length * mean_cov)/(read_length*2) 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | # take in to account short length contigs 121 | #start position can't be bigger than (chr_length-circle_length) 122 | chr_pos_start = rd.randint(0,(chromosomes[chr] - circle_length)) 123 | #set end 124 | if chromosomes[chr] == (chromosomes[chr] - circle_length): 125 | chr_pos_end = chromosomes[chr] 126 | else: 127 | chr_pos_end = chr_pos_start + circle_length 128 | 129 | #if user of provides regions to exclude, check within it is on the region. and skip it 130 | if exclude_regions != None and bt.BedTool(exclude_regions).sort().any_hits(bt.Interval(chr,chr_pos_start,chr_pos_end)) != 0: 131 | #hit in a gap region 132 | # shared memory object between processes. It is use to track the number of skipped circles 133 | with skipped.get_lock(): 134 | skipped.value+=1 135 | continue 136 | else: 137 | #shared memory object between processes. It is use to track the number of correctly simulated circles 138 | with correct.get_lock(): 139 | correct.value+=1 140 | #save each circle positions, so that then I can check true circles 141 | 142 | 143 | first_line = [chr, chr_pos_start, chr_pos_end] 144 | 145 | #create class object outside the loop 146 | new_read = sim_paired_end(n_of_reads, insert_size, genome_fasta, chr, chr_pos_start, 147 | chr_pos_end, read_length, circle_number,process) 148 | 149 | #simulation rounds 150 | for each_sim in range(0,round(int(rounds_of_sim))): 151 | 152 | 153 | if errors == True: 154 | 155 | 156 | 157 | if (n_of_reads_it+1) !=1000: 158 | 159 | # sim the read 160 | get_seq = new_read.simulate_read() 161 | # put it in fastq format 162 | simulated_reads = sim_paired_end.simulate_read_with_errors(new_read, get_seq[0], get_seq[1], 163 | get_seq[2],ins_rate1,ins_rate2,del_rate1, 164 | del_rate2,sim_pid) 165 | if simulated_reads != None: 166 | # save the read 167 | assert len(set_of_left_reads) == len(set_of_right_reads) 168 | set_of_left_reads.append(simulated_reads[0]) 169 | set_of_right_reads.append(simulated_reads[1]) 170 | assert len(set_of_left_reads) == len(set_of_right_reads) 171 | n_of_reads += 1 172 | n_of_reads_it += 1 173 | 174 | else: 175 | continue 176 | 177 | 178 | 179 | 180 | else: 181 | 182 | # simulate reads and save to disk 183 | get_seq = new_read.simulate_read() 184 | simulated_reads = sim_paired_end.simulate_read_with_errors(new_read, get_seq[0], get_seq[1], 185 | get_seq[2],ins_rate1,ins_rate2,del_rate1, 186 | del_rate2,sim_pid) 187 | set_of_left_reads.append(simulated_reads[0]) 188 | set_of_right_reads.append(simulated_reads[1]) 189 | 190 | # save to disk 191 | assert len(set_of_left_reads) == len(set_of_right_reads) 192 | locker.acquire() 193 | print("Process %s: writting to disk 10000 reads" % process ) 194 | fastq_1 = open(paired_end_fastq_1, "a") 195 | SeqIO.write(set_of_left_reads,fastq_1, "fastq") 196 | fastq_1.close() 197 | fastq_2 = open(paired_end_fastq_2, "a") 198 | SeqIO.write(set_of_right_reads,fastq_2, "fastq") 199 | fastq_2.close() 200 | locker.release() 201 | assert len(set_of_left_reads) == len(set_of_right_reads) 202 | 203 | n_of_reads += 1 204 | n_of_reads_it += 1 205 | 206 | # sim the first read of the list 207 | new_read = sim_paired_end(n_of_reads, insert_size, genome_fasta, chr, chr_pos_start, 208 | chr_pos_end, read_length, circle_number,process) 209 | get_seq = new_read.simulate_read() 210 | simulated_reads = sim_paired_end.simulate_read_with_errors(new_read, get_seq[0], get_seq[1], 211 | get_seq[2],ins_rate1,ins_rate2,del_rate1, 212 | del_rate2,sim_pid) 213 | assert len(set_of_left_reads) == len(set_of_right_reads) 214 | set_of_left_reads = [simulated_reads[0]] 215 | set_of_right_reads = [simulated_reads[1]] 216 | assert len(set_of_left_reads) == len(set_of_right_reads) 217 | n_of_reads += 1 218 | n_of_reads_it = 1 219 | 220 | 221 | else: 222 | 223 | if (n_of_reads_it+1) != 10000: 224 | 225 | #sim the read 226 | get_seq = new_read.simulate_read() 227 | #put it in fastq format 228 | simulated_reads = sim_paired_end.simulate_perfect_read(new_read,get_seq[0], get_seq[1], get_seq[2]) 229 | #save the read 230 | set_of_left_reads.append(simulated_reads[0]) 231 | set_of_right_reads.append(simulated_reads[1]) 232 | n_of_reads +=1 233 | n_of_reads_it += 1 234 | 235 | 236 | 237 | else: 238 | #simulate reads and save to disk 239 | get_seq = new_read.simulate_read() 240 | simulated_reads = sim_paired_end.simulate_perfect_read(new_read, get_seq[0], get_seq[1], 241 | get_seq[2]) 242 | set_of_left_reads.append(simulated_reads[0]) 243 | set_of_right_reads.append(simulated_reads[1]) 244 | 245 | #save to disk 246 | locker.acquire() 247 | assert len(set_of_left_reads) == len(set_of_right_reads) 248 | print("Process %s: writting to disk 10000 reads" % process) 249 | fastq_1 = open(paired_end_fastq_1, "a") 250 | SeqIO.write(set_of_left_reads, fastq_1, "fastq") 251 | fastq_1.close() 252 | fastq_2 = open(paired_end_fastq_2, "a") 253 | SeqIO.write(set_of_right_reads, fastq_2, "fastq") 254 | fastq_2.close() 255 | assert len(set_of_left_reads) == len(set_of_right_reads) 256 | locker.release() 257 | 258 | 259 | 260 | n_of_reads += 1 261 | n_of_reads_it += 1 262 | 263 | #sim the first read of the list 264 | new_read = sim_paired_end(n_of_reads, insert_size, genome_fasta, chr, chr_pos_start, 265 | chr_pos_end, read_length, circle_number,process) 266 | get_seq = new_read.simulate_read() 267 | simulated_reads = sim_paired_end.simulate_perfect_read(new_read, get_seq[0], get_seq[1], 268 | get_seq[2]) 269 | 270 | assert len(set_of_left_reads) == len(set_of_right_reads) 271 | set_of_left_reads = [simulated_reads[0]] 272 | set_of_right_reads = [simulated_reads[1]] 273 | assert len(set_of_left_reads) == len(set_of_right_reads) 274 | n_of_reads +=1 275 | n_of_reads_it = 1 276 | 277 | 278 | 279 | 280 | 281 | circle_bed.append(first_line) 282 | 283 | # last save to disk 284 | 285 | locker.acquire() 286 | fastq_1 = open(paired_end_fastq_1, "a") 287 | SeqIO.write(set_of_left_reads, fastq_1, "fastq") 288 | fastq_1.close() 289 | fastq_2 = open(paired_end_fastq_2, "a") 290 | SeqIO.write(set_of_right_reads, fastq_2, "fastq") 291 | fastq_2.close() 292 | locker.release() 293 | 294 | 295 | 296 | 297 | #shared memory between the processes.This is a list that every process will rate the simulated circles 298 | for element in circle_bed: 299 | sim_circles.append(element) 300 | 301 | 302 | 303 | 304 | class sim_paired_end: 305 | 306 | #init the class 307 | def __init__(self,read_number,insert_size,genome_fa,chr,chr_pos_start,chr_pos_end,read_length,circle_id,process): 308 | self.read_number = read_number 309 | self.insert_size = insert_size 310 | self.genome_fa = genome_fa 311 | self.chr = chr 312 | self.chr_pos_start = chr_pos_start 313 | self.chr_pos_end = chr_pos_end 314 | self.read_length = read_length 315 | self.circle_id = circle_id 316 | self.process = process 317 | 318 | def simulate_read(self): 319 | """Function that simulates perfect paired-end reads""" 320 | 321 | fastafile = ps.FastaFile(self.genome_fa) 322 | # left split read 323 | 324 | insert = int(np.random.normal(self.insert_size, (self.insert_size / 12), 1)) 325 | start = int(np.random.randint(self.chr_pos_start, (self.chr_pos_end + 1))) 326 | left_end = start + self.read_length 327 | total_end = start + int(np.round(insert)) 328 | right_start = total_end - self.read_length 329 | if total_end > self.chr_pos_end: 330 | # split read scenario or insert spanning split read scenario 331 | if left_end > self.chr_pos_end: 332 | # left read spanning split read scenario 333 | # left_read 334 | left_dntps = self.chr_pos_end - start 335 | right_dntps = self.read_length - left_dntps 336 | 337 | # the error could be here 338 | left_split_read = fastafile.fetch(self.chr, start, self.chr_pos_end) 339 | right_split_read = fastafile.fetch(self.chr, self.chr_pos_start, (self.chr_pos_start + right_dntps)) 340 | left_read = left_split_read + right_split_read 341 | 342 | # right_read 343 | right_start = self.chr_pos_start + int(round(self.insert_size - left_dntps - self.read_length)) 344 | right_read = fastafile.fetch(self.chr, right_start, (right_start + self.read_length)) 345 | 346 | # assertion to check the error here 347 | 348 | common_id = "%s|%s|%s:%s-%s:%s|%s:%s|1|%s" % ( 349 | self.read_number, self.chr, start, self.chr_pos_end, self.chr_pos_start, (self.chr_pos_start + right_dntps), right_start, 350 | (right_start + self.read_length), self.circle_id) 351 | 352 | 353 | 354 | 355 | else: 356 | if right_start > self.chr_pos_end: 357 | # insert spanning split read scenario 358 | left_read = fastafile.fetch(self.chr, start, (start + self.read_length)) 359 | right_start = self.chr_pos_start + (right_start - self.chr_pos_end) 360 | right_read = fastafile.fetch(self.chr, right_start, (right_start + self.read_length)) 361 | common_id = "%s|%s|%s:%s|%s:%s|3|%s" % ( 362 | self.read_number, self.chr, start, (start + self.read_length), right_start, (right_start + self.read_length), self.circle_id) 363 | else: 364 | # right split read scenario 365 | assert right_start <= self.chr_pos_end 366 | assert (right_start + self.read_length) > self.chr_pos_end 367 | left_read = fastafile.fetch(self.chr, start, (start + self.read_length)) 368 | 369 | # compute right dntps 370 | left_dntps = self.chr_pos_end - right_start 371 | right_dntps = self.read_length - left_dntps 372 | left_split_read = fastafile.fetch(self.chr, right_start, self.chr_pos_end) 373 | right_split_read = fastafile.fetch(self.chr, self.chr_pos_start, (self.chr_pos_start + right_dntps)) 374 | right_read = left_split_read + right_split_read 375 | common_id = "%s|%s|%s:%s|%s:%s-%s:%s|2|%s" % ( 376 | self.read_number,self.chr, start, (start + self.read_length), right_start, self.chr_pos_end, self.chr_pos_start, 377 | (self.chr_pos_start + 378 | right_dntps), self.circle_id) 379 | 380 | 381 | else: 382 | # non split read scenario 383 | left_read = fastafile.fetch(self.chr, start, (start + self.read_length)) 384 | # correct right read start 385 | right_read = fastafile.fetch(self.chr, right_start, (right_start + self.read_length)) 386 | common_id = "%s|%s|%s:%s|%s:%s|0|%s" % ( 387 | self.read_number, self.chr, start, (start + self.read_length), right_start, (right_start + self.read_length), self.circle_id) 388 | 389 | return(right_read,left_read,common_id) 390 | 391 | 392 | 393 | def simulate_perfect_read(self,right_read,left_read,common_id): 394 | # put all together 395 | # unique identifiers for right and left reads 396 | right_read_id = "2:N:0:CGCTGTG" 397 | right_id = common_id + " " + right_read_id 398 | left_read_id = "1:N:0:CGCTGTG" 399 | left_id = common_id + " " + left_read_id 400 | quality = "I" * self.read_length 401 | # get the reverse complement of the right read 402 | right_read = Seq(right_read, generic_dna) 403 | right_read = right_read.reverse_complement() 404 | 405 | left_read = left_read.upper() 406 | 407 | right_read = right_read.upper() 408 | 409 | 410 | fastq_left = "@%s\n%s\n+\n%s\n" % (left_id, left_read, quality) 411 | fastq_right = "@%s\n%s\n+\n%s\n" % (right_id, right_read, quality) 412 | 413 | right_record = SeqIO.read(StringIO(fastq_right), "fastq") 414 | left_record = SeqIO.read(StringIO(fastq_left), "fastq") 415 | return (left_record, right_record) 416 | 417 | 418 | def simulate_read_with_errors(self,right_read, left_read, common_id,ins_rate1,ins_rate2,del_rate1, 419 | del_rate2,pid): 420 | # put all together 421 | # unique identifiers for right and left reads 422 | dir = os.getcwd() 423 | os.chdir("temp_files_%s" % pid) 424 | 425 | right_read_id = "2:N:0:CGCTGTG" 426 | right_id = common_id + "space" + right_read_id 427 | left_read_id = "1:N:0:CGCTGTG" 428 | left_id = common_id + "space" + left_read_id 429 | 430 | # attemp to use art to simulate the quality scores and the error rate 431 | #create a one read genome 432 | left_fasta = open("left_read_%s.fa" % (self.process), "w") 433 | left_fasta.write(">" + left_id + "\n" + str(left_read) + "\n") 434 | # sim the read with art 435 | left_fasta.close() 436 | 437 | sp.call("art_illumina -q -na -ss HS25 -ir %s -ir2 %s -dr %s -dr2 %s -nf 0 -i left_read_%s.fa -l %s -f 1 -o left%s" % 438 | (ins_rate1,ins_rate2,del_rate1,del_rate2,self.process,self.read_length,self.process), 439 | shell=True,stdout=sp.DEVNULL, stderr=sp.STDOUT) 440 | 441 | 442 | with open("left%s.fq" % (self.process), 'r') as left: 443 | left_read = left.read().replace('space', ' ').replace('1:N:0:CGCTGTG-1', '1:N:0:CGCTGTG') 444 | 445 | 446 | 447 | 448 | # get the reverse complement of the right read 449 | right_read = Seq(right_read, generic_dna) 450 | right_read = right_read.reverse_complement() 451 | 452 | right_fasta = open("right_read_%s.fa" % (self.process), "w") 453 | right_fasta.write(">" + right_id + "\n" + str(right_read) + "\n") 454 | right_fasta.close() 455 | # sim the read with art 456 | 457 | sp.call("art_illumina -na -q -ss HS25 -ir %s -ir2 %s -dr %s -dr2 %s -nf 0 -i right_read_%s.fa -l %s -f 1 -o right%s" % 458 | (ins_rate1,ins_rate2,del_rate1,del_rate2,self.process,self.read_length,self.process), 459 | shell=True,stdout=sp.DEVNULL, stderr=sp.STDOUT) 460 | 461 | with open("right%s.fq" % (self.process), 'r') as right: 462 | right_read = right.read().replace('space', ' ').replace('1:N:0:CGCTGTG-1', '2:N:0:CGCTGTG') 463 | 464 | #sometimes the reading fails. I introduce this to capture it 465 | try: 466 | 467 | right_record = SeqIO.read(StringIO(right_read), "fastq") 468 | left_record = SeqIO.read(StringIO(left_read), "fastq") 469 | os.chdir(dir) 470 | return (left_record, right_record) 471 | except ValueError as v: 472 | 473 | warnings.warn('Catched ValueError in a sampling round. Skipping') 474 | os.chdir(dir) 475 | return(None) 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | -------------------------------------------------------------------------------- /circlemap/utils.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2019 Iñigo Prada Luengo 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | import pysam as ps 24 | import pybedtools as bt 25 | import warnings 26 | import numpy as np 27 | import pandas as pd 28 | import itertools as it 29 | import edlib 30 | import os 31 | import subprocess as sp 32 | import glob 33 | import time 34 | import sys 35 | from scipy import stats as st 36 | import random 37 | import re 38 | from numba import jit 39 | import math 40 | import datetime 41 | 42 | 43 | 44 | 45 | 46 | def is_soft_clipped(read): 47 | 48 | """Function that checks the CIGAR string of the sam file and returns true if the read is soft-clipped""" 49 | 50 | # cigar 4 equals to S in pysam sam representation 51 | match = 0 52 | for cigar in read.cigar: 53 | if cigar[0] == 4: 54 | match +=1 55 | else: 56 | pass 57 | 58 | if match > 0: 59 | return(True) 60 | 61 | else: 62 | return(False) 63 | 64 | def is_hard_clipped(read): 65 | 66 | """Function that checks the CIGAR string of the sam file and returns true if the read is hard-clipped""" 67 | 68 | # cigar 5 equals to H in pysam sam representation 69 | match = 0 70 | for cigar in read.cigar: 71 | if cigar[0] == 5: 72 | match += 1 73 | else: 74 | pass 75 | 76 | if match > 0: 77 | return (True) 78 | 79 | else: 80 | return (False) 81 | 82 | def rightmost_from_read(read): 83 | """Function that takes as input a read a returns its rightmost mapping position""" 84 | 85 | rightmost = 0 86 | 87 | #matches, deletions and ref skip consume reference 88 | for cigar in read.cigar: 89 | 90 | if cigar[0] == 0: 91 | rightmost += cigar[1] 92 | 93 | elif cigar[0] == 2: 94 | rightmost += cigar[1] 95 | 96 | elif cigar[0] == 3: 97 | rightmost += cigar[1] 98 | 99 | 100 | return(read.reference_start + rightmost) 101 | 102 | def rightmost_from_sa(leftmost,sa_cigar): 103 | """Function that takes as input the leftmost position of a supplementary alignment and returns it rightmost mapping 104 | position""" 105 | 106 | 107 | #the SA alignment is 1 based 108 | rightmost = int(leftmost)-1 109 | 110 | cigar = [''.join(g) for _, g in it.groupby(sa_cigar, str.isalpha)] 111 | # matches, deletions and ref skip consume reference 112 | match_index = [x for x in range(len(cigar)) if cigar[x] == 'M'] 113 | deletion_index = [x for x in range(len(cigar)) if cigar[x] == 'D'] 114 | ambiguous_index = [x for x in range(len(cigar)) if cigar[x] == 'N'] 115 | 116 | 117 | for index in match_index: 118 | rightmost += int(cigar[index-1]) 119 | 120 | for index in deletion_index: 121 | rightmost += int(cigar[index-1]) 122 | 123 | 124 | for index in ambiguous_index: 125 | rightmost += int(cigar[index-1]) 126 | 127 | assert rightmost >= (int(leftmost)-1) 128 | 129 | return(rightmost) 130 | 131 | 132 | 133 | 134 | 135 | def aligned_bases(read): 136 | 137 | """Function that counts the number of aligned bases from the CIGAR string and returns and integer""" 138 | 139 | aligned = 0 140 | 141 | for cigar in read.cigar: 142 | if cigar[0] == 0: 143 | aligned += cigar[1] 144 | else: 145 | pass 146 | assert aligned >= 0 147 | return(aligned) 148 | 149 | def aligned_bases_from_sa(sa_cigar): 150 | 151 | """Function that gets as input the SA tag CIGAR and reports the number of bases that where matched to the genome""" 152 | 153 | cigar = [''.join(g) for _, g in it.groupby(sa_cigar, str.isalpha)] 154 | 155 | 156 | match_index = [x for x in range(len(cigar)) if cigar[x]=='M'] 157 | 158 | aligned = 0 159 | #if only one hit 160 | if type(match_index) == int: 161 | aligned += int(cigar[match_index -1]) 162 | 163 | #when there are more than 1 hits 164 | else: 165 | assert type(match_index) == list 166 | 167 | for index in match_index: 168 | aligned += int(cigar[index - 1]) 169 | 170 | assert aligned >=0 171 | return(aligned) 172 | 173 | 174 | def genome_alignment_from_cigar(sa_cigar): 175 | 176 | """Function that gets as input the SA tag CIGAR and returns the length of the alignment interval in the genome it 177 | will look at the number of matches and deletions in the CIGAR, as they are the elements that will explain the genome 178 | alignment 179 | """ 180 | aligned = 0 181 | 182 | cigar = [''.join(g) for _, g in it.groupby(sa_cigar, str.isalpha)] 183 | 184 | #do it for the matches 185 | match_index = [x for x in range(len(cigar)) if cigar[x]=='M'] 186 | 187 | 188 | # if only one hit 189 | if type(match_index) == int: 190 | aligned += int(cigar[match_index - 1]) 191 | 192 | # when there are more than 1 hits 193 | else: 194 | assert type(match_index) == list 195 | 196 | for index in match_index: 197 | aligned += int(cigar[index - 1]) 198 | 199 | 200 | if 'D' in cigar == True: 201 | 202 | deletion_index = cigar.index('D') 203 | 204 | # if only one hit 205 | if type(deletion_index) == int: 206 | aligned += int(cigar[deletion_index - 1]) 207 | 208 | # when there are more than 1 hits 209 | else: 210 | assert type(deletion_index) == list 211 | 212 | for index in deletion_index: 213 | aligned += int(cigar[index - 1]) 214 | 215 | assert aligned >=0 216 | return(aligned) 217 | 218 | 219 | 220 | 221 | 222 | def bam_circ_sv_peaks(bam,input_bam_name,cores,verbose,pid,clusters): 223 | """Function that takes as input a bam file and returns a merged bed file of the genome covered by the bam, it will create 224 | and index too""" 225 | 226 | # check bam header for sorting state 227 | 228 | 229 | 230 | #check the header of the bam file for the sorting state, and sort if necessary 231 | 232 | if 'HD' in bam.header: 233 | if bam.header['HD']['SO'] == 'queryname': 234 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Bam is sorted by queryname, exiting") 235 | 236 | bam.close() 237 | sys.exit() 238 | 239 | 240 | 241 | 242 | 243 | elif bam.header['HD']['SO'] == 'unsorted': 244 | 245 | bam.close() 246 | 247 | print("Bam is unsorted, exiting") 248 | sys.exit() 249 | 250 | 251 | 252 | 253 | elif bam.header['HD']['SO'] == 'coordinate': 254 | 255 | bam.close() 256 | # this handles Circle-Map bam2bam 257 | if input_bam_name != None: 258 | sorted_bam = ps.AlignmentFile("%s" % input_bam_name) 259 | 260 | 261 | 262 | 263 | 264 | else: 265 | if verbose < 2: 266 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:")) 267 | warnings.warn( 268 | "WARNING: the bam file does not have an SO tag.\nCircle-Map cannot check if the bam file is sorted by coordinate.\n If the bam file is not sorted by coordinate the program will file") 269 | print( 270 | "As sanity check, sort your bam file coordinate with the following command:\n\n\tsamtools sort -o output.bam input.bam") 271 | 272 | 273 | else: 274 | 275 | if verbose < 2: 276 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:")) 277 | warnings.warn( 278 | "WARNING: the bam file does not have an HD tag.\nCircle-Map cannot check if the bam file is sorted by coordinate.\n If the bam file is not sorted by coordinate the program will file") 279 | print( 280 | "As sanity check, sort your bam file coordinate with the following command:\n\n\tsamtools sort -o output.bam input.bam") 281 | 282 | 283 | #from bam to BedGraph 284 | 285 | sp.call("bedtools genomecov -bg -ibam %s | sort -T temp_files_%s -k 1,1 -k2,2n | mergeBed -d %s -c 4 -o mean | sort -r -n -k 4,4 > temp_files_%s/peaks.bed" % 286 | (input_bam_name,pid,clusters,pid),shell=True) 287 | 288 | #Decide number of chunks 289 | chunks = cores * 100 290 | #Create empty list 291 | split_peaks = [] 292 | for i in range(0, chunks): 293 | split_peaks.append([]) 294 | 295 | #put chunks in the list 296 | counter = 0 297 | for interval in bt.BedTool("temp_files_%s/peaks.bed" % pid): 298 | if counter == chunks: 299 | counter = 0 300 | 301 | if int(interval[2]) - int(interval[1]) > 500: 302 | w_start = int(interval[1]) 303 | while w_start < int(interval[2]): 304 | splitted = [interval.chrom, str(w_start), str(w_start + 300)] 305 | w_start += 300 306 | if counter == chunks: 307 | counter = 0 308 | split_peaks[counter].append(splitted) 309 | else: 310 | split_peaks[counter].append(splitted) 311 | counter +=1 312 | else: 313 | split_peaks[counter].append([interval.chrom, str(interval.start), str(interval.end)]) 314 | counter += 1 315 | 316 | 317 | return(sorted_bam,split_peaks) 318 | 319 | 320 | def get_mate_intervals(sorted_bam,interval,mapq_cutoff,verbose,only_discordants): 321 | 322 | """Function that takes as input a sorted bam, an interval and the mapq cutoff and returns the mate alignment positions 323 | (the realignment prior) intervals""" 324 | 325 | try: 326 | 327 | 328 | 329 | candidate_mates = [] 330 | for read in sorted_bam.fetch(interval['chrom'], int(interval['start']), int(interval['end']),multiple_iterators=True): 331 | 332 | if read.mapq >= mapq_cutoff: 333 | 334 | # create mate interval based on the soft-clipped SA alignments 335 | if is_soft_clipped(read) == True and read.has_tag('SA'): 336 | if only_discordants != True: 337 | 338 | read_chr = sorted_bam.get_reference_name(read.reference_id) 339 | suplementary = read.get_tag('SA') 340 | 341 | # [chr, left_most start, "strand,CIGAR,mapq, edit_distance] 342 | supl_info = [x.strip() for x in suplementary.split(',')] 343 | 344 | if read_chr == supl_info[0] and int(supl_info[4]) >= mapq_cutoff: 345 | 346 | # split read with the same orientation 347 | if (read.is_reverse == True and supl_info[2] == '-') or ( 348 | read.is_reverse == False and supl_info[2] == '+'): 349 | 350 | # SA is downstream, the interval is start, start+read length 351 | 352 | if read.reference_start > int(supl_info[1]): 353 | 354 | ref_alignment_length = genome_alignment_from_cigar(supl_info[3]) 355 | 356 | # ref_alignment_length * 2 is done for extending the realignment region 357 | # "SA" means that the realignment prior has been generated by a supplementary alignment 358 | # L means that the SA is aligned to to a rightmost part. 359 | 360 | mate_interval = [interval['chrom'], int(supl_info[1]) - (ref_alignment_length), 361 | (int(supl_info[1]) + (ref_alignment_length)), "SA", "L",str( 362 | 1-phred_to_prob(np.array(int(supl_info[4]),dtype=np.float64)))] 363 | 364 | candidate_mates.append(mate_interval) 365 | 366 | 367 | # SA is upstream, the interval is end - read length, end 368 | elif read.reference_start < int(supl_info[1]): 369 | 370 | ref_alignment_length = genome_alignment_from_cigar(supl_info[3]) 371 | 372 | # ref_alignment_length * 2 is done for extending the realignment region, "SA" means that the realignment prior has been generated 373 | # by a supplementary alignment. R means that the SA is aligned to to a rightmost part. 374 | 375 | mate_interval = [interval['chrom'], (int(supl_info[1]) - (ref_alignment_length)), 376 | int(supl_info[1]) + (ref_alignment_length), "SA", "R",str(1-phred_to_prob(np.array(int(supl_info[4]),dtype=np.float64)))] 377 | 378 | candidate_mates.append(mate_interval) 379 | else: 380 | pass 381 | 382 | 383 | 384 | 385 | # check discordant reads (R2F1 orientation) 386 | elif read.is_unmapped == False and read.mate_is_unmapped == False: 387 | 388 | # check R2F1 orientation,when the R2 read 389 | if read.is_reverse == True and read.mate_is_reverse == False: 390 | 391 | # R2F1 order 392 | if read.reference_start < read.next_reference_start: 393 | 394 | if read.reference_id == read.next_reference_id: 395 | # create mate interval 396 | read_length = read.infer_query_length() 397 | 398 | # DR means that the realignment prior has been generated by the discordants. R means 399 | # that the mate has been aligned to a rightmost part 400 | 401 | 402 | 403 | 404 | 405 | mate_interval = [interval['chrom'], read.next_reference_start, 406 | (read.next_reference_start + read_length), "DR", 407 | "R",str(1-phred_to_prob(np.array(read.get_tag('MQ'),dtype=np.float64)))] 408 | candidate_mates.append(mate_interval) 409 | 410 | 411 | # R2F1 when iterating trough F1 read 412 | elif read.is_reverse == False and read.mate_is_reverse == True: 413 | 414 | if read.next_reference_start < read.reference_start: 415 | 416 | if read.reference_id == read.next_reference_id: 417 | # create mate interval 418 | read_length = read.infer_query_length() 419 | 420 | # L means that the mate is aligned to a leftmost part 421 | 422 | mate_interval = [interval['chrom'], read.next_reference_start, 423 | (read.next_reference_start + read_length), 424 | "DR", "L",str(1-phred_to_prob(np.array(read.get_tag('MQ'),dtype=np.float64)))] 425 | candidate_mates.append(mate_interval) 426 | else: 427 | 428 | if only_discordants != True: 429 | # soft clipped without and SA and hard clipped reads (secondary) 430 | 431 | 432 | if is_soft_clipped(read) == True and read.has_tag('SA') == False: 433 | # mate interval is whole chromosome 434 | 435 | if 'SQ' in sorted_bam.header: 436 | 437 | for reference in sorted_bam.header['SQ']: 438 | 439 | if reference['SN'] == sorted_bam.get_reference_name(read.reference_id): 440 | # LR is added just not to crash the program 441 | 442 | mate_interval = [interval['chrom'], 1, reference['LN'], "SC", "LR",0] 443 | 444 | candidate_mates.append(mate_interval) 445 | 446 | 447 | else: 448 | 449 | if verbose < 2: 450 | 451 | warnings.warn( 452 | "WARNING: the bam file does not have a SQ tag. Circle-Map cannot check the reference length for realigning\n" 453 | "soft clipped reads without a SA tag, hence, skipping. Please, check if your bam file is truncated") 454 | 455 | elif is_hard_clipped(read): 456 | 457 | # all hard clipped reads have SA tag with bwa, but just as sanity 458 | 459 | if read.has_tag('SA'): 460 | 461 | read_chr = sorted_bam.get_reference_name(read.reference_id) 462 | 463 | suplementary = read.get_tag('SA') 464 | 465 | # [chr, left_most start, "strand,CIGAR,mapq, edit_distance] 466 | supl_info = [x.strip() for x in suplementary.split(',')] 467 | 468 | if read_chr == supl_info[0] and int(supl_info[4]) >= mapq_cutoff: 469 | 470 | # SA alignment with the same orientation 471 | if (read.is_reverse == True and supl_info[2] == '-') or ( 472 | read.is_reverse == False and supl_info[2] == '+'): 473 | 474 | # SA is downstream, the interval is start, start+read length 475 | 476 | if read.reference_start > int(supl_info[1]): 477 | 478 | ref_alignment_length = genome_alignment_from_cigar(supl_info[3]) 479 | 480 | # ref_alignment_length * 2 is done for extending the realignment region 481 | # "SA" means that the realignment prior has been generated by a supplementary alignment 482 | # L means that the SA is in a downstream region 483 | 484 | mate_interval = [interval['chrom'], int(supl_info[1]) - (ref_alignment_length * 2), 485 | (int(supl_info[1]) + (ref_alignment_length * 2)), "SA", "L",str(1-phred_to_prob(int(supl_info[4])))] 486 | 487 | candidate_mates.append(mate_interval) 488 | 489 | 490 | # SA is upstream, the interval is end - read length, end 491 | elif read.reference_start < int(supl_info[1]): 492 | 493 | ref_alignment_length = genome_alignment_from_cigar(supl_info[3]) 494 | 495 | # ref_alignment_length * 2 is done for extending the realignment region, "SA" means that the realignment prior has been generated 496 | # by a supplementary alignment 497 | # R means that the SA is in a upstream region 498 | 499 | mate_interval = [interval['chrom'], 500 | (int(supl_info[1]) - (ref_alignment_length * 2)), 501 | int(supl_info[1]) + (ref_alignment_length * 2), "SA", "R",str(1-phred_to_prob(int(supl_info[4])))] 502 | 503 | 504 | 505 | candidate_mates.append(mate_interval) 506 | else: 507 | pass 508 | 509 | 510 | 511 | else: 512 | # low mapping quality reads, do nothing 513 | pass 514 | 515 | #this function should return the candidate mates (realignment prior, discordant intervals/split read intervals and soft-clipped reads) 516 | return(candidate_mates) 517 | 518 | except BaseException as e: 519 | 520 | warnings.warn( 521 | "WARNING: Could not get mate interval priors for the interval %s due to the following error %s \n Skipping interval" % (str(interval),str(e))) 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | def insert_size_dist(sample_size,mapq_cutoff,qname_bam): 534 | """Function that takes as input a queryname sorted bam and computes the mean insert a size and 535 | the standard deviation from. This number is computed from the F1R2 read with a user defined sample size, 536 | using a user defined mapping quality cutoff in both reads""" 537 | 538 | 539 | whole_bam = ps.AlignmentFile(qname_bam, "rb") 540 | 541 | counter = 0 542 | insert_length = [] 543 | read1 = '' 544 | 545 | # this is similar to the code of read extractor. I save the first read in memory and then I operate 546 | # in both reads together 547 | for read in whole_bam: 548 | 549 | 550 | if read.is_read1: 551 | read1 = read 552 | else: 553 | if read.is_read2 and read.qname == read1.qname: 554 | read2 = read 555 | # both reads in memory 556 | if read1.mapq >= mapq_cutoff and read2.mapq >= mapq_cutoff: 557 | if read1.is_proper_pair: 558 | if is_hard_clipped(read1) == False and is_hard_clipped(read2) == False: 559 | if is_soft_clipped(read1) == False and is_soft_clipped(read2) == False: 560 | if read1.is_reverse == False and read2.is_reverse == True: 561 | if read1.tlen > 0: 562 | insert_length.append(read1.tlen) 563 | counter += 1 564 | 565 | if counter >= sample_size: 566 | break 567 | else: 568 | pass 569 | 570 | 571 | mean = np.mean(insert_length) 572 | std = np.std(insert_length) 573 | return(mean, std) 574 | 575 | def normalize_probability_matrix(pandas_df): 576 | return(pandas_df) 577 | 578 | def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,verbose): 579 | 580 | 581 | """Function that takes as input a bed file with the read type information and will remove the soft-clipped if there 582 | are more informative priors (DR,SA). If there are only soft-clipped reads, they will be saved to a bed file to attemp 583 | lonely soft-clipped read rescue""" 584 | 585 | try: 586 | 587 | labels = ['chrom', 'start', 'end', 'read_type', 'orientation','probability'] 588 | candidate_mates_dataframe = pd.DataFrame.from_records(bed_prior, columns=labels) 589 | 590 | read_types = candidate_mates_dataframe.read_type.unique() 591 | orientation = candidate_mates_dataframe.orientation.unique() 592 | 593 | 594 | 595 | 596 | 597 | #this contains the sumatory over all probabilities 598 | sum = 0 599 | 600 | 601 | if np.any(read_types == 'SC') == False: 602 | 603 | 604 | # nothing. Sort and merge 605 | 606 | 607 | candidate_mates_dataframe = candidate_mates_dataframe.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True]) 608 | candidate_mates_dataframe['probability'] = candidate_mates_dataframe.probability.astype(float) 609 | 610 | candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'last','probability':'sum'}) 611 | 612 | sum = np.sum(float(x[3]) for index, x in candidate_mates.iterrows()) 613 | candidate_mates['probability'] = candidate_mates['probability'] / sum 614 | 615 | 616 | elif np.any(read_types == 'SC') == True and (np.any(read_types == 'DR') == True or np.any(read_types == 'SA') == True): 617 | #remove lines with sc 618 | 619 | candidate_mates_no_sc = candidate_mates_dataframe.drop(candidate_mates_dataframe[candidate_mates_dataframe.read_type == 'SC'].index) 620 | candidate_mates_dataframe = candidate_mates_no_sc.sort_values(by=['chrom', 'start', 'end'],ascending=[True, True, True]) 621 | candidate_mates_dataframe['probability'] = candidate_mates_dataframe.probability.astype(float) 622 | 623 | 624 | candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'last','probability':'sum'}) 625 | 626 | 627 | sum = np.sum(float(x[3]) for index,x in candidate_mates.iterrows()) 628 | candidate_mates['probability'] = candidate_mates['probability']/sum 629 | 630 | 631 | 632 | else: 633 | #only soft clipped 634 | 635 | return(None) 636 | 637 | extended = [] 638 | 639 | 640 | 641 | 642 | #if argmax is turn on interval_p is 0 643 | if interval_p_cutoff == 0: 644 | #argmax(probability) 645 | 646 | candidate_mates = candidate_mates.loc[candidate_mates['probability'] == candidate_mates['probability'].max()] 647 | 648 | for item,row in candidate_mates.iterrows(): 649 | 650 | if ('LR' in orientation) or ('L' and 'R' in orientation): 651 | 652 | 653 | start = row['start'] - interval_extension 654 | 655 | end = row['end'] + interval_extension 656 | 657 | if start < 0: 658 | extended.append([row['chrom'], str(0), int(round(end)),float(row['probability'])]) 659 | 660 | else: 661 | extended.append([row['chrom'], int(round(start)), int(round(end)),float(row['probability'])]) 662 | 663 | elif 'L' in orientation: 664 | 665 | start = row['start'] - interval_extension 666 | 667 | if start < 0: 668 | extended.append([row['chrom'], str(0), row['end'],float(row['probability'])]) 669 | 670 | else: 671 | extended.append([row['chrom'], int(round(start)), row['end'],float(row['probability'])]) 672 | 673 | elif 'R' in orientation: 674 | 675 | end = row['end'] + interval_extension 676 | 677 | extended.append([row['chrom'], row['start'], int(round(end)),float(row['probability'])]) 678 | 679 | return (pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability'])) 680 | 681 | else: 682 | 683 | for index,interval in candidate_mates.iterrows(): 684 | 685 | #small pseudocount to denominator to avoid div by zero 686 | 687 | 688 | if interval['probability'] >= interval_p_cutoff: 689 | 690 | if ('LR' in orientation) or ('L' and 'R' in orientation): 691 | 692 | 693 | start = interval['start'] - interval_extension 694 | 695 | end = interval['end'] + interval_extension 696 | 697 | if start < 0: 698 | extended.append([interval['chrom'], str(0), int(round(end)),float(interval['probability'])]) 699 | 700 | else: 701 | extended.append([interval['chrom'], int(round(start)), int(round(end)),float(interval['probability'])]) 702 | 703 | elif 'L' in orientation: 704 | 705 | start = interval['start'] - interval_extension 706 | 707 | if start < 0: 708 | extended.append([interval['chrom'], str(0), interval['end'],float(interval['probability'])]) 709 | 710 | else: 711 | extended.append([interval['chrom'], int(round(start)), interval['end'],float(interval['probability'])]) 712 | 713 | elif 'R' in orientation: 714 | 715 | end = interval['end'] + interval_extension 716 | 717 | extended.append([interval['chrom'], interval['start'], int(round(end)),float(interval['probability'])]) 718 | 719 | 720 | return(pd.DataFrame.from_records(extended,columns=['chrom','start','end','probability']).sort_values(by=['probability'],ascending=[False])) 721 | 722 | 723 | except BaseException as e: 724 | 725 | 726 | warnings.warn( 727 | "WARNING: Could not compute the probability for the mate interval priors %s due to the following error %s \n Skipping intervals" % ( 728 | str(bed_prior), str(e))) 729 | 730 | 731 | 732 | 733 | def circle_from_SA(read,mapq_cutoff,mate_interval): 734 | 735 | """Function that takes as input a read (soft-clipped) with a Suplementary alignment the mapping quality cut-off 736 | and the mate intervals and checks if it fits the conditions to call a circle. Will return True if the supplementary 737 | alignment matches the interval""" 738 | 739 | suplementary = read.get_tag('SA') 740 | 741 | #this list will have the following information [chr,left_most start,"strand,CIGAR,mapq, edit_distance] 742 | 743 | supl_info = [x.strip() for x in suplementary.split(',')] 744 | 745 | #mapq filter 746 | if int(supl_info[4]) > mapq_cutoff: 747 | #chromosome filter 748 | if supl_info[0] == mate_interval['chrom']: 749 | #aligned to the mate interval 750 | if int(mate_interval['start']) < int(supl_info[1]) < int(mate_interval['end']): 751 | 752 | #orientation 753 | if read.is_reverse == True and supl_info[2] == '-': 754 | return{'support' : True, 'leftmost': int(supl_info[1]), 'cigar' : supl_info[3]} 755 | 756 | elif read.is_reverse == False and supl_info[2] == '+': 757 | 758 | return{'support' : True, 'leftmost' : int(supl_info[1]), 'cigar' : supl_info[3]} 759 | 760 | else: 761 | 762 | return{'support' : False} 763 | 764 | else: 765 | 766 | return{'support' : False} 767 | 768 | else: 769 | return{'support' : False} 770 | @jit(nopython=True) 771 | def number_encoding(seq): 772 | """Function that takes as input a DNA sequence, and encodes the sequence to numbers, so that it can be accelerated 773 | with numba""" 774 | encoded = [] 775 | for i in seq: 776 | if i == "A": 777 | encoded.append(1) 778 | elif i == "T": 779 | encoded.append(2) 780 | elif i == "C": 781 | encoded.append(3) 782 | elif i == "G": 783 | encoded.append(4) 784 | return(np.array(encoded)) 785 | 786 | 787 | def check_alphabet(sequence): 788 | """Function that takes as input a sequence and it will check that there is at least a letter matching the alphabet 789 | in the sequence, returning true.""" 790 | 791 | code = "ATCG" 792 | 793 | for base in sequence: 794 | if base in code: 795 | return(True) 796 | return(False) 797 | 798 | def check_compatibility(seq1,seq2): 799 | """Function that takes as input two DNA sequence and checks whether their alphabets have at least one element 800 | in common. This due to an old bug in edlib""" 801 | 802 | for base in seq1: 803 | 804 | for base2 in seq2: 805 | 806 | if base == base2: 807 | 808 | return(True) 809 | 810 | return(False) 811 | 812 | @jit(nopython=True) 813 | def phred_to_prob(values): 814 | """Function that takes as input a numpy array with phred base quality scores and returns an array with base probabi- 815 | lity scores""" 816 | return(10**((values*-1)/10)) 817 | 818 | def get_longest_soft_clipped_bases(read): 819 | """Function that takes as input the cigar string and returns a dictionary containing the longest soft-clipped part of 820 | the read, the quality values and the read mapping quality""" 821 | 822 | read_cigar = read.cigar 823 | 824 | 825 | #get index of the soft-clipped in the cigar 826 | match_index = [x for x in range(len(read_cigar)) if read_cigar[x][0] == 4] 827 | 828 | 829 | # soft-clipped in only one side 830 | if len(match_index) == 1: 831 | 832 | #return first n soft-clipped 833 | if match_index == [0]: 834 | return{'seq': read.seq[0:read_cigar[0][1]],'qual': read.query_qualities[0:read_cigar[0][1]],'mapq':read.mapq} 835 | 836 | #return last n nucleotides 837 | elif match_index[0] == (len(read_cigar)-1): 838 | 839 | return {'seq':read.seq[-read_cigar[match_index[0]][1]:], 840 | 'qual':read.query_qualities[-read_cigar[match_index[0]][1]:],'mapq':read.mapq} 841 | 842 | 843 | 844 | 845 | 846 | 847 | #soft-clipped in both sides of the read 848 | else: 849 | 850 | #make sure that is soft-clipped on both sides 851 | 852 | try: 853 | 854 | assert read_cigar[0][0] == 4 and read_cigar[-1][0] == 4 855 | 856 | # longest soft-clipped are first n nucleotides 857 | if read_cigar[0][1] >= read_cigar[-1][1]: 858 | 859 | 860 | 861 | return {'seq': read.seq[0:read_cigar[0][1]],'qual': read.query_qualities[0:read_cigar[0][1]], 862 | 'mapq':read.mapq} 863 | 864 | else: 865 | 866 | return{'seq':read.seq[-read_cigar[-1][1]:],'qual': read.query_qualities[-read_cigar[-1][1]:], 867 | 'mapq': read.mapq} 868 | 869 | except AssertionError as e: 870 | 871 | print(e) 872 | 873 | def background_freqs(seq): 874 | """Function that takes as input the sequence of the nucletide frequencies in the realignment interval""" 875 | 876 | return{nucleotide: seq.count(nucleotide)/len(seq) for nucleotide in 'ATCG'} 877 | 878 | 879 | 880 | 881 | 882 | def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freqs,gap_open,gap_extend,verbose,max_edit): 883 | 884 | 885 | """Function that takes as input a read, the number of hits to find and the plus and minus strand and will return 886 | the number of hits, the sequencing qualities for that read and the g+c content of the realignment interval""" 887 | 888 | 889 | #get soft-clipped read 890 | soft_clipped_read = get_longest_soft_clipped_bases(read) 891 | 892 | #encoding of DNA and operations A,T,C,G,=,X,DI. THis is done for Numba 893 | nuc_and_ops = np.array([1,2,3,4,5,6,7]) 894 | encoded_nucs = number_encoding(soft_clipped_read['seq']) 895 | 896 | hits = 0 897 | 898 | min_score = len(soft_clipped_read['seq']) 899 | 900 | 901 | top_hits = {} 902 | 903 | 904 | if read.is_reverse: 905 | 906 | while hits < n_hits and min_score >= -10: 907 | 908 | alignment = edlib.align(soft_clipped_read['seq'], minus_strand, mode='HW', task='path') 909 | if hits ==0: 910 | if alignment['editDistance'] > max_edit: 911 | return(None) 912 | 913 | 914 | 915 | for location in alignment['locations']: 916 | 917 | 918 | 919 | mask_bases = 'X' * ( location[1] - location[0]) 920 | 921 | 922 | minus_strand = minus_strand[:location[0]] + mask_bases + minus_strand[location[1]:] 923 | 924 | hits += 1 925 | 926 | 927 | score = pssm(phred_to_prob(np.array(soft_clipped_read['qual'],dtype=np.float64)), encoded_nucs, 928 | edlib_cigar_to_iterable(alignment['cigar']),minus_base_freqs,gap_open,gap_extend,nuc_and_ops,verbose) 929 | 930 | if score < min_score: 931 | min_score = score 932 | 933 | 934 | top_hits[hits] = (location,alignment['cigar'],score,alignment['editDistance'],"-") 935 | 936 | else: 937 | # the search was exaustive 938 | hits +=n_hits 939 | 940 | else: 941 | #min socre stops the search if the score is orders of magnitude smaller that the top score given the edit 942 | #distance 943 | while hits < n_hits and min_score >= -10: 944 | 945 | 946 | 947 | alignment = edlib.align(soft_clipped_read['seq'], plus_strand, mode='HW', task='path') 948 | #stop search if edit distance is to high 949 | if hits ==0: 950 | if alignment['editDistance'] > max_edit: 951 | return (None) 952 | 953 | 954 | for location in alignment['locations']: 955 | 956 | mask_bases = 'X' * ( location[1] - location[0]) 957 | 958 | plus_strand = plus_strand[:location[0]] + mask_bases + plus_strand[location[1]:] 959 | 960 | hits += 1 961 | 962 | score = pssm(phred_to_prob(np.array(soft_clipped_read['qual'],dtype=np.float64)), encoded_nucs, 963 | edlib_cigar_to_iterable(alignment['cigar']), plus_base_freqs,gap_open,gap_extend,nuc_and_ops,verbose) 964 | 965 | if score < min_score: 966 | min_score = score 967 | 968 | top_hits[hits] = (location,alignment['cigar'],score,alignment['editDistance'],"+") 969 | 970 | else: 971 | 972 | hits +=n_hits 973 | 974 | 975 | 976 | 977 | return({'alignments':top_hits,'mapq_prior': soft_clipped_read['mapq']}) 978 | 979 | 980 | def edlib_cigar_to_iterable(edlib_cigar): 981 | """Function that takes as input the edlib cigar and parses it to get it in a iterable manner""" 982 | #encoding of DNA and operations A,T,C,G,=,X,ID 983 | #nuc_and_ops = np.array([1,2,3,4,5,6,7]) 984 | 985 | length = [] 986 | operations = [] 987 | 988 | for i in re.findall(r'\d+[IDX=]',edlib_cigar): 989 | length.append(int(i[0])) 990 | if i[1] == '=': 991 | operations.append(5) 992 | elif i[1] == 'X': 993 | operations.append(6) 994 | elif i[1] == 'I' or 'D': 995 | operations.append(7) 996 | 997 | 998 | return(np.array(length),np.array(operations)) 999 | 1000 | 1001 | @jit(nopython=True) 1002 | def pssm(seq_prob,seq_nucl,iterable_cigar,base_freqs,gap_open,gap_extend,nuc_and_ops,verbose): 1003 | """Function that takes as input the sequencing probabilities and cigar string and returns the log2 pssm of the read""" 1004 | 1005 | 1006 | 1007 | 1008 | 1009 | #start positon to operate in the pssm. This is done to iterate over the operations in the cigar, and keep track of 1010 | # were I am in the seq and quality values 1011 | seq_pos = 0 1012 | indel_penalty = 0 1013 | 1014 | 1015 | 1016 | #iterate trough CIGAR operations 1017 | for index in range(0,len(iterable_cigar[0])): 1018 | 1019 | operation_length = iterable_cigar[0][index] 1020 | end = operation_length + seq_pos 1021 | 1022 | 1023 | 1024 | operation = iterable_cigar[1][index] 1025 | 1026 | 1027 | #match, 1 minus prob(base called wrong) 1028 | if operation == nuc_and_ops[4]: 1029 | 1030 | for nucleotide in range(seq_pos,end): 1031 | 1032 | if seq_nucl[nucleotide] == nuc_and_ops[0]: 1033 | 1034 | seq_prob[nucleotide] = np.log2((1 - (seq_prob[nucleotide]))/base_freqs[0]) 1035 | 1036 | elif seq_nucl[nucleotide] == nuc_and_ops[1]: 1037 | 1038 | seq_prob[nucleotide] = np.log2((1 - (seq_prob[nucleotide])) / base_freqs[1]) 1039 | 1040 | elif seq_nucl[nucleotide] == nuc_and_ops[2]: 1041 | 1042 | seq_prob[nucleotide] = np.log2((1 - (seq_prob[nucleotide])) / base_freqs[2]) 1043 | 1044 | elif seq_nucl[nucleotide] == nuc_and_ops[3]: 1045 | 1046 | seq_prob[nucleotide] = np.log2((1 - (seq_prob[nucleotide])) / base_freqs[3]) 1047 | 1048 | 1049 | 1050 | seq_pos += operation_length 1051 | 1052 | 1053 | 1054 | elif operation == nuc_and_ops[5]: 1055 | 1056 | 1057 | for nucleotide in range(seq_pos,end): 1058 | 1059 | if seq_nucl[nucleotide] == nuc_and_ops[0]: 1060 | 1061 | seq_prob[nucleotide] = np.log2( 1062 | (seq_prob[nucleotide]/3)/base_freqs[0]) 1063 | 1064 | 1065 | elif seq_nucl[nucleotide] == nuc_and_ops[1]: 1066 | 1067 | seq_prob[nucleotide] = np.log2( 1068 | (seq_prob[nucleotide]/3)/base_freqs[1]) 1069 | 1070 | elif seq_nucl[nucleotide] == nuc_and_ops[2]: 1071 | 1072 | seq_prob[nucleotide] = np.log2( 1073 | (seq_prob[nucleotide]/3)/base_freqs[2]) 1074 | 1075 | 1076 | elif seq_nucl[nucleotide] == nuc_and_ops[3]: 1077 | 1078 | seq_prob[nucleotide] = np.log2( 1079 | (seq_prob[nucleotide]/3)/base_freqs[3]) 1080 | 1081 | 1082 | 1083 | 1084 | elif seq_nucl[nucleotide] == nuc_and_ops[6]: 1085 | 1086 | if verbose < 2: 1087 | 1088 | seq_prob[nucleotide] = 0 1089 | print("Warning:Ambiguous base found in nucleotide sequence. Assigning score of 0 in the log2 pssm") 1090 | 1091 | seq_pos += operation_length 1092 | 1093 | 1094 | elif operation == nuc_and_ops[6]: 1095 | 1096 | #affine gap scoring model 1097 | indel_penalty += gap_open + gap_extend*(operation_length-1) 1098 | 1099 | 1100 | return(np.sum(seq_prob)-indel_penalty) 1101 | 1102 | 1103 | def realignment_probability(hit_dict,interval_length): 1104 | """Function that takes as input the realignment dictionary and returns the alignment probability of the best hit""" 1105 | 1106 | 1107 | best_hit = hit_dict['alignments'][1][2] 1108 | 1109 | #this might be included on the denominator 1110 | try: 1111 | posterior = 2**best_hit/(np.sum((2**value[2]) for key,value in hit_dict['alignments'].items())) 1112 | except ZeroDivisionError as e: 1113 | print(e) 1114 | warnings.warn("ZeroDivisionError caught while computing the realignment posterior probability." 1115 | "Setting posterior probability to 0") 1116 | posterior = 0 1117 | return(posterior) 1118 | 1119 | 1120 | 1121 | 1122 | 1123 | 1124 | 1125 | def fraction(start1,start2,end1,end2,read1,read2): 1126 | """Function that performs a first round of merging. If the realigned intervals and SA intervals overlap, and are ca- 1127 | lled within the same iteration (which means that it is the same circle probably) they will be merged""" 1128 | 1129 | #check that they come from the same read 1130 | read_match = (read1 == read2)*1 1131 | 1132 | 1133 | #calculate distance between the two intervals 1134 | distance = (abs(start1-start2) + abs(end1-end2)) 1135 | 1136 | #overlap of interval 1 on interval 2 1137 | one_overlap_two = 1 - (distance/(end1-start1)) 1138 | #overlap of interval two on interval 1 1139 | two_overlap_one = 1 - (distance/(end2-start2)) 1140 | 1141 | return(one_overlap_two + two_overlap_one + read_match) 1142 | 1143 | 1144 | 1145 | def merge_fraction(chrom1,x1,x2,chrom2,y1,y2): 1146 | """compute overlap (reciprocal) of the interval y over interval x""" 1147 | 1148 | distance = (np.minimum(x2.values,y2.values) - np.maximum(x1.values,y1.values)) 1149 | 1150 | 1151 | 1152 | one_overlap_two = distance/(y2.values-y1.values) 1153 | 1154 | two_overlap_one = distance/(x2.values-x1.values) 1155 | 1156 | 1157 | # check if they are on the same chromosome and the amount of overlap if so 1158 | return(pd.Series(chrom1 == chrom2) + pd.Series(two_overlap_one.clip(0)) + pd.Series(one_overlap_two.clip(0))) 1159 | 1160 | 1161 | def iteration_merge(only_discordants,results,fraction,splits,score,sc_len,bam,af,insert,std,n_discordant): 1162 | """finction that merges the results of every iteration and filters the data by allele frequency""" 1163 | 1164 | norm_fraction = 3 1165 | 1166 | parsed_discordants = [] 1167 | for interval in only_discordants: 1168 | interval.append(0) 1169 | parsed_discordants.append(interval) 1170 | 1171 | 1172 | discordant_bed = bt.BedTool(parsed_discordants) 1173 | 1174 | 1175 | 1176 | 1177 | 1178 | unparsed_pd = pd.DataFrame.from_records(results, 1179 | columns=['chrom', 'start', 'end', 'read', 'iteration','score', 'discordants']) 1180 | 1181 | unparsed_pd = unparsed_pd.sort_values(['iteration','chrom','start','end']).reset_index() 1182 | 1183 | 1184 | grouped = unparsed_pd.groupby(merge_fraction(unparsed_pd.iteration.shift(), unparsed_pd.start.shift(), 1185 | unparsed_pd.end.shift(), unparsed_pd.iteration, 1186 | unparsed_pd.start, 1187 | unparsed_pd.end).lt(norm_fraction).cumsum()).agg( 1188 | {'chrom': 'first', 'start': 'min', 'end': 'max', 'discordants': 'max', 'read': 'sum','score':'sum'}) 1189 | 1190 | bedtool_output = bt.BedTool.from_dataframe(grouped) 1191 | 1192 | 1193 | 1194 | 1195 | 1196 | 1197 | allele_free = bedtool_output.cat(discordant_bed, postmerge=False) 1198 | write = [] 1199 | 1200 | for interval in allele_free: 1201 | try: 1202 | if int(interval[4]) != 0: 1203 | if (int(interval[4])) >= splits and float(interval[5]) > score: 1204 | start_cov = bam.count(contig=interval[0], 1205 | start=int(interval[1]), stop=int(interval[1])+1 1206 | ,read_callback='nofilter') 1207 | 1208 | end_cov = bam.count(contig=interval[0], 1209 | start=int(interval[2])-1, stop=int(interval[2]) 1210 | ,read_callback='nofilter') 1211 | 1212 | 1213 | 1214 | circle_af = ((int(interval[4]) * 2)) / ((start_cov+end_cov+0.01)/2) 1215 | if circle_af >=af: 1216 | write.append(interval) 1217 | else: 1218 | if int(interval[3]) >= n_discordant: 1219 | start_cov = bam.count(contig=interval[0],start=int(interval[1]), stop=int(interval[1]) + 1, 1220 | read_callback='nofilter') 1221 | 1222 | end_cov = bam.count(contig=interval[0], 1223 | start=int(interval[2]) - 1, stop=int(interval[2]), 1224 | read_callback='nofilter') 1225 | 1226 | circle_af = (int(interval[3])) / ((start_cov+end_cov+0.01)/2) 1227 | 1228 | if circle_af >= af: 1229 | write.append(interval) 1230 | except BaseException as e: 1231 | print(e) 1232 | pass 1233 | 1234 | return(bt.BedTool(write)) 1235 | 1236 | 1237 | 1238 | 1239 | 1240 | def merge_final_output(bam,results,begin,splits,dir,fraction,pid): 1241 | 1242 | """Function that takes as input the final results, and merge reciprocal intervals (this is done to combine the output 1243 | of different clusters)""" 1244 | 1245 | 1246 | 1247 | bam = ps.AlignmentFile(bam, "rb") 1248 | os.chdir("temp_files_%s/" % pid) 1249 | 1250 | # multiply *2 for reciprocal overlap +1 to check chromosome 1251 | norm_fraction = (fraction*2)+1 1252 | 1253 | unparsed_bed = bt.BedTool(os.path.basename(results)) 1254 | 1255 | 1256 | 1257 | 1258 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Writting final output to disk") 1259 | 1260 | 1261 | unparsed_pd = unparsed_bed.to_dataframe( 1262 | names=['chrom', 'start', 'end', 'discordants', 'sc','score']) 1263 | 1264 | 1265 | 1266 | second_merging_round = unparsed_pd.sort_values(['chrom', 'start', 'end']).reset_index() 1267 | #merge the output 1268 | # merge_fraction calculates the degree of overlap between the two genomic intervals 1269 | #lt(norm_freaction) looks the ones that surpass the merging threshold (returns 0 if true, 1 if not) 1270 | # Cumsum calculates the cumulative sum over the output of lt. Which is then used for the grouping. 1271 | #If the cumulative sum is the same for two rows, they are merged 1272 | final_output = second_merging_round.groupby( 1273 | merge_fraction(second_merging_round.chrom.shift(), second_merging_round.start.shift(), 1274 | second_merging_round.end.shift(),second_merging_round.chrom,second_merging_round.start,second_merging_round.end).lt(norm_fraction).cumsum()).agg( 1275 | {'chrom': 'first', 'start': 'min', 'end': 'max', 'discordants' : 'max', 'sc': 'sum','score':'sum'}) 1276 | 1277 | unfiltered_output = bt.BedTool.from_dataframe(final_output) 1278 | 1279 | # filter splits 1280 | 1281 | filtered = [] 1282 | for interval in unfiltered_output: 1283 | 1284 | if (int(interval[4])+int(interval[3])) >= splits: 1285 | if int(interval[1]) != 0: 1286 | interval[1] = int(interval[1])+1 1287 | filtered.append(interval) 1288 | 1289 | filtered_output = bt.BedTool(filtered) 1290 | 1291 | os.chdir("%s" % dir) 1292 | 1293 | 1294 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Finished!") 1295 | 1296 | end = time.time() 1297 | 1298 | total_time = (end - begin) / 60 1299 | 1300 | 1301 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Circle-Map Realign finished indentifying circles in %s \n" % total_time) 1302 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Circle-Map has identified %s circles\n" % len(filtered_output)) 1303 | 1304 | 1305 | 1306 | return(filtered_output) 1307 | 1308 | 1309 | def write_to_disk(partial_bed,output,locker,dir,pid): 1310 | 1311 | """function that writes to disk the results of every worker thread""" 1312 | 1313 | 1314 | locker.acquire() 1315 | os.chdir("%s/temp_files_%s/" % (dir,pid)) 1316 | output_bed = bt.BedTool('%s' % os.path.basename(output)) 1317 | writer_bed = output_bed.cat(partial_bed,postmerge=False) 1318 | writer_bed.saveas('%s' % os.path.basename(output)) 1319 | os.chdir("%s" % dir) 1320 | locker.release() 1321 | 1322 | def start_realign(circle_bam,output,threads,verbose,pid,clusters): 1323 | """Function that start the realigner function 1324 | - Splits the clusters to cores and removes the from disk the bedtools intermediates""" 1325 | 1326 | begin = time.time() 1327 | 1328 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Realigning reads using Circle-Map\n") 1329 | 1330 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Clustering structural variant reads\n") 1331 | 1332 | 1333 | eccdna_bam = ps.AlignmentFile("%s" % circle_bam, "rb") 1334 | 1335 | sp.call("mkdir temp_files_%s" % pid, shell=True) 1336 | 1337 | 1338 | sorted_bam,splitted = bam_circ_sv_peaks(eccdna_bam,circle_bam,threads,verbose,pid,clusters) 1339 | 1340 | 1341 | 1342 | 1343 | # split to cores 1344 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Splitting clusters to to processors\n") 1345 | os.chdir("temp_files_%s" % pid) 1346 | sp.call("touch %s" % os.path.basename(output), shell=True) 1347 | os.chdir("../") 1348 | 1349 | #this releases from tmp file the unmerged and peak file 1350 | bt.cleanup() 1351 | 1352 | return(splitted,sorted_bam,begin) 1353 | 1354 | def start_simulate(pid): 1355 | """Function for starting Circle-Map simulate""" 1356 | 1357 | print("\nRunning Circle-Map Simulate\n") 1358 | 1359 | 1360 | sp.call("mkdir temp_files_%s" % pid, shell=True) 1361 | 1362 | 1363 | return(pid) 1364 | 1365 | def mutate(genome,pid,indel,snp,java_mem): 1366 | """Function that takes as input the path of the genome,the indel ans substitution rate, and it will create a sinthetic 1367 | genome introducing random mutations on the fasta sequence and providing a vcf""" 1368 | 1369 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Introducing mutations in the fasta genome") 1370 | print("\t Indel rate: %s" % indel) 1371 | print("\t Substitution rate: %s" % snp) 1372 | sp.call("mutate.sh %s in=%s out=temp_files_%s/mutated.fa subrate=%s indelrate=%s" % (java_mem,genome,pid,snp,indel),shell=True) 1373 | 1374 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Simulating reads") 1375 | 1376 | return(None) 1377 | 1378 | 1379 | 1380 | 1381 | 1382 | 1383 | def check_size_and_write(results,only_discortants,output,lock,directory,fraction,pid): 1384 | """Function that checks if the intervals in memory are to big. And writes them to disk to release memory.""" 1385 | 1386 | 1387 | if sys.getsizeof(results) < 100000000: 1388 | return(False) 1389 | 1390 | 1391 | else: 1392 | 1393 | partial_bed = iteration_merge(only_discortants, results,fraction) 1394 | 1395 | print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Writting %s circular intervals to disk" % len(partial_bed)) 1396 | write_to_disk(partial_bed,output,lock,directory,pid) 1397 | 1398 | return(True) 1399 | 1400 | def merge_coverage_bed(results,frac,number): 1401 | 1402 | """Function that takes as bed file containing the coordinates of the double mapped reads and 1403 | returns the merged bed file containing the information about the clusters""" 1404 | 1405 | fraction = (frac*2)+1 1406 | 1407 | unparsed_pd = pd.DataFrame.from_records(results,columns=['chrom', 'start', 'end','item']) 1408 | 1409 | 1410 | 1411 | sort = unparsed_pd.sort_values(by=['chrom', 'start', 'end']).reset_index(drop=True) 1412 | 1413 | merging_out = sort.groupby( 1414 | merge_fraction(sort.chrom, sort.start, 1415 | sort.end,sort.chrom.shift(),sort.start.shift(),sort.end.shift()).lt(fraction).cumsum()).agg( 1416 | {'chrom': 'first', 'start': 'min', 'end': 'max','item': 'sum'}) 1417 | 1418 | merging_out = merging_out.drop(merging_out[merging_out.item < number].index) 1419 | 1420 | merging_out = merging_out.sort_values(by=['chrom', 'start', 'end']).reset_index(drop=True) 1421 | 1422 | 1423 | 1424 | 1425 | 1426 | 1427 | 1428 | final_output = merging_out.groupby( 1429 | merge_fraction(merging_out.chrom, merging_out.start, 1430 | merging_out.end, merging_out.chrom.shift(), merging_out.start.shift(),merging_out.end.shift()).lt(fraction).cumsum()).agg( 1431 | {'chrom': 'first', 'start': 'min', 'end': 'last', 'item': 'sum'}) 1432 | 1433 | 1434 | bedtool_output = bt.BedTool.from_dataframe(final_output) 1435 | 1436 | return(bedtool_output) 1437 | 1438 | def filter_by_ratio(eccdna_bed,cutoff): 1439 | """Function that takes as input the eccDNA bed and returns the data filtered by tha change at the start and the end 1440 | """ 1441 | 1442 | #circle list is a shared memory object 1443 | circle_list = [] 1444 | unparsed_pd = eccdna_bed.to_dataframe( 1445 | names=['chrom', 'start', 'end', 'discordants', 'soft-clipped', 'score', 'mean','std','start_ratio','end_ratio','continuity']) 1446 | for item, row in unparsed_pd.iterrows(): 1447 | 1448 | 1449 | if float(row[8]) > cutoff or float(row[9]) > cutoff: 1450 | circle_list.append([row['chrom'],row['start'],row['end'],row['discordants'],row['soft-clipped'], 1451 | row['score'],row['mean'],row['std'],row['start_ratio'],row['end_ratio'],row['continuity']]) 1452 | 1453 | output = pd.DataFrame.from_records(circle_list,columns=['chrom', 'start', 'end', 'discordants', 'soft-clipped', 'score', 'mean','std', 1454 | 'start_ratio','end_ratio','continuity']) 1455 | 1456 | return(output) 1457 | 1458 | 1459 | def merge_bed(discordants_pd): 1460 | """Function that takes as input a bed file and returns a pandas dataframe indicating if files should be merged. This 1461 | function will merge everything that is overlapping by at least 1bp""" 1462 | #check range overlap 1463 | overlap = ((discordants_pd.start - discordants_pd.shift().end) - 1).lt(0) 1464 | #check chr overlap 1465 | chr_overlap = (discordants_pd.chrom == discordants_pd.shift().chrom) 1466 | #if both bools are succesful returns a 2 1467 | return ((overlap * 1 + chr_overlap * 1).lt(2).cumsum()) 1468 | 1469 | def assign_discordants(split_bed,discordant_bed,insert_mean,insert_std): 1470 | """Function that takes as input the the discordant reads supporting an interval and assigns them to the 1471 | interval if they are close by (using the insert size estimate)""" 1472 | 1473 | max_dist = (insert_mean) + (5 * insert_std) 1474 | 1475 | splits = pd.DataFrame.from_records(split_bed, columns=['chrom', 'start', 'end', 'read', 'iteration', 1476 | 'score']).sort_values(['chrom', 'start', 'end']) 1477 | 1478 | splits['score'] = splits['score'].astype(float) 1479 | 1480 | merged_splits = splits.groupby(['chrom', 'start', 'end', 'iteration']).agg( 1481 | {'chrom': 'first', 'start': 'first', 'end': 'max', 'read': 'nunique', 'iteration': 'first', 'score': 'sum'}) 1482 | 1483 | merged_splits['read'] = merged_splits['read'].astype(int) 1484 | discordant_bed = pd.DataFrame.from_records(discordant_bed,columns=['chrom', 'start', 'end', 'read']) 1485 | 1486 | if len(discordant_bed) > 0: 1487 | assigned_splits = [] 1488 | 1489 | for index, row in merged_splits.iterrows(): 1490 | chrom_filt = discordant_bed[(discordant_bed['chrom'] == row['chrom'])] 1491 | start_filt = chrom_filt[ 1492 | (chrom_filt['start'] > row['start']) & ((chrom_filt['start'] - row['start']) < max_dist)] 1493 | end_filt = start_filt[(start_filt['end'] < row['end']) & ((row['end'] - start_filt['end']) < max_dist)] 1494 | 1495 | assigned_splits.append( 1496 | [row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(end_filt)]) 1497 | 1498 | return (assigned_splits) 1499 | 1500 | else: 1501 | assigned_splits = [] 1502 | for index,row in merged_splits.iterrows(): 1503 | assigned_splits.append( 1504 | [row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), 1505 | 0]) 1506 | 1507 | return(assigned_splits) 1508 | 1509 | def adaptative_myers_k(sc_len,edit_frac): 1510 | """Calculate the edit distance allowed as a function of the read length""" 1511 | return(float(sc_len*edit_frac)) 1512 | @jit(nopython=True) 1513 | def non_colinearity(read_start_cigar,read_end_cigar,aln_start,mate_interval_start,mate_interval_end): 1514 | """Input a read and the mate interval in the graph. The function checks whether the alignment would be linear (splicing) 1515 | or colinear. Will return false, in order to not attemp realignment. This is mainly thought for skipping deletions and 1516 | RNA splicing""" 1517 | 1518 | 1519 | #check left soft-clipped 1520 | if read_start_cigar == 4: 1521 | #graph needs to be upstream or looping to itself 1522 | if int(mate_interval_start) > aln_start: 1523 | return (True) 1524 | elif aln_start < int(mate_interval_end): 1525 | #looping to itself 1526 | return (True) 1527 | else: 1528 | return (False) 1529 | #check right softclipped 1530 | if read_end_cigar == 4: 1531 | # graph needs to be downstream or looping to itself 1532 | if int(mate_interval_end) < aln_start: 1533 | return (True) 1534 | elif aln_start > int(mate_interval_start): 1535 | #looping to itself 1536 | return (True) 1537 | else: 1538 | return (False) 1539 | 1540 | @jit(nopython=True) 1541 | def prob_to_phred(prob): 1542 | """Function that takes as input a probability and returns a phred-scaled probability. Rounded to the nearest decimal""" 1543 | if prob == 1.0: 1544 | prob = 0.999999999 1545 | return(int(np.around(-10*np.log10(1-prob)))) 1546 | 1547 | def realignment_read_to_SA_string(realignment_dict,prob,chrom,soft_clip_start): 1548 | """Function that takes as input the realignment dict, the alignment posterior probability and the chromosome and 1549 | returns an SA string""" 1550 | 1551 | sa_tag = chrom + "," + str(soft_clip_start) + "," + str(realignment_dict['alignments'][1][4]) + "," \ 1552 | + realignment_dict['alignments'][1][1] + ","+ str(prob_to_phred(prob)) + "," + str(realignment_dict['alignments'][1][3]) + ";" 1553 | return(sa_tag) 1554 | 1555 | def write_clipped_read(bam,read,mate,no_soft_clipped,no_hard_clipped,mapq_cutoff,own_mapq=False): 1556 | """Function that takes as input a bam file and a read and writes the read to the bam file""" 1557 | # If mate is unmapped, own mapq is set to true and the read will get its own mapq 1558 | if read.has_tag('MQ'): 1559 | if is_soft_clipped(read) == True: 1560 | 1561 | if no_soft_clipped == False: 1562 | 1563 | # gets its on mapq since mate is unmapped 1564 | if read.mapq >= mapq_cutoff: 1565 | bam.write(read) 1566 | 1567 | else: 1568 | pass 1569 | else: 1570 | if is_hard_clipped(read) == True: 1571 | if no_hard_clipped == False: 1572 | bam.write(read) 1573 | 1574 | else: 1575 | 1576 | if is_soft_clipped(read) == True: 1577 | 1578 | if no_soft_clipped == False: 1579 | 1580 | # gets its on mapq since mate is unmapped 1581 | if read.mapq >= mapq_cutoff: 1582 | if own_mapq == True: 1583 | read.tags += [('MQ', read.mapq)] 1584 | else: 1585 | read.tags += [('MQ', mate.mapq)] 1586 | 1587 | bam.write(read) 1588 | 1589 | else: 1590 | pass 1591 | else: 1592 | if is_hard_clipped(read) == True: 1593 | if no_hard_clipped == False: 1594 | if read.mapq >= mapq_cutoff: 1595 | if own_mapq == True: 1596 | read.tags += [('MQ', read.mapq)] 1597 | else: 1598 | read.tags += [('MQ', mate.mapq)] 1599 | 1600 | bam.write(read) 1601 | 1602 | return(None) 1603 | 1604 | 1605 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #MIT License 2 | # 3 | #Copyright (c) 2019 Inigo Prada Luengo 4 | # 5 | #Permission is hereby granted, free of charge, to any person obtaining a copy 6 | #of this software and associated documentation files (the "Software"), to deal 7 | #in the Software without restriction, including without limitation the rights 8 | #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | #copies of the Software, and to permit persons to whom the Software is 10 | #furnished to do so, subject to the following conditions: 11 | # 12 | #The above copyright notice and this permission notice shall be included in all 13 | #copies or substantial portions of the Software. 14 | # 15 | #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | #SOFTWARE. 22 | 23 | from setuptools import setup,find_packages 24 | from circlemap.__version__ import __version__ 25 | 26 | setup(name='Circle-Map', 27 | version=__version__, 28 | description='Circular DNA analysis tools', 29 | author='Inigo Prada-Luengo', 30 | url='https://github.com/iprada/Circle-Map', 31 | packages=find_packages(), 32 | install_requires=[ 33 | 'pysam>=0.15.2','pybedtools>=0.8.0','pandas>=0.24.2','biopython==1.77','numpy>=1.16.3', 34 | 'edlib>=1.2.3','numba>=0.45.0','tqdm>=4.31.1','scipy>=1.2.1' 35 | ], 36 | entry_points={ 37 | 'console_scripts': [ 38 | 'Circle-Map = circlemap.circle_map:main' 39 | ], 40 | 41 | }, 42 | classifiers=[ 43 | 'License :: OSI Approved :: MIT License' 44 | ], 45 | ) 46 | -------------------------------------------------------------------------------- /tests/profile_circle_map.py: -------------------------------------------------------------------------------- 1 | from circlemap.utils import start_realign,insert_size_dist 2 | import multiprocessing as mp 3 | from circlemap.realigner import realignment 4 | import os 5 | from tqdm import * 6 | 7 | input = "/home/iprada/faststorage/projects/6_aged_yeast/working_directory/aligned/BM3/sort_circle_qname_BM3.bam" 8 | qbam = "/home/iprada/faststorage/projects/6_aged_yeast/working_directory/aligned/BM3/qname_BM3.bam" 9 | sort_bam = "/home/iprada/faststorage/projects/6_aged_yeast/working_directory/aligned/BM3/sorted_BM3.bam" 10 | fasta = "/home/iprada/faststorage/reference_Data/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/8_plasmids_genome/yeast_8_plasmids.fa" 11 | 12 | splitted, sorted_bam, begin = start_realign(input,"profiling_output.bed", 1,3,1,500) 13 | 14 | sorted_bam.close() 15 | #get global insert size prior 16 | metrics = insert_size_dist(100000,60,qbam) 17 | 18 | 19 | 20 | lock = mp.Lock() 21 | 22 | object = realignment(input, qbam, sort_bam, fasta, 23 | os.getcwd(), 24 | 20,60, 4, 100000,5,1, 10, 0.99, 6,0.95,0.01,"profiling_output.bed",16,0.1, lock, 0,0.0,1,1, 25 | 0.05, False,False, 0,0.0, metrics, 3) 26 | 27 | 28 | 29 | object.realign(splitted[0]) 30 | -------------------------------------------------------------------------------- /tests/run_call.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iprada/Circle-Map/e1d122a4bc3d9f36ae00fcf0cbcdfef77ada7c90/tests/run_call.py --------------------------------------------------------------------------------