├── .github └── workflows │ ├── cmseq_ci.yml │ └── python-publish.yml ├── .gitignore ├── LICENSE.txt ├── README.md ├── README_class.md ├── cmseq ├── __init__.py ├── breadth_depth.py ├── cmseq.py ├── consensus.py ├── consensus_aDNA.py ├── filter.py ├── poly.py └── polymut.py ├── recipe └── meta.yaml └── setup.py /.github/workflows/cmseq_ci.yml: -------------------------------------------------------------------------------- 1 | name: CMSeq_ci 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.6, 3.7, 3.8, 3.9] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install flake8 pytest 23 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 24 | # - name: Lint with flake8 25 | # run: | 26 | # # stop the build if there are Python syntax errors or undefined names 27 | # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 28 | # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 29 | # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 30 | - name: Install CMSeq 31 | run: | 32 | pip install . 33 | - name: Check cmseq help message 34 | run: | 35 | breadth_depth.py --help 36 | consensus_aDNA.py --help 37 | consensus.py --help 38 | poly.py --help 39 | polymut.py --help -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Moreno Zolfo, Nicolai Karcher 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CMSeq # 2 | 3 | CMSeq is a set of commands to provide an interface to .bam files for coverage and sequence consensus. 4 | 5 | **Requires:** 6 | 7 | * samtools (> 1.x) 8 | * numpy 9 | * pysam 10 | * pandas 11 | * Biopython with bcbio-gff module _(warning: Biopython <= 1.76 is required for `polymut.py`)_ 12 | 13 | ## Installation 14 | 15 | [![PyPi version](https://pypip.in/v/CMSeq/badge.png)](https://pypi.org/project/CMSeq/) [![Anaconda-Server Badge](https://anaconda.org/bioconda/cmseq/badges/version.svg)](https://anaconda.org/bioconda/cmseq) [![Anaconda-Server Badge](https://anaconda.org/bioconda/cmseq/badges/installer/conda.svg)](https://conda.anaconda.org/bioconda) 16 | 17 | ### Install via source and pip ### 18 | 19 | ``` 20 | git clone https://github.com/SegataLab/cmseq.git 21 | pip install . 22 | ``` 23 | 24 | ### Install via PyPi ### 25 | 26 | `pip install CMSeq` 27 | 28 | ### Install via Bioconda ### 29 | 30 | `conda install -c bioconda cmseq` 31 | 32 | ## Functions 33 | 34 | * [Breadth and Depth of Coverage](#breadth-and-depth-of-coverage-with-breadth_depthpy) 35 | * [Polymorphic Rate on CDS](#polymorphic-rate-over-protein-coding-genes-with-polymutpy) 36 | * [Polymorphic Rate on the whole genome](#polymorphic-rate-with-polypy) 37 | * [Reference free consensus](#reference-free-but-guided-consensus-with-consensuspy) 38 | 39 | **Note: CMSeq can be used [as python module](README_class.md) as well** 40 | 41 | ## Breadth and Depth of coverage with breadth_depth.py 42 | 43 | Provides breadth and depth of coverage the references of BAM alignment file, in tabular format. The file must be indexed and sorted (alternatively, --sortindex can be used). 44 | 45 | 46 | ``` 47 | usage: breadth_depth.py [-h] [-c REFERENCE ID] [-f] [--sortindex] 48 | [--minlen MINLEN] [--minqual MINQUAL] 49 | [--mincov MINCOV] [--truncate TRUNCATE] 50 | BAMFILE 51 | ``` 52 | 53 | 54 | ``` 55 | positional arguments: 56 | BAMFILE The file on which to operate 57 | 58 | optional arguments: 59 | -h, --help show this help message and exit 60 | -c REFERENCE ID, --contig REFERENCE ID 61 | Focus on a subset of references in the BAM file. Can 62 | be a list of references separated by commas or a FASTA 63 | file (the IDs are used to subset) 64 | -f If set unmapped (FUNMAP), secondary (FSECONDARY), qc- 65 | fail (FQCFAIL) and duplicate (FDUP) are excluded. If 66 | unset ALL reads are considered (bedtools genomecov 67 | style). Default: unset 68 | --sortindex Sort and index the file 69 | --minlen MINLEN Minimum Reference Length for a reference to be 70 | considered 71 | --minqual MINQUAL Minimum base quality. Bases with quality score lower 72 | than this will be discarded. This is performed BEFORE 73 | --mincov. Default: 30 74 | --mincov MINCOV Minimum position coverage to perform the polymorphism 75 | calculation. Position with a lower depth of coverage 76 | will be discarded (i.e. considered as zero-coverage 77 | positions). This is calculated AFTER --minqual. 78 | Default: 1 79 | --truncate TRUNCATE Number of nucleotides that are truncated at either 80 | contigs end before calculating coverage values. 81 | 82 | ``` 83 | 84 | 85 | Breadh and Depth of coverage outputs a table with the breadth of coverage, average and median depth-of-coverage of each reference. Values are calculated only on the covered portion of the reference: 86 | 87 | |contig|Breadth|Depth (avg)|Depth (median)| 88 | |------|-------|-----------|--------------| 89 | |EF401177.1.1491|0.101274312542|1.0|1.0| 90 | |EF405039.1.1494|0.101070950469|2.69536423841|3.0| 91 | |all_contigs|-|1.84768211921|1.0| 92 | 93 | The last line is a summary line calculated as if all the reads were coming from the same (big) contig. 94 | 95 | ### Examples: ### 96 | 97 | Extract breadth and depth of coverage for all the references within a sorted and indexed `BAM` file 98 | 99 | 100 | ``` 101 | breadth_depth.py mybam.sorted.bam 102 | ``` 103 | 104 | Extract breadth and depth of coverage for all the references within an unsorted `BAM` file 105 | 106 | 107 | ``` 108 | breadth_depth.py --sortindex mybam.sorted.bam 109 | ``` 110 | 111 | Extract breadth and depth of coverage for all the references within a sorted `BAM` file, count only the reads with minimum quality of 25 and positions with a minimum coverage of 10 112 | 113 | 114 | ``` 115 | breadth_depth.py --mincov 10 --minqual 20 mybam.bam 116 | ``` 117 | 118 | Extract breadth and depth of coverage for the references: genome_1 and genome_2 within a sorted `BAM` file 119 | 120 | 121 | ``` 122 | breadth_depth.py -c genome_1,genome_2 mybam.bam 123 | ``` 124 | 125 | Extract breadth and depth of coverage for the references present in MYFASTA.fasta, within a sorted `BAM` file 126 | 127 | 128 | ``` 129 | breadth_depth.py -c MYFASTA.fasta mybam.sorted.bam 130 | ``` 131 | 132 | ## Polymorphic rate over protein-coding genes with polymut.py 133 | 134 | **Warning:** Biopython <= 1.76 is required for `polymut.py` 135 | 136 | This function calculates polymorphic site rates over protein coding genes. It considers dominant and second-dominant alleles over protein-coding genes on the nucleotide level, translates the ORFs into proteins and then calculates and outputs the number of 137 | synonymous and non-synonymous mutations (on the protein level) between the dominant and second-dominant protein sequences. 138 | Positions with a ratio between second-dominant and dominant allele coverage smaller than dominant_frq_thrsh are considered non-variant. 139 | 140 | This function was used in [Pasolli et al., 2019](https://pubmed.ncbi.nlm.nih.gov/30661755/) as an ad-hoc measure to calculate strain heterogeneity in metagenomes. 141 | 142 | Since the likelihood of finding more than one strain in the same gut varies strongly across gut commensals (as well as different within-species genetic diversity), this function does not allow a rigorous classification of metagenomes into strain-mixed and non-strain-mixed, but it can be shown that - considering polymorphic site rates over i.e. core genes of any given speices - samples with a higher polymorphic site rate are more likely to harbour more than one strain. 143 | 144 | Please supply a gff file from Prokka and make sure that the contig names between the bam file and the gff file can be matched. 145 | 146 | 147 | ``` 148 | usage: polymut.py [-h] [-c REFERENCE ID] [-f] [--sortindex] [--minlen MINLEN] 149 | [--minqual MINQUAL] [--mincov MINCOV] 150 | [--dominant_frq_thrsh DOMINANT_FRQ_THRSH] 151 | [--gff_file GFF_FILE] 152 | BAMFILE 153 | 154 | Reports the polymorpgic rate of each reference (polymorphic bases / total 155 | bases). Focuses only on covered regions (i.e. depth >= 1) 156 | 157 | positional arguments: 158 | BAMFILE The file on which to operate 159 | 160 | optional arguments: 161 | -h, --help show this help message and exit 162 | -c REFERENCE ID, --contig REFERENCE ID 163 | Focus on a subset of references in the BAM file. Can 164 | be a list of references separated by commas or a FASTA 165 | file (the IDs are used to subset) 166 | -f If set unmapped (FUNMAP), secondary (FSECONDARY), qc- 167 | fail (FQCFAIL) and duplicate (FDUP) are excluded. If 168 | unset ALL reads are considered (bedtools genomecov 169 | style). Default: unset 170 | --sortindex Sort and index the file 171 | --minlen MINLEN Minimum Reference Length for a reference to be 172 | considered. Default: 0 173 | --minqual MINQUAL Minimum base quality. Bases with quality score lower 174 | than this will be discarded. This is performed BEFORE 175 | --mincov. Default: 30 176 | --mincov MINCOV Minimum position coverage to perform the polymorphism 177 | calculation. Position with a lower depth of coverage 178 | will be discarded (i.e. considered as zero-coverage 179 | positions). This is calculated AFTER --minqual. 180 | Default:1 181 | --dominant_frq_thrsh DOMINANT_FRQ_THRSH 182 | Cutoff for degree of `allele dominance` for a position 183 | to be considered polymorphic. Default: 0.8 184 | --gff_file GFF_FILE GFF file used to extract protein-coding genes 185 | 186 | ``` 187 | 188 | The functions prints three values: 189 | * the total number of non-synonymous mutations 190 | * the total number of synonymous mutations 191 | * the total number of considered positions (total number of positions covered higher than the parameter specified with --mincov) 192 | 193 | Please note that this function is meant to be used on multi-contig genomes, so **polymut.py reports the sum of non-synonimous and synonimous positions** for all the contigs considered. If you specify a list of contigs with `-c`, only those will be considered. 194 | 195 | 196 | ### Examples ### 197 | 198 | Calculate the number of non-synonymous, synonymous and the total number of considered positions (on the nucleotide level!) over your contig of interest. 199 | 200 | ``` 201 | python polymut.py -c "contig_of_interest" bam_of_interest.bam --mincov 10 --minqual 30 --dominant_frq_thrsh 0.8 --gff_file gff_from_prokka.gff 202 | ``` 203 | 204 | ## Polymorphic Rate with poly.py 205 | 206 | Provides the Polymorphic-rate of each reference in a sorted and indexed BAMFILE. The polymorphic rate is defined as: number_of_polymorhpic_sites / number_of_total_nucleotides. Beware that *number_of_total_nucleotides* depends on --minqual and --mincov, as if a position is not covered (e.g. coverage = 0) will not be counted in the denominator. 207 | 208 | 209 | ``` 210 | usage: poly.py [-h] [-c REFERENCE ID] [-f] [--sortindex] [--minlen MINLEN] 211 | [--minqual MINQUAL] [--mincov MINCOV] [--pvalue PVALUE] 212 | [--seq_err SEQ_ERR] [--dominant_frq_thrsh DOMINANT_FRQ_THRSH] 213 | BAMFILE 214 | 215 | Reports the polymorpgic rate of each reference (polymorphic bases / total 216 | bases). Focuses only on covered regions (i.e. depth >= 1) 217 | 218 | positional arguments: 219 | BAMFILE The file on which to operate 220 | 221 | optional arguments: 222 | -h, --help show this help message and exit 223 | -c REFERENCE ID, --contig REFERENCE ID 224 | Focus on a subset of references in the BAM file. Can 225 | be a list of references separated by commas or a FASTA 226 | file (the IDs are used to subset) 227 | -f If set unmapped (FUNMAP), secondary (FSECONDARY), qc- 228 | fail (FQCFAIL) and duplicate (FDUP) are excluded. If 229 | unset ALL reads are considered (bedtools genomecov 230 | style). Default: unset 231 | --sortindex Sort and index the file 232 | --minlen MINLEN Minimum Reference Length for a reference to be 233 | considered. Default: 0 234 | --minqual MINQUAL Minimum base quality. Bases with quality score lower 235 | than this will be discarded. This is performed BEFORE 236 | --mincov. Default: 30 237 | --mincov MINCOV Minimum position coverage to perform the polymorphism 238 | calculation. Position with a lower depth of coverage 239 | will be discarded (i.e. considered as zero-coverage 240 | positions). This is calculated AFTER --minqual. 241 | Default:1 242 | --pvalue PVALUE Binomial p-value threshold for the binomal-polymorphic 243 | test. Default: 0.01 244 | --seq_err SEQ_ERR Sequencing error rate. Default: 0.001 245 | --dominant_frq_thrsh DOMINANT_FRQ_THRSH 246 | Cutoff for degree of `allele dominance` for a position 247 | to be considered polymorphic. Default: 0.8 248 | 249 | ``` 250 | 251 | The output is strucutred as follows: 252 | 253 | ``` 254 | |referenceID|dominant_allele_distr_mean|dominant_allele_distr_perc_10|...|dominant_allele_distr_sd|tot_covered_bases|tot_polymorphic_bases|polymorphic_rate| 255 | |----|----|----|----|----|----|----|----|----| 256 | |EF401177.1.1491|-|-...|-|151.00|0.00|0.00| 257 | |EF405039.1.1494|0.65|0.67|...|0.04|151.00|13.00|0.09| 258 | |-GENOME-|0.65|0.67|...|0.04|302.00|13.00|0.04| 259 | ``` 260 | 261 | As for ``breadh_depth.py``, also the polymorphic rate analyisis is subjected to ``mincov``, ``minqual``, and ``minlen``. Additionally, two parameters can be set to decide when a site is polymorphic: 262 | 263 | 264 | * ``dominant_frq_thrsh`` is a percentage: if the majoritary allele frequency at position x is greater than the threshold, x is considered non-polymorphic. Otherwise, a binomial test is performed to assure that x is polymorpfic (polymorphic if p < ``pvalue``) 265 | 266 | 267 | ### Examples ### 268 | 269 | Extract polymorphic rate from a sorted and indexed bam file 270 | 271 | 272 | ``` 273 | poly.py mybam.sorted.bam 274 | ``` 275 | 276 | 277 | Extract polymorphic rate from an unsorted bam file 278 | 279 | 280 | ``` 281 | poly.py --sortindex mybam.sorted.bam 282 | ``` 283 | 284 | 285 | Extract polymorphic rate from an unsorted bam file, counting only bases with minimum quality of 30 and minimum position-coverage of 10 286 | 287 | 288 | ``` 289 | poly.py --sortindex --mincov 10 --minqual 30 mybam.unsorted.bam 290 | ``` 291 | 292 | 293 | Extract polymorphic rate from an unsorted bam file, only for reads aligning against genome_1 or genome_2. Consider polymorphic only sites with majoritary-allele-freq < 70% 294 | 295 | 296 | ``` 297 | poly.py --sortindex -c genome_1,genome_2 --dominant_frq_thrsh 0.7 mybam.unsorted.bam 298 | ``` 299 | 300 | 301 | ## Reference Free (but guided) consensus with consensus.py 302 | 303 | Provides the Reference Free consensus for the references in a BAM alignment file, reconstructing the sequence from the raw reads, in FASTA format to standard output. The file must be indexed and sorted (alternatively, --sortindex can be used). Note that the length of the reconstructed sequence is bound to the original length of the reference. On that length, not all the positions may be covered. This can happen because: 304 | 305 | * there are no reads mapping to the position 306 | * there are too few reads (*i.e < ``mincov``*) mapping to the position 307 | * the reads that map to the position have a low quality (*i.e. < ``minqual``*) 308 | * the distribution of nucleotides at that position is potentially problematic (*i.e. dominant_allele_frequency < ``dominant_frq_thrsh``*): in this case, the position is excluded to reduce noise. 309 | 310 | 311 | ``` 312 | usage: consensus.py [-h] [-c REFERENCE ID] [-f] [--sortindex] 313 | [--minqual MINQUAL] [--mincov MINCOV] 314 | [--dominant_frq_thrsh DOMINANT_FRQ_THRSH] 315 | [--minlen MINLEN] [--trim TRIM] 316 | BAMFILE 317 | 318 | outputs the consensus in FASTA format. Non covered positions (or quality- 319 | trimmed positions) are reported as a dashes: - 320 | 321 | positional arguments: 322 | BAMFILE The file on which to operate 323 | 324 | optional arguments: 325 | -h, --help show this help message and exit 326 | -c REFERENCE ID, --contig REFERENCE ID 327 | Focus on a subset of references in the BAM file. Can 328 | be a list of references separated by commas or a FASTA 329 | file (the IDs are used to subset) 330 | -f If set unmapped (FUNMAP), secondary (FSECONDARY), qc- 331 | fail (FQCFAIL) and duplicate (FDUP) are excluded. If 332 | unset ALL reads are considered (bedtools genomecov 333 | style). Default: unset 334 | --sortindex Sort and index the file 335 | --minqual MINQUAL Minimum base quality. Bases with quality score lower 336 | than this will be discarded. This is performed BEFORE 337 | --mincov. Default: 30 338 | --mincov MINCOV Minimum position coverage to perform the polymorphism 339 | calculation. Position with a lower depth of coverage 340 | will be discarded (i.e. considered as zero-coverage 341 | positions). This is calculated AFTER --minqual. 342 | Default: 0 343 | --dominant_frq_thrsh DOMINANT_FRQ_THRSH 344 | Cutoff for degree of `allele dominance` for a position 345 | to be considered polymorphic. Default: 0.8 346 | --minlen MINLEN Minimum Reference Length for a reference to be 347 | considered. Default: 0 348 | --trim TRIM Trim the reads before computing the consensus. 349 | A value of 10:10 means that the first and last 10 positions 350 | of each read will be ignored. Default: None 351 | ``` 352 | 353 | 354 | 355 | Note that positions with a majoritary allele frequency lower than dominant_frq_thrsh will be considered "problematic" and substituted with a "-", even with sufficient coverage and quality. 356 | 357 | 358 | 359 | ``` 360 | consensus.py ~/tmp.bam.sorted -c EF401177.1.1491,EF405039.1.1494 --mincov 1 --dominant_frq_thrsh 0.5 361 | >EF401177.1.1491_consensus 362 | ------------------------------------------------------------ 363 | ------------------------------------------------------------ 364 | ------------------------------------------------------------ 365 | ------------------------------------------------------------ 366 | ------------------------------------------------------------ 367 | ------------------------------------------------------------ 368 | ------------------------------------------------------------ 369 | ------------------------------------------------------------ 370 | -----------------------------------TACGTAGGGGGCAAGCGTTATCCGG 371 | ATTTACTGGGTGTAAAGGGAGCGTAGACGGCGAGACAAGTCTGAAGTGAAAGCCCGGGGC 372 | TCAACCCCGGGACTGCTTTGGAAACTGCCTTGCTAGAGTGCTGGAGAGGTAAGTGGAATT 373 | CCTAGT------------------------------------------------------ 374 | ------------------------------------------------------------ 375 | ------------------------------------------------------------ 376 | ------------------------------------------------------------ 377 | ------------------------------------------------------------ 378 | ------------------------------------------------------------ 379 | ------------------------------------------------------------ 380 | ------------------------------------------------------------ 381 | ------------------------------------------------------------ 382 | ------------------------------------------------------------ 383 | ------------------------------------------------------------ 384 | ------------------------------------------------------------ 385 | ------------------------------------------------------------ 386 | --------------------------------------------------- 387 | >EF405039.1.1494_consensus 388 | ------------------------------------------------------------ 389 | ------------------------------------------------------------ 390 | ------------------------------------------------------------ 391 | ------------------------------------------------------------ 392 | ------------------------------------------------------------ 393 | ------------------------------------------------------------ 394 | ------------------------------------------------------------ 395 | ------------------------------------------------------------ 396 | -------------------------------------TACGTAGGTGGCAAGCGTTATCC 397 | GGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGTCTGCAAGTCAGATGTGAAATCCATGG 398 | GCTCAACCCATGAACTGCATTTGAAACTGTAGATCTTGAGTGTCGGAGGGGCAATCGGAA 399 | TTCCTAGT---------------------------------------------------- 400 | ------------------------------------------------------------ 401 | ------------------------------------------------------------ 402 | ------------------------------------------------------------ 403 | ------------------------------------------------------------ 404 | ------------------------------------------------------------ 405 | ------------------------------------------------------------ 406 | ------------------------------------------------------------ 407 | ------------------------------------------------------------ 408 | ------------------------------------------------------------ 409 | ------------------------------------------------------------ 410 | ------------------------------------------------------------ 411 | ------------------------------------------------------------ 412 | ------------------------------------------------------ 413 | ``` 414 | 415 | 416 | ### Examples ### 417 | 418 | 419 | Extract the consensus from all the references from a sorted and indexed BAM file, in FASTA format: 420 | 421 | 422 | ``` 423 | consensus.py mybam.sorted.bam 424 | ``` 425 | 426 | Extract the consensus from all the references from an unsorted BAM file, in FASTA format: 427 | 428 | 429 | ``` 430 | consensus.py --sortindex mybam.sorted.bam 431 | ``` 432 | 433 | Extract the consensus of genome_1 and genome_2 from a BAM file. Positions with coverage lower than 5 are ignored (- is reported instead of base-call): 434 | 435 | 436 | ``` 437 | consensus.py --mincov 5 -c genome_1,genome_2 mybam.sorted.bam 438 | ``` 439 | 440 | Extract the consensus of genome_1 and genome_2 from a BAM file. Positions with coverage lower than 5 "high quality" bases are ignored (- is reported instead of base-call). Additionally, positions with less than 50% majoritary-letters will be substituted by a "-": 441 | 442 | 443 | ``` 444 | consensus.py --mincov 5 --minqual 30 -c genome_1,genome_2 --dominant_frq_thrsh 0.5 mybam.sorted.bam 445 | ``` 446 | 447 | Same as above, but a FASTA file is used to filter references instead: 448 | 449 | 450 | ``` 451 | consensus.py --mincov 5 --minqual 30 -c FILTER_FASTA.fasta --dominant_frq_thrsh 0.5 mybam.sorted.bam 452 | ``` 453 | 454 | ### Ancient DNA consensus 455 | 456 | Extract the consensus of genome from a BAM file, in the scenario of ancient metagenomics study. Positions with coverage lower than 5 and damage probability (Stats_out_MCMC_correct_prob.csv from mapDamage2) higher than 0.95 are ignored. 457 | 458 | 459 | ``` 460 | consensus_aDNA.py --mincov 5 -r reference.fna --pos_specific_prob_tab Stats_out_MCMC_correct_prob.csv --pos_damage_prob_thrsh 0.95 mybam.sorted.bam 461 | ``` 462 | 463 | -------------------------------------------------------------------------------- /README_class.md: -------------------------------------------------------------------------------- 1 | # CMSeq # 2 | 3 | 4 | * Provides interface for .bam files 5 | * reference free consensus 6 | * Breadth and Depth of coverage 7 | 8 | Requires samtools (> 1.x), numpy, pysam, matplotlib and seaborn 9 | 10 | ## Use as Python Module ## 11 | 12 | ### class BamFile ### 13 | 14 | Represents a collection of contig/reference of a bam file 15 | 16 | To create a new BamContig from an unsorted BAM file: 17 | ``` 18 | #!python 19 | collection = cmseq.BamFile(BAM_FILE_PATH,sort=True,index=True,minlen=0) 20 | ``` 21 | 22 | an optional argument ``filterInputList`` can be passed to BamFile, to filter only some references. ``filterInputList`` can be: 23 | * a string of comma-separated IDs 24 | * the path to a FASTA file with the to-be-filtered IDs as FASTA IDs 25 | 26 | To start from a pre-sorted and indexed bam file: 27 | ``` 28 | #!python 29 | collection = cmseq.BamFile(BAM_FILE_PATH) 30 | ``` 31 | 32 | To set the pysam stepper to a custom value (e.g. `all`, that avoids secondary alignments or `nofilter`, that includes secondary alignments): 33 | ``` 34 | #!python 35 | #Chose a custom stepper for all the contigs of the BAMFILE 36 | collection = cmseq.BamFile(BAM_FILE_PATH,stepper='all') 37 | ``` 38 | 39 | To take into accounts only references (/contigs) longer than N, use `minlen`: 40 | ``` 41 | #!python 42 | #Build the collection only on contigs / references longer than 5000 43 | collection = cmseq.BamFile(BAM_FILE_PATH,minlen=5000) 44 | ``` 45 | 46 | ### class BamContig ### 47 | 48 | Represents a reference to which some reads map against 49 | 50 | To create a new BamContig: 51 | *Note*: this is NOT needed if a BamFile instance has been created before, as this is done automatically for each contig within the bamfile 52 | 53 | ``` 54 | #!python 55 | contig = cmseq.BamContig(bamHandle,contigName,contigLength) 56 | ``` 57 | 58 | * bamHandle: a pysam AlignmentFile instance, pointing to the original bam file (sorted and indexed) 59 | * contigName: the name of the contig/reference in the bam file 60 | * contigLength: the length of that contig/reference 61 | 62 | **Refernece Free Consensus** 63 | 64 | reference_free_consensus(): returns a string, long as the reference, with the consensus. 65 | 66 | The function can use the optional parameters: 67 | 68 | * `minqual`: the consensus will be based only on those nucleotides with a mapping-quality higher than minqual. **Default: 0**, meaning everything is used 69 | * `mincov`: the consensus will be based only on those positions with at least MINCOV coverage (after the quality filtering of `minqual`). **Default: 1**, meaning everything is used. 70 | 71 | * `consensus_rule`: a custom consensus function that: 72 | takes as input a python dictionary. The function is applied to each column of the samtools pileup. 73 | The dictionary has this structre: {'A':0,'T':0,'C':0,'G':0,'N':0} and stores the counts (coverages) for each position in each nucleotide ("N" = anything else). The function must return a char 74 | The default function is: `lambda array: max(array, key=array.get)` (pure majority rule). 75 | The function is applied only to positions that meet the requirements of `minqual` and `mincov`. Other positions are reported as "-" 76 | 77 | * `trimReads`: a tuple specifying the range of each read to be skipped when computing the consensus. If set to (10,10) it means that the first and last 10 of each read will not be used to compute the consensus. Default is None, which means nothing will be trimmed. 78 | Examples 79 | ``` 80 | #!python 81 | # Get the simplest majority rule (default) consensus of REFERENCE_NAME: 82 | print a.get_contig_by_label("REFERENCE_NAME").reference_free_consensus() 83 | 84 | # Get the simplest majority rule (default) consensus of REFERENCE_NAME considering positions covered by at least 5 reads with qualities higher than 33: 85 | print a.get_contig_by_label("REFERENCE_NAME").reference_free_consensus(mincov=5,minqual=33) 86 | 87 | # Use a custom consensus rule: return X for each position 88 | print a.get_contig_by_label("REFERENCE_NAME").reference_free_consensus(consensus_rule=lambda array: 'X') 89 | ``` 90 | 91 | **Depth of Coverage** 92 | 93 | BamContig.**depth_of_coverage()**: returns a tuple, with the (mean_coverage,median_coverage) values, calculated over the positions that have a coverage of at least 1 (at least one mapping read on that position). Optionally, can take: 94 | 95 | * `minqual`: the nucleotides considered are only those that have a quality score higher than MINQUAL. **Default: 0**, meaning everything is used 96 | * `mincov`: the depth is based only on those positions with at least MINCOV coverage (after the quality filtering of `minqual`). **Default: 1**, meaning everything is used. 97 | 98 | **Breadth of Coverage** 99 | 100 | BamContig.**breadth_of_coverage**: returns a float, with the percentage of the total reference length covered by reads. It takes as optional parameters `mincov` and `minqual` as *depth_of_coverage* 101 | 102 | **Polymorphic Rate** 103 | 104 | BamContig.**polymorphism_rate**: returns a DataFrame, with the statistics of polymorphic positions, over the total number of reconstructable positions. It takes as optional parameters `mincov` and `minqual` as *depth_of_coverage*. 105 | 106 | **Set the Pysam stepper** 107 | 108 | BamContig.**set_stepper(VALUE)**: resets the pysam stepper for the reference. VALUE can be `all` or `nofilter`, as of the pysam specifications. By default the stepper is set to 'nofilter' (bedtools style). 109 | 110 | ### Examples ### 111 | 112 | Create a new instance of a BamFile. An unsorted, unindexed bam file can be provided and will be sorted and indexed within the module: 113 | 114 | ``` 115 | #!python 116 | import cmseq 117 | collection = cmseq.BamFile("CONTIG_NAME",sort=True,index=True) 118 | ``` 119 | 120 | Iterate over each contig represented in the BAM/SAM file: 121 | 122 | ``` 123 | #!python 124 | for i in collection.get_contigs(): 125 | print i,collection.get_contig_by_label(i).reference_free_consensus() 126 | print collection.get_contig_by_label(i).depth_of_coverage() #(mean,median) 127 | print collection.get_contig_by_label(i).breadth_of_coverage() 128 | ``` 129 | Select a custom contig and get its consensus sequence by majoriy rule: 130 | ``` 131 | #!python 132 | print collection.get_contig_by_label("REFERENCE_NAME").reference_free_consensus() 133 | ``` 134 | 135 | Select a custom contig and plot its coverage 136 | ``` 137 | #!python 138 | collection.get_contig_by_label("REFERENCE_NAME").plot_coverage('out.pdf') 139 | ``` 140 | 141 | Select a custom contig and get its consensus sequence by majoriy rule, only for positions covered by at least 10 high quality reads: 142 | 143 | ``` 144 | #!python 145 | print collection.get_contig_by_label("REFERENCE_NAME").reference_free_consensus(mincov=10,minqual=33) 146 | ``` 147 | 148 | Select a custom contig and get a custom consensus sequence, with "+" where coverage is higher or equal 2, - otherwise: 149 | 150 | ``` 151 | #!python 152 | print collection.get_contig_by_label("REFERENCE_NAME").reference_free_consensus(consensus_rule=lambda array: '+' if sum(array.values()) >= 2 else '-') 153 | ``` 154 | 155 | Do the same as before, without using the BamFile class, but with pysam only. The bam file needs to be sorted and indexed! 156 | 157 | ``` 158 | #!python 159 | import pysam,cmseq 160 | bamHandle = pysam.AlignmentFile(BAM_PATH, "rb") 161 | lengths = dict((r,l) for r,l in zip(bamHandle.references,bamHandle.lengths)) 162 | contig = cmseq.BamContig(bamHandle,TARGET_CONTIG,lengths[TARGET_CONTIG]) 163 | 164 | print contig.reference_free_consensus(consensus_rule=lambda array: '+' if sum(array.values()) >= 2 else '-') 165 | 166 | ``` -------------------------------------------------------------------------------- /cmseq/__init__.py: -------------------------------------------------------------------------------- 1 | from cmseq.cmseq import CMSEQ_DEFAULTS 2 | from cmseq.cmseq import BamFile 3 | from cmseq.cmseq import BamContig 4 | 5 | __version__ = '1.0.4' -------------------------------------------------------------------------------- /cmseq/breadth_depth.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from cmseq.cmseq import CMSEQ_DEFAULTS 4 | from cmseq.cmseq import BamFile 5 | from cmseq import __version__ 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import argparse 10 | 11 | 12 | 13 | def bd_from_file(): 14 | parser = argparse.ArgumentParser(description="calculate the Breadth and Depth of coverage of BAMFILE.") 15 | parser.add_argument('--version', action='version', version=f"CMSeq {__version__}") 16 | 17 | parser.add_argument('BAMFILE', help='The file on which to operate') 18 | parser.add_argument('-c','--contig', help='Focus on a subset of references in the BAM file. Can be a list of references separated by commas or a FASTA file (the IDs are used to subset)', metavar="REFERENCE ID" ,default=None) 19 | parser.add_argument('-f', help='If set unmapped (FUNMAP), secondary (FSECONDARY), qc-fail (FQCFAIL) and duplicate (FDUP) are excluded. If unset ALL reads are considered (bedtools genomecov style). Default: unset',action='store_true') 20 | parser.add_argument('--sortindex', help='Sort and index the file',action='store_true') 21 | parser.add_argument('--minlen', help='Minimum Reference Length for a reference to be considered',default=CMSEQ_DEFAULTS.minlen, type=int) 22 | parser.add_argument('--minqual', help='Minimum base quality. Bases with quality score lower than this will be discarded. This is performed BEFORE --mincov. Default: 30', type=int, default=CMSEQ_DEFAULTS.minqual) 23 | parser.add_argument('--mincov', help='Minimum position coverage to perform the polymorphism calculation. Position with a lower depth of coverage will be discarded (i.e. considered as zero-coverage positions). This is calculated AFTER --minqual. Default: 1', type=int, default=CMSEQ_DEFAULTS.mincov) 24 | parser.add_argument('--truncate', help='Number of nucleotides that are truncated at either contigs end before calculating coverage values.', type=float, default=0) 25 | parser.add_argument('--combine', help='Combine all contigs into one giant contig and report it at the end', action='store_true') 26 | 27 | #print vars(args) 28 | args = parser.parse_args() 29 | si = True if args.sortindex else False 30 | mode = 'all' if args.f else 'nofilter' 31 | 32 | bf = BamFile(args.BAMFILE,sort=si,index=si,stepper=mode,minlen=args.minlen,filterInputList=args.contig,minimumReadsAligning=args.mincov) 33 | 34 | print('Contig\tBreadth\tDepth avg\tDepth median') 35 | 36 | all_coverage_values = [] 37 | for i in bf.get_contigs_obj(): 38 | bd_result = i.breadth_and_depth_of_coverage(minqual=args.minqual,mincov=args.mincov,trunc=args.truncate) 39 | 40 | if not all(np.isnan(x) for x in [bd_result[0],bd_result[1],bd_result[2]]): 41 | print (i.name+'\t'+str(bd_result[0])+'\t'+str(bd_result[1])+'\t'+str(bd_result[2])) 42 | 43 | if args.combine: 44 | all_coverage_values.extend(bd_result[3]) 45 | 46 | if args.combine: 47 | if np.all(np.isnan(all_coverage_values)): 48 | print ("all_contigs"+'\t-\t'+str("NaN")+'\t'+str("NaN")) 49 | else: 50 | print ("all_contigs"+'\t-\t'+str(np.nanmean(all_coverage_values)) + '\t'+str(np.nanmedian(all_coverage_values))) 51 | 52 | 53 | if __name__ == "__main__": 54 | bd_from_file() -------------------------------------------------------------------------------- /cmseq/cmseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from __future__ import print_function 3 | import os 4 | import pysam 5 | import numpy as np 6 | import math 7 | import sys 8 | from scipy import stats 9 | from collections import defaultdict 10 | import pickle,os 11 | 12 | def _initt(terminating_,_consensus_bamFile,_consensus_args): 13 | global terminating 14 | global consensus_args 15 | global consensus_bamFile 16 | terminating = terminating_ 17 | consensus_args = _consensus_args 18 | consensus_bamFile = _consensus_bamFile 19 | 20 | 21 | class CMSEQ_DEFAULTS: 22 | minqual = 30 23 | mincov = 1 24 | minlen = 0 25 | poly_error_rate = 0.001 26 | poly_pvalue_threshold = 0.01 27 | poly_dominant_frq_thrsh = 0.8 28 | trimReads = None 29 | 30 | 31 | class BamFile: 32 | bam_handle = None 33 | bamFile = None 34 | contigs = {} 35 | 36 | def __init__(self,bamFile,sort=False,index=False,stepper='nofilter',minlen=CMSEQ_DEFAULTS.minlen,filterInputList=None,minimumReadsAligning=None): 37 | if not os.path.isfile(bamFile): 38 | raise Exception(bamFile+' is not accessible, or is not a file') 39 | 40 | if sort: 41 | import subprocess 42 | fp = bamFile+'.sorted' 43 | subprocess.call(['samtools','sort',bamFile,'-o',bamFile+'.sorted']) 44 | else: fp = bamFile 45 | 46 | if index: pysam.index(fp) 47 | 48 | self.bamFile = fp 49 | 50 | bamHandle = pysam.AlignmentFile(fp, "rb") 51 | 52 | self.bam_handle = bamHandle 53 | 54 | if filterInputList is not None: 55 | 56 | toList=[] 57 | if isinstance(filterInputList, list): 58 | toList = filterInputList 59 | 60 | elif os.path.isfile(filterInputList): 61 | from Bio import SeqIO 62 | 63 | with open(filterInputList, "r") as infile: 64 | 65 | for record in SeqIO.parse(infile, "fasta"): 66 | 67 | toList.append(record.id) 68 | else: 69 | toList = [element for element in filterInputList.split(',')] 70 | 71 | if minimumReadsAligning: 72 | self.contigs = dict((r,BamContig(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and r in toList and bamHandle.count(contig=r,read_callback=stepper) >= minimumReadsAligning)) 73 | else: 74 | self.contigs = dict((r,BamContig(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and r in toList)) 75 | 76 | else: 77 | if minimumReadsAligning: 78 | self.contigs = dict((r,BamContig(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and bamHandle.count(contig=r,read_callback=stepper) >= minimumReadsAligning)) 79 | else: 80 | self.contigs = dict((r,BamContig(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen)) 81 | 82 | def get_contigs(self): return iter(self.contigs.keys()) 83 | def get_contigs_obj(self): return iter(self.contigs.values()) 84 | def get_contig_by_label(self,contigID): return (self.contigs[contigID] if contigID in self.contigs else None) 85 | 86 | def parse_gff(self, inputGFF): 87 | ''' 88 | get a list of contigs plus 0-indexed gene-coordinates and sense-ness of protein coding regions from a gff file. 89 | Only tested with prokka GFF files. 90 | ''' 91 | from BCBio import GFF 92 | import Bio 93 | import re 94 | import warnings 95 | 96 | def rev_comp(string): 97 | string = string.upper() 98 | complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N' : 'N'} 99 | bases = list(string) 100 | bases = [complement[base] for base in bases] 101 | bases.reverse() 102 | return ''.join(bases) 103 | 104 | try: 105 | with open(inputGFF) as in_handle: 106 | _ = next(GFF.parse(in_handle)) 107 | except: 108 | print ('Parsing of GFF failed. This is probably because your biopython version is too new. Try downgrading to 1.76 or older') 109 | sys.exit(1) 110 | 111 | with open(inputGFF) as in_handle: 112 | 113 | for rec in GFF.parse(in_handle): 114 | tmp = [] 115 | for r in rec.features: 116 | if "minced" in r.qualifiers['source'][0] or "Minced" in r.qualifiers['source'][0]: 117 | # This catches CRISPR repeats. 118 | continue 119 | if r.sub_features: 120 | prodigal_bool = 'Prodigal' in r.sub_features[0].qualifiers['source'][0] or 'prodigal' in r.sub_features[0].qualifiers['source'][0] 121 | else: 122 | prodigal_bool = 'Prodigal' in r.qualifiers['source'][0] or 'prodigal' in r.qualifiers['source'][0] 123 | 124 | if prodigal_bool: 125 | # Prokka not only finds protein sequences, but also t-/r-RNA sequences. In order to only parse protein coding sequences, 126 | # I search for Prodigal/Prodigal in the source entry of the sub_features attribute. 127 | 128 | # the sub_features attribute of a seq_record object is apparently deprecated. I couldn't find any other way to access 129 | # the required information, though. Should probably be fixed when I can. 130 | indices = str(r.location).split('[')[1].split(']')[0].split(':') 131 | indices = [int(x) for x in indices] 132 | sense = str(r.location).split('(')[1].split(')')[0] 133 | if sense == "-": 134 | gene_seq = rev_comp(rec.seq[indices[0]:indices[1]]) 135 | else: 136 | gene_seq = rec.seq[indices[0]:indices[1]] 137 | 138 | if (str(gene_seq[0:3]) == "ATG" or str(gene_seq[0:3]) == "GTG" or str(gene_seq[0:3]) == "TTG"): 139 | pass 140 | else: 141 | warnings.warn(str(r.id) + " doesn't start with a common start codon. Beware. Continuing.") 142 | 143 | if (str(gene_seq[-3:]) == "TAG" or str(gene_seq[-3:]) == "TAA" or str(gene_seq[-3:]) == "TGA"): 144 | pass 145 | else: 146 | warnings.warn(str(r.id) + " doesn't stop with a usual stop codon. Beware. Continuing.") 147 | tmp.append((indices, sense)) 148 | 149 | if str(rec.id) in self.contigs: 150 | self.contigs[str(rec.id)].annotations.append(tmp) 151 | else: 152 | warnings.warn(str(rec.id) + " is not tracked by the BAMFile.") 153 | 154 | 155 | 156 | 157 | def parallel_reference_free_consensus(self,ncores=4,**kwargs): 158 | import multiprocessing as mp 159 | 160 | terminating = mp.Event() 161 | 162 | with mp.Pool(initializer=_initt, initargs=(terminating,self.bamFile,kwargs),processes=ncores) as pool: 163 | res= [x for x in pool.imap_unordered(BamFile._parallel_consensus_worker, self.contigs.keys())] 164 | return res 165 | 166 | @staticmethod 167 | def _parallel_consensus_worker(contigName): 168 | 169 | if not terminating.is_set(): 170 | try: 171 | t=BamFile(consensus_bamFile,filterInputList=[contigName]) 172 | return (contigName,t.get_contig_by_label(contigName).reference_free_consensus(**consensus_args)) 173 | except Exception as e: 174 | terminating.set() 175 | raise 176 | else: 177 | terminating.set() 178 | 179 | class BamContig: 180 | 181 | coverage = None 182 | consensus = '' 183 | name = None 184 | length = None 185 | stepper = 'nofilter' 186 | annotations = None 187 | 188 | def __init__(self,bamHandle,contigName,contigLength,stepper='nofilter'): 189 | 190 | self.name = contigName 191 | self.length = contigLength 192 | self.bam_handle = bamHandle 193 | self.stepper=stepper 194 | self.annotations = [] 195 | 196 | 197 | def set_stepper(self,ns): 198 | if ns in ['all','nofilter']: self.stepper = ns 199 | 200 | 201 | def majority_rule(data_array): 202 | freq_array= data_array['base_freq'] 203 | 204 | 205 | if any([v>0 for v in freq_array.values()]): 206 | return max(sorted(freq_array), key=freq_array.get) 207 | else: 208 | return 'N' 209 | 210 | def majority_rule_polymorphicLoci(data_array): 211 | 212 | # Masks the consensus sequence with "*" when a polymorphic locus is found according 213 | # to dominant_frq_thrsh defined p-value 214 | 215 | freq_array= data_array['base_freq'] 216 | poly_pvalue= data_array['p'] 217 | 218 | if poly_pvalue <= 0.05: 219 | return "*" 220 | elif any([v>0 for k,v in freq_array.items() if k != 'N']): 221 | return max(sorted(freq_array), key=freq_array.get) 222 | else: 223 | return 'N' 224 | 225 | def reference_free_consensus(self,consensus_rule=majority_rule,mincov=CMSEQ_DEFAULTS.mincov,minqual=CMSEQ_DEFAULTS.minqual,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh,noneCharacter='-',BAM_tagFilter=None, trimReads=None): 226 | 227 | consensus_positions = {} 228 | 229 | #print("A",mincov,minqual,dominant_frq_thrsh) 230 | for pileupcolumn,position_data in self.get_base_stats(min_read_depth=mincov, min_base_quality=minqual,dominant_frq_thrsh=dominant_frq_thrsh,BAM_tagFilter=BAM_tagFilter,trimReads=trimReads,error_rate=CMSEQ_DEFAULTS.poly_error_rate).items(): 231 | consensus_positions[pileupcolumn] = consensus_rule(position_data) 232 | 233 | if len(consensus_positions) > 0 : 234 | self.consensus = ''.join([(consensus_positions[position] if position in consensus_positions else noneCharacter) for position in range(1,self.length+1)]) 235 | else: 236 | self.consensus = noneCharacter*self.length 237 | 238 | #del consensus_positions 239 | 240 | return self.consensus 241 | 242 | 243 | 244 | def baseline_PSR(self,mincov=10,minqual=30,pvalue=0.01,error_rate=0.001,dominant_frq_thrsh=0.8,binom=None): 245 | # This function estimates the polymorphic site rate over the input contig assuming that there are no truely polymorphic sites 246 | # (Meaning that all observed polymorphisms are due to random sequencing error). The test also puts a threshold on the "dominance" 247 | # of the allele, meaning that it only reports a polymorphic base if the binomial test indicates significance AND the base is NOT sufficiently 248 | # dominated by the dominant base. Defaults to 0.8 dominance (dominant / all). 249 | from scipy import stats 250 | 251 | polymorphic_empirical_loci = 0 252 | 253 | # Get coverage as well as values of contig 254 | depthsList = self.get_all_base_values('base_cov', min_base_qualit=yminqual,min_read_depth=mincov) 255 | 256 | # Also get dominant allele frequency of contig 257 | dominantFreq = self.get_all_base_values('ratio_max2all', min_base_quality=minqual,min_read_depth=mincov) 258 | 259 | # For each position, draw depth-times from bernoulli with success rate 1-error_rate. 260 | # Determine significance based on a binomial test, as you would in the regular test for polymorphism. 261 | for depth, da_freq in zip(depthsList, dominantFreq): 262 | base_max = sum(stats.bernoulli.rvs(1-error_rate, size=depth)) 263 | if binom and base_max in binom and depth in binom[base_max]: 264 | p = binom[base_max][depth] 265 | else: 266 | p = stats.binom.cdf(base_max, depth,1.0-error_rate) 267 | if p < pvalue and da_freq < dominant_frq_thrsh: 268 | polymorphic_empirical_loci+=1 269 | PSR = float(polymorphic_empirical_loci) / float(len(depthsList)) 270 | return PSR 271 | 272 | def get_base_stats_for_poly(self,minqual=CMSEQ_DEFAULTS.minqual): 273 | 274 | from scipy import stats 275 | import numpy 276 | from collections import defaultdict 277 | import pickle,os 278 | from itertools import chain 279 | import sys 280 | import pandas as pd 281 | 282 | ATCG=('A','C','G','T') 283 | rev_dict={'A':'T', 'T':'A', 'G':'C', 'C':'G'} 284 | def rev_pos(cur_pos, gene_start, gene_end): 285 | # This function mirrors nucleotide positions in a gene on a gene's 'mid-part' 286 | # (So Nucleotide 1 in a gene is mapped to the last nucleotide in the gene, nucleotide 2 is mapped to one before the last nucleotide on the gene and so forth) 287 | # The gene_length variable is misnamed. It should be called gene_length_minus_one.. 288 | gene_length = ((gene_end - 1) - gene_start) 289 | distance_from_start = cur_pos - gene_start 290 | return(cur_pos + (gene_length - 2 * distance_from_start)) 291 | 292 | if not self.annotations: 293 | base_stats = [None] * self.length 294 | for base_pileup in self.bam_handle.pileup(self.name,stepper=self.stepper): 295 | 296 | base_freq = {'A':0,'C':0,'G':0,'T':0,'N':0} 297 | for matched_read in base_pileup.pileups: 298 | 299 | if not matched_read.is_del and not matched_read.is_refskip: 300 | b = matched_read.alignment.query_sequence[matched_read.query_position].upper() 301 | q = matched_read.alignment.query_qualities[matched_read.query_position] 302 | 303 | #print self.name,matched_read.query_position, b,q 304 | 305 | if q >= minqual and b in ATCG: base_freq[b] += 1 306 | #else: print "Q",q,"B",b 307 | #print "Filling",base_pileup.pos,"with", base_freq 308 | if sum(base_freq.values()) > 0: 309 | base_stats[base_pileup.pos] = ((base_freq['A'],base_freq['C'],base_freq['G'],base_freq['T']),base_pileup.pos) 310 | else: 311 | base_stats = [] 312 | # Generate pileups gene-wise 313 | # I use the 'truncate' parameter to only obtain the parsed start and stop positions. Without truncate, all positions with reads covering the parsed positions are returned. 314 | # I wrote a function that reverses a given gene position, which is used to effectively revert genes on the anti-sense strand. 315 | # Furthermore, for each read's nucleotide over a given position I write out the complement 316 | genes_and_positions = dict() 317 | for gene_idx in range(0, len(self.annotations[0])): 318 | genes_and_positions[gene_idx] = self.annotations[0][gene_idx] 319 | 320 | for gene_idx in genes_and_positions: 321 | gene_stats = [None] * (genes_and_positions[gene_idx][0][1] - genes_and_positions[gene_idx][0][0]) 322 | pos_on_gene = 0 323 | bam_pileup = self.bam_handle.pileup(self.name, int(genes_and_positions[gene_idx][0][0]), int(genes_and_positions[gene_idx][0][1]), stepper=self.stepper, truncate = True) 324 | if genes_and_positions[gene_idx][1] == "+": 325 | # If the gene is on the sense-strand, do the same as before. 326 | for base_pileup in bam_pileup: 327 | base_freq = {'A':0,'C':0,'G':0,'T':0,'N':0} 328 | for matched_read in base_pileup.pileups: 329 | if not matched_read.is_del and not matched_read.is_refskip: 330 | b = matched_read.alignment.query_sequence[matched_read.query_position].upper() 331 | q = matched_read.alignment.query_qualities[matched_read.query_position] 332 | if q >= minqual and b in ATCG: base_freq[b] += 1 333 | if sum(base_freq.values()) > 0: 334 | gene_stats[pos_on_gene] = ((base_freq['A'],base_freq['C'],base_freq['G'],base_freq['T']), base_pileup.pos) 335 | pos_on_gene += 1 336 | base_stats.extend(gene_stats) 337 | else: 338 | # If the gene is on the anti-sense strand, effectively return the reverse complement by mapping positions on a gene to it's mirrored position (using rev_pos) 339 | # and then also converting each nucleotide to it's complement. 340 | for base_pileup in bam_pileup: 341 | base_freq = {'A':0,'C':0,'G':0,'T':0,'N':0} 342 | for matched_read in base_pileup.pileups: 343 | if not matched_read.is_del and not matched_read.is_refskip: 344 | b = matched_read.alignment.query_sequence[matched_read.query_position].upper() 345 | q = matched_read.alignment.query_qualities[matched_read.query_position] 346 | # We have to increment the COMPLEMENT of each base when gene calls are on the reverse strand. 347 | if q >= minqual and b in ATCG: base_freq[rev_dict[b]] += 1 348 | if sum(base_freq.values()) > 0: 349 | out_pos = rev_pos(cur_pos = int(pos_on_gene), gene_start = 0, gene_end = len(gene_stats)) 350 | contig_pos = rev_pos(cur_pos = int(base_pileup.pos), gene_start = genes_and_positions[gene_idx][0][0], gene_end = genes_and_positions[gene_idx][0][1]) 351 | gene_stats[out_pos] = ((base_freq['A'],base_freq['C'],base_freq['G'],base_freq['T']), contig_pos) 352 | pos_on_gene += 1 353 | if len(gene_stats) % 3 != 0: 354 | print("One of your genes' length is not a multiple of three. Check your gff file / gene calls.") 355 | print("Contig name", self.name) 356 | print("Gene position", genes_and_positions[gene_idx]) 357 | sys.exit() 358 | base_stats.extend(gene_stats) 359 | 360 | return base_stats 361 | 362 | def easy_polymorphism_rate(self,mincov=CMSEQ_DEFAULTS.mincov,minqual=CMSEQ_DEFAULTS.minqual,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh): 363 | 364 | from Bio.Seq import Seq 365 | #from Bio.Alphabet import IUPAC 366 | 367 | bases = self.get_base_stats_for_poly(minqual=minqual) 368 | 369 | #list N-long where N is the number of covered bases (N <= L(contig)) 370 | dominanceList = [] 371 | mutationStats={'DN':0,'DS':0,'D?':0} 372 | 373 | explainList=[] 374 | 375 | codon_f1 = [] 376 | codon_f2 = [] 377 | 378 | for positionData in bases: 379 | # positionData= ((A,C,G,T),position) if covered, None if not. 380 | bases = ['N'] 381 | 382 | if positionData: 383 | nuclAbundance,position = positionData 384 | base_sum=sum(nuclAbundance) 385 | base_max=float(max(nuclAbundance)) 386 | dominance = float(base_max) / float(base_sum) 387 | 388 | if base_sum > mincov: 389 | 390 | dominanceList.append(dominance) 391 | tmpDict = dict((k,v) for k,v in zip(['A','C','G','T'],nuclAbundance)) 392 | bases = [k for k,v in sorted(tmpDict.items(), key = lambda x: x[1], reverse=True) if v>0] 393 | else: 394 | dominanceList.append(np.nan) 395 | else: 396 | dominanceList.append(np.nan) 397 | 398 | first_base = bases[0] 399 | second_base = bases[1] if (len(bases) > 1 and dominance < dominant_frq_thrsh) else bases[0] 400 | 401 | codon_f1.append(first_base) 402 | codon_f2.append(second_base) 403 | 404 | if len(codon_f1) == 3 and len(codon_f2) == 3: 405 | 406 | codon_s1 = Seq(''.join(codon_f1)) 407 | codon_s2 = Seq(''.join(codon_f2)) 408 | codon_t1 = codon_s1.translate() 409 | codon_t2 = codon_s2.translate() 410 | 411 | positionLabel = positionData[1] if positionData else 'ND' 412 | RD=None 413 | if codon_t1 == "X" or codon_t2 == "X": 414 | mutationStats['D?'] +=1 415 | RD="D?" 416 | elif codon_t1 != codon_t2: 417 | mutationStats['DN'] +=1 418 | RD="DN" 419 | elif (codon_t1 == codon_t2) and (codon_s1 != codon_s2): 420 | mutationStats['DS'] +=1 421 | RD="DS" 422 | 423 | codon_f1 = [] 424 | codon_f2 = [] 425 | 426 | return (dominanceList,mutationStats) 427 | 428 | 429 | def polymorphism_rate(self,mincov=CMSEQ_DEFAULTS.mincov,minqual=CMSEQ_DEFAULTS.minqual,pvalue=CMSEQ_DEFAULTS.poly_pvalue_threshold,error_rate=CMSEQ_DEFAULTS.poly_error_rate,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh): 430 | 431 | base_values = self.get_base_stats(min_read_depth=mincov, min_base_quality=minqual,error_rate=error_rate,dominant_frq_thrsh=dominant_frq_thrsh) 432 | 433 | 434 | rv={} 435 | rv['total_covered_bases'] = len(base_values) 436 | rv['total_polymorphic_bases'] = 0 437 | 438 | if len(base_values) > 0: 439 | pb=sum([(1 if (info['p'] < pvalue and info['ratio_max2all'] < dominant_frq_thrsh) else 0) for pox, info in base_values.items()]) 440 | 441 | 442 | rv['total_polymorphic_bases']= pb 443 | rv['total_polymorphic_rate'] = float(pb)/float(len(base_values)) 444 | 445 | # If we have at least one polymorphic site 446 | if pb > 0: 447 | 448 | rv['ratios'] = [info['ratio_max2all'] for pox,info in base_values.items() if (info['p'] < pvalue and info['ratio_max2all'] < dominant_frq_thrsh)] 449 | rv['dominant_allele_distr_mean'] = np.mean(rv['ratios']) 450 | rv['dominant_allele_distr_sd'] = np.std(rv['ratios']) 451 | 452 | for i in [10,20,30,40,50,60,70,80,90,95,98,99]: 453 | rv['dominant_allele_distr_perc_'+str(i)] = np.percentile(rv['ratios'],i) 454 | 455 | return rv 456 | 457 | 458 | def breadth_and_depth_of_coverage(self,mincov=10,minqual=30,trunc=0): 459 | coverage_positions = {} 460 | if self.length > trunc*2: 461 | # Check if the contig is long enough to be truncated 462 | consid_r = range(int(trunc), int(self.length - trunc)) 463 | else: 464 | # If a contig is too short to be truncated, ignore the truncation. 465 | # This is not nice and should be improved. 466 | consid_r = range(0, int(self.length)) 467 | 468 | for pileupcolumn in self.bam_handle.pileup(self.name,stepper=self.stepper): 469 | #for each position 470 | if pileupcolumn.pos in consid_r: 471 | tCoverage = 0 472 | for pileupread in pileupcolumn.pileups: 473 | #for each base at the position 474 | if not pileupread.is_del and not pileupread.is_refskip and pileupread.alignment.query_qualities[pileupread.query_position] >= minqual and pileupread.alignment.query_sequence[pileupread.query_position].upper() in ('A','T','C','G'): 475 | tCoverage +=1 476 | 477 | if tCoverage >= mincov: 478 | coverage_positions[pileupcolumn.pos] = tCoverage 479 | 480 | 481 | if (len(coverage_positions.keys())) > 0: 482 | breadth = float(len(coverage_positions.keys()))/len(consid_r) 483 | vals = list(coverage_positions.values()) 484 | avgdepth = np.mean(vals) 485 | mediandepth = np.median(vals) 486 | 487 | return (breadth,avgdepth,mediandepth,coverage_positions.values()) 488 | else: 489 | return (np.nan,np.nan,np.nan,[np.nan]) 490 | 491 | def depth_of_coverage(self,mincov=10,minqual=30): 492 | return self.breadth_and_depth_of_coverage(mincov,minqual)[1] 493 | #coverage_positions = {} 494 | #for pileupcolumn in self.bam_handle.pileup(self.name,stepper=self.stepper): 495 | # if pileupcolumn.n >= mincov: coverage_positions[pileupcolumn.pos] = len([1 for pileupread in pileupcolumn.pileups if not pileupread.is_del and not pileupread.is_refskip and pileupread.alignment.query_qualities[pileupread.query_position] >= args.minqual and pileupread.alignment.query_sequence[pileupread.query_position].upper() in ('A','T','C','G') ]) 496 | # 497 | #return (np.mean(coverage_positions.values()),np.median(coverage_positions.values())) 498 | 499 | 500 | def breadth_of_coverage(self,mincov=10,minqual=30): 501 | return self.breadth_and_depth_of_coverage(mincov,minqual)[0] 502 | 503 | #------------------------------------------------------------------------------ 504 | 505 | 506 | def get_base_stats(self, min_read_depth=CMSEQ_DEFAULTS.mincov, min_base_quality=CMSEQ_DEFAULTS.minqual, error_rate=CMSEQ_DEFAULTS.poly_error_rate,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh,BAM_tagFilter=None,trimReads=None): 507 | ''' 508 | get base frequencies and quality stats, 509 | to use in get_all_base_values() and other functions 510 | ''' 511 | 512 | 513 | if trimReads: 514 | mask_head_until = int(trimReads[0]) if (trimReads[0] is not None and trimReads[0] != '') else 0 515 | mask_tail_before = int(trimReads[1]) if (trimReads[1] is not None and trimReads[1] != '') else 0 516 | 517 | base_stats = defaultdict(dict) 518 | 519 | ATCG=('A','T','C','G') 520 | 521 | 522 | #for each position (column) 523 | 524 | 525 | 526 | for base_pileup in self.bam_handle.pileup(self.name,stepper=self.stepper,min_base_quality=min_base_quality): 527 | base_freq = {'A':0,'T':0,'C':0,'G':0,'N':0} 528 | 529 | pos=base_pileup.pos+1 # 1-based 530 | 531 | 532 | #for each read composing the pile 533 | for matched_read in base_pileup.pileups: 534 | if not matched_read.is_del and not matched_read.is_refskip: 535 | 536 | b = matched_read.alignment.query_sequence[matched_read.query_position].upper() 537 | q = matched_read.alignment.query_qualities[matched_read.query_position] 538 | #print("I am get_base_stats and this is read", matched_read.alignment.query_name, " (L=",matched_read.alignment.query_length," ) at position", matched_read.query_position, "it's a ", b) 539 | 540 | thisPositionBase = 'N' 541 | 542 | if not trimReads or (trimReads and ((matched_read.query_position >= mask_head_until) and (matched_read.query_position <= (matched_read.alignment.query_length-mask_tail_before) ) ) ): 543 | if b in ATCG: 544 | if BAM_tagFilter is None or all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter ): 545 | thisPositionBase = b 546 | 547 | base_freq[thisPositionBase] += 1 548 | 549 | # calculate quality stats, ignoring N's 550 | base_sum=sum([base_freq[b] for b in ATCG]) 551 | base_max=float(max([base_freq[b] for b in ATCG])) 552 | 553 | if base_sum >= min_read_depth: 554 | r = base_max / base_sum 555 | #print r, dominant_frq_thrsh 556 | if r < dominant_frq_thrsh: 557 | #it makes sense to calculate pvalue 558 | p = stats.binom.cdf(base_max, base_sum, 1.0 - error_rate) 559 | else: 560 | p = 1.0 561 | 562 | 563 | base_stats[pos]['p']=p # quality measure 564 | base_stats[pos]['ratio_max2all']=r # dominant base versus others 565 | base_stats[pos]['base_cov'] =base_sum # number of reads covering the base, not counting N's 566 | base_stats[pos]['base_freq']=base_freq # dict: {'A':4,'T':1,'C':2,'G':0,'N':0} 567 | 568 | return base_stats 569 | 570 | 571 | def get_all_base_values(self, stats_value, *f_args, **f_kwargs): 572 | ''' 573 | get list of p values (or 'ratio_max2all' etc) for all bases that pass argument thresholds 574 | p_all = a.get_contig_by_label('CONTIGNAME').get_all_base_values('p', min_base_quality=30) 575 | ''' 576 | base_stats = self.get_base_stats(*f_args, **f_kwargs) 577 | return [base_stats[k].get(stats_value, 'NaN') for k in base_stats] 578 | 579 | 580 | 581 | def loc_gte(a,b): 582 | return a>=b 583 | 584 | def loc_lte(a,b): 585 | return a<=b 586 | 587 | def loc_gt(a,b): 588 | return a>b 589 | 590 | def loc_lt(a,b): 591 | return a minlen and r in toList and bamHandle.count(contig=r,read_callback=stepper) >= minimumReadsAligning)) 70 | else: 71 | self.contigs = dict((r,BamContigAncient(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and r in toList)) 72 | 73 | else: 74 | if minimumReadsAligning: 75 | self.contigs = dict((r,BamContigAncient(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and bamHandle.count(contig=r,read_callback=stepper) >= minimumReadsAligning)) 76 | else: 77 | self.contigs = dict((r,BamContigAncient(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen)) 78 | 79 | def get_contigs_obj(self): return iter(self.contigs.values()) 80 | 81 | 82 | class BamContigAncient(BamContig): 83 | ## Get base stats code goes here 84 | 85 | def reference_free_consensus(self,consensus_rule=BamContig.majority_rule,mincov=CMSEQ_DEFAULTS.mincov, 86 | minqual=CMSEQ_DEFAULTS.minqual,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh, 87 | noneCharacter='-',BAM_tagFilter=None,trimReads=None,post_damage_prob=None, 88 | pos_prob_db=CMSEQ_DEFAULTS_Ancient.position_specific_prob,refseq_idx=None): 89 | 90 | consensus_positions = {} 91 | 92 | for pileupcolumn,position_data in self.get_base_stats(min_read_depth=mincov, min_base_quality=minqual, 93 | dominant_frq_thrsh=dominant_frq_thrsh,BAM_tagFilter=BAM_tagFilter,trimReads=trimReads, 94 | post_damage_prob=post_damage_prob,pos_prob_db=pos_prob_db, refseq_idx=refseq_idx).items(): 95 | ref_base_idx = self.name + '__' + str(pileupcolumn) 96 | 97 | 98 | if float(position_data['ratio_max2all']) >= float(dominant_frq_thrsh): 99 | consensus_positions[pileupcolumn] = consensus_rule(dict((k,v) for k,v in position_data['base_freq'].items() if k != 'N')) 100 | 101 | if len(consensus_positions) > 0 : 102 | self.consensus = ''.join([(consensus_positions[position] if position in consensus_positions else noneCharacter) for position in range(1,self.length+1)]) 103 | else: 104 | self.consensus = noneCharacter*self.length 105 | 106 | del consensus_positions 107 | return self.consensus 108 | 109 | def get_base_stats(self, min_read_depth=CMSEQ_DEFAULTS.mincov, min_base_quality=CMSEQ_DEFAULTS.minqual, 110 | error_rate=CMSEQ_DEFAULTS.poly_error_rate,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh, 111 | BAM_tagFilter=None,trimReads=None,post_damage_prob=CMSEQ_DEFAULTS_Ancient.position_specific_prob_thrsh, 112 | pos_prob_db=CMSEQ_DEFAULTS_Ancient.position_specific_prob,refseq_idx=None): 113 | 114 | ''' 115 | get base frequencies and quality stats, 116 | to use in get_all_base_values() and other functions 117 | ''' 118 | 119 | from scipy import stats 120 | from collections import defaultdict 121 | import pickle,os 122 | 123 | 124 | base_stats = defaultdict(dict) 125 | 126 | ATCG=('A','T','C','G') 127 | 128 | #for each position (column) 129 | for base_pileup in self.bam_handle.pileup(self.name,stepper=self.stepper): 130 | base_freq = {'A':0,'T':0,'C':0,'G':0,'N':0} 131 | 132 | 133 | pos=base_pileup.pos+1 # 1-based 134 | 135 | #for each read composing the pile 136 | 137 | for matched_read in base_pileup.pileups: 138 | if not matched_read.is_del and not matched_read.is_refskip: 139 | 140 | 141 | b = matched_read.alignment.query_sequence[matched_read.query_position].upper() 142 | q = matched_read.alignment.query_qualities[matched_read.query_position] 143 | 144 | thisPositionBase = 'N' 145 | 146 | if post_damage_prob and pos_prob_db and refseq_idx: # Enter position-specific mode 147 | ref_base_key = self.name + '__' + str(pos) 148 | ref_base = refseq_idx[ref_base_key] 149 | if matched_read.query_position <= 11: # Check position if on the left end 150 | left_pos = matched_read.query_position + 1 # 1-based 151 | sub = ref_base+b 152 | if sub == 'CT' or sub == 'GA': # Check if the RefSeq-Read base if CT or GA 153 | prob = pos_prob_db[left_pos][sub] 154 | # print(sub, prob, matched_read.alignment.query_length, matched_read.query_position, left_pos) 155 | if (q >= min_base_quality) and (b in ATCG) and (prob <= post_damage_prob): 156 | if BAM_tagFilter is None: 157 | thisPositionBase = b 158 | elif BAM_tagFilter and all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter): 159 | thisPositionBase = b 160 | else: 161 | if (q >= min_base_quality) and (b in ATCG): 162 | if BAM_tagFilter is None: 163 | thisPositionBase = b 164 | elif BAM_tagFilter and all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter): 165 | thisPositionBase = b 166 | 167 | elif (matched_read.alignment.query_length - matched_read.query_position) <= 11: # check position if on the right end 168 | right_pos = matched_read.query_position - matched_read.alignment.query_length 169 | sub = ref_base+b 170 | if sub == 'CT' or sub == 'GA': 171 | prob = pos_prob_db[right_pos][sub] 172 | # print(sub, prob, matched_read.alignment.query_length, matched_read.query_position, right_pos) 173 | if (q >= min_base_quality) and (b in ATCG) and (prob <= post_damage_prob): 174 | if BAM_tagFilter is None: 175 | thisPositionBase = b 176 | elif BAM_tagFilter and all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter): 177 | thisPositionBase = b 178 | else: 179 | if (q >= min_base_quality) and (b in ATCG): 180 | if BAM_tagFilter is None: 181 | thisPositionBase = b 182 | elif BAM_tagFilter and all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter): 183 | thisPositionBase = b 184 | else: 185 | # print(sub, matched_read.alignment.query_length, matched_read.query_position, "X") 186 | if (q >= min_base_quality) and (b in ATCG): 187 | if BAM_tagFilter is None: 188 | thisPositionBase = b 189 | elif BAM_tagFilter and all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter): 190 | thisPositionBase = b 191 | base_freq[thisPositionBase] += 1 192 | 193 | # calculate quality stats, ignoring N's 194 | base_sum=sum([base_freq[b] for b in ATCG]) 195 | base_max=float(max([base_freq[b] for b in ATCG])) 196 | 197 | if base_sum >= min_read_depth: 198 | r = base_max / base_sum 199 | #print r, dominant_frq_thrsh 200 | if r < dominant_frq_thrsh: 201 | #it makes sense to calculate pvalue 202 | p = stats.binom.cdf(base_max, base_sum, 1.0 - error_rate) 203 | else: 204 | p = 1.0 205 | 206 | 207 | base_stats[pos]['p']=p # quality measure 208 | base_stats[pos]['ratio_max2all']=r # dominant base versus others 209 | base_stats[pos]['base_cov'] =base_sum # number of reads covering the base, not counting N's 210 | base_stats[pos]['base_freq']=base_freq # dict: {'A':4,'T':1,'C':2,'G':0,'N':0} 211 | 212 | return base_stats 213 | 214 | def consensus_from_file(): 215 | 216 | parser = argparse.ArgumentParser(description="outputs the consensus in FASTA format. Non covered positions (or quality-trimmed positions) are reported as a dashes: -") 217 | parser.add_argument('--version', action='version', version=f"CMSeq {__version__}") 218 | 219 | parser.add_argument('BAMFILE', help='The file on which to operate') 220 | parser.add_argument('-c','--contig', help='Focus on a subset of references in the BAM file. Can be a list of references separated by commas or a FASTA file (the IDs are used to subset)', metavar="REFERENCE ID" ,default=None) 221 | parser.add_argument('-f', help='If set unmapped (FUNMAP), secondary (FSECONDARY), qc-fail (FQCFAIL) and duplicate (FDUP) are excluded. If unset ALL reads are considered (bedtools genomecov style). Default: unset',action='store_true') 222 | parser.add_argument('-r', '--refseq', help='Input the refrence genome sequence', type=str) 223 | parser.add_argument('--sortindex', help='Sort and index the file',action='store_true') 224 | parser.add_argument('--minqual', help='Minimum base quality. Bases with quality score lower than this will be discarded. This is performed BEFORE --mincov. Default: '+str(CMSEQ_DEFAULTS.minqual), type=int, default=CMSEQ_DEFAULTS.minqual) 225 | parser.add_argument('--mincov', help='Minimum position coverage to perform the polymorphism calculation. Position with a lower depth of coverage will be discarded (i.e. considered as zero-coverage positions). This is calculated AFTER --minqual. Default: '+str(CMSEQ_DEFAULTS.minlen), type=int, default=CMSEQ_DEFAULTS.mincov) 226 | parser.add_argument('--dominant_frq_thrsh', help='Cutoff for degree of `allele dominance` for a position to be considered polymorphic. Default: '+str(CMSEQ_DEFAULTS.poly_dominant_frq_thrsh), type=float, default=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh) 227 | parser.add_argument('--minlen', help='Minimum Reference Length for a reference to be considered. Default: '+str(CMSEQ_DEFAULTS.minlen),default=CMSEQ_DEFAULTS.minlen, type=int) 228 | parser.add_argument('--pos_specific_prob_tab', help='Stats_out_MCMC_correct_prob table produced from mapdamage2. It contains the position specific probability of observing a C->T or G->A due to a post-mortem damage.',default=CMSEQ_DEFAULTS_Ancient.position_specific_prob, type=str) 229 | parser.add_argument('--pos_damage_prob_thrsh', help = 'Maximum post-mortem damage probability for a nucletide on a read to be considered when building consensus.', default=CMSEQ_DEFAULTS_Ancient.position_specific_prob_thrsh, type = float) 230 | 231 | args = parser.parse_args() 232 | 233 | si = True if args.sortindex else False 234 | mode = 'all' if args.f else 'nofilter' 235 | 236 | bf = BamFileAncient(args.BAMFILE,sort=si,index=si,stepper=mode,minlen=args.minlen,filterInputList=args.contig) 237 | #tl = [bf.get_contig_by_label(contig) for contig in args.contig.split(',')] if args.contig is not None else list(bf.get_contigs_obj()) 238 | 239 | lst = [] 240 | if args.pos_specific_prob_tab and args.pos_damage_prob_thrsh and args.refseq: 241 | pos_specific_prob_db = [i.rstrip().split(',') for i in open(args.pos_specific_prob_tab).readlines()][1:] 242 | stats_db = {} 243 | for i in pos_specific_prob_db: 244 | pos_ = int(i[1]) 245 | CT_ = float(i[2]) 246 | GA_ = float(i[3]) 247 | stats_db[pos_] = {'CT': CT_, 'GA': GA_} 248 | pos_stats_db = stats_db 249 | pos_prob_thrsh = args.pos_damage_prob_thrsh 250 | 251 | RefSeq_dict = SeqIO.to_dict(SeqIO.parse(open(args.refseq), "fasta")) 252 | RefSeq_idx = {} 253 | for i in RefSeq_dict: 254 | seq = RefSeq_dict[i].seq 255 | for b_idx in range(len(seq)): 256 | RefSeq_idx[i+'__'+str(b_idx+1)]=seq[b_idx] 257 | 258 | else: 259 | pos_stats_db, pos_prob_thrsh, RefSeq_idx = None, None, None 260 | sys.exit("Please input position-specific probability table from mapdamage2, reference sequence, and damage probability cap!") 261 | 262 | 263 | for i in bf.get_contigs_obj(): 264 | 265 | 266 | sq = i.reference_free_consensus(mincov=args.mincov,minqual=args.minqual, 267 | dominant_frq_thrsh=args.dominant_frq_thrsh,noneCharacter='N', 268 | trimReads=None,post_damage_prob=pos_prob_thrsh,pos_prob_db=pos_stats_db, refseq_idx=RefSeq_idx) 269 | 270 | if sq is not None: 271 | lst.append(SeqRecord(Seq(sq), id=i.name+"_consensus", description='')) 272 | SeqIO.write(lst,sys.stdout,'fasta') 273 | 274 | 275 | if __name__ == "__main__": 276 | consensus_from_file() 277 | -------------------------------------------------------------------------------- /cmseq/filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pysam,sys 3 | import argparse 4 | import numpy as np 5 | from cmseq import __version__ 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--version', action='version', version=f"CMSeq {__version__}") 9 | 10 | parser.add_argument('--minlen', help='Minimum length of alignment for a read to pass', type=int, default=70) 11 | parser.add_argument('--minqual', help='Minimum average quality for a read to pass. It is computed over the fastq Phred-scores of each read', type=int, default=30) 12 | parser.add_argument('--maxsnps', help='Maximum edit distance on the alignment for a read to pass. It is computed over NM value from the BAM)', type=float, default=1.0) 13 | parser.add_argument('--exclude_targets', help='Exclude these entries (FASTA file to filter out)') 14 | parser.add_argument('--exclude_reads_bam', help='Exclude these entries (BAM file to filter out)') 15 | 16 | args = parser.parse_args() 17 | if args.exclude_targets: 18 | to_exclude = list(set([rec.strip() for rec in open(args.exclude_targets)])) 19 | else: 20 | to_exclude = [] 21 | 22 | if args.exclude_reads_bam: 23 | ex_samfile = pysam.AlignmentFile(args.exclude_reads_bam, "rb") 24 | reads_to_exclude= list(set([''.join(i.query_name.split('_')[:-1]) for i in ex_samfile.fetch(until_eof=True)])) 25 | 26 | else: 27 | reads_to_exclude = [] 28 | 29 | samfile = pysam.AlignmentFile("-", "rb") 30 | passingReads = pysam.AlignmentFile("-", "wb", template=samfile) 31 | 32 | for read in samfile.fetch(): 33 | alignment_len = int(read.query_alignment_length) 34 | snps = read.get_tag('NM') 35 | 36 | qualities = read.query_qualities 37 | refname = read.reference_name 38 | readname = read.query_name 39 | snps_rate =float(snps) / float(read.query_alignment_length) 40 | meanqualities =np.mean(read.query_qualities) 41 | 42 | if (not read.is_secondary) and (alignment_len >= args.minlen) and (snps_rate <= args.maxsnps) and (meanqualities >= args.minqual) and (refname not in to_exclude) and (readname not in reads_to_exclude): 43 | passingReads.write(read) 44 | 45 | 46 | passingReads.close() 47 | samfile.close() 48 | -------------------------------------------------------------------------------- /cmseq/poly.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import argparse 4 | import sys 5 | from cmseq import __version__ 6 | 7 | from .cmseq import CMSEQ_DEFAULTS 8 | from .cmseq import BamFile 9 | 10 | def poly_from_file(): 11 | parser = argparse.ArgumentParser(description="Reports the polymorpgic rate of each reference (polymorphic bases / total bases). Focuses only on covered regions (i.e. depth >= 1)") 12 | parser.add_argument('--version', action='version', version=f"CMSeq {__version__}") 13 | 14 | parser.add_argument('BAMFILE', help='The file on which to operate') 15 | parser.add_argument('-c','--contig', help='Focus on a subset of references in the BAM file. Can be a list of references separated by commas or a FASTA file (the IDs are used to subset)', metavar="REFERENCE ID" ,default=None) 16 | parser.add_argument('-f', help='If set unmapped (FUNMAP), secondary (FSECONDARY), qc-fail (FQCFAIL) and duplicate (FDUP) are excluded. If unset ALL reads are considered (bedtools genomecov style). Default: unset',action='store_true') 17 | parser.add_argument('--sortindex', help='Sort and index the file',action='store_true') 18 | parser.add_argument('--minlen', help='Minimum Reference Length for a reference to be considered. Default: '+str(CMSEQ_DEFAULTS.minlen),default=CMSEQ_DEFAULTS.minlen, type=int) 19 | parser.add_argument('--minqual', help='Minimum base quality. Bases with quality score lower than this will be discarded. This is performed BEFORE --mincov. Default: 30', type=int, default=CMSEQ_DEFAULTS.minqual) 20 | parser.add_argument('--mincov', help='Minimum position coverage to perform the polymorphism calculation. Position with a lower depth of coverage will be discarded (i.e. considered as zero-coverage positions). This is calculated AFTER --minqual. Default:'+str(CMSEQ_DEFAULTS.mincov), type=int, default=CMSEQ_DEFAULTS.mincov) 21 | parser.add_argument('--pvalue', help='Binomial p-value threshold for the binomal-polymorphic test. Default: '+str(CMSEQ_DEFAULTS.poly_pvalue_threshold), type=float, default=CMSEQ_DEFAULTS.poly_pvalue_threshold) 22 | parser.add_argument('--seq_err', help='Sequencing error rate. Default: '+str(CMSEQ_DEFAULTS.poly_error_rate), type=float, default=CMSEQ_DEFAULTS.poly_error_rate) 23 | parser.add_argument('--dominant_frq_thrsh', help='Cutoff for degree of `allele dominance` for a position to be considered polymorphic. Default: '+str(CMSEQ_DEFAULTS.poly_dominant_frq_thrsh), type=float, default=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh) 24 | args = parser.parse_args() 25 | 26 | import pandas as pd 27 | 28 | si = True if args.sortindex else False 29 | mode = 'all' if args.f else 'nofilter' 30 | 31 | bf = BamFile(args.BAMFILE,sort=si,index=si,stepper=mode,minlen=args.minlen,filterInputList=args.contig) 32 | 33 | outputDF = [] 34 | allRatios = [] 35 | allGenomeCol = {'referenceID': '-GENOME-','total_covered_bases':0,'total_polymorphic_bases':0,'total_polymorphic_rate':np.nan} 36 | 37 | for element in bf.get_contigs_obj(): 38 | 39 | tld = element.polymorphism_rate(minqual=args.minqual,mincov=args.mincov,error_rate=args.seq_err,dominant_frq_thrsh=args.dominant_frq_thrsh) 40 | tld['referenceID'] = element.name 41 | 42 | allGenomeCol['total_covered_bases'] += tld['total_covered_bases'] 43 | allGenomeCol['total_polymorphic_bases'] += tld['total_polymorphic_bases'] 44 | if 'ratios' in tld: 45 | allRatios = allRatios + tld['ratios'] 46 | del tld['ratios'] 47 | 48 | outputDF.append(tld) 49 | del tld 50 | 51 | 52 | if float(allGenomeCol['total_covered_bases']) and float(allGenomeCol['total_polymorphic_bases']) > 0: 53 | 54 | allGenomeCol['total_polymorphic_rate'] = float(allGenomeCol['total_polymorphic_bases']) / float(allGenomeCol['total_covered_bases']) 55 | allGenomeCol['dominant_allele_distr_mean'] = np.mean(allRatios) 56 | allGenomeCol['dominant_allele_distr_sd'] = np.std(allRatios) 57 | 58 | for i in [10,20,30,40,50,60,70,80,90,95,98,99]: 59 | allGenomeCol['dominant_allele_distr_perc_'+str(i)] = np.percentile(allRatios,i) 60 | 61 | 62 | outputDF.append(allGenomeCol) 63 | 64 | pd.DataFrame.from_dict(outputDF).set_index('referenceID').to_csv(sys.stdout,sep='#') 65 | 66 | if __name__ == "__main__": 67 | poly_from_file() -------------------------------------------------------------------------------- /cmseq/polymut.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import argparse 4 | import sys 5 | from cmseq import __version__ 6 | 7 | from .cmseq import CMSEQ_DEFAULTS 8 | from .cmseq import BamFile 9 | 10 | def polymut_from_file(): 11 | parser = argparse.ArgumentParser(description="Reports the polymorpgic rate of each reference (polymorphic bases / total bases). Focuses only on covered regions (i.e. depth >= 1)") 12 | parser.add_argument('--version', action='version', version=f"CMSeq {__version__}") 13 | 14 | parser.add_argument('BAMFILE', help='The file on which to operate') 15 | parser.add_argument('-c','--contig', help='Focus on a subset of references in the BAM file. Can be a list of references separated by commas or a FASTA file (the IDs are used to subset)', metavar="REFERENCE ID" ,default=None) 16 | parser.add_argument('-f', help='If set unmapped (FUNMAP), secondary (FSECONDARY), qc-fail (FQCFAIL) and duplicate (FDUP) are excluded. If unset ALL reads are considered (bedtools genomecov style). Default: unset',action='store_true') 17 | parser.add_argument('--sortindex', help='Sort and index the file',action='store_true') 18 | parser.add_argument('--minlen', help='Minimum Reference Length for a reference to be considered. Default: '+str(CMSEQ_DEFAULTS.minlen),default=CMSEQ_DEFAULTS.minlen, type=int) 19 | parser.add_argument('--minqual', help='Minimum base quality. Bases with quality score lower than this will be discarded. This is performed BEFORE --mincov. Default: 30', type=int, default=CMSEQ_DEFAULTS.minqual) 20 | parser.add_argument('--mincov', help='Minimum position coverage to perform the polymorphism calculation. Position with a lower depth of coverage will be discarded (i.e. considered as zero-coverage positions). This is calculated AFTER --minqual. Default:'+str(CMSEQ_DEFAULTS.mincov), type=int, default=CMSEQ_DEFAULTS.mincov) 21 | parser.add_argument('--dominant_frq_thrsh', help='Cutoff for degree of `allele dominance` for a position to be considered polymorphic. Default: '+str(CMSEQ_DEFAULTS.poly_dominant_frq_thrsh), type=float, default=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh) 22 | parser.add_argument('--gff_file', help="GFF file used to extract protein-coding genes", default = None) 23 | args = parser.parse_args() 24 | 25 | import pandas as pd 26 | 27 | outputDicts=[] 28 | 29 | si = True if args.sortindex else False 30 | mode = 'all' if args.f else 'nofilter' 31 | 32 | bf = BamFile(args.BAMFILE,sort=si,index=si,stepper=mode,minlen=args.minlen,filterInputList=args.contig) 33 | 34 | if (args.gff_file): 35 | bf.parse_gff(args.gff_file) 36 | 37 | for i in bf.get_contigs_obj(): 38 | dominanceArray, mutationStats = i.easy_polymorphism_rate(minqual=args.minqual,mincov=args.mincov,dominant_frq_thrsh=args.dominant_frq_thrsh) 39 | outputDicts.append({'Ref':i.name, 'DN':mutationStats['DN'],'DS':mutationStats['DS'],'D?':mutationStats['D?'], "consid_pos":len([x for x in dominanceArray if not np.isnan(x)])}) 40 | out_df = pd.DataFrame.from_dict(outputDicts).set_index('Ref') 41 | print(float(np.sum(out_df["DN"])), float(np.sum(out_df["DS"])), float(sum(out_df["consid_pos"]))) 42 | 43 | if __name__ == "__main__": 44 | polymut_from_file() -------------------------------------------------------------------------------- /recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "cmseq" %} 2 | {% set version = "1.0" %} 3 | 4 | package: 5 | name: {{ name }} 6 | version: {{ version }} 7 | 8 | source: 9 | url: https://github.com/fbeghini/cmseq/archive/{{version}}.tar.gz 10 | sha256: 37f202a6bf6668ebda5ccfdab878af277a46965e39fa1c4602dc12c0ca67b079 11 | 12 | build: 13 | noarch: python 14 | script: {{ PYTHON }} -m pip install . -vv 15 | 16 | requirements: 17 | host: 18 | - python 19 | - pip 20 | - numpy 21 | run: 22 | - python 23 | - samtools >=1.0 24 | - numpy 25 | - scipy 26 | - pysam 27 | - pandas 28 | - biopython 29 | - bcbio-gff 30 | 31 | test: 32 | commands: 33 | - breadth_depth.py --help 34 | - consensus.py --help 35 | - polymut.py --help 36 | - poly.py --help 37 | about: 38 | home: https://github.com/SegataLab/cmseq 39 | summary: Set of utilities on sequences and BAM files -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | import codecs 3 | from setuptools.command.install import install 4 | from io import open 5 | import os 6 | 7 | def read(rel_path): 8 | here = os.path.abspath(os.path.dirname(__file__)) 9 | with codecs.open(os.path.join(here, rel_path), "r") as fp: 10 | return fp.read() 11 | 12 | 13 | def get_version(rel_path): 14 | for line in read(rel_path).splitlines(): 15 | if line.startswith("__version__"): 16 | delim = '"' if '"' in line else "'" 17 | return line.split(delim)[1] 18 | else: 19 | raise RuntimeError("Unable to find version string.") 20 | 21 | install_requires = ["numpy", "scipy", "pysam", "pandas", "biopython", "bcbio-gff"] 22 | setuptools.setup( 23 | name='CMSeq', 24 | version=get_version("cmseq/__init__.py"), 25 | author='Moreno Zolfo', 26 | author_email='moreno.zolfo@unitn.it', 27 | url='http://github.com/SegataLab/cmseq/', 28 | license = 'LICENSE.txt', 29 | packages=setuptools.find_packages(), 30 | entry_points={ 31 | 'console_scripts': [ 32 | 'breadth_depth.py = cmseq.breadth_depth:bd_from_file', 33 | 'consensus.py = cmseq.consensus:consensus_from_file', 34 | 'consensus_aDNA.py = cmseq.consensus_aDNA:consensus_from_file', 35 | 'polymut.py = cmseq.polymut:polymut_from_file', 36 | 'poly.py = cmseq.poly:poly_from_file' 37 | ] 38 | }, 39 | long_description_content_type='text/markdown', 40 | long_description=open('README.md').read(), 41 | description='Set of utilities on sequences and BAM files', 42 | install_requires=install_requires 43 | ) 44 | --------------------------------------------------------------------------------