├── .github
    └── workflows
    │   ├── cmseq_ci.yml
    │   └── python-publish.yml
├── .gitignore
├── LICENSE.txt
├── README.md
├── README_class.md
├── cmseq
    ├── __init__.py
    ├── breadth_depth.py
    ├── cmseq.py
    ├── consensus.py
    ├── consensus_aDNA.py
    ├── filter.py
    ├── poly.py
    └── polymut.py
├── recipe
    └── meta.yaml
└── setup.py


/.github/workflows/cmseq_ci.yml:
--------------------------------------------------------------------------------
 1 | name: CMSeq_ci
 2 | 
 3 | on: [push,  pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: [3.6, 3.7, 3.8, 3.9]
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v2
15 |       - name: Set up Python ${{ matrix.python-version }}
16 |         uses: actions/setup-python@v2
17 |         with:
18 |           python-version: ${{ matrix.python-version }}
19 |       - name: Install dependencies
20 |         run: |
21 |           python -m pip install --upgrade pip
22 |           pip install flake8 pytest
23 |           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
24 |       # - name: Lint with flake8
25 |       #   run: |
26 |       #     # stop the build if there are Python syntax errors or undefined names
27 |       #     flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
28 |       #     # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
29 |       #     flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
30 |       - name: Install CMSeq
31 |         run: |
32 |           pip install .
33 |       - name: Check cmseq help message
34 |         run: |
35 |           breadth_depth.py --help
36 |           consensus_aDNA.py --help
37 |           consensus.py --help
38 |           poly.py --help
39 |           polymut.py --help


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Moreno Zolfo, Nicolai Karcher
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CMSeq #
  2 |  
  3 | CMSeq is a set of commands to provide an interface to .bam files for coverage and sequence consensus.
  4 | 
  5 | **Requires:**
  6 | 
  7 | * samtools (> 1.x)
  8 | * numpy
  9 | * pysam
 10 | * pandas
 11 | * Biopython with bcbio-gff module _(warning: Biopython <= 1.76 is required for `polymut.py`)_
 12 | 
 13 | ## Installation
 14 | 
 15 | [![PyPi version](https://pypip.in/v/CMSeq/badge.png)](https://pypi.org/project/CMSeq/) [![Anaconda-Server Badge](https://anaconda.org/bioconda/cmseq/badges/version.svg)](https://anaconda.org/bioconda/cmseq) [![Anaconda-Server Badge](https://anaconda.org/bioconda/cmseq/badges/installer/conda.svg)](https://conda.anaconda.org/bioconda)
 16 | 
 17 | ### Install via source and pip ###
 18 | 
 19 | ```
 20 | git clone https://github.com/SegataLab/cmseq.git
 21 | pip install .
 22 | ```
 23 | 
 24 | ### Install via PyPi ###
 25 | 
 26 | `pip install CMSeq`
 27 | 
 28 | ### Install via Bioconda ###
 29 | 
 30 | `conda install -c bioconda cmseq`
 31 | 
 32 | ## Functions
 33 | 
 34 | * [Breadth and Depth of Coverage](#breadth-and-depth-of-coverage-with-breadth_depthpy)
 35 | * [Polymorphic Rate on CDS](#polymorphic-rate-over-protein-coding-genes-with-polymutpy)
 36 | * [Polymorphic Rate on the whole genome](#polymorphic-rate-with-polypy)
 37 | * [Reference free consensus](#reference-free-but-guided-consensus-with-consensuspy)
 38 | 
 39 | **Note: CMSeq can be used [as python module](README_class.md) as well**
 40 | 
 41 | ## Breadth and Depth of coverage with breadth_depth.py
 42 | 
 43 | Provides breadth and depth of coverage the references of BAM alignment file, in tabular format. The file must be indexed and sorted (alternatively, --sortindex can be used).
 44 | 
 45 | 
 46 | ```
 47 | usage: breadth_depth.py [-h] [-c REFERENCE ID] [-f] [--sortindex]
 48 |                         [--minlen MINLEN] [--minqual MINQUAL]
 49 |                         [--mincov MINCOV] [--truncate TRUNCATE]
 50 |                         BAMFILE
 51 | ```
 52 | 
 53 | 
 54 | ```
 55 | positional arguments:
 56 |   BAMFILE               The file on which to operate
 57 | 
 58 | optional arguments:
 59 |   -h, --help            show this help message and exit
 60 |   -c REFERENCE ID, --contig REFERENCE ID
 61 |                         Focus on a subset of references in the BAM file. Can
 62 |                         be a list of references separated by commas or a FASTA
 63 |                         file (the IDs are used to subset)
 64 |   -f                    If set unmapped (FUNMAP), secondary (FSECONDARY), qc-
 65 |                         fail (FQCFAIL) and duplicate (FDUP) are excluded. If
 66 |                         unset ALL reads are considered (bedtools genomecov
 67 |                         style). Default: unset
 68 |   --sortindex           Sort and index the file
 69 |   --minlen MINLEN       Minimum Reference Length for a reference to be
 70 |                         considered
 71 |   --minqual MINQUAL     Minimum base quality. Bases with quality score lower
 72 |                         than this will be discarded. This is performed BEFORE
 73 |                         --mincov. Default: 30
 74 |   --mincov MINCOV       Minimum position coverage to perform the polymorphism
 75 |                         calculation. Position with a lower depth of coverage
 76 |                         will be discarded (i.e. considered as zero-coverage
 77 |                         positions). This is calculated AFTER --minqual.
 78 |                         Default: 1
 79 |   --truncate TRUNCATE   Number of nucleotides that are truncated at either
 80 |                         contigs end before calculating coverage values.
 81 | 
 82 | ```
 83 |  
 84 | 
 85 | Breadh and Depth of coverage outputs a table with the breadth of coverage, average and median depth-of-coverage of each reference. Values are calculated only on the covered portion of the reference:
 86 | 
 87 | |contig|Breadth|Depth (avg)|Depth (median)|
 88 | |------|-------|-----------|--------------|
 89 | |EF401177.1.1491|0.101274312542|1.0|1.0|
 90 | |EF405039.1.1494|0.101070950469|2.69536423841|3.0|
 91 | |all_contigs|-|1.84768211921|1.0|
 92 | 
 93 | The last line is a summary line calculated as if all the reads were coming from the same (big) contig.
 94 | 
 95 | ### Examples: ###
 96 | 
 97 | Extract breadth and depth of coverage for all the references within a sorted and indexed `BAM` file
 98 | 
 99 | 
100 | ```
101 | breadth_depth.py mybam.sorted.bam
102 | ```
103 | 
104 | Extract breadth and depth of coverage for all the references within an unsorted `BAM` file 
105 | 
106 | 
107 | ```
108 | breadth_depth.py --sortindex mybam.sorted.bam 
109 | ```
110 | 
111 | Extract breadth and depth of coverage for all the references within a sorted `BAM` file, count only the reads with minimum quality of 25 and positions with a minimum coverage of 10 
112 | 
113 | 
114 | ```
115 | breadth_depth.py --mincov 10 --minqual 20 mybam.bam
116 | ```
117 | 
118 | Extract breadth and depth of coverage for the references: genome_1 and genome_2 within a sorted `BAM` file
119 | 
120 | 
121 | ```
122 | breadth_depth.py -c genome_1,genome_2 mybam.bam
123 | ```
124 | 
125 | Extract breadth and depth of coverage for the references present in MYFASTA.fasta, within a sorted `BAM` file
126 | 
127 | 
128 | ```
129 | breadth_depth.py -c MYFASTA.fasta mybam.sorted.bam
130 | ```
131 | 
132 | ## Polymorphic rate over protein-coding genes with polymut.py
133 | 
134 | **Warning:** Biopython <= 1.76 is required for `polymut.py`
135 | 
136 | This function calculates polymorphic site rates over protein coding genes. It considers dominant and second-dominant alleles over protein-coding genes on the nucleotide level, translates the ORFs into proteins and then calculates and outputs the number of 
137 | synonymous and non-synonymous mutations (on the protein level) between the dominant and second-dominant protein sequences. 
138 | Positions with a ratio between second-dominant and dominant allele coverage smaller than dominant_frq_thrsh are considered non-variant.
139 | 
140 | This function was used in [Pasolli et al., 2019](https://pubmed.ncbi.nlm.nih.gov/30661755/) as an ad-hoc measure to calculate strain heterogeneity in metagenomes.
141 | 
142 | Since the likelihood of finding more than one strain in the same gut varies strongly across gut commensals (as well as different within-species genetic diversity), this function does not allow a rigorous classification of metagenomes into strain-mixed and non-strain-mixed, but it can be shown that - considering polymorphic site rates over i.e. core genes of any given speices - samples with a higher polymorphic site rate are more likely to harbour more than one strain. 
143 | 
144 | Please supply a gff file from Prokka and make sure that the contig names between the bam file and the gff file can be matched.
145 | 
146 | 
147 | ```
148 | usage: polymut.py [-h] [-c REFERENCE ID] [-f] [--sortindex] [--minlen MINLEN]
149 |                   [--minqual MINQUAL] [--mincov MINCOV]
150 |                   [--dominant_frq_thrsh DOMINANT_FRQ_THRSH]
151 |                   [--gff_file GFF_FILE]
152 |                   BAMFILE
153 | 
154 | Reports the polymorpgic rate of each reference (polymorphic bases / total
155 | bases). Focuses only on covered regions (i.e. depth >= 1)
156 | 
157 | positional arguments:
158 |   BAMFILE               The file on which to operate
159 | 
160 | optional arguments:
161 |   -h, --help            show this help message and exit
162 |   -c REFERENCE ID, --contig REFERENCE ID
163 |                         Focus on a subset of references in the BAM file. Can
164 |                         be a list of references separated by commas or a FASTA
165 |                         file (the IDs are used to subset)
166 |   -f                    If set unmapped (FUNMAP), secondary (FSECONDARY), qc-
167 |                         fail (FQCFAIL) and duplicate (FDUP) are excluded. If
168 |                         unset ALL reads are considered (bedtools genomecov
169 |                         style). Default: unset
170 |   --sortindex           Sort and index the file
171 |   --minlen MINLEN       Minimum Reference Length for a reference to be
172 |                         considered. Default: 0
173 |   --minqual MINQUAL     Minimum base quality. Bases with quality score lower
174 |                         than this will be discarded. This is performed BEFORE
175 |                         --mincov. Default: 30
176 |   --mincov MINCOV       Minimum position coverage to perform the polymorphism
177 |                         calculation. Position with a lower depth of coverage
178 |                         will be discarded (i.e. considered as zero-coverage
179 |                         positions). This is calculated AFTER --minqual.
180 |                         Default:1
181 |   --dominant_frq_thrsh DOMINANT_FRQ_THRSH
182 |                         Cutoff for degree of `allele dominance` for a position
183 |                         to be considered polymorphic. Default: 0.8
184 |   --gff_file GFF_FILE   GFF file used to extract protein-coding genes
185 | 
186 | ```
187 | 
188 | The functions prints three values:
189 | * the total number of non-synonymous mutations
190 | * the total number of synonymous mutations
191 | * the total number of considered positions (total number of positions covered higher than the parameter specified with --mincov)
192 | 
193 | Please note that this function is meant to be used on multi-contig genomes, so **polymut.py reports the sum of non-synonimous and synonimous positions** for all the contigs considered. If you specify a list of contigs with `-c`, only those will be considered. 
194 | 
195 | 
196 | ### Examples ###
197 | 
198 | Calculate the number of non-synonymous, synonymous and the total number of considered positions (on the nucleotide level!) over your contig of interest.
199 | 
200 | ```
201 | python polymut.py -c "contig_of_interest" bam_of_interest.bam --mincov 10 --minqual 30 --dominant_frq_thrsh 0.8 --gff_file gff_from_prokka.gff
202 | ```
203 | 
204 | ## Polymorphic Rate with poly.py
205 | 
206 | Provides the Polymorphic-rate of each reference in a sorted and indexed BAMFILE. The polymorphic rate is defined as: number_of_polymorhpic_sites / number_of_total_nucleotides. Beware that *number_of_total_nucleotides* depends on --minqual and --mincov, as if a position is not covered (e.g. coverage = 0) will not be counted in the denominator.
207 | 
208 | 
209 | ```
210 | usage: poly.py [-h] [-c REFERENCE ID] [-f] [--sortindex] [--minlen MINLEN]
211 |                [--minqual MINQUAL] [--mincov MINCOV] [--pvalue PVALUE]
212 |                [--seq_err SEQ_ERR] [--dominant_frq_thrsh DOMINANT_FRQ_THRSH]
213 |                BAMFILE
214 | 
215 | Reports the polymorpgic rate of each reference (polymorphic bases / total
216 | bases). Focuses only on covered regions (i.e. depth >= 1)
217 | 
218 | positional arguments:
219 |   BAMFILE               The file on which to operate
220 | 
221 | optional arguments:
222 |   -h, --help            show this help message and exit
223 |   -c REFERENCE ID, --contig REFERENCE ID
224 |                         Focus on a subset of references in the BAM file. Can
225 |                         be a list of references separated by commas or a FASTA
226 |                         file (the IDs are used to subset)
227 |   -f                    If set unmapped (FUNMAP), secondary (FSECONDARY), qc-
228 |                         fail (FQCFAIL) and duplicate (FDUP) are excluded. If
229 |                         unset ALL reads are considered (bedtools genomecov
230 |                         style). Default: unset
231 |   --sortindex           Sort and index the file
232 |   --minlen MINLEN       Minimum Reference Length for a reference to be
233 |                         considered. Default: 0
234 |   --minqual MINQUAL     Minimum base quality. Bases with quality score lower
235 |                         than this will be discarded. This is performed BEFORE
236 |                         --mincov. Default: 30
237 |   --mincov MINCOV       Minimum position coverage to perform the polymorphism
238 |                         calculation. Position with a lower depth of coverage
239 |                         will be discarded (i.e. considered as zero-coverage
240 |                         positions). This is calculated AFTER --minqual.
241 |                         Default:1
242 |   --pvalue PVALUE       Binomial p-value threshold for the binomal-polymorphic
243 |                         test. Default: 0.01
244 |   --seq_err SEQ_ERR     Sequencing error rate. Default: 0.001
245 |   --dominant_frq_thrsh DOMINANT_FRQ_THRSH
246 |                         Cutoff for degree of `allele dominance` for a position
247 |                         to be considered polymorphic. Default: 0.8
248 | 
249 | ```
250 | 
251 | The output is strucutred as follows:
252 | 
253 | ```
254 | |referenceID|dominant_allele_distr_mean|dominant_allele_distr_perc_10|...|dominant_allele_distr_sd|tot_covered_bases|tot_polymorphic_bases|polymorphic_rate|
255 | |----|----|----|----|----|----|----|----|----|
256 | |EF401177.1.1491|-|-...|-|151.00|0.00|0.00|
257 | |EF405039.1.1494|0.65|0.67|...|0.04|151.00|13.00|0.09|
258 | |-GENOME-|0.65|0.67|...|0.04|302.00|13.00|0.04|
259 | ```
260 | 
261 | As for ``breadh_depth.py``, also the polymorphic rate analyisis is subjected to ``mincov``, ``minqual``, and ``minlen``. Additionally, two parameters can be set to decide when a site is polymorphic:
262 | 
263 | 
264 | * ``dominant_frq_thrsh`` is a percentage: if the majoritary allele frequency at position x is greater than the threshold, x is considered non-polymorphic. Otherwise, a binomial test is performed to assure that x is polymorpfic (polymorphic if p < ``pvalue``)
265 | 
266 | 
267 | ### Examples ###
268 | 
269 | Extract polymorphic rate from a sorted and indexed bam file 
270 | 
271 | 
272 | ```
273 | poly.py mybam.sorted.bam
274 | ```
275 | 
276 | 
277 | Extract polymorphic rate from an unsorted bam file 
278 | 
279 | 
280 | ```
281 | poly.py --sortindex mybam.sorted.bam 
282 | ```
283 | 
284 | 
285 | Extract polymorphic rate from an unsorted bam file, counting only bases with minimum quality of 30 and minimum position-coverage of 10
286 | 
287 | 
288 | ```
289 | poly.py --sortindex --mincov 10 --minqual 30 mybam.unsorted.bam
290 | ```
291 | 
292 | 
293 | Extract polymorphic rate from an unsorted bam file, only for reads aligning against genome_1 or genome_2. Consider polymorphic only sites with majoritary-allele-freq < 70%
294 | 
295 | 
296 | ```
297 | poly.py --sortindex -c genome_1,genome_2 --dominant_frq_thrsh 0.7 mybam.unsorted.bam
298 | ```
299 | 
300 |  
301 | ## Reference Free (but guided) consensus with consensus.py
302 | 
303 | Provides the Reference Free consensus for the references in a BAM alignment file, reconstructing the sequence from the raw reads, in FASTA format to standard output. The file must be indexed and sorted (alternatively, --sortindex can be used). Note that the length of the reconstructed sequence is bound to the original length of the reference. On that length, not all the positions may be covered. This can happen because:
304 | 
305 | * there are no reads mapping to the position
306 | * there are too few reads (*i.e < ``mincov``*) mapping to the position
307 | * the reads that map to the position have a low quality (*i.e. < ``minqual``*)
308 | * the distribution of nucleotides at that position is potentially problematic (*i.e. dominant_allele_frequency < ``dominant_frq_thrsh``*): in this case, the position is excluded to reduce noise.  
309 | 
310 | 
311 | ```
312 | usage: consensus.py [-h] [-c REFERENCE ID] [-f] [--sortindex]
313 |                     [--minqual MINQUAL] [--mincov MINCOV]
314 |                     [--dominant_frq_thrsh DOMINANT_FRQ_THRSH]
315 |                     [--minlen MINLEN] [--trim TRIM]
316 |                     BAMFILE
317 | 
318 | outputs the consensus in FASTA format. Non covered positions (or quality-
319 | trimmed positions) are reported as a dashes: -
320 | 
321 | positional arguments:
322 |   BAMFILE               The file on which to operate
323 | 
324 | optional arguments:
325 |   -h, --help            show this help message and exit
326 |   -c REFERENCE ID, --contig REFERENCE ID
327 |                         Focus on a subset of references in the BAM file. Can
328 |                         be a list of references separated by commas or a FASTA
329 |                         file (the IDs are used to subset)
330 |   -f                    If set unmapped (FUNMAP), secondary (FSECONDARY), qc-
331 |                         fail (FQCFAIL) and duplicate (FDUP) are excluded. If
332 |                         unset ALL reads are considered (bedtools genomecov
333 |                         style). Default: unset
334 |   --sortindex           Sort and index the file
335 |   --minqual MINQUAL     Minimum base quality. Bases with quality score lower
336 |                         than this will be discarded. This is performed BEFORE
337 |                         --mincov. Default: 30
338 |   --mincov MINCOV       Minimum position coverage to perform the polymorphism
339 |                         calculation. Position with a lower depth of coverage
340 |                         will be discarded (i.e. considered as zero-coverage
341 |                         positions). This is calculated AFTER --minqual.
342 |                         Default: 0
343 |   --dominant_frq_thrsh DOMINANT_FRQ_THRSH
344 |                         Cutoff for degree of `allele dominance` for a position
345 |                         to be considered polymorphic. Default: 0.8
346 |   --minlen MINLEN       Minimum Reference Length for a reference to be
347 |                         considered. Default: 0
348 |   --trim TRIM           Trim the reads before computing the consensus.
349 |                         A value of 10:10 means that the first and last 10 positions
350 |                         of each read will be ignored. Default: None
351 | ```
352 | 
353 | 
354 | 
355 | Note that positions with a majoritary allele frequency lower than dominant_frq_thrsh will be considered "problematic" and substituted with a "-", even with sufficient coverage and quality.
356 | 
357 | 
358 | 
359 | ```
360 | consensus.py ~/tmp.bam.sorted -c EF401177.1.1491,EF405039.1.1494 --mincov 1 --dominant_frq_thrsh 0.5
361 | >EF401177.1.1491_consensus
362 | ------------------------------------------------------------
363 | ------------------------------------------------------------
364 | ------------------------------------------------------------
365 | ------------------------------------------------------------
366 | ------------------------------------------------------------
367 | ------------------------------------------------------------
368 | ------------------------------------------------------------
369 | ------------------------------------------------------------
370 | -----------------------------------TACGTAGGGGGCAAGCGTTATCCGG
371 | ATTTACTGGGTGTAAAGGGAGCGTAGACGGCGAGACAAGTCTGAAGTGAAAGCCCGGGGC
372 | TCAACCCCGGGACTGCTTTGGAAACTGCCTTGCTAGAGTGCTGGAGAGGTAAGTGGAATT
373 | CCTAGT------------------------------------------------------
374 | ------------------------------------------------------------
375 | ------------------------------------------------------------
376 | ------------------------------------------------------------
377 | ------------------------------------------------------------
378 | ------------------------------------------------------------
379 | ------------------------------------------------------------
380 | ------------------------------------------------------------
381 | ------------------------------------------------------------
382 | ------------------------------------------------------------
383 | ------------------------------------------------------------
384 | ------------------------------------------------------------
385 | ------------------------------------------------------------
386 | ---------------------------------------------------
387 | >EF405039.1.1494_consensus
388 | ------------------------------------------------------------
389 | ------------------------------------------------------------
390 | ------------------------------------------------------------
391 | ------------------------------------------------------------
392 | ------------------------------------------------------------
393 | ------------------------------------------------------------
394 | ------------------------------------------------------------
395 | ------------------------------------------------------------
396 | -------------------------------------TACGTAGGTGGCAAGCGTTATCC
397 | GGATTTACTGGGTGTAAAGGGCGTGCAGCCGGGTCTGCAAGTCAGATGTGAAATCCATGG
398 | GCTCAACCCATGAACTGCATTTGAAACTGTAGATCTTGAGTGTCGGAGGGGCAATCGGAA
399 | TTCCTAGT----------------------------------------------------
400 | ------------------------------------------------------------
401 | ------------------------------------------------------------
402 | ------------------------------------------------------------
403 | ------------------------------------------------------------
404 | ------------------------------------------------------------
405 | ------------------------------------------------------------
406 | ------------------------------------------------------------
407 | ------------------------------------------------------------
408 | ------------------------------------------------------------
409 | ------------------------------------------------------------
410 | ------------------------------------------------------------
411 | ------------------------------------------------------------
412 | ------------------------------------------------------
413 | ```
414 | 
415 | 
416 | ### Examples ###
417 | 
418 | 
419 | Extract the consensus from all the references from a sorted and indexed BAM file, in FASTA format:
420 | 
421 | 
422 | ```
423 | consensus.py mybam.sorted.bam
424 | ```
425 | 
426 | Extract the consensus from all the references from an unsorted BAM file, in FASTA format:
427 | 
428 | 
429 | ```
430 | consensus.py --sortindex mybam.sorted.bam 
431 | ```
432 | 
433 | Extract the consensus of genome_1 and genome_2 from a BAM file. Positions with coverage lower than 5 are ignored (- is reported instead of base-call):
434 | 
435 | 
436 | ```
437 | consensus.py --mincov 5 -c genome_1,genome_2 mybam.sorted.bam
438 | ```
439 | 
440 | Extract the consensus of genome_1 and genome_2 from a BAM file. Positions with coverage lower than 5 "high quality" bases are ignored (- is reported instead of base-call). Additionally, positions with less than 50% majoritary-letters will be substituted by a "-":
441 | 
442 | 
443 | ```
444 | consensus.py --mincov 5 --minqual 30 -c genome_1,genome_2 --dominant_frq_thrsh 0.5 mybam.sorted.bam
445 | ```
446 | 
447 | Same as above, but a FASTA file is used to filter references instead:
448 | 
449 | 
450 | ```
451 | consensus.py --mincov 5 --minqual 30 -c FILTER_FASTA.fasta --dominant_frq_thrsh 0.5 mybam.sorted.bam
452 | ```
453 | 
454 | ### Ancient DNA consensus 
455 | 
456 | Extract the consensus of genome from a BAM file, in the scenario of ancient metagenomics study. Positions with coverage lower than 5 and damage probability (Stats_out_MCMC_correct_prob.csv from mapDamage2) higher than 0.95 are ignored.
457 | 
458 | 
459 | ```
460 | consensus_aDNA.py --mincov 5 -r reference.fna --pos_specific_prob_tab Stats_out_MCMC_correct_prob.csv --pos_damage_prob_thrsh 0.95 mybam.sorted.bam 
461 | ```
462 | 
463 | 


--------------------------------------------------------------------------------
/README_class.md:
--------------------------------------------------------------------------------
  1 | # CMSeq #
  2 | 
  3 |  
  4 | * Provides interface for .bam files
  5 | * reference free consensus
  6 | * Breadth and Depth of coverage 
  7 | 
  8 | Requires samtools (> 1.x), numpy, pysam, matplotlib and seaborn
  9 |  
 10 | ## Use as Python Module ##
 11 | 
 12 | ### class BamFile ###
 13 | 
 14 | Represents a collection of contig/reference of a bam file
 15 | 
 16 | To create a new BamContig from an unsorted BAM file:
 17 | ```
 18 | #!python
 19 | collection = cmseq.BamFile(BAM_FILE_PATH,sort=True,index=True,minlen=0)
 20 | ```
 21 | 
 22 | an optional argument ``filterInputList`` can be passed to BamFile, to filter only some references. ``filterInputList`` can be:
 23 | * a string of comma-separated IDs
 24 | * the path to a FASTA file with the to-be-filtered IDs as FASTA IDs
 25 | 
 26 | To start from a pre-sorted and indexed bam file:
 27 | ```
 28 | #!python
 29 | collection = cmseq.BamFile(BAM_FILE_PATH)
 30 | ```
 31 | 
 32 | To set the pysam stepper to a custom value (e.g. `all`, that avoids secondary alignments or `nofilter`, that includes secondary alignments):
 33 | ```
 34 | #!python
 35 | #Chose a custom stepper for all the contigs of the BAMFILE
 36 | collection = cmseq.BamFile(BAM_FILE_PATH,stepper='all')
 37 | ```
 38 | 
 39 | To take into accounts only references (/contigs) longer than N, use `minlen`:
 40 | ```
 41 | #!python
 42 | #Build the collection only on contigs / references longer than 5000
 43 | collection = cmseq.BamFile(BAM_FILE_PATH,minlen=5000)
 44 | ```
 45 | 
 46 | ### class BamContig ###
 47 | 
 48 | Represents a reference to which some reads map against
 49 | 
 50 | To create a new BamContig:
 51 | *Note*: this is NOT needed if a BamFile instance has been created before, as this is done automatically for each contig within the bamfile
 52 | 
 53 | ```
 54 | #!python
 55 | contig = cmseq.BamContig(bamHandle,contigName,contigLength)
 56 | ```
 57 | 
 58 | * bamHandle: a pysam AlignmentFile instance, pointing to the original bam file (sorted and indexed)
 59 | * contigName: the name of the contig/reference in the bam file
 60 | * contigLength: the length of that contig/reference
 61 | 
 62 | **Refernece Free Consensus**
 63 | 
 64 | reference_free_consensus(): returns a string, long as the reference, with the consensus.
 65 | 
 66 | The function can use the optional parameters:
 67 | 
 68 | * `minqual`: the consensus will be based only on those nucleotides with a mapping-quality higher than minqual. **Default: 0**, meaning everything is used
 69 | * `mincov`: the consensus will be based only on those positions with at least MINCOV coverage (after the quality filtering of `minqual`). **Default: 1**, meaning everything is used.
 70 | 
 71 | * `consensus_rule`: a custom consensus function that: 
 72 | takes as input a python dictionary. The function is applied to each column of the samtools pileup.
 73 | The dictionary has this structre: {'A':0,'T':0,'C':0,'G':0,'N':0} and stores the counts (coverages) for each position in each nucleotide ("N" = anything else). The function must return a char
 74 | The default function is: `lambda array: max(array, key=array.get)` (pure majority rule).
 75 | The function is applied only to positions that meet the requirements of `minqual` and `mincov`. Other positions are reported as "-"
 76 | 
 77 | * `trimReads`: a tuple specifying the range of each read to be skipped when computing the consensus. If set to (10,10) it means that the first and last 10 of each read will not be used to compute the consensus. Default is None, which means nothing will be trimmed. 
 78 | Examples
 79 | ```
 80 | #!python
 81 | # Get the simplest majority rule (default) consensus of REFERENCE_NAME:
 82 | print a.get_contig_by_label("REFERENCE_NAME").reference_free_consensus()
 83 | 
 84 | # Get the simplest majority rule (default) consensus of REFERENCE_NAME considering positions covered by at least 5 reads with qualities higher than 33:
 85 | print a.get_contig_by_label("REFERENCE_NAME").reference_free_consensus(mincov=5,minqual=33)
 86 | 
 87 | # Use a custom consensus rule: return X for each position
 88 | print a.get_contig_by_label("REFERENCE_NAME").reference_free_consensus(consensus_rule=lambda array: 'X')
 89 | ```
 90 | 
 91 | **Depth of Coverage**
 92 | 
 93 | BamContig.**depth_of_coverage()**: returns a tuple, with the (mean_coverage,median_coverage) values, calculated over the positions that have a coverage of at least 1 (at least one mapping read on that position). Optionally, can take:
 94 | 
 95 | * `minqual`: the nucleotides considered are only those that have a quality score higher than MINQUAL. **Default: 0**, meaning everything is used
 96 | * `mincov`: the depth is based only on those positions with at least MINCOV coverage (after the quality filtering of `minqual`). **Default: 1**, meaning everything is used.
 97 | 
 98 | **Breadth of Coverage**
 99 | 
100 | BamContig.**breadth_of_coverage**: returns a float, with the percentage of the total reference length covered by reads. It takes as optional parameters `mincov` and `minqual` as *depth_of_coverage*
101 | 
102 | **Polymorphic Rate**
103 | 
104 | BamContig.**polymorphism_rate**: returns a DataFrame, with the statistics of polymorphic positions, over the total number of reconstructable positions. It takes as optional parameters `mincov` and `minqual` as *depth_of_coverage*. 
105 |  
106 | **Set the Pysam stepper**
107 | 
108 | BamContig.**set_stepper(VALUE)**: resets the pysam stepper for the reference. VALUE can be `all` or `nofilter`, as of the pysam specifications. By default the stepper is set to 'nofilter' (bedtools style).
109 | 
110 | ### Examples ###
111 | 
112 | Create a new instance of a BamFile. An unsorted, unindexed bam file can be provided and will be sorted and indexed within the module:
113 | 
114 | ```
115 | #!python
116 | import cmseq
117 | collection = cmseq.BamFile("CONTIG_NAME",sort=True,index=True)
118 | ```
119 | 
120 | Iterate over each contig represented in the BAM/SAM file:
121 | 
122 | ```
123 | #!python
124 | for i in collection.get_contigs():
125 |   print i,collection.get_contig_by_label(i).reference_free_consensus()
126 |   print collection.get_contig_by_label(i).depth_of_coverage()  #(mean,median)
127 |   print collection.get_contig_by_label(i).breadth_of_coverage()
128 | ```
129 | Select a custom contig and get its consensus sequence by majoriy rule:
130 | ```
131 | #!python
132 | print collection.get_contig_by_label("REFERENCE_NAME").reference_free_consensus()
133 | ```
134 | 
135 | Select a custom contig and plot its coverage
136 | ```
137 | #!python
138 | collection.get_contig_by_label("REFERENCE_NAME").plot_coverage('out.pdf')
139 | ```
140 | 
141 | Select a custom contig and get its consensus sequence by majoriy rule, only for positions covered by at least 10 high quality reads:
142 | 
143 | ```
144 | #!python
145 | print collection.get_contig_by_label("REFERENCE_NAME").reference_free_consensus(mincov=10,minqual=33)
146 | ```
147 | 
148 | Select a custom contig and get a custom consensus sequence, with "+" where coverage is higher or equal 2, - otherwise:
149 | 
150 | ```
151 | #!python
152 | print collection.get_contig_by_label("REFERENCE_NAME").reference_free_consensus(consensus_rule=lambda array: '+' if sum(array.values()) >= 2 else '-')
153 | ```
154 | 
155 | Do the same as before, without using the BamFile class, but with pysam only. The bam file needs to be sorted and indexed!
156 | 
157 | ```
158 | #!python
159 | import pysam,cmseq
160 | bamHandle = pysam.AlignmentFile(BAM_PATH, "rb")
161 | lengths = dict((r,l) for r,l in zip(bamHandle.references,bamHandle.lengths))
162 | contig = cmseq.BamContig(bamHandle,TARGET_CONTIG,lengths[TARGET_CONTIG])
163 | 
164 | print contig.reference_free_consensus(consensus_rule=lambda array: '+' if sum(array.values()) >= 2 else '-')
165 | 
166 | ```


--------------------------------------------------------------------------------
/cmseq/__init__.py:
--------------------------------------------------------------------------------
1 | from cmseq.cmseq import CMSEQ_DEFAULTS
2 | from cmseq.cmseq import BamFile
3 | from cmseq.cmseq import BamContig
4 | 
5 | __version__ = '1.0.4'


--------------------------------------------------------------------------------
/cmseq/breadth_depth.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from cmseq.cmseq import CMSEQ_DEFAULTS
 4 | from cmseq.cmseq import BamFile
 5 | from cmseq import __version__
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | import argparse
10 | 
11 | 
12 | 
13 | def bd_from_file():
14 | 	parser = argparse.ArgumentParser(description="calculate the Breadth and Depth of coverage of BAMFILE.")
15 | 	parser.add_argument('--version', action='version', version=f"CMSeq {__version__}")
16 | 
17 | 	parser.add_argument('BAMFILE', help='The file on which to operate')
18 | 	parser.add_argument('-c','--contig', help='Focus on a subset of references in the BAM file. Can be a list of references separated by commas or a FASTA file (the IDs are used to subset)', metavar="REFERENCE ID" ,default=None)
19 | 	parser.add_argument('-f', help='If set unmapped (FUNMAP), secondary (FSECONDARY), qc-fail (FQCFAIL) and duplicate (FDUP) are excluded. If unset ALL reads are considered (bedtools genomecov style). Default: unset',action='store_true')
20 | 	parser.add_argument('--sortindex', help='Sort and index the file',action='store_true')
21 | 	parser.add_argument('--minlen', help='Minimum Reference Length for a reference to be considered',default=CMSEQ_DEFAULTS.minlen, type=int)
22 | 	parser.add_argument('--minqual', help='Minimum base quality. Bases with quality score lower than this will be discarded. This is performed BEFORE --mincov. Default: 30', type=int, default=CMSEQ_DEFAULTS.minqual)
23 | 	parser.add_argument('--mincov', help='Minimum position coverage to perform the polymorphism calculation. Position with a lower depth of coverage will be discarded (i.e. considered as zero-coverage positions). This is calculated AFTER --minqual. Default: 1', type=int, default=CMSEQ_DEFAULTS.mincov)
24 | 	parser.add_argument('--truncate', help='Number of nucleotides that are truncated at either contigs end before calculating coverage values.', type=float, default=0)
25 | 	parser.add_argument('--combine', help='Combine all contigs into one giant contig and report it at the end', action='store_true')
26 | 
27 | 	#print vars(args)
28 | 	args = parser.parse_args()
29 | 	si = True if args.sortindex else False
30 | 	mode = 'all' if args.f else 'nofilter'
31 | 
32 | 	bf = BamFile(args.BAMFILE,sort=si,index=si,stepper=mode,minlen=args.minlen,filterInputList=args.contig,minimumReadsAligning=args.mincov)
33 | 
34 | 	print('Contig\tBreadth\tDepth avg\tDepth median')
35 | 
36 | 	all_coverage_values = []
37 | 	for i in bf.get_contigs_obj():
38 | 		bd_result = i.breadth_and_depth_of_coverage(minqual=args.minqual,mincov=args.mincov,trunc=args.truncate)
39 | 
40 | 		if not all(np.isnan(x) for x in [bd_result[0],bd_result[1],bd_result[2]]):
41 | 			print (i.name+'\t'+str(bd_result[0])+'\t'+str(bd_result[1])+'\t'+str(bd_result[2]))
42 | 			
43 | 			if args.combine:
44 | 				all_coverage_values.extend(bd_result[3])
45 | 
46 | 	if args.combine:
47 | 		if np.all(np.isnan(all_coverage_values)):
48 | 			print ("all_contigs"+'\t-\t'+str("NaN")+'\t'+str("NaN"))
49 | 		else:
50 | 			print ("all_contigs"+'\t-\t'+str(np.nanmean(all_coverage_values)) + '\t'+str(np.nanmedian(all_coverage_values)))
51 | 
52 | 
53 | if __name__ == "__main__":
54 | 	bd_from_file()


--------------------------------------------------------------------------------
/cmseq/cmseq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | from __future__ import print_function
  3 | import os
  4 | import pysam
  5 | import numpy as np
  6 | import math
  7 | import sys
  8 | from scipy import stats
  9 | from collections import defaultdict
 10 | import pickle,os
 11 | 
 12 | def _initt(terminating_,_consensus_bamFile,_consensus_args):
 13 | 	global terminating
 14 | 	global consensus_args
 15 | 	global consensus_bamFile
 16 | 	terminating = terminating_
 17 | 	consensus_args = _consensus_args
 18 | 	consensus_bamFile = _consensus_bamFile
 19 | 	
 20 | 
 21 | class CMSEQ_DEFAULTS:
 22 | 	minqual = 30
 23 | 	mincov  = 1
 24 | 	minlen  = 0
 25 | 	poly_error_rate = 0.001
 26 | 	poly_pvalue_threshold = 0.01
 27 | 	poly_dominant_frq_thrsh = 0.8
 28 | 	trimReads = None
 29 | 
 30 | 
 31 | class BamFile:
 32 | 	bam_handle = None
 33 | 	bamFile = None
 34 | 	contigs = {}
 35 | 
 36 | 	def __init__(self,bamFile,sort=False,index=False,stepper='nofilter',minlen=CMSEQ_DEFAULTS.minlen,filterInputList=None,minimumReadsAligning=None):
 37 | 		if not os.path.isfile(bamFile):
 38 | 			raise Exception(bamFile+' is not accessible, or is not a file')
 39 | 
 40 | 		if sort:
 41 | 			import subprocess
 42 | 			fp = bamFile+'.sorted'
 43 | 			subprocess.call(['samtools','sort',bamFile,'-o',bamFile+'.sorted'])
 44 | 		else: fp = bamFile
 45 | 
 46 | 		if index: pysam.index(fp)
 47 | 
 48 | 		self.bamFile = fp
 49 | 		
 50 | 		bamHandle = pysam.AlignmentFile(fp, "rb")
 51 | 		
 52 | 		self.bam_handle = bamHandle
 53 | 		
 54 | 		if filterInputList is not None:
 55 | 			
 56 | 			toList=[]
 57 | 			if isinstance(filterInputList, list):
 58 | 				toList = filterInputList
 59 | 			
 60 | 			elif os.path.isfile(filterInputList):
 61 | 				from Bio import SeqIO
 62 | 				
 63 | 				with open(filterInputList, "r") as infile:
 64 | 
 65 | 					for record in SeqIO.parse(infile, "fasta"):
 66 | 						
 67 | 						toList.append(record.id)
 68 | 			else:
 69 | 				toList = [element for element in filterInputList.split(',')]
 70 | 
 71 | 			if minimumReadsAligning:
 72 | 				self.contigs = dict((r,BamContig(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and r in toList and bamHandle.count(contig=r,read_callback=stepper) >= minimumReadsAligning))
 73 | 			else:
 74 | 				self.contigs = dict((r,BamContig(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and r in toList))
 75 | 			
 76 | 		else:
 77 | 			if minimumReadsAligning:
 78 | 				self.contigs = dict((r,BamContig(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and bamHandle.count(contig=r,read_callback=stepper) >= minimumReadsAligning))
 79 | 			else: 
 80 | 				self.contigs = dict((r,BamContig(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen))
 81 | 			
 82 | 	def get_contigs(self): return iter(self.contigs.keys())
 83 | 	def get_contigs_obj(self): return iter(self.contigs.values())
 84 | 	def get_contig_by_label(self,contigID): return (self.contigs[contigID] if contigID in self.contigs else None)
 85 | 
 86 | 	def parse_gff(self, inputGFF):
 87 | 		'''
 88 | 		get a list of contigs plus 0-indexed gene-coordinates and sense-ness of protein coding regions from a gff file.
 89 | 		Only tested with prokka GFF files.
 90 | 		'''
 91 | 		from BCBio import GFF
 92 | 		import Bio
 93 | 		import re
 94 | 		import warnings
 95 | 
 96 | 		def rev_comp(string):
 97 | 			string = string.upper()
 98 | 			complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N' : 'N'} 
 99 | 			bases = list(string) 
100 | 			bases = [complement[base] for base in bases]
101 | 			bases.reverse()
102 | 			return ''.join(bases)
103 |  
104 | 		try:
105 | 			with open(inputGFF) as in_handle:
106 | 				_ = next(GFF.parse(in_handle))
107 | 		except:
108 | 			print ('Parsing of GFF failed. This is probably because your biopython version is too new. Try downgrading to 1.76 or older')
109 | 			sys.exit(1)
110 | 
111 | 		with open(inputGFF) as in_handle:
112 | 		
113 | 			for rec in GFF.parse(in_handle):
114 | 				tmp = []
115 | 				for r in rec.features:
116 | 					if "minced" in r.qualifiers['source'][0] or "Minced" in r.qualifiers['source'][0]:
117 | 						# This catches CRISPR repeats.
118 | 						continue
119 | 					if r.sub_features:
120 | 						prodigal_bool = 'Prodigal' in r.sub_features[0].qualifiers['source'][0] or 'prodigal' in r.sub_features[0].qualifiers['source'][0]
121 | 					else:
122 | 						prodigal_bool = 'Prodigal' in r.qualifiers['source'][0] or 'prodigal' in r.qualifiers['source'][0]
123 | 					
124 | 					if prodigal_bool:
125 | 						# Prokka not only finds protein sequences, but also t-/r-RNA sequences. In order to only parse protein coding sequences,
126 | 						# I search for Prodigal/Prodigal in the source entry of the sub_features attribute.
127 | 						
128 | 						# the sub_features attribute of a seq_record object is apparently deprecated. I couldn't find any other way to access
129 | 						# the required information, though. Should probably be fixed when I can.
130 | 						indices = str(r.location).split('[')[1].split(']')[0].split(':')
131 | 						indices = [int(x) for x in indices]
132 | 						sense = str(r.location).split('(')[1].split(')')[0]
133 | 						if sense == "-":
134 | 							gene_seq = rev_comp(rec.seq[indices[0]:indices[1]])
135 | 						else:
136 | 							gene_seq = rec.seq[indices[0]:indices[1]]
137 | 
138 | 						if (str(gene_seq[0:3]) == "ATG" or str(gene_seq[0:3]) == "GTG" or str(gene_seq[0:3]) == "TTG"):
139 | 							pass
140 | 						else:
141 | 							warnings.warn(str(r.id) + " doesn't start with a common start codon. Beware. Continuing.")
142 | 
143 | 						if (str(gene_seq[-3:]) == "TAG" or str(gene_seq[-3:]) == "TAA" or str(gene_seq[-3:]) == "TGA"):
144 | 							pass
145 | 						else:
146 | 							warnings.warn(str(r.id) + " doesn't stop with a usual stop codon. Beware. Continuing.")
147 | 						tmp.append((indices, sense))
148 | 				
149 | 				if str(rec.id) in self.contigs:
150 | 					self.contigs[str(rec.id)].annotations.append(tmp)
151 | 				else:
152 | 					warnings.warn(str(rec.id) + " is not tracked by the BAMFile.")
153 | 
154 | 		
155 | 		
156 | 
157 | 	def parallel_reference_free_consensus(self,ncores=4,**kwargs):
158 | 		import multiprocessing as mp
159 |  
160 | 		terminating = mp.Event()
161 | 		
162 | 		with mp.Pool(initializer=_initt, initargs=(terminating,self.bamFile,kwargs),processes=ncores) as pool:
163 | 			res= [x for x in pool.imap_unordered(BamFile._parallel_consensus_worker, self.contigs.keys())]
164 | 		return res
165 | 
166 | 	@staticmethod
167 | 	def _parallel_consensus_worker(contigName):
168 | 
169 | 		if not terminating.is_set():
170 | 			try:
171 | 				t=BamFile(consensus_bamFile,filterInputList=[contigName])
172 | 				return (contigName,t.get_contig_by_label(contigName).reference_free_consensus(**consensus_args))
173 | 			except Exception as e:
174 | 				terminating.set()
175 | 				raise
176 | 		else:
177 | 			terminating.set()
178 | 	
179 | class BamContig:
180 | 
181 | 	coverage = None
182 | 	consensus = ''
183 | 	name = None
184 | 	length = None
185 | 	stepper = 'nofilter'
186 | 	annotations = None
187 | 
188 | 	def __init__(self,bamHandle,contigName,contigLength,stepper='nofilter'):
189 | 
190 | 		self.name = contigName
191 | 		self.length = contigLength
192 | 		self.bam_handle = bamHandle 
193 | 		self.stepper=stepper
194 | 		self.annotations = []
195 | 
196 | 
197 | 	def set_stepper(self,ns):
198 | 		if ns in ['all','nofilter']: self.stepper = ns
199 | 
200 | 
201 | 	def majority_rule(data_array):
202 | 		freq_array= data_array['base_freq']
203 | 		
204 | 
205 | 		if any([v>0 for v in freq_array.values()]):
206 | 			return max(sorted(freq_array), key=freq_array.get)
207 | 		else: 
208 | 			return 'N'
209 | 
210 | 	def majority_rule_polymorphicLoci(data_array):
211 | 
212 | 		# Masks the consensus sequence with "*" when a polymorphic locus is found according
213 | 		# to dominant_frq_thrsh defined p-value
214 | 		
215 | 		freq_array= data_array['base_freq']
216 | 		poly_pvalue= data_array['p']
217 | 
218 | 		if poly_pvalue <= 0.05: 
219 | 			return "*"
220 | 		elif any([v>0 for k,v in freq_array.items() if k != 'N']):
221 | 			return max(sorted(freq_array), key=freq_array.get)
222 | 		else: 
223 | 			return 'N'
224 | 
225 | 	def reference_free_consensus(self,consensus_rule=majority_rule,mincov=CMSEQ_DEFAULTS.mincov,minqual=CMSEQ_DEFAULTS.minqual,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh,noneCharacter='-',BAM_tagFilter=None, trimReads=None):
226 | 
227 | 		consensus_positions = {}
228 | 
229 | 		#print("A",mincov,minqual,dominant_frq_thrsh)
230 | 		for pileupcolumn,position_data in self.get_base_stats(min_read_depth=mincov, min_base_quality=minqual,dominant_frq_thrsh=dominant_frq_thrsh,BAM_tagFilter=BAM_tagFilter,trimReads=trimReads,error_rate=CMSEQ_DEFAULTS.poly_error_rate).items():
231 | 			consensus_positions[pileupcolumn] = consensus_rule(position_data)
232 | 
233 | 		if len(consensus_positions) > 0 :
234 | 			self.consensus = ''.join([(consensus_positions[position] if position in consensus_positions else noneCharacter) for position in range(1,self.length+1)])
235 | 		else:
236 | 			self.consensus = noneCharacter*self.length
237 | 
238 | 		#del consensus_positions
239 | 
240 | 		return self.consensus
241 | 
242 | 
243 | 
244 | 	def baseline_PSR(self,mincov=10,minqual=30,pvalue=0.01,error_rate=0.001,dominant_frq_thrsh=0.8,binom=None):
245 | 		# This function estimates the polymorphic site rate over the input contig assuming that there are no truely polymorphic sites
246 | 		# (Meaning that all observed polymorphisms are due to random sequencing error). The test also puts a threshold on the "dominance"
247 | 		# of the allele, meaning that it only reports a polymorphic base if the binomial test indicates significance AND the base is NOT sufficiently
248 | 		# dominated by the dominant base. Defaults to 0.8 dominance (dominant / all).
249 | 		from scipy import stats 
250 | 
251 | 		polymorphic_empirical_loci = 0
252 | 
253 | 		# Get coverage as well as values of contig
254 | 		depthsList = self.get_all_base_values('base_cov', min_base_qualit=yminqual,min_read_depth=mincov)
255 | 
256 | 		# Also get dominant allele frequency of contig
257 | 		dominantFreq = self.get_all_base_values('ratio_max2all', min_base_quality=minqual,min_read_depth=mincov)
258 | 
259 | 		# For each position, draw depth-times from bernoulli with success rate 1-error_rate. 
260 | 		# Determine significance based on a binomial test, as you would in the regular test for polymorphism.
261 | 		for depth, da_freq in zip(depthsList, dominantFreq):
262 | 			base_max = sum(stats.bernoulli.rvs(1-error_rate, size=depth))
263 | 			if binom and base_max in binom and depth in binom[base_max]:
264 | 					p = binom[base_max][depth]
265 | 			else:
266 | 					p = stats.binom.cdf(base_max, depth,1.0-error_rate)
267 | 			if p < pvalue and da_freq < dominant_frq_thrsh:
268 | 				polymorphic_empirical_loci+=1
269 | 		PSR = float(polymorphic_empirical_loci) / float(len(depthsList))
270 | 		return PSR
271 | 
272 | 	def get_base_stats_for_poly(self,minqual=CMSEQ_DEFAULTS.minqual):
273 | 		
274 | 		from scipy import stats
275 | 		import numpy
276 | 		from collections import defaultdict
277 | 		import pickle,os
278 | 		from itertools import chain
279 | 		import sys
280 | 		import pandas as pd
281 | 
282 | 		ATCG=('A','C','G','T')
283 | 		rev_dict={'A':'T', 'T':'A', 'G':'C', 'C':'G'}
284 | 		def rev_pos(cur_pos, gene_start, gene_end):
285 | 			# This function mirrors nucleotide positions in a gene on a gene's 'mid-part'
286 | 			# (So Nucleotide 1 in a gene is mapped to the last nucleotide in the gene, nucleotide 2 is mapped to one before the last nucleotide on the gene and so forth)
287 | 			# The gene_length variable is misnamed. It should be called gene_length_minus_one..
288 | 			gene_length = ((gene_end - 1) - gene_start)
289 | 			distance_from_start = cur_pos - gene_start
290 | 			return(cur_pos + (gene_length - 2 * distance_from_start))
291 | 
292 | 		if not self.annotations:
293 | 			base_stats = [None] * self.length
294 | 			for base_pileup in self.bam_handle.pileup(self.name,stepper=self.stepper):
295 | 				
296 | 				base_freq = {'A':0,'C':0,'G':0,'T':0,'N':0}
297 | 				for matched_read in base_pileup.pileups:
298 | 
299 | 					if not matched_read.is_del and not matched_read.is_refskip:
300 | 						b = matched_read.alignment.query_sequence[matched_read.query_position].upper()
301 | 						q = matched_read.alignment.query_qualities[matched_read.query_position]	
302 | 
303 | 						#print self.name,matched_read.query_position, b,q
304 | 
305 | 						if q >= minqual and b in ATCG: base_freq[b] += 1
306 | 					 	#else: print "Q",q,"B",b
307 | 					#print "Filling",base_pileup.pos,"with", base_freq
308 | 					if sum(base_freq.values()) > 0:
309 | 						base_stats[base_pileup.pos] = ((base_freq['A'],base_freq['C'],base_freq['G'],base_freq['T']),base_pileup.pos)
310 | 		else:
311 | 			base_stats = []
312 | 			# Generate pileups gene-wise
313 | 			# I use the 'truncate' parameter to only obtain the parsed start and stop positions. Without truncate, all positions with reads covering the parsed positions are returned.
314 | 			# I wrote a function that reverses a given gene position, which is used to effectively revert genes on the anti-sense strand.
315 | 			# Furthermore, for each read's nucleotide over a given position I write out the complement
316 | 			genes_and_positions = dict()
317 | 			for gene_idx in range(0, len(self.annotations[0])):
318 | 				genes_and_positions[gene_idx] = self.annotations[0][gene_idx]
319 | 			
320 | 			for gene_idx in genes_and_positions:
321 | 				gene_stats = [None] * (genes_and_positions[gene_idx][0][1] - genes_and_positions[gene_idx][0][0])
322 | 				pos_on_gene = 0
323 | 				bam_pileup = self.bam_handle.pileup(self.name, int(genes_and_positions[gene_idx][0][0]), int(genes_and_positions[gene_idx][0][1]), stepper=self.stepper, truncate = True)
324 | 				if genes_and_positions[gene_idx][1] == "+":
325 | 					# If the gene is on the sense-strand, do the same as before.
326 | 					for base_pileup in bam_pileup:
327 | 						base_freq = {'A':0,'C':0,'G':0,'T':0,'N':0}
328 | 						for matched_read in base_pileup.pileups:
329 | 							if not matched_read.is_del and not matched_read.is_refskip:
330 | 								b = matched_read.alignment.query_sequence[matched_read.query_position].upper()
331 | 								q = matched_read.alignment.query_qualities[matched_read.query_position]	
332 | 								if q >= minqual and b in ATCG: base_freq[b] += 1
333 | 						if sum(base_freq.values()) > 0:
334 | 							gene_stats[pos_on_gene] = ((base_freq['A'],base_freq['C'],base_freq['G'],base_freq['T']), base_pileup.pos)
335 | 						pos_on_gene += 1
336 | 					base_stats.extend(gene_stats)
337 | 				else:
338 | 					# If the gene is on the anti-sense strand, effectively return the reverse complement by mapping positions on a gene to it's mirrored position (using rev_pos)
339 | 					# and then also converting each nucleotide to it's complement.
340 | 					for base_pileup in bam_pileup:
341 | 						base_freq = {'A':0,'C':0,'G':0,'T':0,'N':0}
342 | 						for matched_read in base_pileup.pileups:
343 | 							if not matched_read.is_del and not matched_read.is_refskip:
344 | 								b = matched_read.alignment.query_sequence[matched_read.query_position].upper()
345 | 								q = matched_read.alignment.query_qualities[matched_read.query_position]	
346 | 								# We have to increment the COMPLEMENT of each base when gene calls are on the reverse strand.
347 | 								if q >= minqual and b in ATCG: base_freq[rev_dict[b]] += 1
348 | 						if sum(base_freq.values()) > 0:
349 | 							out_pos = rev_pos(cur_pos = int(pos_on_gene), gene_start = 0, gene_end = len(gene_stats))
350 | 							contig_pos = rev_pos(cur_pos = int(base_pileup.pos), gene_start = genes_and_positions[gene_idx][0][0], gene_end = genes_and_positions[gene_idx][0][1])
351 | 							gene_stats[out_pos] = ((base_freq['A'],base_freq['C'],base_freq['G'],base_freq['T']), contig_pos)
352 | 						pos_on_gene += 1
353 | 					if len(gene_stats) % 3 != 0:
354 | 						print("One of your genes' length is not a multiple of three. Check your gff file / gene calls.")
355 | 						print("Contig name", self.name)
356 | 						print("Gene position", genes_and_positions[gene_idx])
357 | 						sys.exit()
358 | 					base_stats.extend(gene_stats)
359 | 
360 | 		return base_stats
361 | 
362 | 	def easy_polymorphism_rate(self,mincov=CMSEQ_DEFAULTS.mincov,minqual=CMSEQ_DEFAULTS.minqual,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh):
363 | 
364 | 		from Bio.Seq import Seq
365 | 		#from Bio.Alphabet import IUPAC
366 | 
367 | 		bases = self.get_base_stats_for_poly(minqual=minqual)
368 | 		
369 | 		#list N-long where N is the number of covered bases (N <= L(contig))
370 | 		dominanceList = []
371 | 		mutationStats={'DN':0,'DS':0,'D?':0}
372 | 		
373 | 		explainList=[]
374 | 
375 | 		codon_f1 = []
376 | 		codon_f2 = []
377 | 
378 | 		for positionData in bases:
379 | 			# positionData= ((A,C,G,T),position) if covered, None if not.
380 | 			bases = ['N']
381 | 
382 | 			if positionData:
383 | 				nuclAbundance,position = positionData
384 | 				base_sum=sum(nuclAbundance)
385 | 				base_max=float(max(nuclAbundance))
386 | 				dominance = float(base_max) / float(base_sum)
387 | 				
388 | 				if base_sum > mincov:	
389 | 					
390 | 					dominanceList.append(dominance)
391 | 					tmpDict = dict((k,v) for k,v in zip(['A','C','G','T'],nuclAbundance))
392 | 					bases = [k for k,v in sorted(tmpDict.items(), key = lambda x: x[1], reverse=True) if v>0]	
393 | 				else:
394 | 					dominanceList.append(np.nan)
395 | 			else:
396 | 				dominanceList.append(np.nan)
397 | 			
398 | 			first_base = bases[0]
399 | 			second_base = bases[1] if (len(bases) > 1 and dominance < dominant_frq_thrsh) else bases[0]
400 |  
401 | 			codon_f1.append(first_base)
402 | 			codon_f2.append(second_base)
403 | 
404 | 			if len(codon_f1) == 3 and len(codon_f2) == 3:
405 | 
406 | 				codon_s1 = Seq(''.join(codon_f1))
407 | 				codon_s2 = Seq(''.join(codon_f2))
408 | 				codon_t1 = codon_s1.translate()
409 | 				codon_t2 = codon_s2.translate()
410 | 
411 | 				positionLabel = positionData[1] if positionData else 'ND'
412 | 				RD=None
413 | 				if codon_t1 == "X" or codon_t2 == "X":
414 | 					mutationStats['D?'] +=1
415 | 					RD="D?"
416 | 				elif codon_t1 != codon_t2:
417 | 					mutationStats['DN'] +=1
418 | 					RD="DN"
419 | 				elif (codon_t1 == codon_t2) and (codon_s1 != codon_s2):
420 | 					mutationStats['DS'] +=1
421 | 					RD="DS"
422 | 
423 | 				codon_f1 = []
424 | 				codon_f2 = []
425 | 
426 | 		return (dominanceList,mutationStats)
427 | 
428 | 
429 | 	def polymorphism_rate(self,mincov=CMSEQ_DEFAULTS.mincov,minqual=CMSEQ_DEFAULTS.minqual,pvalue=CMSEQ_DEFAULTS.poly_pvalue_threshold,error_rate=CMSEQ_DEFAULTS.poly_error_rate,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh):
430 | 
431 | 		base_values = self.get_base_stats(min_read_depth=mincov, min_base_quality=minqual,error_rate=error_rate,dominant_frq_thrsh=dominant_frq_thrsh)
432 | 		
433 | 
434 | 		rv={}
435 | 		rv['total_covered_bases'] = len(base_values)
436 | 		rv['total_polymorphic_bases'] = 0
437 | 
438 | 		if len(base_values) > 0:
439 | 			pb=sum([(1 if (info['p'] < pvalue and info['ratio_max2all'] < dominant_frq_thrsh) else 0) for pox, info in base_values.items()])
440 | 			
441 | 
442 | 			rv['total_polymorphic_bases']= pb
443 | 			rv['total_polymorphic_rate'] = float(pb)/float(len(base_values))
444 | 
445 | 			# If we have at least one polymorphic site
446 | 			if pb > 0:
447 | 
448 | 				rv['ratios'] = [info['ratio_max2all'] for pox,info in base_values.items() if (info['p'] < pvalue and info['ratio_max2all'] < dominant_frq_thrsh)]
449 | 				rv['dominant_allele_distr_mean'] = np.mean(rv['ratios'])
450 | 				rv['dominant_allele_distr_sd'] = np.std(rv['ratios'])
451 | 
452 | 				for i in [10,20,30,40,50,60,70,80,90,95,98,99]:
453 | 					rv['dominant_allele_distr_perc_'+str(i)] = np.percentile(rv['ratios'],i)
454 | 
455 | 		return rv
456 | 
457 | 
458 | 	def breadth_and_depth_of_coverage(self,mincov=10,minqual=30,trunc=0):
459 | 		coverage_positions = {}
460 | 		if self.length > trunc*2: 
461 | 			# Check if the contig is long enough to be truncated
462 | 			consid_r = range(int(trunc), int(self.length - trunc))
463 | 		else:
464 | 			# If a contig is too short to be truncated, ignore the truncation. 
465 | 			# This is not nice and should be improved.
466 | 			consid_r = range(0, int(self.length))
467 | 
468 | 		for pileupcolumn in self.bam_handle.pileup(self.name,stepper=self.stepper):
469 | 			#for each position
470 | 			if pileupcolumn.pos in consid_r:
471 | 				tCoverage = 0
472 | 				for pileupread in pileupcolumn.pileups:
473 | 					#for each base at the position	
474 | 					if not pileupread.is_del and not pileupread.is_refskip and pileupread.alignment.query_qualities[pileupread.query_position] >= minqual and pileupread.alignment.query_sequence[pileupread.query_position].upper() in ('A','T','C','G'):
475 | 							tCoverage +=1
476 | 
477 | 					if tCoverage >= mincov:
478 | 						coverage_positions[pileupcolumn.pos] = tCoverage
479 | 
480 | 
481 | 		if (len(coverage_positions.keys())) > 0:
482 | 			breadth = float(len(coverage_positions.keys()))/len(consid_r)
483 | 			vals = list(coverage_positions.values())
484 | 			avgdepth = np.mean(vals)
485 | 			mediandepth = np.median(vals)
486 | 			
487 | 			return (breadth,avgdepth,mediandepth,coverage_positions.values())
488 | 		else: 
489 | 			return (np.nan,np.nan,np.nan,[np.nan])
490 | 
491 | 	def depth_of_coverage(self,mincov=10,minqual=30):
492 | 		return self.breadth_and_depth_of_coverage(mincov,minqual)[1]
493 | 		#coverage_positions = {}
494 | 		#for pileupcolumn in self.bam_handle.pileup(self.name,stepper=self.stepper):
495 | 		#	if pileupcolumn.n >= mincov: coverage_positions[pileupcolumn.pos] = len([1 for pileupread in pileupcolumn.pileups if not pileupread.is_del and not pileupread.is_refskip and pileupread.alignment.query_qualities[pileupread.query_position] >= args.minqual and pileupread.alignment.query_sequence[pileupread.query_position].upper() in ('A','T','C','G') ])
496 | 		#
497 | 		#return (np.mean(coverage_positions.values()),np.median(coverage_positions.values()))
498 | 
499 | 
500 | 	def breadth_of_coverage(self,mincov=10,minqual=30):
501 | 		return self.breadth_and_depth_of_coverage(mincov,minqual)[0]
502 | 		
503 | #------------------------------------------------------------------------------	
504 | 
505 | 	
506 | 	def get_base_stats(self, min_read_depth=CMSEQ_DEFAULTS.mincov, min_base_quality=CMSEQ_DEFAULTS.minqual, error_rate=CMSEQ_DEFAULTS.poly_error_rate,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh,BAM_tagFilter=None,trimReads=None):
507 | 		'''
508 | 		get base frequencies and quality stats,
509 | 		to use in get_all_base_values() and other functions
510 | 		'''
511 | 
512 | 
513 | 		if trimReads:
514 | 			mask_head_until = int(trimReads[0]) if (trimReads[0] is not None and trimReads[0] != '') else 0
515 | 			mask_tail_before = int(trimReads[1]) if (trimReads[1] is not None and trimReads[1] != '') else 0
516 | 
517 | 		base_stats = defaultdict(dict) 
518 | 
519 | 		ATCG=('A','T','C','G')
520 | 
521 | 
522 | 		#for each position (column)
523 | 
524 | 		
525 | 		
526 | 		for base_pileup in self.bam_handle.pileup(self.name,stepper=self.stepper,min_base_quality=min_base_quality):
527 | 			base_freq = {'A':0,'T':0,'C':0,'G':0,'N':0}
528 | 			
529 | 			pos=base_pileup.pos+1 # 1-based
530 | 
531 | 
532 | 			#for each read composing the pile
533 | 			for matched_read in base_pileup.pileups:
534 | 				if not matched_read.is_del and not matched_read.is_refskip:
535 | 
536 | 					b = matched_read.alignment.query_sequence[matched_read.query_position].upper()
537 | 					q = matched_read.alignment.query_qualities[matched_read.query_position]	
538 | 					#print("I am get_base_stats and this is read", matched_read.alignment.query_name, " (L=",matched_read.alignment.query_length," ) at position", matched_read.query_position, "it's a ", b)
539 | 
540 | 					thisPositionBase = 'N'
541 | 					
542 | 					if not trimReads or (trimReads and ((matched_read.query_position >= mask_head_until) and (matched_read.query_position <= (matched_read.alignment.query_length-mask_tail_before) ) ) ):
543 | 						if b in ATCG: 
544 | 							if BAM_tagFilter is None or all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter ):
545 | 								thisPositionBase = b								
546 | 					
547 | 					base_freq[thisPositionBase] += 1
548 | 
549 | 			# calculate quality stats, ignoring N's 
550 | 			base_sum=sum([base_freq[b] for b in ATCG]) 
551 | 			base_max=float(max([base_freq[b] for b in ATCG]))
552 | 
553 | 			if base_sum >= min_read_depth:
554 | 				r = base_max / base_sum
555 | 				#print r, dominant_frq_thrsh
556 | 				if r < dominant_frq_thrsh:
557 | 					#it makes sense to calculate pvalue
558 | 					p = stats.binom.cdf(base_max, base_sum, 1.0 - error_rate)
559 | 				else:
560 | 					p = 1.0
561 | 
562 | 				
563 | 				base_stats[pos]['p']=p                 # quality measure
564 | 				base_stats[pos]['ratio_max2all']=r     # dominant base versus others
565 | 				base_stats[pos]['base_cov'] =base_sum  # number of reads covering the base, not counting N's
566 | 				base_stats[pos]['base_freq']=base_freq # dict: {'A':4,'T':1,'C':2,'G':0,'N':0}
567 | 			
568 | 		return base_stats
569 | 
570 | 	
571 | 	def get_all_base_values(self, stats_value,  *f_args, **f_kwargs):
572 | 		'''
573 | 		get list of p values (or 'ratio_max2all' etc) for all bases that pass argument thresholds
574 | 		p_all = a.get_contig_by_label('CONTIGNAME').get_all_base_values('p', min_base_quality=30)
575 | 		'''
576 | 		base_stats = self.get_base_stats(*f_args, **f_kwargs)
577 | 		return [base_stats[k].get(stats_value, 'NaN') for k in base_stats]
578 | 
579 | 	
580 | 		
581 | def loc_gte(a,b):
582 | 	return a>=b
583 | 
584 | def loc_lte(a,b):
585 | 	return a<=b
586 | 
587 | def loc_gt(a,b):
588 | 	return a>b
589 | 
590 | def loc_lt(a,b):
591 | 	return a<b
592 | 
593 | def loc_leq(a,b):
594 | 	return a==b
595 | 
596 | class bcolors:
597 | 	HEADER = '\033[95m'
598 | 	OKBLUE = '\033[94m'
599 | 	OKGREEN = '\033[92m'
600 | 	WARNING = '\033[93m'
601 | 	FAIL = '\033[91m'
602 | 	ENDC = '\033[0m'        
603 | 	OKGREEN2 = '\033[42m\033[30m'
604 | 	RED = '\033[1;91m'
605 | 	CYAN = '\033[0;37m'
606 | 


--------------------------------------------------------------------------------
/cmseq/consensus.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import argparse
 4 | import sys
 5 | from cmseq import __version__
 6 | 
 7 | from Bio import SeqIO
 8 | from Bio.Seq import Seq
 9 | from Bio.SeqRecord import SeqRecord
10 | 
11 | from .cmseq import CMSEQ_DEFAULTS
12 | from .cmseq import BamFile
13 | 
14 | def consensus_from_file():
15 | 	parser = argparse.ArgumentParser(description="outputs the consensus in FASTA format. Non covered positions (or quality-trimmed positions) are reported as a dashes: -")
16 | 	parser.add_argument('--version', action='version', version=f"CMSeq {__version__}")
17 | 
18 | 	parser.add_argument('BAMFILE', help='The file on which to operate')
19 | 	parser.add_argument('-c','--contig', help='Focus on a subset of references in the BAM file. Can be a list of references separated by commas or a FASTA file (the IDs are used to subset)', metavar="REFERENCE ID" ,default=None)
20 | 	parser.add_argument('-f', help='If set unmapped (FUNMAP), secondary (FSECONDARY), qc-fail (FQCFAIL) and duplicate (FDUP) are excluded. If unset ALL reads are considered (bedtools genomecov style). Default: unset',action='store_true')
21 | 	parser.add_argument('--sortindex', help='Sort and index the file',action='store_true')
22 | 	parser.add_argument('--minqual', help='Minimum base quality. Bases with quality score lower than this will be discarded. This is performed BEFORE --mincov. Default: '+str(CMSEQ_DEFAULTS.minqual), type=int, default=CMSEQ_DEFAULTS.minqual)
23 | 	parser.add_argument('--mincov', help='Minimum position coverage to perform the polymorphism calculation. Position with a lower depth of coverage will be discarded (i.e. considered as zero-coverage positions). This is calculated AFTER --minqual. Default: '+str(CMSEQ_DEFAULTS.minlen), type=int, default=CMSEQ_DEFAULTS.mincov)
24 | 	parser.add_argument('--dominant_frq_thrsh', help='Cutoff for degree of `allele dominance` for a position to be considered polymorphic. Default: '+str(CMSEQ_DEFAULTS.poly_dominant_frq_thrsh), type=float, default=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh)
25 | 	parser.add_argument('--minlen', help='Minimum Reference Length for a reference to be considered. Default: '+str(CMSEQ_DEFAULTS.minlen),default=CMSEQ_DEFAULTS.minlen, type=int)
26 | 	parser.add_argument('--trim', help='Trim the reads before computing the consensus. A value of 10:10 means that the first and last 10 positions of each read will be ignored. Default: '+str(CMSEQ_DEFAULTS.trimReads),default=CMSEQ_DEFAULTS.trimReads, type=str)
27 | 	
28 | 	args = parser.parse_args()
29 | 	si = True if args.sortindex else False
30 | 	mode = 'all' if args.f else 'nofilter'
31 | 
32 | 	bf = BamFile(args.BAMFILE,sort=si,index=si,stepper=mode,minlen=args.minlen,filterInputList=args.contig)
33 | 	#tl = [bf.get_contig_by_label(contig) for contig in args.contig.split(',')] if args.contig is not None else list(bf.get_contigs_obj())
34 |  
35 | 	lst = []
36 | 
37 | 	for i in bf.get_contigs_obj():
38 | 
39 | 		
40 | 		trimParam = tuple(args.trim.strip().split(':')) if args.trim else args.trim
41 | 
42 | 		sq = i.reference_free_consensus(mincov=args.mincov,minqual=args.minqual,dominant_frq_thrsh=args.dominant_frq_thrsh,noneCharacter='N',trimReads=trimParam)
43 | 		
44 | 		if sq is not None:
45 | 			lst.append(SeqRecord(Seq(sq), id=i.name+"_consensus", description=''))
46 | 	SeqIO.write(lst,sys.stdout,'fasta')
47 | 
48 | 
49 | if __name__ == "__main__":
50 | 	consensus_from_file()


--------------------------------------------------------------------------------
/cmseq/consensus_aDNA.py:
--------------------------------------------------------------------------------
  1 | from .cmseq import CMSEQ_DEFAULTS
  2 | from .cmseq import BamFile
  3 | from .cmseq import BamContig
  4 | from cmseq import __version__
  5 | 
  6 | import os
  7 | import pysam
  8 | import math
  9 | 
 10 | import pandas as pd
 11 | import numpy as np
 12 | import argparse
 13 | import sys
 14 | from Bio import SeqIO
 15 | from Bio.Seq import Seq
 16 | from Bio.SeqRecord import SeqRecord
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | __author__ = 'Kun D. Huang (kun.huang@unitn.it), Moreno Zolfo (moreno.zolfo@unitn.it)'
 24 | __date__ = '09 September 2020'
 25 | 
 26 | class CMSEQ_DEFAULTS_Ancient(CMSEQ_DEFAULTS):
 27 | 	position_specific_prob = None
 28 | 	position_specific_prob_thrsh = None
 29 | 
 30 | 
 31 | class BamFileAncient(BamFile):
 32 | 
 33 | 	def __init__(self,bamFile,sort=False,index=False,stepper='nofilter',minlen=CMSEQ_DEFAULTS.minlen,filterInputList=None,minimumReadsAligning=None):
 34 | 		if not os.path.isfile(bamFile):
 35 | 			raise Exception(bamFile+' is not accessible, or is not a file')
 36 | 
 37 | 		if sort:
 38 | 			import subprocess
 39 | 			fp = bamFile+'.sorted'
 40 | 			subprocess.call(['samtools','sort',bamFile,'-o',bamFile+'.sorted'])
 41 | 		else: fp = bamFile
 42 | 
 43 | 		if index: pysam.index(fp)
 44 | 
 45 | 		self.bamFile = fp
 46 | 		
 47 | 		bamHandle = pysam.AlignmentFile(fp, "rb")
 48 | 		
 49 | 		self.bam_handle = bamHandle
 50 | 		
 51 | 		if filterInputList is not None:
 52 | 			
 53 | 			toList=[]
 54 | 			if isinstance(filterInputList, list):
 55 | 				toList = filterInputList
 56 | 			
 57 | 			elif os.path.isfile(filterInputList):
 58 | 				from Bio import SeqIO
 59 | 				
 60 | 				with open(filterInputList, "r") as infile:
 61 | 
 62 | 					for record in SeqIO.parse(infile, "fasta"):
 63 | 						
 64 | 						toList.append(record.id)
 65 | 			else:
 66 | 				toList = [element for element in filterInputList.split(',')]
 67 | 
 68 | 			if minimumReadsAligning:
 69 | 				self.contigs = dict((r,BamContigAncient(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and r in toList and bamHandle.count(contig=r,read_callback=stepper) >= minimumReadsAligning))
 70 | 			else:
 71 | 				self.contigs = dict((r,BamContigAncient(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and r in toList))
 72 | 			
 73 | 		else:
 74 | 			if minimumReadsAligning:
 75 | 				self.contigs = dict((r,BamContigAncient(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen and bamHandle.count(contig=r,read_callback=stepper) >= minimumReadsAligning))
 76 | 			else: 
 77 | 				self.contigs = dict((r,BamContigAncient(self.bam_handle,r,l,stepper)) for r,l in zip(bamHandle.references,bamHandle.lengths) if (l > minlen))
 78 | 
 79 | 	def get_contigs_obj(self): return iter(self.contigs.values())
 80 | 			
 81 | 
 82 | class BamContigAncient(BamContig):
 83 | 	## Get base stats code goes here
 84 | 
 85 | 	def reference_free_consensus(self,consensus_rule=BamContig.majority_rule,mincov=CMSEQ_DEFAULTS.mincov,
 86 | 		minqual=CMSEQ_DEFAULTS.minqual,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh,
 87 | 		noneCharacter='-',BAM_tagFilter=None,trimReads=None,post_damage_prob=None,
 88 | 		pos_prob_db=CMSEQ_DEFAULTS_Ancient.position_specific_prob,refseq_idx=None):
 89 | 
 90 | 		consensus_positions = {}
 91 | 
 92 | 		for pileupcolumn,position_data in self.get_base_stats(min_read_depth=mincov, min_base_quality=minqual,
 93 | 			dominant_frq_thrsh=dominant_frq_thrsh,BAM_tagFilter=BAM_tagFilter,trimReads=trimReads,
 94 | 			post_damage_prob=post_damage_prob,pos_prob_db=pos_prob_db, refseq_idx=refseq_idx).items():
 95 | 			ref_base_idx = self.name + '__' + str(pileupcolumn)
 96 | 			
 97 | 
 98 | 			if float(position_data['ratio_max2all']) >= float(dominant_frq_thrsh):
 99 | 				consensus_positions[pileupcolumn] = consensus_rule(dict((k,v) for k,v in position_data['base_freq'].items() if k != 'N'))
100 | 
101 | 		if len(consensus_positions) > 0 :
102 | 			self.consensus = ''.join([(consensus_positions[position] if position in consensus_positions else noneCharacter) for position in range(1,self.length+1)])
103 | 		else:
104 | 			self.consensus = noneCharacter*self.length
105 | 
106 | 		del consensus_positions
107 | 		return self.consensus
108 | 
109 | 	def get_base_stats(self, min_read_depth=CMSEQ_DEFAULTS.mincov, min_base_quality=CMSEQ_DEFAULTS.minqual,
110 | 	 error_rate=CMSEQ_DEFAULTS.poly_error_rate,dominant_frq_thrsh=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh,
111 | 	 BAM_tagFilter=None,trimReads=None,post_damage_prob=CMSEQ_DEFAULTS_Ancient.position_specific_prob_thrsh,
112 | 	 pos_prob_db=CMSEQ_DEFAULTS_Ancient.position_specific_prob,refseq_idx=None):
113 | 		
114 | 		'''
115 | 		get base frequencies and quality stats,
116 | 		to use in get_all_base_values() and other functions
117 | 		'''
118 | 
119 | 		from scipy import stats
120 | 		from collections import defaultdict
121 | 		import pickle,os
122 | 
123 | 
124 | 		base_stats = defaultdict(dict) 
125 | 
126 | 		ATCG=('A','T','C','G')
127 | 
128 | 		#for each position (column)
129 | 		for base_pileup in self.bam_handle.pileup(self.name,stepper=self.stepper):
130 | 			base_freq = {'A':0,'T':0,'C':0,'G':0,'N':0}
131 | 			
132 | 
133 | 			pos=base_pileup.pos+1 # 1-based
134 | 
135 | 			#for each read composing the pile
136 | 
137 | 			for matched_read in base_pileup.pileups:
138 | 				if not matched_read.is_del and not matched_read.is_refskip:
139 | 
140 | 
141 | 					b = matched_read.alignment.query_sequence[matched_read.query_position].upper()
142 | 					q = matched_read.alignment.query_qualities[matched_read.query_position]	
143 | 
144 | 					thisPositionBase = 'N'
145 | 					
146 | 					if post_damage_prob and pos_prob_db and refseq_idx: # Enter position-specific mode
147 | 						ref_base_key = self.name + '__' + str(pos)
148 | 						ref_base = refseq_idx[ref_base_key]	
149 | 						if matched_read.query_position <= 11: # Check position if on the left end 
150 | 							left_pos = matched_read.query_position + 1 # 1-based
151 | 							sub = ref_base+b
152 | 							if sub == 'CT' or sub == 'GA': # Check if the RefSeq-Read base if CT or GA
153 | 								prob = pos_prob_db[left_pos][sub]
154 | 								# print(sub, prob, matched_read.alignment.query_length, matched_read.query_position, left_pos)
155 | 								if (q >= min_base_quality) and (b in ATCG) and (prob <= post_damage_prob):
156 | 									if BAM_tagFilter is None:
157 | 										thisPositionBase = b
158 | 									elif BAM_tagFilter and all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter):
159 | 										thisPositionBase = b
160 | 							else:
161 | 								if (q >= min_base_quality) and (b in ATCG):
162 | 									if BAM_tagFilter is None:
163 | 										thisPositionBase = b
164 | 									elif BAM_tagFilter and all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter):
165 | 										thisPositionBase = b
166 | 
167 | 						elif (matched_read.alignment.query_length - matched_read.query_position) <= 11: # check position if on the right end
168 | 							right_pos = matched_read.query_position - matched_read.alignment.query_length 
169 | 							sub = ref_base+b
170 | 							if sub == 'CT' or sub == 'GA':
171 | 								prob = pos_prob_db[right_pos][sub]
172 | 								# print(sub, prob, matched_read.alignment.query_length, matched_read.query_position, right_pos)
173 | 								if (q >= min_base_quality) and (b in ATCG) and (prob <= post_damage_prob):
174 | 									if BAM_tagFilter is None:
175 | 										thisPositionBase = b
176 | 									elif BAM_tagFilter and all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter):
177 | 										thisPositionBase = b
178 | 							else:
179 | 								if (q >= min_base_quality) and (b in ATCG):
180 | 									if BAM_tagFilter is None:
181 | 										thisPositionBase = b
182 | 									elif BAM_tagFilter and all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter):
183 | 										thisPositionBase = b
184 | 						else:
185 | 						# 	print(sub, matched_read.alignment.query_length, matched_read.query_position, "X")
186 | 							if (q >= min_base_quality) and (b in ATCG):
187 | 								if BAM_tagFilter is None:
188 | 									thisPositionBase = b
189 | 								elif BAM_tagFilter and all(globals()[func](matched_read.alignment.get_tag(tag),limitValue) == True for (tag,func,limitValue) in BAM_tagFilter):
190 | 									thisPositionBase = b
191 | 						base_freq[thisPositionBase] += 1
192 | 
193 | 			# calculate quality stats, ignoring N's 
194 | 			base_sum=sum([base_freq[b] for b in ATCG]) 
195 | 			base_max=float(max([base_freq[b] for b in ATCG]))
196 | 
197 | 			if base_sum >= min_read_depth:
198 | 				r = base_max / base_sum
199 | 				#print r, dominant_frq_thrsh
200 | 				if r < dominant_frq_thrsh:
201 | 					#it makes sense to calculate pvalue
202 | 					p = stats.binom.cdf(base_max, base_sum, 1.0 - error_rate)
203 | 				else:
204 | 					p = 1.0
205 | 
206 | 				
207 | 				base_stats[pos]['p']=p                 # quality measure
208 | 				base_stats[pos]['ratio_max2all']=r     # dominant base versus others
209 | 				base_stats[pos]['base_cov'] =base_sum  # number of reads covering the base, not counting N's
210 | 				base_stats[pos]['base_freq']=base_freq # dict: {'A':4,'T':1,'C':2,'G':0,'N':0}
211 | 			
212 | 		return base_stats
213 | 
214 | def consensus_from_file():
215 | 
216 | 	parser = argparse.ArgumentParser(description="outputs the consensus in FASTA format. Non covered positions (or quality-trimmed positions) are reported as a dashes: -")
217 | 	parser.add_argument('--version', action='version', version=f"CMSeq {__version__}")
218 | 
219 | 	parser.add_argument('BAMFILE', help='The file on which to operate')
220 | 	parser.add_argument('-c','--contig', help='Focus on a subset of references in the BAM file. Can be a list of references separated by commas or a FASTA file (the IDs are used to subset)', metavar="REFERENCE ID" ,default=None)
221 | 	parser.add_argument('-f', help='If set unmapped (FUNMAP), secondary (FSECONDARY), qc-fail (FQCFAIL) and duplicate (FDUP) are excluded. If unset ALL reads are considered (bedtools genomecov style). Default: unset',action='store_true')
222 | 	parser.add_argument('-r', '--refseq', help='Input the refrence genome sequence', type=str)
223 | 	parser.add_argument('--sortindex', help='Sort and index the file',action='store_true')
224 | 	parser.add_argument('--minqual', help='Minimum base quality. Bases with quality score lower than this will be discarded. This is performed BEFORE --mincov. Default: '+str(CMSEQ_DEFAULTS.minqual), type=int, default=CMSEQ_DEFAULTS.minqual)
225 | 	parser.add_argument('--mincov', help='Minimum position coverage to perform the polymorphism calculation. Position with a lower depth of coverage will be discarded (i.e. considered as zero-coverage positions). This is calculated AFTER --minqual. Default: '+str(CMSEQ_DEFAULTS.minlen), type=int, default=CMSEQ_DEFAULTS.mincov)
226 | 	parser.add_argument('--dominant_frq_thrsh', help='Cutoff for degree of `allele dominance` for a position to be considered polymorphic. Default: '+str(CMSEQ_DEFAULTS.poly_dominant_frq_thrsh), type=float, default=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh)
227 | 	parser.add_argument('--minlen', help='Minimum Reference Length for a reference to be considered. Default: '+str(CMSEQ_DEFAULTS.minlen),default=CMSEQ_DEFAULTS.minlen, type=int)
228 | 	parser.add_argument('--pos_specific_prob_tab', help='Stats_out_MCMC_correct_prob table produced from mapdamage2. It contains the position specific probability of observing a C->T or G->A due to a post-mortem damage.',default=CMSEQ_DEFAULTS_Ancient.position_specific_prob, type=str)
229 | 	parser.add_argument('--pos_damage_prob_thrsh', help = 'Maximum post-mortem damage probability for a nucletide on a read to be considered when building consensus.', default=CMSEQ_DEFAULTS_Ancient.position_specific_prob_thrsh, type = float)
230 | 
231 | 	args = parser.parse_args()
232 | 
233 | 	si = True if args.sortindex else False
234 | 	mode = 'all' if args.f else 'nofilter'
235 | 
236 | 	bf = BamFileAncient(args.BAMFILE,sort=si,index=si,stepper=mode,minlen=args.minlen,filterInputList=args.contig)
237 | 	#tl = [bf.get_contig_by_label(contig) for contig in args.contig.split(',')] if args.contig is not None else list(bf.get_contigs_obj())
238 |  
239 | 	lst = []
240 | 	if args.pos_specific_prob_tab and args.pos_damage_prob_thrsh and args.refseq: 
241 | 		pos_specific_prob_db = [i.rstrip().split(',') for i in open(args.pos_specific_prob_tab).readlines()][1:]
242 | 		stats_db = {}
243 | 		for i in pos_specific_prob_db:
244 | 			pos_ = int(i[1])
245 | 			CT_ = float(i[2])
246 | 			GA_ = float(i[3])
247 | 			stats_db[pos_] = {'CT': CT_, 'GA': GA_}
248 | 		pos_stats_db = stats_db
249 | 		pos_prob_thrsh = args.pos_damage_prob_thrsh
250 | 
251 | 		RefSeq_dict = SeqIO.to_dict(SeqIO.parse(open(args.refseq), "fasta"))
252 | 		RefSeq_idx = {}
253 | 		for i in RefSeq_dict:
254 | 			seq = RefSeq_dict[i].seq
255 | 			for b_idx in range(len(seq)):
256 | 				RefSeq_idx[i+'__'+str(b_idx+1)]=seq[b_idx]
257 | 
258 | 	else:
259 | 		pos_stats_db, pos_prob_thrsh, RefSeq_idx = None, None, None
260 | 		sys.exit("Please input position-specific probability table from mapdamage2, reference sequence, and damage probability cap!")
261 | 
262 | 
263 | 	for i in bf.get_contigs_obj():
264 | 
265 | 		
266 | 		sq = i.reference_free_consensus(mincov=args.mincov,minqual=args.minqual,
267 | 			dominant_frq_thrsh=args.dominant_frq_thrsh,noneCharacter='N',
268 | 			trimReads=None,post_damage_prob=pos_prob_thrsh,pos_prob_db=pos_stats_db, refseq_idx=RefSeq_idx)
269 | 		
270 | 		if sq is not None:
271 | 			lst.append(SeqRecord(Seq(sq), id=i.name+"_consensus", description=''))
272 | 	SeqIO.write(lst,sys.stdout,'fasta')
273 | 
274 | 
275 | if __name__ == "__main__":
276 | 	consensus_from_file()
277 | 


--------------------------------------------------------------------------------
/cmseq/filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pysam,sys
 3 | import argparse
 4 | import numpy as np
 5 | from cmseq import __version__
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('--version', action='version', version=f"CMSeq {__version__}")
 9 | 
10 | parser.add_argument('--minlen', help='Minimum length of alignment for a read to pass', type=int, default=70)
11 | parser.add_argument('--minqual', help='Minimum average quality for a read to pass. It is computed over the fastq Phred-scores of each read', type=int, default=30)
12 | parser.add_argument('--maxsnps', help='Maximum edit distance on the alignment for a read to pass. It is computed over NM value from the BAM)', type=float, default=1.0)
13 | parser.add_argument('--exclude_targets', help='Exclude these entries (FASTA file to filter out)')
14 | parser.add_argument('--exclude_reads_bam', help='Exclude these entries (BAM file to filter out)')
15 | 
16 | args = parser.parse_args()
17 | if args.exclude_targets:
18 | 	to_exclude = list(set([rec.strip() for rec in open(args.exclude_targets)]))
19 | else:
20 | 	to_exclude = []
21 | 
22 | if args.exclude_reads_bam:
23 | 	ex_samfile = pysam.AlignmentFile(args.exclude_reads_bam, "rb")
24 | 	reads_to_exclude= list(set([''.join(i.query_name.split('_')[:-1]) for i in ex_samfile.fetch(until_eof=True)]))
25 | 
26 | else:
27 | 	reads_to_exclude = []
28 | 
29 | samfile = pysam.AlignmentFile("-", "rb")
30 | passingReads = pysam.AlignmentFile("-", "wb", template=samfile)
31 | 
32 | for read in samfile.fetch():
33 | 	alignment_len = int(read.query_alignment_length)
34 | 	snps = read.get_tag('NM')
35 | 
36 | 	qualities = read.query_qualities
37 | 	refname = read.reference_name
38 | 	readname = read.query_name
39 | 	snps_rate =float(snps) / float(read.query_alignment_length)
40 | 	meanqualities =np.mean(read.query_qualities)
41 | 
42 | 	if (not read.is_secondary) and (alignment_len >= args.minlen) and (snps_rate <= args.maxsnps) and (meanqualities >= args.minqual) and (refname not in to_exclude) and (readname not in reads_to_exclude):
43 | 		passingReads.write(read)
44 | 
45 | 
46 | passingReads.close() 
47 | samfile.close()
48 | 


--------------------------------------------------------------------------------
/cmseq/poly.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import argparse
 4 | import sys
 5 | from cmseq import __version__
 6 | 
 7 | from .cmseq import CMSEQ_DEFAULTS
 8 | from .cmseq import BamFile
 9 | 
10 | def poly_from_file():
11 | 	parser = argparse.ArgumentParser(description="Reports the polymorpgic rate of each reference (polymorphic bases / total bases). Focuses only on covered regions (i.e. depth >= 1)")
12 | 	parser.add_argument('--version', action='version', version=f"CMSeq {__version__}")
13 | 	
14 | 	parser.add_argument('BAMFILE', help='The file on which to operate')
15 | 	parser.add_argument('-c','--contig', help='Focus on a subset of references in the BAM file. Can be a list of references separated by commas or a FASTA file (the IDs are used to subset)', metavar="REFERENCE ID" ,default=None)
16 | 	parser.add_argument('-f', help='If set unmapped (FUNMAP), secondary (FSECONDARY), qc-fail (FQCFAIL) and duplicate (FDUP) are excluded. If unset ALL reads are considered (bedtools genomecov style). Default: unset',action='store_true')
17 | 	parser.add_argument('--sortindex', help='Sort and index the file',action='store_true')
18 | 	parser.add_argument('--minlen', help='Minimum Reference Length for a reference to be considered. Default: '+str(CMSEQ_DEFAULTS.minlen),default=CMSEQ_DEFAULTS.minlen, type=int)
19 | 	parser.add_argument('--minqual', help='Minimum base quality. Bases with quality score lower than this will be discarded. This is performed BEFORE --mincov. Default: 30', type=int, default=CMSEQ_DEFAULTS.minqual)
20 | 	parser.add_argument('--mincov', help='Minimum position coverage to perform the polymorphism calculation. Position with a lower depth of coverage will be discarded (i.e. considered as zero-coverage positions). This is calculated AFTER --minqual. Default:'+str(CMSEQ_DEFAULTS.mincov), type=int, default=CMSEQ_DEFAULTS.mincov)
21 | 	parser.add_argument('--pvalue', help='Binomial p-value threshold for the binomal-polymorphic test. Default: '+str(CMSEQ_DEFAULTS.poly_pvalue_threshold), type=float, default=CMSEQ_DEFAULTS.poly_pvalue_threshold)
22 | 	parser.add_argument('--seq_err', help='Sequencing error rate. Default: '+str(CMSEQ_DEFAULTS.poly_error_rate), type=float, default=CMSEQ_DEFAULTS.poly_error_rate)
23 | 	parser.add_argument('--dominant_frq_thrsh', help='Cutoff for degree of `allele dominance` for a position to be considered polymorphic. Default: '+str(CMSEQ_DEFAULTS.poly_dominant_frq_thrsh), type=float, default=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh)
24 | 	args = parser.parse_args()
25 | 
26 | 	import pandas as pd
27 | 
28 | 	si = True if args.sortindex else False
29 | 	mode = 'all' if args.f else 'nofilter'
30 | 	
31 | 	bf = BamFile(args.BAMFILE,sort=si,index=si,stepper=mode,minlen=args.minlen,filterInputList=args.contig)
32 | 
33 | 	outputDF = []
34 | 	allRatios = [] 
35 | 	allGenomeCol = {'referenceID': '-GENOME-','total_covered_bases':0,'total_polymorphic_bases':0,'total_polymorphic_rate':np.nan}
36 | 
37 | 	for element in bf.get_contigs_obj():
38 | 		
39 | 		tld = element.polymorphism_rate(minqual=args.minqual,mincov=args.mincov,error_rate=args.seq_err,dominant_frq_thrsh=args.dominant_frq_thrsh)
40 | 		tld['referenceID'] = element.name
41 | 	
42 | 		allGenomeCol['total_covered_bases'] += tld['total_covered_bases']
43 | 		allGenomeCol['total_polymorphic_bases'] += tld['total_polymorphic_bases'] 
44 | 		if 'ratios' in tld:
45 | 			allRatios = allRatios + tld['ratios']
46 | 			del tld['ratios']
47 | 
48 | 		outputDF.append(tld)
49 | 		del tld
50 | 
51 | 
52 | 	if float(allGenomeCol['total_covered_bases']) and float(allGenomeCol['total_polymorphic_bases']) > 0:
53 | 
54 | 		allGenomeCol['total_polymorphic_rate'] = float(allGenomeCol['total_polymorphic_bases']) / float(allGenomeCol['total_covered_bases'])
55 | 		allGenomeCol['dominant_allele_distr_mean'] = np.mean(allRatios)
56 | 		allGenomeCol['dominant_allele_distr_sd'] = np.std(allRatios)
57 | 		
58 | 		for i in [10,20,30,40,50,60,70,80,90,95,98,99]:
59 | 			allGenomeCol['dominant_allele_distr_perc_'+str(i)] = np.percentile(allRatios,i)
60 | 
61 | 
62 | 	outputDF.append(allGenomeCol)
63 | 
64 | 	pd.DataFrame.from_dict(outputDF).set_index('referenceID').to_csv(sys.stdout,sep='#')
65 | 
66 | if __name__ == "__main__":
67 | 	poly_from_file()


--------------------------------------------------------------------------------
/cmseq/polymut.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import argparse
 4 | import sys
 5 | from cmseq import __version__
 6 | 
 7 | from .cmseq import CMSEQ_DEFAULTS
 8 | from .cmseq import BamFile
 9 | 
10 | def polymut_from_file():
11 | 	parser = argparse.ArgumentParser(description="Reports the polymorpgic rate of each reference (polymorphic bases / total bases). Focuses only on covered regions (i.e. depth >= 1)")
12 | 	parser.add_argument('--version', action='version', version=f"CMSeq {__version__}")
13 | 
14 | 	parser.add_argument('BAMFILE', help='The file on which to operate')
15 | 	parser.add_argument('-c','--contig', help='Focus on a subset of references in the BAM file. Can be a list of references separated by commas or a FASTA file (the IDs are used to subset)', metavar="REFERENCE ID" ,default=None)
16 | 	parser.add_argument('-f', help='If set unmapped (FUNMAP), secondary (FSECONDARY), qc-fail (FQCFAIL) and duplicate (FDUP) are excluded. If unset ALL reads are considered (bedtools genomecov style). Default: unset',action='store_true')
17 | 	parser.add_argument('--sortindex', help='Sort and index the file',action='store_true')
18 | 	parser.add_argument('--minlen', help='Minimum Reference Length for a reference to be considered. Default: '+str(CMSEQ_DEFAULTS.minlen),default=CMSEQ_DEFAULTS.minlen, type=int)
19 | 	parser.add_argument('--minqual', help='Minimum base quality. Bases with quality score lower than this will be discarded. This is performed BEFORE --mincov. Default: 30', type=int, default=CMSEQ_DEFAULTS.minqual)
20 | 	parser.add_argument('--mincov', help='Minimum position coverage to perform the polymorphism calculation. Position with a lower depth of coverage will be discarded (i.e. considered as zero-coverage positions). This is calculated AFTER --minqual. Default:'+str(CMSEQ_DEFAULTS.mincov), type=int, default=CMSEQ_DEFAULTS.mincov)
21 | 	parser.add_argument('--dominant_frq_thrsh', help='Cutoff for degree of `allele dominance` for a position to be considered polymorphic. Default: '+str(CMSEQ_DEFAULTS.poly_dominant_frq_thrsh), type=float, default=CMSEQ_DEFAULTS.poly_dominant_frq_thrsh)
22 | 	parser.add_argument('--gff_file', help="GFF file used to extract protein-coding genes", default = None)
23 | 	args = parser.parse_args()
24 | 
25 | 	import pandas as pd
26 | 
27 | 	outputDicts=[]
28 | 
29 | 	si = True if args.sortindex else False
30 | 	mode = 'all' if args.f else 'nofilter'
31 | 
32 | 	bf = BamFile(args.BAMFILE,sort=si,index=si,stepper=mode,minlen=args.minlen,filterInputList=args.contig)
33 | 
34 | 	if (args.gff_file):
35 | 		bf.parse_gff(args.gff_file)
36 | 
37 | 	for i in bf.get_contigs_obj():
38 | 		dominanceArray, mutationStats = i.easy_polymorphism_rate(minqual=args.minqual,mincov=args.mincov,dominant_frq_thrsh=args.dominant_frq_thrsh)
39 | 		outputDicts.append({'Ref':i.name, 'DN':mutationStats['DN'],'DS':mutationStats['DS'],'D?':mutationStats['D?'], "consid_pos":len([x for x in dominanceArray if not np.isnan(x)])})
40 | 	out_df = pd.DataFrame.from_dict(outputDicts).set_index('Ref')
41 | 	print(float(np.sum(out_df["DN"])), float(np.sum(out_df["DS"])), float(sum(out_df["consid_pos"])))
42 | 
43 | if __name__ == "__main__":
44 | 	polymut_from_file()


--------------------------------------------------------------------------------
/recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "cmseq" %}
 2 | {% set version = "1.0" %}
 3 | 
 4 | package:
 5 |   name: {{ name }}
 6 |   version: {{ version }}
 7 | 
 8 | source:
 9 |   url: https://github.com/fbeghini/cmseq/archive/{{version}}.tar.gz
10 |   sha256: 37f202a6bf6668ebda5ccfdab878af277a46965e39fa1c4602dc12c0ca67b079
11 | 
12 | build:
13 |   noarch: python
14 |   script: {{ PYTHON }} -m pip install . -vv
15 | 
16 | requirements:
17 |   host:
18 |     - python
19 |     - pip
20 |     - numpy
21 |   run:
22 |     - python
23 |     - samtools >=1.0
24 |     - numpy
25 |     - scipy
26 |     - pysam
27 |     - pandas
28 |     - biopython
29 |     - bcbio-gff
30 | 
31 | test:
32 |   commands:
33 |     - breadth_depth.py --help
34 |     - consensus.py --help
35 |     - polymut.py --help
36 |     - poly.py --help
37 | about:
38 |   home: https://github.com/SegataLab/cmseq
39 |   summary: Set of utilities on sequences and BAM files


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | import codecs
 3 | from setuptools.command.install import install
 4 | from io import open
 5 | import os
 6 | 
 7 | def read(rel_path):
 8 |     here = os.path.abspath(os.path.dirname(__file__))
 9 |     with codecs.open(os.path.join(here, rel_path), "r") as fp:
10 |         return fp.read()
11 | 
12 | 
13 | def get_version(rel_path):
14 |     for line in read(rel_path).splitlines():
15 |         if line.startswith("__version__"):
16 |             delim = '"' if '"' in line else "'"
17 |             return line.split(delim)[1]
18 |     else:
19 |         raise RuntimeError("Unable to find version string.")
20 | 
21 | install_requires = ["numpy", "scipy", "pysam", "pandas", "biopython", "bcbio-gff"]
22 | setuptools.setup(
23 |     name='CMSeq',
24 |     version=get_version("cmseq/__init__.py"),
25 |     author='Moreno Zolfo',
26 |     author_email='moreno.zolfo@unitn.it',
27 |     url='http://github.com/SegataLab/cmseq/',
28 |     license = 'LICENSE.txt',
29 |     packages=setuptools.find_packages(),
30 |     entry_points={
31 |         'console_scripts': [
32 |             'breadth_depth.py = cmseq.breadth_depth:bd_from_file',
33 |             'consensus.py = cmseq.consensus:consensus_from_file',
34 |             'consensus_aDNA.py = cmseq.consensus_aDNA:consensus_from_file',
35 |             'polymut.py = cmseq.polymut:polymut_from_file',
36 |             'poly.py = cmseq.poly:poly_from_file'
37 |         ]
38 |     },
39 |     long_description_content_type='text/markdown',
40 |     long_description=open('README.md').read(),
41 |     description='Set of utilities on sequences and BAM files',
42 |     install_requires=install_requires
43 | )
44 | 


--------------------------------------------------------------------------------