├── .github └── workflows │ └── pythonpackage.yml ├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── README.md ├── docs ├── Makefile ├── api.rst ├── conf.py ├── index.rst └── release_notes.rst ├── flake8.cfg ├── mypy.ini ├── poetry.lock ├── pyproject.toml └── samwell ├── __init__.py ├── dnautils.py ├── itertools.py ├── overlap_detector.py ├── sam ├── __init__.py ├── bwa_mem.py ├── clipping.py ├── sambuilder.py └── tests │ ├── __init__.py │ ├── data │ └── valid.sam │ ├── test_bwa_mem.py │ ├── test_clipping.py │ ├── test_sam.py │ └── test_sambuilder.py └── tests ├── __init__.py ├── test_dnautils.py ├── test_itertools.py └── test_overlap_detector.py /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | env: 5 | POETRY_VERSION: 1.0 6 | 7 | 8 | jobs: 9 | testing: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | PYTHON_VERSION: [3.6, 3.7, 3.8] 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up Python ${{matrix.PYTHON_VERSION}} 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: ${{matrix.PYTHON_VERSION}} 20 | - name: Install bwa 21 | env: 22 | ACTIONS_ALLOW_UNSECURE_COMMANDS: 'true' 23 | run: | 24 | wget https://github.com/lh3/bwa/releases/download/v0.7.17/bwa-0.7.17.tar.bz2 25 | tar -jxvf bwa-0.7.17.tar.bz2 26 | cd bwa-0.7.17 27 | make -j$(nproc) 28 | cd .. 29 | echo "${GITHUB_WORKSPACE}/bwa-0.7.17/" >> $GITHUB_PATH 30 | - name: Install poetry 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install poetry==${{env.POETRY_VERSION}} 34 | - name: Install cython 35 | run: | 36 | poetry run pip install cython==0.29.15 37 | - name: Install setuptools-scm for py3.6 38 | run: | 39 | poetry run pip install setuptools-scm==6.4.2 40 | - name: Install deps 41 | run: | 42 | poetry install --extras docs 43 | - name: Run pytest 44 | run: | 45 | poetry run python -m pytest --cov=samwell --cov-branch 46 | - name: Run lint 47 | run: | 48 | poetry run flake8 --config=flake8.cfg samwell 49 | - name: Run mypy 50 | run: | 51 | poetry run mypy -p samwell --config=mypy.ini 52 | - name: Run docs 53 | shell: bash 54 | run: | 55 | set -euo pipefail 56 | pushd docs 57 | poetry run make html 58 | popd 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # PyCharm 2 | .idea 3 | 4 | # Python compiled & optimized files 5 | *.pyc 6 | *.pyo 7 | 8 | # MyPy Cache directory 9 | .mypy_cache 10 | 11 | # for develop installs 12 | *.egg-info 13 | 14 | # venv set up 15 | .venv 16 | dist/ 17 | 18 | # Sphinx documentation 19 | html/_build 20 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | build: 2 | image: latest 3 | version: 2 4 | sphinx: 5 | configuration: docs/conf.py 6 | python: 7 | version: 3.6 8 | install: 9 | - method: pip 10 | path: . 11 | extra_requirements: 12 | - docs 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Myriad Genetics, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Language][language-badge]][language-link] 2 | [![Code Style][code-style-badge]][code-style-link] 3 | [![Type Checked][type-checking-badge]][type-checking-link] 4 | [![PEP8][pep-8-badge]][pep-8-link] 5 | [![License][license-badge]][license-link] 6 | 7 | --- 8 | 9 | [![Python package][python-package-badge]][python-package-link] 10 | [![PyPI version][pypi-badge]][pypi-link] 11 | [![PyPI download total][pypi-downloads-badge]][pypi-downloads-link] 12 | 13 | --- 14 | 15 | [language-badge]: http://img.shields.io/badge/language-python-brightgreen.svg 16 | [language-link]: http://www.python.org/ 17 | [code-style-badge]: https://img.shields.io/badge/code%20style-black-000000.svg 18 | [code-style-link]: https://black.readthedocs.io/en/stable/ 19 | [type-checking-badge]: http://www.mypy-lang.org/static/mypy_badge.svg 20 | [type-checking-link]: http://mypy-lang.org/ 21 | [pep-8-badge]: https://img.shields.io/badge/code%20style-pep8-brightgreen.svg 22 | [pep-8-link]: https://www.python.org/dev/peps/pep-0008/ 23 | [license-badge]: http://img.shields.io/badge/license-MIT-blue.svg 24 | [license-link]: https://github.com/myriad-opensource/samwell/blob/master/LICENSE 25 | [python-package-badge]: https://github.com/myriad-opensource/samwell/workflows/Python%20package/badge.svg 26 | [python-package-link]: https://github.com/myriad-opensource/samwell/actions?query=workflow%3A%22Python+package%22 27 | [pypi-badge]: https://badge.fury.io/py/samwell.svg 28 | [pypi-link]: https://pypi.python.org/pypi/samwell 29 | [pypi-downloads-badge]: https://img.shields.io/pypi/dm/samwell 30 | [pypi-downloads-link]: https://pypi.python.org/pypi/samwell 31 | 32 | # Samwell: a python package for using genomic files... well 33 | 34 | Samwell provides elegant utilities for managing biological data. 35 | 36 | See: [samwell.readthedocs.io](https://samwell.readthedocs.io/en/latest/) 37 | 38 | ## Quickstart 39 | 40 | First install samwell: 41 | 42 | ``` 43 | pip install samwell 44 | ``` 45 | 46 | ### Reading/Writing BAMs with automatic inference of filetype 47 | 48 | Samwell provides easy utilities for reading/writing BAMs: 49 | 50 | ```python 51 | from samwell import sam 52 | with sam.reader("myfile.bam") as in_bam: 53 | with sam.writer("my-output-file.bam", header=in_bam.header) as out_bam: 54 | for read in in_bam: 55 | if read.is_paired: 56 | out_bam.write(read) 57 | ``` 58 | 59 | 60 | ### Realigning fastqs with bwa 61 | 62 | You can use `samwell` to easily realign fastq records as necessary 63 | 64 | ```python 65 | from pathlib import Path 66 | from samwell import sam 67 | from samwell.sam import bwa_mem 68 | from samwell.sam import clipping 69 | from samwell.sam.bwa_mem import FastqRecord 70 | with sam.reader("myfile.bam") as in_bam: 71 | with sam.writer("outfile.bam", header=in_bam.header) as out_bam: 72 | fastq_gen = iter(FastqRecord.build(read) for read in in_bam) 73 | for read in bwa_mem.align(fastq_gen, Path("genome.fasta")): 74 | out_bam.write(read) 75 | ``` 76 | 77 | See `samwell.bwa_mem` module for more detail. 78 | 79 | 80 | ## Developing with samwell 81 | 82 | Samwell uses [`poetry`](https://github.com/python-poetry/poetry#installation) for dependency managment. 83 | 84 | Please install `poetry` using the instructions in the above link. 85 | Then simply execute: 86 | 87 | ```bash 88 | poetry install 89 | ``` 90 | 91 | ## Checking the Build 92 | 93 | ### Linting 94 | 95 | ```bash 96 | poetry run flake8 --config=flake8.cfg samwell 97 | ``` 98 | 99 | ### Type Checking 100 | 101 | ```bash 102 | poetry run mypy -p samwell --config=mypy.ini 103 | ``` 104 | 105 | ### Unit Tests 106 | 107 | ```bash 108 | poetry run python -m pytest --cov=samwell --cov-branch 109 | ``` 110 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | 9 | # Internal variables. 10 | PAPEROPT_a4 = -D latex_paper_size=a4 11 | PAPEROPT_letter = -D latex_paper_size=letter 12 | ALLSPHINXOPTS = -d _build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 13 | 14 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest 15 | 16 | help: 17 | @echo "Please use \`make ' where is one of" 18 | @echo " html to make standalone HTML files" 19 | @echo " dirhtml to make HTML files named index.html in directories" 20 | @echo " pickle to make pickle files" 21 | @echo " json to make JSON files" 22 | @echo " htmlhelp to make HTML files and a HTML help project" 23 | @echo " qthelp to make HTML files and a qthelp project" 24 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 25 | @echo " changes to make an overview of all changed/added/deprecated items" 26 | @echo " linkcheck to check all external links for integrity" 27 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 28 | 29 | clean: 30 | -rm -rf _build/* 31 | 32 | html: 33 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) _build/html 34 | @echo 35 | @echo "Build finished. The HTML pages are in _build/html." 36 | 37 | dirhtml: 38 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) _build/dirhtml 39 | @echo 40 | @echo "Build finished. The HTML pages are in _build/dirhtml." 41 | 42 | pickle: 43 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) _build/pickle 44 | @echo 45 | @echo "Build finished; now you can process the pickle files." 46 | 47 | json: 48 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) _build/json 49 | @echo 50 | @echo "Build finished; now you can process the JSON files." 51 | 52 | htmlhelp: 53 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) _build/htmlhelp 54 | @echo 55 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 56 | ".hhp project file in _build/htmlhelp." 57 | 58 | qthelp: 59 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) _build/qthelp 60 | @echo 61 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 62 | ".qhcp project file in _build/qthelp, like this:" 63 | @echo "# qcollectiongenerator _build/qthelp/samtools.qhcp" 64 | @echo "To view the help file:" 65 | @echo "# assistant -collectionFile _build/qthelp/samtools.qhc" 66 | 67 | latex: 68 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) _build/latex 69 | @echo 70 | @echo "Build finished; the LaTeX files are in _build/latex." 71 | @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ 72 | "run these through (pdf)latex." 73 | 74 | changes: 75 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) _build/changes 76 | @echo 77 | @echo "The overview file is in _build/changes." 78 | 79 | linkcheck: 80 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) _build/linkcheck 81 | @echo 82 | @echo "Link check complete; look for any errors in the above output " \ 83 | "or in _build/linkcheck/output.txt." 84 | 85 | doctest: 86 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) _build/doctest 87 | @echo "Testing of doctests in the sources finished, look at the " \ 88 | "results in _build/doctest/output.txt." 89 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | === 2 | API 3 | === 4 | 5 | ============ 6 | 7 | .. automodule:: samwell.dnautils 8 | :members: 9 | 10 | .. automodule:: samwell.itertools 11 | :members: 12 | 13 | .. automodule:: samwell.overlap_detector 14 | :members: 15 | 16 | .. automodule:: samwell.sam 17 | :members: 18 | 19 | .. automodule:: samwell.sam.bwa_mem 20 | :members: 21 | 22 | .. automodule:: samwell.sam.clipping 23 | :members: 24 | 25 | .. automodule:: samwell.sam.sambuilder 26 | :members: 27 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # samwell documentation build configuration file 4 | # 5 | # This file is execfile()d with the current directory set to its containing dir. 6 | # 7 | # Note that not all possible configuration values are present in this 8 | # autogenerated file. 9 | # 10 | # All configuration values have a default; values that are commented out 11 | # serve to show the default. 12 | 13 | import sys, os, glob 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | #_libdir = "../build/lib.%s-%s-%s.%s" % (os.uname()[0].lower(), os.uname()[4], 19 | # sys.version_info[0], sys.version_info[1]) 20 | _libdir = "../build/lib" 21 | if os.path.exists(_libdir): 22 | sys.path.insert(0, os.path.abspath(_libdir)) 23 | 24 | # -- General configuration ----------------------------------------------------- 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc', 29 | 'sphinx.ext.autosummary', 30 | 'sphinx.ext.todo', 31 | 'sphinx.ext.ifconfig', 32 | 'sphinx.ext.intersphinx', 33 | 'sphinx.ext.napoleon'] 34 | 35 | intersphinx_mapping = {'python': ('http://docs.python.org/3.6', None)} 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # The suffix of source filenames. 41 | source_suffix = '.rst' 42 | 43 | # The encoding of source files. 44 | #source_encoding = 'utf-8' 45 | 46 | # The master toctree document. 47 | master_doc = 'index' 48 | 49 | # General information about the project. 50 | project = u'samwell' 51 | copyright = u'2021, Myriad Genetics, Inc.' 52 | 53 | # Included at the end of each rst file 54 | rst_epilog = ''' 55 | .. _samwell: https://github.com/fulcrumgenomics/samwell 56 | .. _python: http://python.org/ 57 | .. _conda: https://conda.io/docs/ 58 | ''' 59 | 60 | autosummary_generate = True 61 | 62 | # The version info for the project you're documenting, acts as replacement for 63 | # |version| and |release|, also used in various other places throughout the 64 | # built documents. 65 | # 66 | from pathlib import Path 67 | import os 68 | toml_path = Path(os.path.realpath(__file__)).parent.parent / 'pyproject.toml' 69 | with toml_path.open("r") as reader: 70 | for line in reader: 71 | if line.startswith("version"): 72 | version = line.rstrip("\r\n").split(" = ")[1] 73 | version = version[1:-1] 74 | break 75 | 76 | # The full version, including alpha/beta/rc tags. 77 | release = version 78 | 79 | # The language for content autogenerated by Sphinx. Refer to documentation 80 | # for a list of supported languages. 81 | # language = None 82 | 83 | # There are two options for replacing |today|: either, you set today to some 84 | # non-false value, then it is used: 85 | # today = '' 86 | # Else, today_fmt is used as the format for a strftime call. 87 | # today_fmt = '%B %d, %Y' 88 | 89 | # List of documents that shouldn't be included in the build. 90 | # unused_docs = [] 91 | 92 | # List of directories, relative to source directory, that shouldn't be searched 93 | # for source files. 94 | exclude_trees = ['_build'] 95 | 96 | # The reST default role (used for this markup: `text`) to use for all documents. 97 | # default_role = None 98 | 99 | # If true, '()' will be appended to :func: etc. cross-reference text. 100 | # add_function_parentheses = True 101 | 102 | # If true, the current module name will be prepended to all description 103 | # unit titles (such as .. function::). 104 | # add_module_names = True 105 | 106 | # If true, sectionauthor and moduleauthor directives will be shown in the 107 | # output. They are ignored by default. 108 | # show_authors = False 109 | 110 | # The name of the Pygments (syntax highlighting) style to use. 111 | pygments_style = 'sphinx' 112 | 113 | # A list of ignored prefixes for module index sorting. 114 | #modindex_common_prefix = [] 115 | 116 | 117 | # -- Options for HTML output --------------------------------------------------- 118 | 119 | # The theme to use for HTML and HTML Help pages. Major themes that come with 120 | # Sphinx are currently 'default' and 'sphinxdoc'. 121 | html_theme = 'default' 122 | 123 | # Theme options are theme-specific and customize the look and feel of a theme 124 | # further. For a list of options available for each theme, see the 125 | # documentation. 126 | # html_theme_options = {} 127 | 128 | # Add any paths that contain custom themes here, relative to this directory. 129 | # html_theme_path = [] 130 | 131 | # The name for this set of Sphinx documents. If None, it defaults to 132 | # " v documentation". 133 | # html_title = None 134 | 135 | # A shorter title for the navigation bar. Default is the same as html_title. 136 | # html_short_title = None 137 | 138 | # The name of an image file (relative to this directory) to place at the top 139 | # of the sidebar. 140 | # html_logo = None 141 | 142 | # The name of an image file (within the static path) to use as favicon of the 143 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 144 | # pixels large. 145 | # html_favicon = None 146 | 147 | # Add any paths that contain custom static files (such as style sheets) here, 148 | # relative to this directory. They are copied after the builtin static files, 149 | # so a file named "default.css" will overwrite the builtin "default.css". 150 | html_static_path = [] 151 | 152 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 153 | # using the given strftime format. 154 | # html_last_updated_fmt = '%b %d, %Y' 155 | 156 | # If true, SmartyPants will be used to convert quotes and dashes to 157 | # typographically correct entities. 158 | # html_use_smartypants = True 159 | 160 | # Custom sidebar templates, maps document names to template names. 161 | # html_sidebars = {} 162 | 163 | # Additional templates that should be rendered to pages, maps page names to 164 | # template names. 165 | # html_additional_pages = {} 166 | 167 | # If false, no module index is generated. 168 | # html_use_modindex = True 169 | 170 | # If false, no index is generated. 171 | # html_use_index = True 172 | 173 | # If true, the index is split into individual pages for each letter. 174 | # html_split_index = False 175 | 176 | # If true, links to the reST sources are added to the pages. 177 | # html_show_sourcelink = True 178 | 179 | # If true, an OpenSearch description file will be output, and all pages will 180 | # contain a tag referring to it. The value of this option must be the 181 | # base URL from which the finished HTML is served. 182 | # html_use_opensearch = '' 183 | 184 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). 185 | # html_file_suffix = '' 186 | 187 | # Output file base name for HTML help builder. 188 | htmlhelp_basename = 'samwelldoc' 189 | 190 | 191 | # -- Options for LaTeX output -------------------------------------------------- 192 | 193 | # The paper size ('letter' or 'a4'). 194 | # latex_paper_size = 'letter' 195 | 196 | # The font size ('10pt', '11pt' or '12pt'). 197 | # latex_font_size = '10pt' 198 | 199 | # Grouping the document tree into LaTeX files. List of tuples 200 | # (source start file, target name, title, author, documentclass [howto/manual]). 201 | latex_documents = [ 202 | ('index', 'samwell.tex', u'samwell documentation', u'Nils Homer, Tim Fennell, et al.', 'manual'), 203 | ] 204 | 205 | # The name of an image file (relative to this directory) to place at the top of 206 | # the title page. 207 | # latex_logo = None 208 | 209 | # For "manual" documents, if this is true, then toplevel headings are parts, 210 | # not chapters. 211 | # latex_use_parts = False 212 | 213 | # Additional stuff for the LaTeX preamble. 214 | # latex_preamble = '' 215 | 216 | # Documents to append as an appendix to all manuals. 217 | # latex_appendices = [] 218 | 219 | # If false, no module index is generated. 220 | # latex_use_modindex = True 221 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ========================================================== 2 | Samwell: a python package for using genomic files... well. 3 | ========================================================== 4 | 5 | :Date: |today| 6 | :Version: |version| 7 | 8 | Samwell provides elegant utilities for managing biological data. 9 | 10 | 11 | Documentation Contents 12 | ====================== 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | 17 | index.rst 18 | api.rst 19 | 20 | .. toctree:: 21 | :maxdepth: 1 22 | 23 | release_notes.rst 24 | 25 | 26 | Quickstart 27 | ========== 28 | 29 | First install samwell:: 30 | 31 | pip install samwell 32 | 33 | Reading/Writing BAMs with automatic inference of filetype 34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 35 | 36 | Samwell provides easy utilities for reading/writing BAMs:: 37 | 38 | from samwell import sam 39 | with sam.reader("myfile.bam") as in_bam: 40 | with sam.writer("my-output-file.bam", header=in_bam.header) as out_bam: 41 | for read in in_bam: 42 | if read.is_paired: 43 | out_bam.write(read) 44 | 45 | See :mod:`~samwell.sam` module for more detail. 46 | 47 | Realigning fastqs with bwa 48 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 49 | 50 | You can use :mod:`~samwell` to easily realign fastq records as necessary:: 51 | 52 | from pathlib import Path 53 | from samwell import sam 54 | from samwell.sam import bwa_mem 55 | from samwell.sam import clipping 56 | from samwell.sam.bwa_mem import FastqRecord 57 | with sam.reader("myfile.bam") as in_bam: 58 | with sam.writer("outfile.bam", header=in_bam.header) as out_bam: 59 | fastq_gen = iter(FastqRecord.build(read) for read in in_bam) 60 | for read in bwa_mem.align(fastq_gen, Path("genome.fasta")): 61 | out_bam.write(read) 62 | 63 | See :mod:`~samwell.bwa_mem` module for more detail. 64 | 65 | Developing with samwell 66 | ======================= 67 | 68 | Samwell uses `poetry `_ for dependency managment. 69 | 70 | Please install `poetry` using the instructions in the above link. 71 | Then simply execute:: 72 | 73 | poetry install 74 | 75 | Checking the Build 76 | ~~~~~~~~~~~~~~~~~~ 77 | 78 | Linting:: 79 | 80 | poetry run flake8 --config=flake8.cfg samwell 81 | 82 | Type Checking:: 83 | 84 | poetry run mypy -p samwell --config=mypy.ini 85 | 86 | Unit Tests:: 87 | 88 | poetry run python -m pytest --cov=samwell --cov-branch 89 | -------------------------------------------------------------------------------- /docs/release_notes.rst: -------------------------------------------------------------------------------- 1 | Release Notes 2 | ============= 3 | 4 | .. contents:: Table of Contents 5 | :depth: 2 6 | :local: 7 | :backlinks: none 8 | 9 | v0.0.2 10 | ------ 11 | Add in some missing typing to dnautils and sam.bwa_mem 12 | 13 | v0.0.1 14 | ------ 15 | Initial release 16 | -------------------------------------------------------------------------------- /flake8.cfg: -------------------------------------------------------------------------------- 1 | # flake8 config file for pysg 2 | 3 | [flake8] 4 | max_line_length = 99 5 | show-source = true 6 | ignore = E701 W504 7 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | strict_optional = False 3 | ignore_missing_imports = True 4 | disallow_untyped_decorators = False 5 | follow_imports = silent 6 | disallow_untyped_defs = True 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "samwell" 3 | version = "v0.0.4" 4 | description = "Useful utilities for biological data formats and analyses" 5 | authors = ["Jeff Tratner ", "Nils Homer", "Tim Fennell", "Nathan Roach"] 6 | 7 | [tool.poetry.dependencies] 8 | python = ">=3.6" 9 | mypy-extensions = ">=0.4.3" 10 | defopt = ">=5.1.0" 11 | attrs = ">=19.3.0" 12 | intervaltree = ">=3.0.2" 13 | pysam = ">=0.15.3" 14 | pybedlite = ">=0.0.1" 15 | sphinx = {version = "4.3.1", optional = true} 16 | 17 | [tool.poetry.dev-dependencies] 18 | pytest = ">=5.3.5" 19 | pytest-vcr = ">=1.0.2" 20 | flake8 = ">=3.7.9" 21 | mypy = ">=0.761" 22 | pytest-cov = ">=2.8.1" 23 | 24 | [tool.poetry.extras] 25 | docs = ["sphinx"] 26 | 27 | [build-system] 28 | requires = ["poetry>=0.12"] 29 | build-backend = "poetry.masonry.api" 30 | -------------------------------------------------------------------------------- /samwell/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myriad-opensource/samwell/47c4d809e3a228cf2be7af09871ab70e706763a1/samwell/__init__.py -------------------------------------------------------------------------------- /samwell/dnautils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility Functions for Manipulating DNA sequences. 3 | ------------------------------------------------- 4 | 5 | This module contains utility functions for manipulating DNA sequences. 6 | 7 | """ 8 | 9 | from typing import Dict 10 | 11 | _RC_DICT: Dict[str, str] = ( 12 | dict(A='T', C='G', G='C', T='A', a='t', c='g', g='c', t='a', N='N') 13 | ) 14 | 15 | 16 | def reverse_complement(bases: str) -> str: 17 | """Reverse complements a base sequence. 18 | 19 | Arguments: 20 | bases: the bases to be reverse complemented. 21 | 22 | Returns: 23 | the reverse complement of the provided base string 24 | """ 25 | return ''.join([_RC_DICT[b] for b in bases[::-1]]) 26 | 27 | 28 | def mask_long_homopolymers(bases: str, min_long_hp_length: int, mask_base: str = 'N') -> str: 29 | """Returns the bases masked for regions with long homopolymers 30 | 31 | Args: 32 | bases: the bases to mask. 33 | min_long_hp_length: the minimum homopolymer length (inclusive) to mask. 34 | mask_base: the base to use when masking 35 | """ 36 | masked = list(bases) 37 | count = 1 38 | last_base = bases[0] 39 | for i in range(1, len(bases)): 40 | cur_base = bases[i] 41 | if last_base == cur_base: 42 | count += 1 43 | else: 44 | if count >= min_long_hp_length: 45 | masked[i - count:i] = mask_base * count 46 | last_base = cur_base 47 | count = 1 48 | if count >= min_long_hp_length: 49 | masked[-count:] = mask_base * count 50 | return ''.join(masked) 51 | 52 | 53 | def has_long_homopolymer(bases: str, max_hp_length: int) -> bool: 54 | '''Returns true if the given bases has a homopolymer length longer than the given length. 55 | 56 | Args: 57 | bases: the bases to examine. 58 | max_hp_length: the maximum homopolymer length to allow. 59 | ''' 60 | count = 1 61 | last_base = bases[0] 62 | for i in range(1, len(bases)): 63 | cur_base = bases[i] 64 | if last_base == cur_base: 65 | count += 1 66 | if count > max_hp_length: 67 | return True 68 | else: 69 | last_base = cur_base 70 | count = 1 71 | return count > max_hp_length 72 | -------------------------------------------------------------------------------- /samwell/itertools.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for Creating Useful Iterators 3 | --------------------------------------- 4 | 5 | This module contains classes and functions for creating useful iterators. 6 | 7 | Examples of a "Peekable" Iterator 8 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 9 | 10 | "Peekable" iterators are useful to "peek" at the next item in an iterator without consuming it. 11 | For example, this is useful when consuming items in iterator while a predicate is true, and not 12 | consuming the first element where the element is not true. See the 13 | :func:`~samwell.itertools.PeekableIterator.takewhile` and 14 | :func:`~samwell.itertools.PeekableIterator.dropwhile` methods. 15 | 16 | An empty peekable iterator throws StopIteration: 17 | 18 | .. code-block:: python 19 | 20 | >>> from samwell.itertools import peekable 21 | >>> piter = peekable(iter([])) 22 | >>> piter.peek() 23 | StopIteration 24 | 25 | A peekable iterator will return the next item before consuming it. 26 | 27 | .. code-block:: python 28 | 29 | >>> piter = peekable(iter([1, 2, 3])) 30 | >>> piter.peek() 31 | 1 32 | >>> next(piter) 33 | 1 34 | >>> [j for j in piter] 35 | [2, 3] 36 | 37 | The `can_peek()` function can be used to determine if the iterator can be peeked without 38 | StopIteration being thrown: 39 | 40 | >>> piter = peekable([1]) 41 | >>> piter.peek() if piter.can_peek() else -1 42 | 1 43 | >>> next(piter) 44 | 1 45 | >>> piter.peek() if piter.can_peek() else -1 46 | -1 47 | >>> next(piter) 48 | StopIteration 49 | 50 | The `peekable()` function should be preferred to calling `PeekableIterator`'s constructor 51 | directly as it supports creation from iterable objects as well as iterators, while the constructor 52 | requires an iterator. 53 | 54 | Examples of a "Merging" Iterator 55 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 56 | 57 | A "merging" iterator can merge two iterators in order based on a given ordering function. This is 58 | useful for merging two iterators that are already in order. 59 | 60 | .. code-block:: python 61 | 62 | >>> from samwell.itertools import MergingIterator 63 | >>> even = iter([2, 4, 6, 8]) 64 | >>> odd = iter([1, 3, 5, 9]) 65 | >>> merging = MergingIterator(even, odd, lambda x: x) 66 | >>> list(merging) 67 | [1, 2, 3, 4, 5, 6, 7, 8, 9] 68 | 69 | Module Contents 70 | ~~~~~~~~~~~~~~~ 71 | 72 | The module contains the following public classes: 73 | 74 | - :class:`~samwell.itertools.PeekableIterator` -- Iterator that allows you to peek at the 75 | next value before calling next 76 | 77 | - :class:`samwell.itertools.MergingIterator` -- Iterator that allows merging of two 78 | iterator using a keyfunc to decide from which iterator to draw the next item 79 | 80 | The module contains the following methods: 81 | 82 | - :func:`~samwell.itertools.peekable` -- Creates an iterator that allows you to peek at 83 | the next value before calling next 84 | """ 85 | 86 | from typing import Any 87 | from typing import Optional 88 | from typing import Callable 89 | from typing import Generic 90 | from typing import Iterable 91 | from typing import Iterator 92 | from typing import List 93 | from typing import TypeVar 94 | from typing import Union 95 | 96 | 97 | IterType = TypeVar('IterType') 98 | 99 | 100 | class PeekableIterator(Generic[IterType], Iterator[IterType]): 101 | """A peekable iterator wrapping an iterable. 102 | 103 | This allows returning the next item without consuming it. 104 | 105 | Args: 106 | source: an iterator over the objects 107 | """ 108 | 109 | def __init__(self, source: Iterator[IterType]) -> None: 110 | self._iter: Iterator[IterType] = source 111 | self._sentinel: Any = object() 112 | self.__update_peek() 113 | 114 | def __iter__(self) -> Iterator[IterType]: 115 | return self 116 | 117 | def __next__(self) -> IterType: 118 | to_return = self.peek() 119 | self.__update_peek() 120 | return to_return 121 | 122 | def __update_peek(self) -> None: 123 | self._peek = next(self._iter, self._sentinel) 124 | 125 | def can_peek(self) -> bool: 126 | """Returns true if there is a value that can be peeked at, false otherwise.""" 127 | return self._peek is not self._sentinel 128 | 129 | def peek(self) -> IterType: 130 | """Returns the next element without consuming it, or StopIteration otherwise.""" 131 | if self.can_peek(): 132 | return self._peek 133 | else: 134 | raise StopIteration 135 | 136 | def maybe_peek(self) -> Optional[IterType]: 137 | """Returns the next element without consuming it, or None otherwise.""" 138 | return self._peek if self.can_peek() else None 139 | 140 | def takewhile(self, pred: Callable[[IterType], bool]) -> List[IterType]: 141 | """Consumes from the iterator while pred is true, and returns the result as a List. 142 | 143 | The iterator is left pointing at the first non-matching item, or if all items match 144 | then the iterator will be exhausted. 145 | 146 | Args: 147 | pred: a function that takes the next value from the iterator and returns 148 | true or false. 149 | 150 | Returns: 151 | List[V]: A list of the values from the iterator, in order, up until and excluding 152 | the first value that does not match the predicate. 153 | """ 154 | xs: List[IterType] = [] 155 | while self.can_peek() and pred(self._peek): 156 | xs.append(next(self)) 157 | return xs 158 | 159 | def dropwhile(self, pred: Callable[[IterType], bool]) -> "PeekableIterator[IterType]": 160 | """Drops elements from the iterator while the predicate is true. 161 | 162 | Updates the iterator to point at the first non-matching element, or exhausts the 163 | iterator if all elements match the predicate. 164 | 165 | Args: 166 | pred (Callable[[V], bool]): a function that takes a value from the iterator 167 | and returns true or false. 168 | 169 | Returns: 170 | PeekableIterator[V]: a reference to this iterator, so calls can be chained 171 | """ 172 | while self.can_peek() and pred(self._peek): 173 | self.__update_peek() 174 | return self 175 | 176 | 177 | def peekable(source: Union[Iterator[IterType], Iterable[IterType]]) -> PeekableIterator[IterType]: 178 | """Creates a peekable iterator that allows you to peek at the next value before calling next 179 | 180 | The peek method will return the next element without consuming it, otherwise StopIteration. 181 | 182 | Args: 183 | source: either an iterator over the objects, or a callable that is called until it 184 | returns the sentinel. 185 | 186 | Returns: 187 | a :class:`~samwell.itertools.PeekableIterator` 188 | """ 189 | return PeekableIterator(source=iter(source)) 190 | 191 | 192 | class MergingIterator(Generic[IterType], Iterator[IterType]): 193 | """An iterator that merges two iterators; if they are sorted and keyfunc is passed, yields 194 | results in order. 195 | 196 | Args: 197 | iter1: an iterator 198 | iter2: an iterator 199 | keyfunc: a function that extracts a key from an item that is used to order items 200 | """ 201 | 202 | def __init__(self, 203 | iter1: Iterator[IterType], 204 | iter2: Iterator[IterType], 205 | keyfunc: Callable[[IterType], Any]) -> None: 206 | self._iter1 = peekable(iter1) 207 | self._iter2 = peekable(iter2) 208 | self._keyfunc = keyfunc 209 | 210 | def __iter__(self) -> Iterator[IterType]: 211 | return self 212 | 213 | def __next__(self) -> IterType: 214 | if self._iter1.can_peek() and self._iter2.can_peek(): 215 | k1 = self._keyfunc(self._iter1.peek()) 216 | k2 = self._keyfunc(self._iter2.peek()) 217 | return next(self._iter1 if k1 <= k2 else self._iter2) 218 | elif self._iter1.can_peek(): 219 | return next(self._iter1) 220 | elif self._iter2.can_peek(): 221 | return next(self._iter2) 222 | else: 223 | raise StopIteration 224 | -------------------------------------------------------------------------------- /samwell/overlap_detector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility Classes for Querying Overlaps with Genomic Regions 3 | ---------------------------------------------------------- 4 | 5 | DEPRECATED - if you have the option use `~pybedlite.overlap_detector` in favor of this. 6 | 7 | Examples of Detecting Overlaps 8 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 9 | 10 | .. code-block:: python 11 | 12 | >>> from samwell.overlap_detector import Interval, OverlapDetector 13 | >>> detector = OverlapDetector() 14 | >>> query = Interval("chr1", 2, 20) 15 | >>> detector.overlaps_any(query) 16 | False 17 | >>> detector.add(Interval("chr2", 1, 100)) 18 | >>> detector.add(Interval("chr1", 21, 100)) 19 | >>> detector.overlaps_any(query) 20 | False 21 | >>> detector.add(Interval("chr1", 1, 1)) 22 | >>> detector.overlaps_any(query) 23 | True 24 | >>> detector.get_overlaps(query) 25 | [Interval("chr1", 1, 1)] 26 | >>> detector.add(Interval("chr1", 3, 10)) 27 | >>> detector.overlaps_any(query) 28 | True 29 | >>> detector.get_overlaps(query) 30 | [Interval("chr1", 1, 1), interval("chr1", 3, 10)] 31 | 32 | Module Contents 33 | ~~~~~~~~~~~~~~~ 34 | 35 | The module contains the following public classes: 36 | 37 | - :class:`~samwell.overlap_detector.Interval` -- Represents a region mapping to the genome 38 | that is 0-based and open-ended 39 | - :class:`~samwell.overlap_detector.OverlapDetector` -- Detects and returns overlaps between 40 | a set of genomic regions and another genomic region 41 | """ 42 | 43 | 44 | from pybedlite.overlap_detector import Interval 45 | from pybedlite.overlap_detector import OverlapDetector 46 | 47 | __all__ = ["Interval", "OverlapDetector"] 48 | -------------------------------------------------------------------------------- /samwell/sam/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility Classes and Methods for SAM/BAM 3 | --------------------------------------- 4 | 5 | This module contains utility classes for reading and writing SAM/BAM files, as well as for 6 | manipulating Cigars. It is recommended to use the :func:`~samwell.sam.reader` and 7 | :func:`~samwell.sam.writer` methods rather than :class:`pysam.AlignmentFile` directly (see 8 | below for motivation). 9 | 10 | Motivation for Reader and Writer methods 11 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 12 | 13 | The following are the reasons for choosing to implement methods to open a SAM/BAM file for 14 | reading and writing, rather than relying on :class:`pysam.AlignmentFile` directly: 15 | 16 | 1. Provides a centralized place for the implementation of opening a SAM/BAM for reading and 17 | writing. This is useful if any additional parameters are added, or changes to standards or 18 | defaults are made. 19 | 2. Makes the requirement to provide a header when opening a file for writing more explicit. 20 | 3. Adds support for :class:`~pathlib.Path`. 21 | 4. Remove the reliance on specifying the mode correctly, including specifying the file type (i.e. 22 | SAM, BAM, or CRAM), as well as additional options (ex. compression level). This makes the 23 | code more explicit and easier to read. 24 | 5. An explicit check is performed to ensure the file type is specified when writing using a 25 | file-like object rather than a path to a file. 26 | 27 | Examples of Opening a SAM/BAM for Reading or Writing 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29 | 30 | Opening a SAM/BAM file for reading, auto-recognizing the file-type by the file extension. See 31 | :class:`~samwell.sam.SamFileType` for the supported file types. 32 | 33 | .. code-block:: python 34 | 35 | >>> from samwell.sam import reader 36 | >>> with reader("/path/to/sample.sam") as fh: 37 | ... for record in fh: 38 | ... print(record.name) # do something 39 | >>> with reader("/path/to/sample.bam") as fh: 40 | ... for record in fh: 41 | ... print(record.name) # do something 42 | 43 | Opening a SAM/BAM file for reading, explicitly passing the file type. 44 | 45 | >>> from samwell.sam import SamFileType 46 | >>> with reader(path="/path/to/sample.ext1", file_type=SamFileType.SAM) as fh: 47 | ... for record in fh: 48 | ... print(record.name) # do something 49 | >>> with reader(path="/path/to/sample.ext2", file_type=SamFileType.BAM) as fh: 50 | ... for record in fh: 51 | ... print(record.name) # do something 52 | 53 | Opening a SAM/BAM file for reading, using an existing file-like object 54 | 55 | >>> with open("/path/to/sample.sam", "rb") as file_object: 56 | ... with reader(path=file_object, file_type=SamFileType.BAM) as fh: 57 | ... for record in fh: 58 | ... print(record.name) # do something 59 | 60 | Opening a SAM/BAM file for writing follows similar to the :func:`~samwell.sam.reader` method, 61 | but the SAM file header object is required. 62 | 63 | >>> from samwell.sam import writer 64 | >>> header: Dict[str, Any] = { 65 | ... "HD": {"VN": "1.5", "SO": "coordinate"}, 66 | ... "RG": [{"ID": "1", "SM": "1_AAAAAA", "LB": "lib", "PL": "ILLUMINA", "PU": "xxx.1"}], 67 | ... "SQ": [ 68 | ... {"SN": "chr1", "LN": 249250621}, 69 | ... {"SN": "chr2", "LN": 243199373} 70 | ... ] 71 | ... } 72 | >>> with writer(path="/path/to/sample.bam", header=header) as fh: 73 | ... pass # do something 74 | 75 | Examples of Manipulating Cigars 76 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 77 | 78 | Creating a :class:`~samwell.sam.Cigar` from a :class:`pysam.AlignedSegment`. 79 | 80 | >>> from samwell.sam import Cigar 81 | >>> with reader("/path/to/sample.sam") as fh: 82 | ... record = next(fh) 83 | ... cigar = Cigar.from_cigartuples(record.cigartuples) 84 | ... print(str(cigar)) 85 | 50M2D5M10S 86 | 87 | Creating a :class:`~samwell.sam.Cigar` from a :class:`str`. 88 | 89 | >>> cigar = Cigar.from_cigarstring("50M2D5M10S") 90 | >>> print(str(cigar)) 91 | 50M2D5M10S 92 | 93 | If the cigar string is invalid, the exception message will show you the problem character(s) in 94 | square brackets. 95 | 96 | >>> cigar = Cigar.from_cigarstring("10M5U") 97 | ... CigarException("Malformed cigar: 10M5[U]") 98 | 99 | The cigar contains a tuple of :class:`~samwell.sam.CigarElement`s. Each element contains the 100 | cigar operator (:class:`~samwell.sam.CigarOp`) and associated operator length. A number of 101 | useful methods are part of both classes. 102 | 103 | The number of bases aligned on the query (i.e. the number of bases consumed by the cigar from 104 | the query): 105 | 106 | >>> cigar = Cigar.from_cigarstring("50M2D5M2I10S") 107 | >>> [e.length_on_query for e in cigar.elements] 108 | [50, 0, 5, 2, 10] 109 | >>> [e.length_on_target for e in cigar.elements] 110 | [50, 2, 5, 0, 0] 111 | >>> [e.operator.is_indel for e in cigar.elements] 112 | [False, True, False, True, False] 113 | 114 | Module Contents 115 | ~~~~~~~~~~~~~~~ 116 | 117 | The module contains the following public classes: 118 | 119 | - :class:`~samwell.sam.SamFileType` -- Enumeration of valid SAM/BAM/CRAM file types. 120 | - :class:`~samwell.sam.SamOrder` -- Enumeration of possible SAM/BAM/CRAM sort orders. 121 | - :class:`~samwell.sam.CigarOp` -- Enumeration of operators that can appear in a Cigar string. 122 | - :class:`~samwell.sam.CigarElement` -- Class representing an element in a Cigar string. 123 | - :class:`~samwell.sam.CigarParsingException` -- The exception raised specific to parsing a 124 | cigar 125 | - :class:`~samwell.sam.Cigar` -- Class representing a cigar string. 126 | 127 | The module contains the following methods: 128 | 129 | - :func:`~samwell.sam.reader` -- opens a SAM/BAM/CRAM file for reading. 130 | - :func:`~samwell.sam.writer` -- opens a SAM/BAM/CRAM file for writing 131 | - :func:`~samwell.sam.set_qc_fail` -- sets the QC fail flag in a 132 | :class:`pysam.AlignedSegment` record and sets additional SAM tags giving the tool name and 133 | reason for why the QC fail flag was set. 134 | - :func:`~samwell.sam.get_qc_fail` -- gets the tool name and reason for why the QC fail flag 135 | was set, or None if it is not set. 136 | """ 137 | 138 | import enum 139 | import io 140 | from pathlib import Path 141 | from typing import Any 142 | from typing import Callable 143 | from typing import Dict 144 | from typing import IO 145 | from typing import List 146 | from typing import Optional 147 | from typing import Tuple 148 | from typing import Union 149 | from typing import TYPE_CHECKING 150 | import attr 151 | import pysam 152 | import sys 153 | from pysam import AlignmentFile as SamFile 154 | from pysam import AlignmentHeader as SamHeader 155 | from pysam import AlignedSegment 156 | 157 | if TYPE_CHECKING or sys.version_info < (3, 8, 0): 158 | from typing_extensions import Final 159 | else: 160 | from typing import Final 161 | 162 | """The valid base classes for opening a SAM/BAM/CRAM file.""" 163 | SamPath = Union[IO[Any], Path, str] 164 | 165 | """The reference index to use to indicate no reference in SAM/BAM.""" 166 | NO_REF_INDEX: int = -1 167 | 168 | """The reference name to use to indicate no reference in SAM/BAM.""" 169 | NO_REF_NAME: str = "*" 170 | 171 | """The reference position to use to indicate no position in SAM/BAM.""" 172 | NO_REF_POS: int = -1 173 | 174 | 175 | @enum.unique 176 | class SamFileType(enum.Enum): 177 | """Enumeration of valid SAM/BAM/CRAM file types. 178 | 179 | Attributes: 180 | mode (str): The additional mode character to add when opening this file type. 181 | ext (str): The standard file extension for this file type. 182 | """ 183 | 184 | def __init__(self, mode: str, ext: str) -> None: 185 | self.mode: Final[str] = mode 186 | self.ext: Final[str] = ext 187 | 188 | SAM = ("", ".sam") 189 | BAM = ("b", ".bam") 190 | CRAM = ("c", ".cram") 191 | 192 | @classmethod 193 | def from_path(cls, path: Union[Path, str]) -> 'SamFileType': 194 | """Infers the file type based on the file extension. 195 | 196 | Args: 197 | path: the path to the SAM/BAM/CRAM to read or write. 198 | """ 199 | ext = Path(path).suffix 200 | try: 201 | return next(iter([tpe for tpe in SamFileType if tpe.ext == ext])) 202 | except StopIteration: 203 | raise ValueError(f"Could not infer file type from {path}") 204 | 205 | 206 | """The classes that should be treated as file-like classes""" 207 | _IOClasses = ( 208 | io.TextIOBase, 209 | io.BufferedIOBase, 210 | io.RawIOBase, 211 | io.IOBase 212 | ) 213 | 214 | 215 | def _pysam_open(path: SamPath, 216 | open_for_reading: bool, 217 | file_type: Optional[SamFileType] = None, 218 | **kwargs: Any) -> SamFile: 219 | """Opens a SAM/BAM/CRAM for reading or writing. 220 | 221 | Args: 222 | path: a file handle or path to the SAM/BAM/CRAM to read or write. 223 | open_for_reading: True to open for reading, false otherwise. 224 | file_type: the file type to assume when opening the file. If None, then the file type 225 | will be auto-detected for reading and must be a path-like object for writing. 226 | kwargs: any keyword arguments to be passed to 227 | :class:`~pysam.AlignmentFile`; may not include "mode". 228 | """ 229 | 230 | if isinstance(path, (str, Path)): # type: ignore 231 | file_type = file_type or SamFileType.from_path(path) 232 | path = str(path) 233 | elif not isinstance(path, _IOClasses): # type: ignore 234 | open_type = "reading" if open_for_reading else "writing" 235 | raise TypeError(f"Cannot open '{type(path)}' for {open_type}.") 236 | 237 | if file_type is None and not open_for_reading: 238 | raise ValueError("file_type must be given when writing to a file-like object") 239 | 240 | # file_type must be set when writing, so if file_type is None, then we must be opening it 241 | # for reading. Hence, only set mode in kwargs to pysam when file_type is set and when 242 | # writing since we can let pysam auto-recognize the file type when reading. See discussion: 243 | # https://github.com/pysam-developers/pysam/issues/655 244 | if file_type is not None: 245 | kwargs["mode"] = "r" if open_for_reading else "w" + file_type.mode 246 | else: 247 | assert open_for_reading, "Bug: file_type was None but open_for_reading was False" 248 | 249 | # Open it! 250 | return pysam.AlignmentFile(path, **kwargs) 251 | 252 | 253 | def reader(path: SamPath, 254 | file_type: Optional[SamFileType] = None 255 | ) -> SamFile: 256 | """Opens a SAM/BAM/CRAM for reading. 257 | 258 | Args: 259 | path: a file handle or path to the SAM/BAM/CRAM to read or write. 260 | file_type: the file type to assume when opening the file. If None, then the file 261 | type will be auto-detected. 262 | """ 263 | return _pysam_open(path=path, open_for_reading=True, file_type=file_type) 264 | 265 | 266 | def writer(path: SamPath, 267 | header: Union[str, Dict[str, Any], SamHeader], 268 | file_type: Optional[SamFileType] = None) -> SamFile: 269 | """Opens a SAM/BAM/CRAM for writing. 270 | 271 | Args: 272 | path: a file handle or path to the SAM/BAM/CRAM to read or write. 273 | header: Either a string to use for the header or a multi-level dictionary. The 274 | multi-level dictionary should be given as follows. The first level are the four 275 | types (‘HD’, ‘SQ’, ...). The second level are a list of lines, with each line being 276 | a list of tag-value pairs. The header is constructed first from all the defined 277 | fields, followed by user tags in alphabetical order. 278 | file_type: the file type to assume when opening the file. If None, then the 279 | filetype will be auto-detected and must be a path-like object. 280 | """ 281 | # Set the header for pysam's AlignmentFile 282 | key = "text" if isinstance(header, str) else "header" 283 | kwargs = {key: header} 284 | 285 | return _pysam_open(path=path, open_for_reading=False, file_type=file_type, **kwargs) 286 | 287 | 288 | class _CigarOpUtil: 289 | """Some useful constants to speed up methods on CigarOp""" 290 | 291 | """A dictionary from the cigar op code to the cigar op char. 292 | 293 | This is to speed up the translation of cigar op code to CigarOp in CigarOp, so needs to be 294 | declared beforehand. 295 | """ 296 | CODE_TO_CHARACTER: Dict[int, str] = {0: "M", 1: "I", 2: "D", 3: "N", 4: "S", 5: "H", 6: "P", 297 | 7: "EQ", 8: "X"} 298 | 299 | 300 | @enum.unique 301 | class CigarOp(enum.Enum): 302 | """Enumeration of operators that can appear in a Cigar string. 303 | 304 | Attributes: 305 | code (int): The :py:mod:`~pysam` cigar operator code. 306 | character (int): The single character cigar operator. 307 | consumes_query (bool): True if this operator consumes query bases, False otherwise. 308 | consumes_target (bool): True if this operator consumes target bases, False otherwise. 309 | """ 310 | 311 | M = (0, 'M', True, True) #: Match or Mismatch the reference 312 | I = (1, 'I', True, False) #: Insertion versus the reference # noqa: E741 313 | D = (2, 'D', False, True) #: Deletion versus the reference 314 | N = (3, 'N', False, True) #: Skipped region from the reference 315 | S = (4, 'S', True, False) #: Soft clip 316 | H = (5, 'H', False, False) #: Hard clip 317 | P = (6, 'P', False, False) #: Padding 318 | EQ = (7, '=', True, True) #: Matches the reference 319 | X = (8, 'X', True, True) #: Mismatches the reference 320 | 321 | def __init__(self, 322 | code: int, 323 | character: str, 324 | consumes_query: bool, 325 | consumes_reference: bool) -> None: 326 | self.code = code 327 | self.character = character 328 | self.consumes_query = consumes_query 329 | self.consumes_reference = consumes_reference 330 | 331 | @staticmethod 332 | def from_character(character: str) -> 'CigarOp': 333 | """Returns the operator from the single character.""" 334 | if CigarOp.EQ.character == character: 335 | return CigarOp.EQ 336 | else: 337 | return CigarOp[character] 338 | 339 | @staticmethod 340 | def from_code(code: int) -> 'CigarOp': 341 | """Returns the operator from the given operator code. 342 | 343 | Note: this is mainly used to get the operator from :py:mod:`~pysam`. 344 | """ 345 | return CigarOp[_CigarOpUtil.CODE_TO_CHARACTER[code]] 346 | 347 | @property 348 | def is_indel(self) -> bool: 349 | """Returns true if the operator is an indel, false otherwise. """ 350 | return self == CigarOp.I or self == CigarOp.D 351 | 352 | 353 | @attr.s(frozen=True, slots=True) 354 | class CigarElement: 355 | """ Represents an element in a Cigar 356 | 357 | Attributes: 358 | - length (int): the length of the element 359 | - operator (CigarOp): the operator of the element 360 | """ 361 | 362 | length: int = attr.ib() 363 | operator: CigarOp = attr.ib() 364 | 365 | @length.validator 366 | def _validate_length(self, attribute: Any, value: int) -> None: 367 | """Validates the length attribute is greater than zero.""" 368 | if value <= 0: 369 | raise ValueError(f"Cigar element must have a length > 0, found {value}") 370 | 371 | @property 372 | def length_on_query(self) -> int: 373 | """Returns the length of the element on the query sequence.""" 374 | return self.length if self.operator.consumes_query else 0 375 | 376 | @property 377 | def length_on_target(self) -> int: 378 | """Returns the length of the element on the target (often reference) sequence.""" 379 | return self.length if self.operator.consumes_reference else 0 380 | 381 | def __str__(self) -> str: 382 | return f"{self.length}{self.operator.character}" 383 | 384 | 385 | class CigarParsingException(Exception): 386 | """The exception raised specific to parsing a cigar.""" 387 | pass 388 | 389 | 390 | @attr.s(frozen=True, slots=True) 391 | class Cigar: 392 | """Class representing a cigar string. 393 | 394 | Attributes: 395 | - elements (Tuple[CigarElement, ...]): zero or more cigar elements 396 | """ 397 | 398 | elements: Tuple[CigarElement, ...] = attr.ib(default=()) 399 | 400 | @classmethod 401 | def from_cigartuples(cls, cigartuples: Optional[List[Tuple[int, int]]]) -> 'Cigar': 402 | """Returns a Cigar from a list of tuples returned by pysam. 403 | 404 | Each tuple denotes the operation and length. See 405 | :class:`~samwell.sam.CigarOp` for more information on the 406 | various operators. If None is given, returns an empty Cigar. 407 | """ 408 | if cigartuples is None or cigartuples == []: 409 | return Cigar() 410 | try: 411 | elements = [] 412 | for code, length in cigartuples: 413 | operator = CigarOp.from_code(code) 414 | elements.append(CigarElement(length, operator)) 415 | return Cigar(tuple(elements)) 416 | except Exception as ex: 417 | raise CigarParsingException(f"Malformed cigar tuples: {cigartuples}") from ex 418 | 419 | @classmethod 420 | def _pretty_cigarstring_exception(cls, 421 | cigarstring: str, 422 | index: int) -> CigarParsingException: 423 | """Raises an exception highlighting the malformed character""" 424 | prefix = cigarstring[:index] 425 | character = cigarstring[index] if index < len(cigarstring) else "" 426 | suffix = cigarstring[index + 1:] 427 | pretty_cigarstring = f"{prefix}[{character}]{suffix}" 428 | message = f"Malformed cigar: {pretty_cigarstring}" 429 | return CigarParsingException(message) 430 | 431 | @classmethod 432 | def from_cigarstring(cls, cigarstring: str) -> 'Cigar': 433 | """Constructs a Cigar from a string returned by pysam. 434 | 435 | If "*" is given, returns an empty Cigar. 436 | """ 437 | if cigarstring == "*": 438 | return Cigar() 439 | 440 | cigarstring_length = len(cigarstring) 441 | if cigarstring_length == 0: 442 | raise CigarParsingException("Cigar string was empty") 443 | 444 | elements = [] 445 | i = 0 446 | while i < cigarstring_length: 447 | if not cigarstring[i].isdigit(): 448 | raise cls._pretty_cigarstring_exception(cigarstring, i) # type: ignore 449 | length = int(cigarstring[i]) 450 | i += 1 451 | while i < cigarstring_length and cigarstring[i].isdigit(): 452 | length = (length * 10) + int(cigarstring[i]) 453 | i += 1 454 | if i == cigarstring_length: 455 | raise cls._pretty_cigarstring_exception(cigarstring, i) # type: ignore 456 | try: 457 | operator = CigarOp.from_character(cigarstring[i]) 458 | elements.append(CigarElement(length, operator)) 459 | except KeyError as ex: 460 | # cigar operator was not valid 461 | raise cls._pretty_cigarstring_exception(cigarstring, i) from ex # type: ignore 462 | except IndexError as ex: 463 | # missing cigar operator (i == len(cigarstring)) 464 | raise cls._pretty_cigarstring_exception(cigarstring, i) from ex # type: ignore 465 | i += 1 466 | return Cigar(tuple(elements)) 467 | 468 | def __str__(self) -> str: 469 | if self.elements: 470 | return "".join([str(e) for e in self.elements]) 471 | else: 472 | return "*" 473 | 474 | def reversed(self) -> "Cigar": 475 | """Returns a copy of the Cigar with the elements in reverse order.""" 476 | return Cigar(tuple(reversed(self.elements))) 477 | 478 | def length_on_query(self) -> int: 479 | """Returns the length of the alignment on the query sequence.""" 480 | return sum([elem.length_on_query for elem in self.elements]) 481 | 482 | def length_on_target(self) -> int: 483 | """Returns the length of the alignment on the target sequence.""" 484 | return sum([elem.length_on_target for elem in self.elements]) 485 | 486 | def coalesce(self) -> "Cigar": 487 | """Returns a copy of the cigar adjacent operators of the same type coalesced into single 488 | operators.""" 489 | new_elements: List[CigarElement] = [] 490 | element_index: int = 0 491 | while element_index < len(self.elements): 492 | cur_element: CigarElement = self.elements[element_index] 493 | op_length: int = cur_element.length 494 | element_index += 1 495 | while (element_index < len(self.elements) and 496 | cur_element.operator == self.elements[element_index].operator): 497 | op_length += self.elements[element_index].length 498 | element_index += 1 499 | new_elements.append(CigarElement(operator=cur_element.operator, length=op_length)) 500 | return Cigar(tuple(new_elements)) 501 | 502 | 503 | # The SAM tag to store which tool caused the QC fail flag to be set 504 | QcFailToolTag = 'qt' 505 | 506 | 507 | # The SAM tag to store the reason why the tool caused the QC flag to be set 508 | QcFailReasonTag = 'qr' 509 | 510 | 511 | def set_qc_fail(rec: pysam.AlignedSegment, tool: Callable[..., Any], reason: str) -> None: 512 | """Sets the QC fail flag, and adds tags containing the tool name and reason for failing. 513 | Args: 514 | rec: the record to fail 515 | tool: the tool (as a callable) that failed this record 516 | reason: the reason for failing 517 | """ 518 | assert '\t' not in reason, f"Reason may not contain tabs: {reason}" 519 | rec.is_qcfail = True 520 | rec.set_tag(QcFailToolTag, tool.__name__) 521 | rec.set_tag(QcFailReasonTag, reason) 522 | 523 | 524 | def get_qc_fail(rec: pysam.AlignedSegment) -> Optional[Tuple[str, str]]: 525 | """Gets the tool and reason for why the QC fail flag is set, otherwise None if not set. 526 | 527 | If the QC fail flag is set, but the tool and filter reason SAM tags are not set, None will be 528 | returned. Use pysam.AlignedSegment.is_qcfail() to check if the record is simply QC failed. 529 | 530 | Args: 531 | rec: the record to fail 532 | """ 533 | if not rec.is_qcfail or not rec.has_tag(QcFailToolTag): 534 | return None 535 | else: 536 | tool_value = rec.get_tag(QcFailToolTag) 537 | reason_value = rec.get_tag(QcFailReasonTag) 538 | return (tool_value, reason_value) 539 | 540 | 541 | def get_qc_fail_by_tool(rec: pysam.AlignedSegment, 542 | tool: Callable[..., Any] = None) -> Optional[Tuple[str, str]]: 543 | """Gets the tool and reason for why the QC fail flag if the flag was set by the passed tool. 544 | 545 | None will be returned in the following cases: 546 | - The QC fail flag is not set 547 | - The QC fail flag isset, but the tool and filter reason SAM tags are not set 548 | - The tool and filter reason SAM tags were set by a different tool 549 | 550 | Use pysam.AlignedSegment.is_qcfail() to check if the record is simply QC failed. 551 | 552 | Args: 553 | rec: the record to fail 554 | tool: the tool that must have set the QC fail flag 555 | """ 556 | maybe_tool_and_reason = get_qc_fail(rec) 557 | if maybe_tool_and_reason is None: 558 | return maybe_tool_and_reason 559 | else: 560 | tool_value = maybe_tool_and_reason[0] 561 | return maybe_tool_and_reason if tool.__name__ == tool_value else None 562 | 563 | 564 | def isize(r1: AlignedSegment, r2: AlignedSegment) -> int: 565 | """Computes the insert size for a pair of records.""" 566 | if r1.is_unmapped or r2.is_unmapped or r1.reference_id != r2.reference_id: 567 | return 0 568 | else: 569 | r1_pos = r1.reference_end if r1.is_reverse else r1.reference_start 570 | r2_pos = r2.reference_end if r2.is_reverse else r2.reference_start 571 | return r2_pos - r1_pos 572 | 573 | 574 | def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = True) -> None: 575 | """Resets mate pair information between reads in a pair. Requires that both r1 576 | and r2 are mapped. Can be handed reads that already have pairing flags setup or 577 | independent R1 and R2 records that are currently flagged as SE reads. 578 | 579 | Args: 580 | r1: read 1 581 | r2: read 2 with the same queryname as r1 582 | """ 583 | assert not r1.is_unmapped, f"Cannot process unmapped mate {r1.query_name}/1" 584 | assert not r2.is_unmapped, f"Cannot process unmapped mate {r2.query_name}/2" 585 | assert r1.query_name == r2.query_name, ( 586 | f"Attempting to pair reads with different qnames {r1.query_name} vs {r2.query_name}." 587 | ) 588 | 589 | for r in [r1, r2]: 590 | r.is_paired = True 591 | r.is_proper_pair = proper_pair 592 | 593 | r1.is_read1 = True 594 | r1.is_read2 = False 595 | r2.is_read2 = True 596 | r2.is_read1 = False 597 | 598 | for src, dest in [(r1, r2), (r2, r1)]: 599 | dest.next_reference_id = src.reference_id 600 | dest.next_reference_start = src.reference_start 601 | dest.mate_is_reverse = src.is_reverse 602 | dest.mate_is_unmapped = False 603 | dest.set_tag("MC", src.cigarstring) 604 | 605 | insert_size = isize(r1, r2) 606 | r1.template_length = insert_size 607 | r2.template_length = - insert_size 608 | 609 | 610 | @enum.unique 611 | class SamOrder(enum.Enum): 612 | """ 613 | Enumerations of possible sort orders for a SAM file. 614 | """ 615 | 616 | Unsorted = "unsorted" #: the SAM / BAM / CRAM is unsorted 617 | Coordinate = "coordinate" #: coordinate sorted 618 | QueryName = "queryname" #: queryname sorted 619 | Unknown = "unknown" # Unknown SAM / BAM / CRAM sort order 620 | -------------------------------------------------------------------------------- /samwell/sam/bwa_mem.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility methods for running BWA 3 | ------------------------------- 4 | 5 | This module contains methods for running BWA. Currently only the "mem" algorithm is supported: 6 | - :func:`~samwell.bwa_mem.align` -- Aligns the given reads with BWA mem. 7 | 8 | 9 | The options for running BWA can be customized via the three Options classes: 10 | - :class:`~samwell.bwa_mem.AlgorithmOptions` -- Bwa mem algorithm options. 11 | - :class:`~samwell.bwa_mem.ScoringOptions` -- Bwa mem scoring options 12 | - :class:`~samwell.bwa_mem.InputOutputOptions` -- Bwa mem input and output options 13 | 14 | An input read for alignment must be minimally transformed into a FASTQ-like record: 15 | 16 | - :class:`~samwell.bwa_mem.FastqRecord` -- Fastq record used as input to alignment. 17 | 18 | 19 | Implementation 20 | ~~~~~~~~~~~~~~ 21 | 22 | Alignment of reads is performed asynchronously. 23 | 24 | This is achieved by creating three sub-processes in :func:`~samwell.bwa_mem.align`: 25 | 1. A process to consume the input iterable of reads and write them (a) to the stdin of 26 | BWA mem, and (b) to a queue of reads that are awaiting alignment results from BWA mem. 27 | 2. A process to run BWA mem, where FASTQ records are written to the process' stdin, 28 | alignment results are returned to stdout, and any error/logging information from BWA mem is 29 | returned to stderr. 30 | 3. A process to route the stderr of BWA mem to the given stderr handle (stderr_out). 31 | 32 | Then :func:`~samwell.bwa_mem.align` method consumes the stdout of the BWA mem process and collates 33 | that with the queue of reads that have been written/given to BWA mem (from process 1). For each 34 | input read, one or more alignments is expected to be returned by BWA mem. The order in which 35 | alignments of reads are returned by BWA mem is the same order as the order of reads given to BWA 36 | mem. The :func:`~samwell.bwa_mem.align` method then returns an iterable over the alignment 37 | results. 38 | 39 | Exceptions may occur in the thread to input FASTQ records to BWA mem, which are propagated 40 | to the caller. Furthermore, an exception is returned if the # of reads given to BWA mem is 41 | not the same as the # of reads returned by BWA mem. 42 | 43 | Some specific handling occurs around reading the BWA mem output with :py:mod:`~pysam`, since the 44 | latter blocks waiting for at least some reads from BWA mem, which may not happen if there was an 45 | issue in the various upstream processes (input to BWA mem or BWA mem itself). This would have 46 | caused a deadlock. 47 | 48 | 49 | Examples 50 | ~~~~~~~~ 51 | 52 | Typically, we have :class:`~pysam.AlignedSegment` records obtained from reading from a SAM or BAM 53 | file. The first must be converted into :class:`~samwell.bwa_mem.FastqRecord` objects. 54 | 55 | .. code-block:: python 56 | 57 | >>> from samwell.sam.bwa_mem import FastqRecord 58 | >>> from samwell.sam import reader 59 | >>> reads = reader("/path/to/sample.sam") 60 | >>> fastq_reads = map(lambda read: FastqRecord.build(read), reads) 61 | 62 | Next, those :class:`~samwell.bwa_mem.FastqRecord`s can be aligned with the 63 | :func:`~samwell.bwa_mem.align_mem` method. 64 | 65 | .. code-block:: python 66 | 67 | >>> from samwell.sam.bwa_mem import align 68 | >>> results = map(lambda read: align(read), fastq_reads) 69 | 70 | This returns an iterable over the alignment results. An alignment result is a tuple 71 | consisting of the original :class:`~samwell.bwa_mem.FastqRecord` and an iterator over the 72 | alignments (see :class:`~pysam.AlignedSegment`). 73 | 74 | .. code-block:: python 75 | 76 | >>> result = next(result) 77 | >>> fastq_read, alignments = result 78 | >>> str(fastq_read) 79 | @name 80 | GATTACA 81 | + 82 | HIJKLKM 83 | >>> len(alignments) 84 | 2 85 | >>> alignment = str(next(alignments)) 86 | >>> alignment.query_name 87 | name 88 | >>> type(alignment) 89 | 90 | """ 91 | 92 | 93 | import enum 94 | import logging 95 | import queue 96 | import subprocess 97 | import sys 98 | import threading 99 | import time 100 | from pathlib import Path 101 | from typing import Any 102 | from typing import Callable 103 | from typing import ClassVar 104 | from typing import Dict 105 | from typing import Generic 106 | from typing import Iterable 107 | from typing import Iterator 108 | from typing import List 109 | from typing import Optional 110 | from typing import Tuple 111 | from typing import TypeVar 112 | from typing import Union 113 | from typing import cast 114 | 115 | import attr 116 | import pysam 117 | 118 | import samwell.sam as sam 119 | from samwell.itertools import PeekableIterator 120 | from samwell.sam import SamFileType 121 | from samwell.dnautils import reverse_complement 122 | 123 | # The type for the source attribute for a :class:`samwell.bwa_mem.FastqRecord` 124 | FastqRecordSourceType = TypeVar('FastqRecordSourceType') 125 | 126 | 127 | @attr.s(frozen=True, auto_attribs=True) 128 | class FastqRecord: 129 | """Fastq record used as input to alignment. 130 | 131 | Attributes: 132 | name: the name of the read 133 | bases: the read bases 134 | quals: the base qualities 135 | source: optionally the :class:`~pysam.AlignedSegment` from which this was built 136 | needs_alignment: True if the read needs alignment, False otherwise 137 | read_number: optionally the read number; should be set to 1 or 2 for paired end 138 | reads. 139 | """ 140 | 141 | name: str = attr.ib() 142 | bases: str = attr.ib() 143 | quals: str = attr.ib() 144 | source: Optional[FastqRecordSourceType] = None # type: ignore 145 | needs_alignment: bool = True 146 | read_number: Optional[int] = None 147 | 148 | _BASE_QUALITY_OFFSET: ClassVar[int] = 33 149 | 150 | @classmethod 151 | def build(cls, 152 | read: pysam.AlignedSegment, 153 | needs_alignment: bool = True, 154 | aligned_bases_only: bool = False, 155 | clip_three_prime: int = 0 156 | ) -> 'FastqRecord': 157 | """Builds a :class:`~samwell.bwa_mem.FastqRecord` from a :class:`~pysam.AlignedSegment` 158 | 159 | Args: 160 | read: the read to convert 161 | needs_alignment: True if the read should be aligned, False otherwise 162 | aligned_bases_only: only align the aligned bases (excludes soft-clipped bases) 163 | clip_three_prime: the number of bases to clip on the three-prime end of the read 164 | relative to the original direction of sequencing. This will be applied after 165 | extracting the bases based on ``aligned_bases_only``. 166 | """ 167 | # Get the bases and qualities 168 | if needs_alignment: 169 | if aligned_bases_only: 170 | bases = read.query_alignment_sequence 171 | quals = read.query_alignment_qualities 172 | else: 173 | bases = read.query_sequence 174 | quals = read.query_qualities 175 | 176 | # reverse complement if necessary 177 | if read.is_reverse: 178 | bases = reverse_complement(bases) 179 | quals = quals[::-1] 180 | 181 | if clip_three_prime > 0: 182 | index_from_end = -1 * clip_three_prime 183 | bases = bases[:index_from_end] 184 | quals = quals[:index_from_end] 185 | 186 | # convert to string 187 | quals = "".join([chr(q + FastqRecord._BASE_QUALITY_OFFSET) for q in quals]) 188 | else: 189 | # If we're not going to align it, no need to muck with bases and quals 190 | bases = "" 191 | quals = "" 192 | 193 | # Get the read number 194 | if read.is_paired: 195 | read_number = 1 if read.is_read1 else 2 196 | else: 197 | read_number = None 198 | 199 | return FastqRecord(name=read.query_name, 200 | bases=bases, 201 | quals=quals, 202 | source=read, 203 | needs_alignment=needs_alignment, 204 | read_number=read_number) 205 | 206 | def __hash__(self) -> int: 207 | """Returns a unique value for this record given the inputs. 208 | 209 | If source is defined and is a :class:`~pysam.AlignedSegment`, then the source's hash 210 | will be returned. Otherwise, the hash of the concatenation of the name, bases, and 211 | qualities will be returned. 212 | """ 213 | if self.source is not None and issubclass(self.source, pysam.AlignedSegment): 214 | return hash(self.source) 215 | else: 216 | return hash(self.str_with_read_number()) 217 | 218 | def __str__(self) -> str: 219 | return f"@{self.name}\n{self.bases}\n+\n{self.quals}\n" 220 | 221 | def str_with_read_number(self) -> str: 222 | """Returns the record in FASTQ format, with the read number appended (colon delimited).""" 223 | name = self.name + ":" + (str(self.read_number) if self.read_number is not None else "0") 224 | return f"@{name}\n{self.bases}\n+\n{self.quals}\n" 225 | 226 | 227 | class _CommandLineOptionGroup: 228 | """Base class for groups of bwa options using @attr.s. 229 | 230 | It is assumed that every attribute has the 'flag' key specified in its metadata field. Use the 231 | :func:`~samwell.bwa_mem._flag` method to add additional flag attributes. 232 | """ 233 | 234 | def args(self) -> List[str]: 235 | """Build the list of command line arguments from the defined options.""" 236 | _args = [] 237 | flag_to_attribute_name: Dict[str, str] = {} 238 | # go through each attribute 239 | for attribute in attr.fields(type(self)): 240 | # get the value for the flag 241 | value = getattr(self, attribute.name) 242 | if isinstance(value, enum.Enum): 243 | value = value.value 244 | else: 245 | # check if it iterable, and if so, join them with commas 246 | try: 247 | value = ",".join(iter(value)) 248 | except TypeError: 249 | pass 250 | 251 | # if it is set, add it to args 252 | if value is not None: 253 | # assume that they have metadata, with the "flag" specified. Get the flag to use 254 | flag = attribute.metadata['flag'] 255 | if flag in flag_to_attribute_name: 256 | cur_name = attribute.name 257 | other_name = flag_to_attribute_name[flag] 258 | raise ValueError( 259 | f"Flag '{flag}' found in attributes {cur_name} and {other_name}") 260 | flag_to_attribute_name[flag] = attribute.name 261 | if attribute.type in (bool, Optional[bool]): 262 | if value is True: 263 | _args.append(flag) 264 | else: 265 | _args.extend([flag, str(value)]) 266 | return _args 267 | 268 | 269 | # Alias for the alignment result 270 | AlignmentResult = Tuple[FastqRecord, List[pysam.AlignedSegment]] 271 | 272 | 273 | @attr.s(frozen=True) 274 | class AlgorithmOptions(_CommandLineOptionGroup): 275 | """Bwa mem algorithm options 276 | 277 | Attributes: 278 | threads: number of threads 279 | min_seed_len: minimum seed length 280 | band_width: band width for banded alignment 281 | off_diagonal_dropoff: off-diagonal X-dropoff 282 | internal_seeds_length_factor: look for internal seeds inside a seed longer than 283 | min_seed_len * internal_seeds_length_factor 284 | max_third_seed_occurrence: seed occurrence for the 3rd round seeding 285 | max_seed_occurrence: skip seeds with more than INT occurrences 286 | drop_ratio: drop chains shorter than this fraction of the longest overlapping chain 287 | min_chain_weight: discard a chain if seeded bases shorter than this value 288 | max_mate_rescue_rounds: perform at most INT rounds of mate rescues for each read 289 | skip_mate_rescue: skip mate rescue 290 | skip_pairing: skip pairing; mate rescue performed unless skip_mate_rescue also in use 291 | """ 292 | 293 | threads: Optional[int] = attr.ib(default=None, metadata={'flag': '-t'}) 294 | min_seed_len: Optional[int] = attr.ib(default=None, metadata={'flag': '-k'}) 295 | band_width: Optional[int] = attr.ib(default=None, metadata={'flag': '-w'}) 296 | off_diagonal_dropoff: Optional[int] = attr.ib(default=None, metadata={'flag': '-d'}) 297 | internal_seeds_length_factor: Optional[float] = attr.ib(default=None, metadata={'flag': '-r'}) 298 | max_third_seed_occurrence: Optional[int] = attr.ib(default=None, metadata={'flag': '-y'}) 299 | max_seed_occurrence: Optional[int] = attr.ib(default=None, metadata={'flag': '-c'}) 300 | drop_ratio: Optional[float] = attr.ib(default=None, metadata={'flag': '-D'}) 301 | min_chain_weight: Optional[int] = attr.ib(default=None, metadata={'flag': '-W'}) 302 | max_mate_rescue_rounds: Optional[int] = attr.ib(default=None, metadata={'flag': '-m'}) 303 | skip_mate_rescue: Optional[bool] = attr.ib(default=None, metadata={'flag': '-S'}) 304 | skip_pairing: Optional[bool] = attr.ib(default=None, metadata={'flag': '-P'}) 305 | 306 | 307 | @enum.unique 308 | class ReadType(enum.Enum): 309 | """The read type for BWA mem.""" 310 | 311 | PacBio = "pacbio" 312 | OxfordNano2D = "ont2d" 313 | IntraSpecies = "intractg" 314 | 315 | 316 | @attr.s(frozen=True) 317 | class ScoringOptions(_CommandLineOptionGroup): 318 | 319 | """Bwa mem scoring options 320 | 321 | Attributes: 322 | match_score: the score for a sequence match, which scales options -TdBOELU unless 323 | overridden 324 | mismatch_score: penalty for a mismatch 325 | gap_open: gap open penalties for deletions and insertions (single value to use the same 326 | for both) 327 | gap_extend: gap extension penalty; a gap of size k cost '{-O} + {-E}*k' (single value to 328 | use the same for both) 329 | clipping_penalty: penalty for 5'- and 3'-end clipping 330 | unpaired_penalty: penalty for an unpaired read pair 331 | read_type: read type. Setting -x changes multiple parameters unless overriden: 332 | pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref) 333 | ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref) 334 | intractg: -B9 -O16 -L5 (intra-species contigs to ref) 335 | """ 336 | 337 | match_score: Optional[int] = attr.ib(default=None, metadata={'flag': '-A'}) 338 | mismatch_score: Optional[int] = attr.ib(default=None, metadata={'flag': '-B'}) 339 | gap_open: Optional[Union[int, Tuple[int, int]]] = attr.ib(default=None, 340 | metadata={'flag': '-O'}) 341 | gap_extend: Optional[Union[int, Tuple[int, int]]] = attr.ib(default=None, 342 | metadata={'flag': '-E'}) 343 | clipping_penalty: Optional[Union[int, Tuple[int, int]]] = attr.ib(default=None, 344 | metadata={'flag': '-L'}) 345 | unpaired_penalty: Optional[int] = attr.ib(default=None, metadata={'flag': '-U'}) 346 | read_type: Optional[ReadType] = attr.ib(default=None, metadata={'flag': '-x'}) 347 | 348 | 349 | # The type for BWA mem's insert size parameter (-I) option 350 | InsertSizeParamsType = Union[ 351 | float, 352 | Tuple[float, float], 353 | Tuple[float, float, int], 354 | Tuple[float, float, int, int]] 355 | 356 | 357 | @attr.s(frozen=True) 358 | class InputOutputOptions(_CommandLineOptionGroup): 359 | """Bwa mem input and output options 360 | 361 | Attributes: 362 | interleaved_pairs: read pairs are consecutive (r1 then r2), otherwise fragment reads 363 | read_group: read group header line such as '@RG\tID:foo\tSM:bar 364 | header_insert: insert STR to header if it starts with @; or insert lines in FILE 365 | alts_as_primary: treat ALT contigs as part of the primary assembly (i.e. ignore 366 | .alt file) 367 | verbosity: verbose level: 1=error, 2=warning, 3=message, 4+=debuggin 368 | min_alignment_score: minimum score to output 369 | max_hits_within_max_score: if there are 80% of the max score, output 370 | all in XA 371 | all_alignments: output all alignments for SE or unpaired PE 372 | append_fastq_comment: append FASTA/FASTQ comment to SAM output 373 | add_fasta_header_to_xr: output the reference FASTA header in the XR tag 374 | softclip_supplementary: use soft clipping for supplementary alignments 375 | split_hits_are_secondary: mark shorter split hits as secondary 376 | insert_size_params: specify the mean, standard deviation (10% of the mean if absent), max 377 | (4 sigma from the mean if absent) and min of the insert size distribution. FR 378 | orientation only. 379 | bases_per_batch: how many bases of sequence data bwa should read from the input 380 | before triggering a batch of alignments. 381 | """ 382 | 383 | interleaved_pairs: Optional[bool] = attr.ib(default=None, metadata={'flag': '-p'}) 384 | read_group: Optional[str] = attr.ib(default=None, metadata={'flag': '-R'}) 385 | header_insert: Optional[Union[str, Path]] = attr.ib(default=None, metadata={'flag': '-H'}) 386 | alts_as_primary: Optional[bool] = attr.ib(default=None, metadata={'flag': '-j'}) 387 | verbosity: Optional[int] = attr.ib(default=None, metadata={'flag': '-v'}) 388 | min_alignment_score: Optional[int] = attr.ib(default=None, metadata={'flag': '-T'}) 389 | max_hits_within_max_score: Optional[Union[int, Tuple[int, int]]] = \ 390 | attr.ib(default=None, metadata={'flag': '-h'}) 391 | all_alignments: Optional[bool] = attr.ib(default=None, metadata={'flag': '-a'}) 392 | append_fastq_comment: Optional[bool] = attr.ib(default=None, metadata={'flag': '-C'}) 393 | add_fasta_header_to_xr: Optional[bool] = attr.ib(default=None, metadata={'flag': '-V'}) 394 | softclip_supplementary: Optional[bool] = attr.ib(default=None, metadata={'flag': '-Y'}) 395 | split_hits_are_secondary: Optional[bool] = attr.ib(default=None, metadata={'flag': '-M'}) 396 | insert_size_params: Optional[InsertSizeParamsType] = attr.ib(default=None, 397 | metadata={'flag': '-I'}) 398 | bases_per_batch: Optional[int] = attr.ib(default=115000, metadata={'flag': '-K'}) 399 | 400 | 401 | # The type for the source items in :class:`samwell.bwa_mem._SourceToSinkThread` 402 | SourceToSinkThreadType = TypeVar('SourceToSinkThreadType') 403 | 404 | 405 | class _SourceToSinkThread(threading.Thread, Generic[SourceToSinkThreadType]): 406 | """A thread that consumes elements from the source and adds them to the sink 407 | 408 | Attributes: 409 | num_added: the number of elements from the source added to the sink 410 | """ 411 | 412 | def __init__(self, 413 | source: Iterator[SourceToSinkThreadType], 414 | sink_add_func: Callable[[SourceToSinkThreadType], None], 415 | sink_close_func: Optional[Callable[..., None]] = None) -> None: 416 | """Creates a new thread for consuming the source and adding to the sink. 417 | 418 | Args: 419 | source: the source iterator from which to consume 420 | sink_add_func: the method to use to add an element to the sink 421 | sink_close_func: the method to call when all elements have been added to the sink 422 | """ 423 | super().__init__(daemon=True) 424 | self.num_added: int = 0 425 | self._source = source 426 | self._sink_add_method = sink_add_func 427 | self._sink_close_method = sink_close_func 428 | self.exception: Optional[Exception] = None 429 | self.done = False 430 | 431 | def run(self) -> None: 432 | """Runs the source to sink transfer""" 433 | try: 434 | for item in self._source: 435 | self._sink_add_method(item) 436 | self.num_added += 1 437 | except Exception as e: 438 | self.exception = e 439 | finally: 440 | if self._sink_close_method is not None: 441 | self._sink_close_method() 442 | self.done = True 443 | 444 | 445 | def _same_read(read: FastqRecord, alignment: pysam.AlignedSegment) -> bool: 446 | """True if an alignment for the given read, False otherwise. 447 | 448 | For the alignment to be considered as an alignment for this read, the read name and read number 449 | must match. The read number is appended to the alignment's query name. 450 | """ 451 | if alignment.is_paired: 452 | assert read.read_number is not None, f"Paired alignment but the read has no read #: {read}" 453 | if read.name != alignment.query_name: 454 | return False 455 | else: 456 | alignment_read_number = 1 if alignment.is_read1 else 2 457 | return read.read_number == alignment_read_number 458 | else: 459 | alignment_name, alignment_read_number = alignment.query_name.rsplit(':', 1) 460 | if read.name != alignment_name: 461 | return False 462 | elif read.read_number is None: 463 | return int(alignment_read_number) == 0 464 | else: 465 | return read.read_number == int(alignment_read_number) 466 | 467 | 468 | def _collate_alignments(reads_queue: queue.Queue, 469 | alignments_reader: pysam.AlignmentFile, 470 | suppress_secondaries: bool = False) -> Iterable[AlignmentResult]: 471 | """Collates the alignments for each read in the given queue. 472 | 473 | Alignments for reads in the alignments reader are in the same order as the reads in the reads 474 | queue. This allows traversal of both once and in a step-wise fashion. In fact, there exist 475 | alignments for reads in the read queue that need alignment (see the corresponding property). 476 | 477 | This method may block waiting for (1) reads to be written for BWA to consume and thus added 478 | to the queue that will be consumed by this method, or (2) the alignments returned by BWA for a 479 | given read. The former (1) will not block indefinitely since at some point the BWA mem 480 | input process will write the sentinel value and reads_queue.get will return None. The 481 | latter will not block indefinitely since the stdout will be closed when the BWA mem process is 482 | terminated. 483 | 484 | Args: 485 | reads_queue: the queue of reads 486 | alignments_reader: the reader of sam records 487 | suppress_secondaries: true to discard all secondary alignments, false otherwise 488 | 489 | Returns: 490 | An iterable over the alignment results. An alignment result is a tuple consisting of the 491 | original :class:`~samwell.bwa_mem.FastqRecord` and an iterator over the alignments (see 492 | :class:`~pysam.AlignedSegment`). 493 | """ 494 | alignments_iter: PeekableIterator = PeekableIterator(alignments_reader) 495 | reads_iterator = cast(Iterator[FastqRecord], iter(reads_queue.get, None)) 496 | for read in reads_iterator: 497 | results: List[pysam.AlignedSegment] = [] 498 | 499 | if read.needs_alignment: 500 | result = alignments_iter.peek() if alignments_iter.can_peek() else None 501 | while result is not None and _same_read(read, result): 502 | next(alignments_iter) # consume the current record 503 | if not suppress_secondaries or not result.is_secondary: 504 | # Update the query name since we may have originally appended the read number 505 | result.query_name = read.name 506 | results.append(result) 507 | result = alignments_iter.peek() if alignments_iter.can_peek() else None 508 | 509 | yield (read, results) 510 | assert not alignments_iter.can_peek(), 'Alignments exist but no more reads in the queue' 511 | 512 | 513 | def _build_command_line(idxbase: Path, 514 | executable_path: Path = Path('bwa'), 515 | algo_opts: Optional[AlgorithmOptions] = None, 516 | scoring_opts: Optional[ScoringOptions] = None, 517 | io_opts: Optional[InputOutputOptions] = None) -> List[str]: 518 | """Builds the command line for bwa mem. 519 | 520 | Args: 521 | idxbase: the path prefix for all the BWA-specific index files 522 | executable_path: the path to the BWA executable 523 | algo_opts: the algorithm options 524 | scoring_opts: the scoring options 525 | io_opts: the input and output options 526 | """ 527 | # Start with the path to BWA, then the mem command 528 | cmd: List[Any] = [executable_path, 'mem'] 529 | # Add any options 530 | for opts in [algo_opts, scoring_opts, io_opts]: 531 | if opts is not None: 532 | cmd.extend(opts.args()) 533 | # Now the reference genome index basename 534 | cmd.append(idxbase) 535 | # Now set the input to be from standard input 536 | cmd.append('/dev/stdin') 537 | # Convert all args to strings 538 | args = [str(arg) for arg in cmd] 539 | return args 540 | 541 | 542 | def _build_bwa_input_process(reads: Iterable[FastqRecord], 543 | to_bwa_handle: Any, 544 | to_output_queue: queue.Queue, 545 | interleaved_pairs: Optional[bool] = None 546 | ) -> _SourceToSinkThread: 547 | """Builds and starts a process to write the given FASTQ records for BWA mem and the given queue 548 | 549 | Args: 550 | reads: the reads to input to the BWA mem subprocess' stdin 551 | to_bwa_handle: the IO handle to which to write the FASTQ records for BWA mem 552 | to_output_queue: the queue to also write to after writing a FASTQ record for BWA mem; 553 | this queue is mainly used for collating FASTQ reads and SAM alignments. 554 | interleaved_pairs: read pairs are consecutive (r1 then r2), otherwise unpaired reads 555 | """ 556 | last_read_name_and_number: Optional[Tuple[str, Optional[int]]] = None 557 | 558 | def sink_add_method(read: FastqRecord) -> None: 559 | """Writes a FASTQ record to the BWA mem input as well as the results collation""" 560 | nonlocal last_read_name_and_number 561 | 562 | if read.needs_alignment: 563 | if last_read_name_and_number is not None: 564 | name, read_number = last_read_name_and_number 565 | assert name != read.name or read_number != read.read_number, \ 566 | 'Consecutive reads have the same name and read number:' + \ 567 | f'\n\t\tname: {name}\n\t\tread number: {read_number}' + \ 568 | f'\n\t\tsource: {read.source}' 569 | last_read_name_and_number = (read.name, read.read_number) 570 | if interleaved_pairs is True: 571 | to_bwa_handle.write(str(read)) 572 | else: 573 | # IMPORTANT: the read name has the read number appended to disambiguate ends of a 574 | # pair 575 | to_bwa_handle.write(read.str_with_read_number()) 576 | to_output_queue.put(read) 577 | 578 | def sink_close_method() -> None: 579 | """Close the BWA mem input handle and the output queue for results collation""" 580 | to_bwa_handle.close() 581 | to_output_queue.put(None) # add the sentinel value 582 | 583 | bwa_input_process = _SourceToSinkThread(source=iter(reads), 584 | sink_add_func=sink_add_method, 585 | sink_close_func=sink_close_method) 586 | bwa_input_process.start() 587 | 588 | return bwa_input_process 589 | 590 | 591 | def align(reads: Iterable[FastqRecord], 592 | idxbase: Path, 593 | executable_path: Path = Path('bwa'), 594 | algo_opts: Optional[AlgorithmOptions] = None, 595 | scoring_opts: Optional[ScoringOptions] = None, 596 | io_opts: Optional[InputOutputOptions] = None, 597 | suppress_secondaries: bool = False, 598 | stderr_out: Any = sys.stderr 599 | ) -> Iterable[AlignmentResult]: 600 | """Aligns the given reads with BWA mem. 601 | 602 | See :py:mod:`~samwell.bwa_mem` for a detailed explanation for the implementation approach. 603 | 604 | Args: 605 | reads: the reads to align 606 | idxbase: the path prefix for all the BWA-specific index files 607 | executable_path: the path to the BWA executable 608 | algo_opts: the algorithm options 609 | scoring_opts: the scoring options 610 | io_opts: the input and output options 611 | suppress_secondaries: true to discard all secondary alignments, false otherwise 612 | 613 | Returns: 614 | An iterable over the alignment results. An alignment result is a tuple consisting of the 615 | original :class:`~samwell.bwa_mem.FastqRecord` and an iterator over the alignments (see 616 | :class:`~pysam.AlignedSegment`) 617 | """ 618 | 619 | # Build the command line used to run BWA MEM 620 | command_line = _build_command_line(idxbase=idxbase, 621 | executable_path=executable_path, 622 | algo_opts=algo_opts, 623 | scoring_opts=scoring_opts, 624 | io_opts=io_opts) 625 | 626 | # Create a sub-process in which to run BWA mem. This process will read FASTQ records from 627 | # stdin, write SAM records to stdout, and write any error/logging information to stderr. 628 | bwa_mem_process = subprocess.Popen(args=command_line, 629 | stdin=subprocess.PIPE, 630 | stdout=subprocess.PIPE, 631 | stderr=subprocess.PIPE, 632 | universal_newlines=True) 633 | 634 | # Create a sub-process in which we read the stderr of the BWA mem subprocess and write it to 635 | # the given stderr_out handle. 636 | bwa_mem_stderr_process = _SourceToSinkThread(source=iter(bwa_mem_process.stderr), 637 | sink_add_func=stderr_out.write, 638 | sink_close_func=None) 639 | bwa_mem_stderr_process.start() 640 | 641 | # Create a queue of FASTQ records that the sub-process who will write to BWA mem's stdin 642 | # will also write. This is so we can collate/join the input FASTQ records with the output SAM 643 | # (or alignment) records. A sentinel value (None) will be written to indicate no more reads 644 | # will be placed in the queue. 645 | reads_queue: queue.Queue = queue.Queue() 646 | 647 | # Create a sub-process to consume the input FASTQ records and write them to BWA mem's stdin. We 648 | # write in a separate thread to avoid any deadlock with waiting for output from BWA mem's 649 | # stdout. This can happen in a synchronous implementation where BWA mem is buffering reads and 650 | # we are waiting for some results from BWA mem's stdout, but really BWA mem is waiting for 651 | # either more reads from stdin or for stdin to be closed. 652 | interleaved_pairs = io_opts.interleaved_pairs if io_opts is not None else None 653 | bwa_input_process = _build_bwa_input_process(reads=reads, 654 | to_bwa_handle=bwa_mem_process.stdin, 655 | to_output_queue=reads_queue, 656 | interleaved_pairs=interleaved_pairs) 657 | 658 | # Go through the output 659 | num_aligned = 0 660 | try: 661 | # Wait for some reads to be written. pysam will block opening the input file until some 662 | # data is available, or the stream is closed. If no data is added, don't even try opening 663 | # the stream. 664 | while bwa_input_process.num_added == 0 and not bwa_input_process.done: 665 | # the input process is still running but no reads have been added 666 | time.sleep(.1) 667 | if bwa_input_process.num_added == 0 and bwa_input_process.done: 668 | # the input process is done (error or success) and no reads have been added, so skip 669 | # opening pysam 670 | raise StopIteration 671 | # Read through the output of BWA mem, and collate that with the queue of reads given to 672 | # BWA mem 673 | with sam.reader(path=bwa_mem_process.stdout, file_type=SamFileType.SAM) as reader: 674 | alignment_results = _collate_alignments(reads_queue=reads_queue, 675 | alignments_reader=reader, 676 | suppress_secondaries=suppress_secondaries) 677 | # A simple loop with its only purpose to count the number of alignment results 678 | for result in alignment_results: 679 | num_aligned += 1 680 | yield result 681 | finally: 682 | # Close the stdin of the BWA mem process. This should signal BWA mem to shut down, and 683 | # for the input thread to stop. 684 | bwa_mem_process.stdin.close() 685 | 686 | # Join the input thread as now stdin of the BWA mem process is closed. 687 | bwa_input_process.join(timeout=1.0) 688 | 689 | # Check if the inputting reads to BWA had an exception 690 | if bwa_input_process.exception is not None: 691 | raise bwa_input_process.exception 692 | elif bwa_input_process.is_alive(): 693 | raise RuntimeError("BWA process encountered no errors but did not terminate.") 694 | 695 | # Check that the number of reads given to BWA mem was the same # returned by BWA mem 696 | num_left = bwa_input_process.num_added - num_aligned 697 | if num_left != 0: 698 | raise ValueError(f"Still had {num_left:,d} remaining reads from BWA") 699 | 700 | # Shut down the BWA mem process. If it fails to shutdown, log a warning and continue on 701 | try: 702 | bwa_mem_process.wait(timeout=5.0) 703 | except subprocess.TimeoutExpired as ex: 704 | logger = logging.getLogger(__name__) 705 | logger.warning("Could not shutdown BWA, ignoring error: %s", str(ex)) 706 | 707 | # Shut down the stderr thread 708 | bwa_mem_stderr_process.join(timeout=1.0) 709 | -------------------------------------------------------------------------------- /samwell/sam/clipping.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility Functions for Soft-Clipping records in SAM/BAM Files 3 | ------------------------------------------------------------ 4 | 5 | This module contains utility functions for soft-clipping reads. There are four variants 6 | that support clipping the beginnings and ends of reads, and specifying the amount to be 7 | clipped in terms of query bases or reference bases: 8 | 9 | - :func:`~samwell.clipping.softclip_start_of_alignment_by_query` clips the start 10 | of the alignment in terms of query bases 11 | - :func:`~samwell.clipping.softclip_end_of_alignment_by_query` clips the end 12 | of the alignment in terms of query bases 13 | - :func:`~samwell.clipping.softclip_start_of_alignment_by_ref` clips the start 14 | of the alignment in terms of reference bases 15 | - :func:`~samwell.clipping.softclip_end_of_alignment_by_ref` clips the end 16 | of the alignment in terms of reference bases 17 | 18 | The difference between query and reference based versions is apparent only when there are 19 | insertions or deletions in the read as indels have lengths on either the query (insertions) or 20 | reference (deletions) but not both. 21 | 22 | Upon clipping a set of additional SAM tags are removed from reads as they are likely invalid. 23 | 24 | For example, to clip the last 10 query bases of all records and reduce the qualities to Q2: 25 | 26 | .. code-block:: python 27 | 28 | 29 | >>> from samwell.sam import reader, clipping 30 | >>> with reader("/path/to/sample.sam") as fh: 31 | ... for rec in fh: 32 | ... clipping.softclip_end_of_alignment_by_query(rec, 10, 2) 33 | ... print(rec.cigarstring) 34 | 35 | It should be noted that any clipping potentially makes the common SAM tags NM, MD and UQ 36 | invalid, as well as potentially other alignment based SAM tags. Any clipping added to the start 37 | of an alignment changes the position (reference_start) of the record. Any reads that have no 38 | aligned bases after clipping are set to be unmapped. If writing the clipped reads back to a BAM 39 | it should be noted that: 40 | 41 | - Mate pairs may have incorrect information about their mate's positions 42 | - Even if the input was coordinate sorted, the output may be out of order 43 | 44 | To rectify these problems it is necessary to do the equivalent of: 45 | 46 | .. code-block:: bash 47 | 48 | cat clipped.bam | samtools sort -n | samtools fixmate | samtools sort | samtools calmd 49 | """ 50 | 51 | from array import array 52 | from typing import Iterable 53 | from typing import List 54 | from typing import NamedTuple 55 | from typing import Optional 56 | from typing import Tuple 57 | 58 | from pysam import AlignedSegment 59 | 60 | from samwell import dnautils 61 | from samwell import sam 62 | from samwell.itertools import peekable 63 | from samwell.sam import Cigar 64 | from samwell.sam import CigarElement 65 | from samwell.sam import CigarOp 66 | 67 | """The default set of SAM tags which become invalid when clipping is applied.""" 68 | TAGS_TO_INVALIDATE: Iterable[str] = ("MD", "NM", "UQ") 69 | 70 | 71 | class ClippingInfo(NamedTuple): 72 | """Named tuple holding the number of bases clipped on the query and reference respectively. 73 | 74 | Attributes: 75 | query_bases_clipped (int): the number of query bases in the alignment that were clipped. 76 | ref_bases_clipped (int): the number of reference bases in the alignment that were clipped. 77 | """ 78 | query_bases_clipped: int 79 | ref_bases_clipped: int 80 | 81 | 82 | def softclip_start_of_alignment_by_query(rec: AlignedSegment, 83 | bases_to_clip: int, 84 | clipped_base_quality: Optional[int] = None, 85 | tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE 86 | ) -> ClippingInfo: 87 | """ 88 | Adds soft-clipping to the start of a read's alignment. 89 | 90 | Clipping is applied after any existing hard or soft clipping. E.g. a read with cigar 5S100M 91 | that is clipped with bases_to_clip=10 will yield a cigar of 15S90M. 92 | 93 | If the read is unmapped or bases_to_clip < 1 then nothing is done. 94 | 95 | If the read has fewer clippable bases than requested the read will be unmapped. 96 | 97 | Args: 98 | rec: the BAM record to clip 99 | bases_to_clip: the number of additional bases of clipping desired in the read/query 100 | clipped_base_quality: if not None, set bases in the clipped region to this quality 101 | tags_to_invalidate: the set of extended attributes to remove upon clipping 102 | 103 | Returns: 104 | ClippingInfo: a named tuple containing the number of query/read bases and the number 105 | of target/reference bases clipped. 106 | """ 107 | if rec.is_unmapped or bases_to_clip < 1: 108 | return ClippingInfo(0, 0) 109 | 110 | num_clippable_bases = rec.query_alignment_length 111 | 112 | if bases_to_clip >= num_clippable_bases: 113 | return _clip_whole_read(rec, tags_to_invalidate) 114 | 115 | cigar = Cigar.from_cigartuples(rec.cigartuples) 116 | quals = rec.query_qualities 117 | new_cigar, clipping_info = _clip(cigar, quals, bases_to_clip, clipped_base_quality) 118 | rec.query_qualities = quals 119 | 120 | rec.reference_start += clipping_info.ref_bases_clipped 121 | rec.cigarstring = str(new_cigar) 122 | _cleanup(rec, tags_to_invalidate) 123 | return clipping_info 124 | 125 | 126 | def softclip_end_of_alignment_by_query(rec: AlignedSegment, 127 | bases_to_clip: int, 128 | clipped_base_quality: Optional[int] = None, 129 | tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE 130 | ) -> ClippingInfo: 131 | """ 132 | Adds soft-clipping to the end of a read's alignment. 133 | 134 | Clipping is applied before any existing hard or soft clipping. E.g. a read with cigar 100M5S 135 | that is clipped with bases_to_clip=10 will yield a cigar of 90M15S. 136 | 137 | If the read is unmapped or bases_to_clip < 1 then nothing is done. 138 | 139 | If the read has fewer clippable bases than requested the read will be unmapped. 140 | 141 | Args: 142 | rec: the BAM record to clip 143 | bases_to_clip: the number of additional bases of clipping desired in the read/query 144 | clipped_base_quality: if not None, set bases in the clipped region to this quality 145 | tags_to_invalidate: the set of extended attributes to remove upon clipping 146 | 147 | Returns: 148 | ClippingInfo: a named tuple containing the number of query/read bases and the number 149 | of target/reference bases clipped. 150 | """ 151 | if rec.is_unmapped or bases_to_clip < 1: 152 | return ClippingInfo(0, 0) 153 | 154 | num_clippable_bases = rec.query_alignment_length 155 | 156 | if bases_to_clip >= num_clippable_bases: 157 | return _clip_whole_read(rec, tags_to_invalidate) 158 | 159 | # Reverse the cigar and qualities so we can clip from the start 160 | cigar = Cigar.from_cigartuples(rec.cigartuples).reversed() 161 | quals = rec.query_qualities 162 | quals.reverse() 163 | new_cigar, clipping_info = _clip(cigar, quals, bases_to_clip, clipped_base_quality) 164 | 165 | # Then reverse everything back again 166 | quals.reverse() 167 | rec.query_qualities = quals 168 | rec.cigarstring = str(new_cigar.reversed()) 169 | 170 | _cleanup(rec, tags_to_invalidate) 171 | return clipping_info 172 | 173 | 174 | def softclip_start_of_alignment_by_ref(rec: AlignedSegment, 175 | bases_to_clip: int, 176 | clipped_base_quality: Optional[int] = None, 177 | tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE 178 | ) -> ClippingInfo: 179 | """Soft-clips the start of an alignment by bases_to_clip bases on the reference. 180 | 181 | Clipping is applied after any existing hard or soft clipping. E.g. a read with cigar 5S100M 182 | that is clipped with bases_to_clip=10 will yield a cigar of 15S90M. 183 | 184 | If the read is unmapped or bases_to_clip < 1 then nothing is done. 185 | 186 | If the read has fewer clippable bases than requested the read will be unmapped. 187 | 188 | Args: 189 | rec: the BAM record to clip 190 | bases_to_clip: the number of additional bases of clipping desired on the reference 191 | clipped_base_quality: if not None, set bases in the clipped region to this quality 192 | tags_to_invalidate: the set of extended attributes to remove upon clipping 193 | 194 | Returns: 195 | ClippingInfo: a named tuple containing the number of query/read bases and the number 196 | of target/reference bases clipped. 197 | """ 198 | if rec.reference_length <= bases_to_clip: 199 | return _clip_whole_read(rec, tags_to_invalidate) 200 | 201 | new_start = rec.reference_start + bases_to_clip 202 | new_query_start = _read_pos_at_ref_pos(rec, new_start, previous=False) 203 | query_bases_to_clip = new_query_start - rec.query_alignment_start 204 | return softclip_start_of_alignment_by_query(rec, 205 | query_bases_to_clip, 206 | clipped_base_quality, 207 | tags_to_invalidate) 208 | 209 | 210 | def softclip_end_of_alignment_by_ref(rec: AlignedSegment, 211 | bases_to_clip: int, 212 | clipped_base_quality: Optional[int] = None, 213 | tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE 214 | ) -> ClippingInfo: 215 | """Soft-clips the end of an alignment by bases_to_clip bases on the reference. 216 | 217 | Clipping is applied beforeany existing hard or soft clipping. E.g. a read with cigar 100M5S 218 | that is clipped with bases_to_clip=10 will yield a cigar of 90M15S. 219 | 220 | If the read is unmapped or bases_to_clip < 1 then nothing is done. 221 | 222 | If the read has fewer clippable bases than requested the read will be unmapped. 223 | 224 | Args: 225 | rec: the BAM record to clip 226 | bases_to_clip: the number of additional bases of clipping desired on the reference 227 | clipped_base_quality: if not None, set bases in the clipped region to this quality 228 | tags_to_invalidate: the set of extended attributes to remove upon clipping 229 | 230 | Returns: 231 | ClippingInfo: a named tuple containing the number of query/read bases and the number 232 | of target/reference bases clipped. 233 | """ 234 | if rec.reference_length <= bases_to_clip: 235 | return _clip_whole_read(rec, tags_to_invalidate) 236 | 237 | new_end = rec.reference_end - bases_to_clip 238 | new_query_end = _read_pos_at_ref_pos(rec, new_end, previous=False) 239 | query_bases_to_clip = rec.query_alignment_end - new_query_end 240 | return softclip_end_of_alignment_by_query(rec, 241 | query_bases_to_clip, 242 | clipped_base_quality, 243 | tags_to_invalidate) 244 | 245 | 246 | def _clip_whole_read(rec: AlignedSegment, tags_to_invalidate: Iterable[str]) -> ClippingInfo: 247 | """Private method that unmaps a read and returns an appropriate ClippingInfo.""" 248 | retval = ClippingInfo(rec.query_alignment_length, rec.reference_length) 249 | _cleanup(rec, tags_to_invalidate) 250 | _make_read_unmapped(rec) 251 | return retval 252 | 253 | 254 | def _make_read_unmapped(rec: AlignedSegment) -> None: 255 | """Removes mapping information from a read.""" 256 | if rec.is_reverse: 257 | quals = rec.query_qualities 258 | quals.reverse() 259 | rec.query_sequence = dnautils.reverse_complement(rec.query_sequence) 260 | rec.query_qualities = quals 261 | rec.is_reverse = False 262 | 263 | rec.reference_id = sam.NO_REF_INDEX 264 | rec.reference_start = sam.NO_REF_POS 265 | rec.cigar = None 266 | rec.mapping_quality = 0 267 | rec.template_length = 0 268 | rec.is_duplicate = False 269 | rec.is_secondary = False 270 | rec.is_supplementary = False 271 | rec.is_proper_pair = False 272 | rec.is_unmapped = True 273 | 274 | 275 | def _cleanup(rec: AlignedSegment, tags_to_invalidate: Iterable[str]) -> None: 276 | """Removes extended tags from a record that may have become invalid after clipping.""" 277 | for tag in tags_to_invalidate: 278 | rec.set_tag(tag, None) 279 | 280 | 281 | def _read_pos_at_ref_pos(rec: AlignedSegment, 282 | ref_pos: int, 283 | previous: Optional[bool] = None) -> Optional[int]: 284 | """ 285 | Returns the read or query position at the reference position. 286 | 287 | If the reference position is not within the span of reference positions to which the 288 | read is aligned an exception will be raised. If the reference position is within the span 289 | but is not aligned (i.e. it is deleted in the read) behavior is controlled by the 290 | "previous" argument. 291 | 292 | Args: 293 | rec: the AlignedSegment within which to find the read position 294 | ref_pos: the reference position to be found 295 | previous: Controls behavior when the reference position is not aligned to any 296 | read position. True indicates to return the previous read position, False 297 | indicates to return the next read position and None indicates to return None. 298 | 299 | Returns: 300 | The read position at the reference position, or None. 301 | """ 302 | if ref_pos < rec.reference_start or ref_pos >= rec.reference_end: 303 | raise ValueError(f"{ref_pos} is not within the reference span for read {rec.query_name}") 304 | 305 | pairs = rec.get_aligned_pairs() 306 | index = 0 307 | read_pos = None 308 | for read, ref in pairs: 309 | if ref == ref_pos: 310 | read_pos = read 311 | break 312 | else: 313 | index += 1 314 | 315 | if not read_pos and previous is not None: 316 | if previous: 317 | while read_pos is None and index > 0: 318 | index -= 1 319 | read_pos = pairs[index][0] 320 | else: 321 | while read_pos is None and index < len(pairs): 322 | read_pos = pairs[index][0] 323 | index += 1 324 | 325 | return read_pos 326 | 327 | 328 | def _clip(cigar: Cigar, 329 | quals: array, 330 | bases_to_clip: int, 331 | clipped_base_quality: Optional[int]) -> Tuple[Cigar, ClippingInfo]: 332 | """Workhorse private clipping method that clips the start of cigars. 333 | 334 | Always works on the start of the cigars/quals; end-clipping is accomplished by 335 | reversing value before calling this function. Since the function is private it 336 | makes the following assumptions: 337 | 338 | 1. There are at least bases_to_clip bases available for clipping in the read 339 | 2. The cigar and quals agree on the query length 340 | 2. clipped_base_quality is either None or a valid integer base quality 341 | """ 342 | 343 | if any(cig.operator == CigarOp.P for cig in cigar.elements): 344 | raise ValueError(f"Cannot handle cigars that contain padding: {cigar}") 345 | 346 | elems = peekable(cigar.elements) 347 | existing_hard_clips = elems.takewhile(lambda c: c.operator == CigarOp.H) 348 | existing_soft_clips = elems.takewhile(lambda c: c.operator == CigarOp.S) 349 | read_bases_clipped = 0 350 | ref_bases_clipped = 0 351 | new_elems: List[CigarElement] = [] # buffer of cigar elements used to make the returned cigar 352 | 353 | # Returns true if the operator immediately after the clipping point is a deletion 354 | def is_trailing_deletion() -> bool: 355 | # Four conditions must be met: 356 | # 1. The number of bases _to_ clip equals the number of bases _already_ clipped 357 | # 2. The clipping point falls between operators (i.e. new_elems is empty) 358 | # 3. There is at least one more element to consider. 359 | # 4. The next element is a deletion. 360 | return read_bases_clipped == bases_to_clip \ 361 | and not new_elems \ 362 | and elems.peek() is not None \ 363 | and elems.peek().operator == CigarOp.D 364 | 365 | # The loop skips over all operators that are getting turned into clipping, while keeping track 366 | # of how many reference bases and how many read bases are skipped over. If the clipping point 367 | # falls between existing operators then the new_elems buffer is empty at the end of the while 368 | # loop. If the clip point falls within: 369 | # a) an alignment operator then the operator is split and the remainder added to the buffer 370 | # b) an insertion: the remainder of the insertion is also clipped 371 | # If the operator immediately after the clip is a deletion, it is also discarded. 372 | # 373 | # At the end of the while loop new_elems is either: 374 | # a) Empty 375 | # b) Contains a single element which is the remainder of an element that had to be split 376 | while read_bases_clipped < bases_to_clip or is_trailing_deletion(): 377 | elem = next(elems) 378 | op: CigarOp = elem.operator 379 | length: int = elem.length 380 | remaining_to_clip = bases_to_clip - read_bases_clipped 381 | 382 | if op.consumes_query and length > remaining_to_clip: 383 | if op == CigarOp.I: 384 | read_bases_clipped += length 385 | else: 386 | remaining_length = length - remaining_to_clip 387 | read_bases_clipped += remaining_to_clip 388 | ref_bases_clipped += remaining_to_clip 389 | new_elems.append(CigarElement(remaining_length, op)) 390 | else: 391 | read_bases_clipped += elem.length_on_query 392 | ref_bases_clipped += elem.length_on_target 393 | 394 | # Add in the remainder of the elements post-clipping 395 | new_elems.extend(elems) 396 | 397 | # Add in the clips 398 | clip_elems = [] 399 | hard_clip_length = sum(map(lambda e: e.length, existing_hard_clips)) 400 | soft_clip_length = sum(map(lambda e: e.length, existing_soft_clips)) + read_bases_clipped 401 | if hard_clip_length > 0: 402 | clip_elems.append(CigarElement(hard_clip_length, CigarOp.H)) 403 | if soft_clip_length > 0: 404 | clip_elems.append(CigarElement(soft_clip_length, CigarOp.S)) 405 | 406 | # Touch up the qualities if requested 407 | if clipped_base_quality is not None: 408 | for index in range(0, soft_clip_length): 409 | quals[index] = clipped_base_quality 410 | 411 | new_cigar = Cigar(tuple(clip_elems + new_elems)) 412 | return new_cigar, ClippingInfo(read_bases_clipped, ref_bases_clipped) 413 | -------------------------------------------------------------------------------- /samwell/sam/sambuilder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classes for generating SAM and BAM files and records for testing 3 | ---------------------------------------------------------------- 4 | 5 | This module contains utility classes for the generation of SAM and BAM files and 6 | alignment records, for use in testing. 7 | 8 | The module contains the following public classes: 9 | 10 | - :class:`~samwell.sam.sambuilder.SamBuilder` -- A builder class that allows the accumulation 11 | of alignment records and access as a list and writing to file. 12 | """ 13 | 14 | from pathlib import Path 15 | from random import Random 16 | from tempfile import NamedTemporaryFile 17 | from typing import Any 18 | from typing import Callable 19 | from typing import Dict 20 | from typing import IO 21 | from typing import List 22 | from typing import Optional 23 | from typing import Tuple 24 | 25 | import pysam 26 | from pysam import AlignmentHeader, AlignedSegment 27 | 28 | from samwell import sam 29 | from samwell.sam import SamOrder 30 | 31 | 32 | class SamBuilder: 33 | """Builder for constructing one or more sam records (`AlignmentSegment`s in pysam terms). 34 | 35 | Provides the ability to manufacture records from minimal arguments, while generating 36 | any remaining attributes to ensure a valid record. 37 | 38 | A builder is constructed with a handful of defaults including lengths for generated R1s 39 | and R2s, the default base quality score to use, a sequence dictionary and a single read group. 40 | 41 | Records are then added using the :func:`~samwell.sambuilder.SamBuilder.add_pair` method. 42 | Once accumulated the records can be accessed in the order in which they were created through 43 | the :func:`~samwell.sambuilder.SamBuilder.to_unsorted_list` function, or in a list sorted 44 | by coordinate order via :func:`~samwell.sambuilder.SamBuilder.to_sorted_list`. The latter 45 | creates a temporary file to do the sorting and is somewhat slower as a result. 46 | 47 | Records can be further modified after being returned from 48 | :func:`~samwell.sambuilder.SamBuilder.add_pair`, 49 | :func:`~samwell.sambuilder.SamBuilder.to_unsorted_list`, or 50 | :func:`~samwell.sambuilder.SamBuilder.to_sorted_list` by directly accessing their attributes 51 | through the 52 | [AlignedSegment](https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment) API. 53 | 54 | Lastly, the records can be written to a temporary file using 55 | :func:`~samwell.sambuilder.SamBuilder.to_path`. 56 | """ 57 | 58 | # The default read one length 59 | DEFAULT_R1_LENGTH: int = 100 60 | 61 | # The default read two length 62 | DEFAULT_R2_LENGTH: int = 100 63 | 64 | @staticmethod 65 | def default_sd() -> List[Dict[str, Any]]: 66 | """Generates the sequence dictionary that is used by default by SamBuilder. 67 | 68 | Matches the names and lengths of the HG19 reference in use in production. 69 | 70 | Returns: 71 | A new copy of the sequence dictionary as a list of dictionaries, one per chromosome. 72 | """ 73 | return [ 74 | {"SN": "chr1", "LN": 249250621}, 75 | {"SN": "chr2", "LN": 243199373}, 76 | {"SN": "chr3", "LN": 198022430}, 77 | {"SN": "chr4", "LN": 191154276}, 78 | {"SN": "chr5", "LN": 180915260}, 79 | {"SN": "chr6", "LN": 171115067}, 80 | {"SN": "chr7", "LN": 159138663}, 81 | {"SN": "chr8", "LN": 146364022}, 82 | {"SN": "chr9", "LN": 141213431}, 83 | {"SN": "chr10", "LN": 135534747}, 84 | {"SN": "chr11", "LN": 135006516}, 85 | {"SN": "chr12", "LN": 133851895}, 86 | {"SN": "chr13", "LN": 115169878}, 87 | {"SN": "chr14", "LN": 107349540}, 88 | {"SN": "chr15", "LN": 102531392}, 89 | {"SN": "chr16", "LN": 90354753}, 90 | {"SN": "chr17", "LN": 81195210}, 91 | {"SN": "chr18", "LN": 78077248}, 92 | {"SN": "chr19", "LN": 59128983}, 93 | {"SN": "chr20", "LN": 63025520}, 94 | {"SN": "chr21", "LN": 48129895}, 95 | {"SN": "chr22", "LN": 51304566}, 96 | {"SN": "chrX", "LN": 155270560}, 97 | {"SN": "chrY", "LN": 59373566}, 98 | {"SN": "chrM", "LN": 16571} 99 | ] 100 | 101 | @staticmethod 102 | def default_rg() -> Dict[str, str]: 103 | """Returns the default read group used by the SamBuilder, as a dictionary.""" 104 | return {"ID": "1", "SM": "1_AAAAAA", "LB": "default", "PL": "ILLUMINA", "PU": "xxx.1"} 105 | 106 | def __init__(self, 107 | r1_len: Optional[int] = None, 108 | r2_len: Optional[int] = None, 109 | base_quality: int = 30, 110 | mapping_quality: int = 60, 111 | sd: Optional[List[Dict[str, Any]]] = None, 112 | rg: Optional[Dict[str, str]] = None, 113 | extra_header: Optional[Dict[str, Any]] = None, 114 | seed: int = 42, 115 | sort_order: Optional[SamOrder] = SamOrder.Coordinate, 116 | ) -> None: 117 | """Initializes a new SamBuilder for generating alignment records and SAM/BAM files. 118 | 119 | Args: 120 | r1_len: The length of R1s to create unless otherwise specified 121 | r2_len: The length of R2s to create unless otherwise specified 122 | base_quality: The base quality of bases to create unless otherwise specified 123 | sd: a sequence dictionary as a list of dicts; defaults to calling default_sd() if None 124 | rg: a single read group as a dict; defaults to calling default_sd() if None 125 | extra_header: a dictionary of extra values to add to the header, None otherwise. See 126 | `::class::~pysam.AlignmentHeader` for more details. 127 | seed: a seed value for random number/string generation 128 | sort_order: optional sort order, if `None` reads will be output in the same order as 129 | they were appended. If `SamOrder.Coordinate`, reads will be ordered by reference 130 | index and coordinate order. If `SamOrder.QueryName`, reads will be ordered by 131 | query name. 132 | """ 133 | 134 | self.r1_len: int = r1_len if r1_len is not None else self.DEFAULT_R1_LENGTH 135 | self.r2_len: int = r2_len if r2_len is not None else self.DEFAULT_R2_LENGTH 136 | self.base_quality: int = base_quality 137 | self.mapping_quality: int = mapping_quality 138 | 139 | sort_order = ( 140 | SamOrder.Unsorted 141 | if sort_order is None 142 | else sort_order 143 | ) 144 | assert sort_order in [SamOrder.Coordinate, SamOrder.QueryName, SamOrder.Unsorted], ( 145 | "`sort_order for `SamBuilder` must be one of `Coordinate` `QueryName` or `Unsorted`" 146 | ) 147 | self.sort_order: SamOrder = sort_order 148 | 149 | self._header: Dict[str, Any] = { 150 | "HD": {"VN": "1.5", "SO": sort_order.value}, 151 | "SQ": (sd if sd is not None else SamBuilder.default_sd()), 152 | "RG": [(rg if rg is not None else SamBuilder.default_rg())] 153 | } 154 | if extra_header is not None: 155 | self._header = {**self._header, **extra_header} 156 | self._samheader = AlignmentHeader.from_dict(self._header) 157 | self._seq_lookup = dict([(s["SN"], s) for s in self._header["SQ"]]) 158 | 159 | self._random: Random = Random(seed) 160 | self._records: List[AlignedSegment] = [] 161 | self._counter: int = 0 162 | 163 | def _next_name(self) -> str: 164 | """Returns the next available query/template name.""" 165 | n = self._counter 166 | self._counter += 1 167 | return f"q{n:>04}" 168 | 169 | def _bases(self, length: int) -> str: 170 | """Returns a random string of bases of the length requested.""" 171 | return "".join(self._random.choices("ACGT", k=length)) # type: ignore 172 | 173 | def _new_rec(self, 174 | name: str, 175 | chrom: str, 176 | start: int, 177 | attrs: Optional[Dict[str, Any]]) -> AlignedSegment: 178 | """Generates a new AlignedSegment. Sets the segment up with the correct 179 | header and adds the RG attribute if not contained in attrs. 180 | 181 | Args: 182 | name: the name of the read/template 183 | chrom: the chromosome to which the read is mapped 184 | start: the start position of the read on the chromosome 185 | attrs: an optional dictionary of SAM attributes with two-char keys 186 | 187 | Returns: 188 | AlignedSegment: an aligned segment with name, chrom, pos, attributes the 189 | read group, and the unmapped flag all set appropriately. 190 | """ 191 | if chrom is not sam.NO_REF_NAME and chrom not in self._seq_lookup: 192 | raise ValueError(f"{chrom} is not a valid chromosome name in this builder.") 193 | 194 | rec = AlignedSegment(header=self._samheader) 195 | rec.query_name = name 196 | rec.reference_name = chrom 197 | rec.reference_start = start 198 | rec.mapping_quality = self.mapping_quality 199 | 200 | if chrom == sam.NO_REF_NAME or start == sam.NO_REF_POS: 201 | rec.is_unmapped = True 202 | 203 | attrs = attrs if attrs else dict() 204 | if "RG" not in attrs: 205 | attrs["RG"] = self.rg_id 206 | rec.set_tags(list(attrs.items())) 207 | return rec 208 | 209 | def _set_flags(self, rec: pysam.AlignedSegment, is_r1: bool, strand: str) -> None: 210 | """Appropriately sets most flag fields on the given read. 211 | 212 | Args: 213 | rec: the read to set the flags on 214 | is_r1: True if the read is a R1, False if it is an R2 215 | strand: Either "+" or "-" to indicate strand of the read 216 | """ 217 | rec.is_paired = True 218 | rec.is_read1 = is_r1 219 | rec.is_read2 = not is_r1 220 | rec.is_qcfail = False 221 | rec.is_duplicate = False 222 | rec.is_secondary = False 223 | rec.is_supplementary = False 224 | if not rec.is_unmapped: 225 | rec.is_reverse = strand != "+" 226 | 227 | def _set_length_dependent_fields(self, 228 | rec: pysam.AlignedSegment, 229 | length: int, 230 | bases: Optional[str] = None, 231 | quals: Optional[List[int]] = None, 232 | cigar: Optional[str] = None, 233 | ) -> None: 234 | """Fills in bases, quals and cigar on a record. 235 | 236 | If any of bases, quals or cigar are defined, they must all have the same length/query 237 | length. If none are defined then the length parameter is used. Undefined values are 238 | synthesize at the inferred length. 239 | 240 | Args: 241 | rec: a SAM record 242 | length: the length to use if all of bases/quals/cigar are None 243 | bases: an optional string of bases for the read 244 | quals: an optional list of qualities for the read 245 | cigar: an optional cigar string for the read 246 | """ 247 | 248 | # Do some validation to make sure all defined things have the same lengths 249 | lengths = set() 250 | if bases is not None: 251 | lengths.add(len(bases)) 252 | if quals is not None: 253 | lengths.add(len(quals)) 254 | if cigar is not None: 255 | cig = sam.Cigar.from_cigarstring(cigar) 256 | lengths.add(sum([elem.length_on_query for elem in cig.elements])) 257 | 258 | if not lengths: 259 | lengths.add(length) 260 | 261 | if len(lengths) != 1: 262 | raise ValueError("Provided bases/quals/cigar are not length compatible.") 263 | 264 | # Fill in the record, making any parts that were not defined as params 265 | length = lengths.pop() 266 | rec.query_sequence = bases if bases else self._bases(length) 267 | rec.query_qualities = quals if quals else [self.base_quality] * length 268 | if not rec.is_unmapped: 269 | rec.cigarstring = cigar if cigar else f"{length}M" 270 | 271 | def _set_mate_info(self, r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None: 272 | """Sets the mate information on a pair of sam records. 273 | 274 | Handles cases where both reads are mapped, one of the two reads is unmapped or both reads 275 | are unmapped. 276 | 277 | Args: 278 | r1: the first read in the pair 279 | r2: the sceond read in the pair 280 | """ 281 | for rec in r1, r2: 282 | rec.template_length = 0 283 | rec.is_proper_pair = False 284 | 285 | if r1.is_unmapped and r2.is_unmapped: 286 | # If they're both unmapped just clean the records up 287 | for rec, other in [(r1, r2), (r2, r1)]: 288 | rec.reference_id = sam.NO_REF_INDEX 289 | rec.next_reference_id = sam.NO_REF_INDEX 290 | rec.reference_start = sam.NO_REF_POS 291 | rec.next_reference_start = sam.NO_REF_POS 292 | rec.is_unmapped = True 293 | rec.mate_is_unmapped = True 294 | rec.is_proper_pair = False 295 | rec.mate_is_reverse = other.is_reverse 296 | 297 | elif r1.is_unmapped or r2.is_unmapped: 298 | # If only one is mapped/unmapped copy over the relevant stuff 299 | (m, u) = (r1, r2) if r2.is_unmapped else (r2, r1) 300 | u.reference_id = m.reference_id 301 | u.reference_start = m.reference_start 302 | u.next_reference_id = m.reference_id 303 | u.next_reference_start = m.reference_start 304 | u.mate_is_reverse = m.is_reverse 305 | u.mate_is_unmapped = False 306 | u.set_tag("MC", m.cigarstring) 307 | 308 | m.next_reference_id = u.reference_id 309 | m.next_reference_start = u.reference_start 310 | m.mate_is_reverse = u.is_reverse 311 | m.mate_is_unmapped = True 312 | 313 | else: 314 | # Else they are both mapped 315 | for rec, other in [(r1, r2), (r2, r1)]: 316 | rec.next_reference_id = other.reference_id 317 | rec.next_reference_start = other.reference_start 318 | rec.mate_is_reverse = other.is_reverse 319 | rec.mate_is_unmapped = False 320 | rec.set_tag("MC", other.cigarstring) 321 | 322 | if r1.reference_id == r2.reference_id: 323 | r1p = r1.reference_end if r1.is_reverse else r1.reference_start 324 | r2p = r2.reference_end if r2.is_reverse else r2.reference_start 325 | r1.template_length = r2p - r1p 326 | r2.template_length = r1p - r2p 327 | 328 | # Arbitrarily set proper pair if the we have an FR pair with isize <= 1000 329 | if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000: 330 | fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p) 331 | if fpos < rpos: 332 | r1.is_proper_pair = True 333 | r2.is_proper_pair = True 334 | 335 | @property 336 | def rg(self) -> Dict[str, Any]: 337 | """Returns the single read group that is defined in the header.""" 338 | rgs = self._header["RG"] 339 | assert len(rgs) == 1, "Header did not contain exactly one read group!" 340 | return rgs[0] 341 | 342 | @property 343 | def rg_id(self) -> str: 344 | """Returns the ID of the single read group that is defined in the header.""" 345 | return self.rg["ID"] 346 | 347 | def add_pair(self, *, 348 | name: Optional[str] = None, 349 | bases1: Optional[str] = None, 350 | bases2: Optional[str] = None, 351 | quals1: Optional[List[int]] = None, 352 | quals2: Optional[List[int]] = None, 353 | chrom: str = sam.NO_REF_NAME, 354 | start1: int = sam.NO_REF_POS, 355 | start2: int = sam.NO_REF_POS, 356 | cigar1: Optional[str] = None, 357 | cigar2: Optional[str] = None, 358 | strand1: str = "+", 359 | strand2: str = "-", 360 | attrs: Optional[Dict[str, Any]] = None) -> Tuple[AlignedSegment, AlignedSegment]: 361 | """Generates a new pair of reads, adds them to the internal collection, and returns them. 362 | 363 | Most fields are optional. 364 | 365 | An unmapped pair can be created by calling the method with no parameters (specifically, 366 | not setting chrom, start1 or start2). If either cigar is provided, it will be ignored. 367 | 368 | A pair with only one of the two reads mapped is created by setting e.g. chrom and start1. 369 | The values will be automaticaly transferred to the unmapped mate, and flags set correctly. 370 | 371 | A mapped pair is created by providing all three of chrom, start1 and start2. 372 | 373 | For a given read (i.e. R1 or R2) the length of the read is determined based on the presence 374 | or absence of bases, quals, and cigar. If values are provided for one or more of these 375 | parameters, the lengths must match, and the length will be used to generate any 376 | unsupplied values. If none of bases, quals, and cigar are provided, all three will be 377 | synthesized based on either the r1_len or r2_len stored on the class as appropriate. 378 | 379 | When synthesizing, bases are always a random sequence of bases, quals are all the default 380 | base quality (supplied when constructing a SamBuilder) and the cigar is always a single M 381 | operator of the read length. 382 | 383 | Alignment attributes not exposed through the method parameters can be modified directly on 384 | the returned AlignedSegment objects. Modifications will be reflected when records are 385 | written to a temporary file with :func:`~samwell.sambuilder.SamBuilder.to_path`. 386 | 387 | Args: 388 | name: The name of the template. If None is given a unique name will be auto-generated. 389 | bases1: The bases for R1. If None is given a random sequence is generated. 390 | bases2: The bases for R2. If None is given a random sequence is generated. 391 | quals1: The list of int qualities for R1. If None, the default base quality is used. 392 | quals2: The list of int qualities for R2. If None, the default base quality is used. 393 | chrom: The chromosome to which both reads are mapped. Defaults to the unmapped value. 394 | start1: The start position of R1. Defaults to the unmapped value. 395 | start2: The start position of R2. Defaults to the unmapped value. 396 | cigar1: The cigar string for R1. Defaults to None for unmapped reads, otherwise all M. 397 | cigar2: The cigar string for R2. Defaults to None for unmapped reads, otherwise all M. 398 | strand1: The strand for R1, either "+" or "-". Defaults to "+". 399 | strand2: The strand for R2, either "+" or "-". Defaults to "-". 400 | attrs: An optional dictionary of SAM attribute to place on both R1 and R2. 401 | 402 | Raises: 403 | ValueError: if either strand field is not "+" or "-" 404 | ValueError: if bases/quals/cigar are set in a way that is not self-consistent 405 | 406 | Returns: 407 | Tuple[AlignedSegment, AlignedSegment]: The pair of records created, R1 then R2. 408 | """ 409 | 410 | if strand1 not in ["+", "-"]: raise ValueError(f"Invalid value for strand1: {strand1}") 411 | if strand2 not in ["+", "-"]: raise ValueError(f"Invalid value for strand2: {strand2}") 412 | 413 | name = name if name is not None else self._next_name() 414 | 415 | # Setup R1 416 | r1 = self._new_rec(name=name, chrom=chrom, start=start1, attrs=attrs) 417 | self._set_flags(r1, is_r1=True, strand=strand1) 418 | self._set_length_dependent_fields( 419 | rec=r1, length=self.r1_len, bases=bases1, quals=quals1, cigar=cigar1) 420 | 421 | # Setup R2 422 | r2 = self._new_rec(name=name, chrom=chrom, start=start2, attrs=attrs) 423 | self._set_flags(r2, is_r1=False, strand=strand2) 424 | self._set_length_dependent_fields( 425 | rec=r2, length=self.r2_len, bases=bases2, quals=quals2, cigar=cigar2) 426 | 427 | # Sync up mate info and we're done! 428 | self._set_mate_info(r1, r2) 429 | self._records.append(r1) 430 | self._records.append(r2) 431 | return r1, r2 432 | 433 | def to_path(self, 434 | path: Optional[Path] = None, 435 | index: bool = True, 436 | pred: Callable[[AlignedSegment], bool] = lambda r: True) -> Path: 437 | """Write the accumulated records to a file, sorts & indexes it, and returns the Path. 438 | If a path is provided, it will be written to, otherwise a temporary file is created 439 | and returned. 440 | 441 | Args: 442 | path: a path at which to write the file, otherwise a temp file is used. 443 | index: if True and `sort_order` is `Coordinate` index is generated, otherwise not. 444 | pred: optional predicate to specify which reads should be output 445 | 446 | Returns: 447 | Path: The path to the sorted (and possibly indexed) file. 448 | """ 449 | 450 | if path is None: 451 | with NamedTemporaryFile(suffix=".bam", delete=False) as fp: 452 | path = Path(fp.name) 453 | 454 | with NamedTemporaryFile(suffix=".bam", delete=True) as fp: 455 | file_handle: IO 456 | if self.sort_order is SamOrder.Unsorted: 457 | file_handle = path.open('w') 458 | else: 459 | file_handle = fp.file 460 | 461 | with sam.writer(file_handle, # type: ignore 462 | header=self._samheader, 463 | file_type=sam.SamFileType.BAM) as writer: 464 | for rec in self._records: 465 | if pred(rec): 466 | writer.write(rec) 467 | 468 | default_samtools_opt_list = ["-o", str(path), fp.name] 469 | 470 | file_handle.close() 471 | if self.sort_order == SamOrder.QueryName: 472 | pysam.sort(*(["-n"] + default_samtools_opt_list)) 473 | elif self.sort_order == SamOrder.Coordinate: 474 | pysam.sort(*default_samtools_opt_list) 475 | if index: 476 | pysam.index(str(path)) 477 | return path 478 | 479 | def __len__(self) -> int: 480 | """Returns the number of records accumulated so far.""" 481 | return len(self._records) 482 | 483 | def to_unsorted_list(self) -> List[pysam.AlignedSegment]: 484 | """Returns the accumulated records in the order they were created.""" 485 | return list(self._records) 486 | 487 | def to_sorted_list(self) -> List[pysam.AlignedSegment]: 488 | """Returns the accumulated records in coordinate order.""" 489 | with NamedTemporaryFile(suffix=".bam", delete=True) as fp: 490 | filename = fp.name 491 | path = self.to_path(path=Path(filename), index=False) 492 | bam = sam.reader(path) 493 | return list(bam) 494 | 495 | @property 496 | def header(self) -> AlignmentHeader: 497 | """Returns a copy of the alignmentt header used by this builder""" 498 | return AlignmentHeader.from_dict(self._header) 499 | -------------------------------------------------------------------------------- /samwell/sam/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myriad-opensource/samwell/47c4d809e3a228cf2be7af09871ab70e706763a1/samwell/sam/tests/__init__.py -------------------------------------------------------------------------------- /samwell/sam/tests/data/valid.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.0 SO:coordinate 2 | @SQ SN:chr1 LN:101 3 | @SQ SN:chr2 LN:101 4 | @SQ SN:chr3 LN:101 5 | @SQ SN:chr4 LN:101 6 | @SQ SN:chr5 LN:101 7 | @SQ SN:chr6 LN:101 8 | @SQ SN:chr7 LN:404 9 | @SQ SN:chr8 LN:202 10 | @RG ID:0 SM:Hi,Mom! LB:my-library PL:ILLUMINA 11 | @RG ID:1 SM:Hi,Mom! LB:my-library PL:ILLUMINA 12 | @RG ID:2 SM:Hi,Mom! LB:my-library PL:Illumina 13 | @PG ID:1 PN:Hey! VN:2.0 14 | both_reads_align_clip_marked 1107 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 PG:Z:1 NM:i:0 MQ:i:255 XT:Z:foo OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 15 | both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:1 PG:Z:1 NM:i:3 MQ:i:255 XT:Z:foo OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 16 | read_2_too_many_gaps 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:2 PG:Z:1 NM:i:8 MQ:i:255 XT:Z:foo2 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 17 | both_reads_align_clip_adapter 147 chr7 16 255 101M = 21 -96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:1 PG:Z:1 NM:i:1 MQ:i:255 XT:Z:foo2 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 18 | both_reads_align_clip_adapter 99 chr7 21 255 101M = 16 96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:1 PG:Z:1 NM:i:1 MQ:i:255 XT:Z:foo2 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 19 | both_reads_align_clip_marked 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 PG:Z:1 NM:i:5 MQ:i:255 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 20 | read_2_too_many_gaps 163 chr7 302 255 10M1D10M5I76M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:2 PG:Z:1 NM:i:6 MQ:i:255 OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 21 | both_reads_present_only_first_aligns 165 * 0 0 * chr7 1 0 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:1 PG:Z:1 22 | -------------------------------------------------------------------------------- /samwell/sam/tests/test_bwa_mem.py: -------------------------------------------------------------------------------- 1 | import distutils.spawn 2 | import subprocess 3 | from pathlib import Path 4 | from tempfile import NamedTemporaryFile as NamedTemp 5 | from typing import List 6 | from typing import Optional 7 | from typing import Tuple 8 | 9 | import attr 10 | import pytest 11 | from py._path.local import LocalPath as TmpDir 12 | from pysam import AlignedSegment 13 | 14 | from samwell.sam.bwa_mem import FastqRecord 15 | from samwell.sam.bwa_mem import InputOutputOptions 16 | from samwell.sam.bwa_mem import align 17 | 18 | 19 | BwaExecutable: Optional[str] = distutils.spawn.find_executable("bwa") 20 | 21 | 22 | @pytest.fixture 23 | def ref_fasta(tmpdir: TmpDir) -> Path: 24 | with NamedTemp(suffix=".fasta", dir=tmpdir, mode='w', delete=False) as fp: 25 | filename = Path(fp.name).name 26 | ref_fasta = tmpdir / filename 27 | 28 | with ref_fasta.open('w') as fh: 29 | fh.write(">1\n") 30 | fh.write("CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAA\n") 31 | subprocess.check_call(args=["bwa", "index", str(fp.name)]) 32 | return ref_fasta 33 | 34 | 35 | @pytest.fixture 36 | def fastq_record() -> FastqRecord: 37 | read_bases = "CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAA" 38 | return FastqRecord( 39 | name="some_name", 40 | bases=read_bases, 41 | quals="".join(["I" for _ in read_bases]) 42 | ) 43 | 44 | 45 | def _assert_alignment_for_fastq_record(read: FastqRecord, 46 | results: Tuple[FastqRecord, List[AlignedSegment]]) -> None: 47 | 48 | fastq, alignments = results 49 | assert len(alignments) == 1 50 | alignment = alignments[0] 51 | 52 | assert alignment.query_name == read.name 53 | assert alignment.query_sequence == read.bases 54 | assert "".join([chr(q + 33) for q in alignment.query_qualities]) == read.quals 55 | assert alignment.query_name == read.name 56 | assert alignment.reference_name == "1" 57 | assert alignment.reference_start == 0 58 | assert alignment.cigarstring == "60M" 59 | if read.read_number is not None: 60 | assert read.read_number == 1 or read.read_number == 2 61 | assert alignment.is_paired 62 | assert read.read_number == (1 if alignment.is_read1 else 2) 63 | 64 | 65 | @pytest.mark.skipif(BwaExecutable is None, reason="requires bwa 0.7.17") 66 | def test_single_alignment(fastq_record: FastqRecord, ref_fasta: Path) -> None: 67 | # run BWA 68 | results = list(align(reads=[fastq_record], idxbase=ref_fasta)) 69 | 70 | # Check the returned alignments 71 | assert len(results) == 1 72 | _assert_alignment_for_fastq_record(read=fastq_record, results=results[0]) 73 | 74 | 75 | @pytest.mark.skipif(BwaExecutable is None, reason="requires bwa 0.7.17") 76 | def test_fails_consecutive_reads_with_the_same_name_and_number(fastq_record: FastqRecord, 77 | ref_fasta: Path) -> None: 78 | # run BWA 79 | with pytest.raises(Exception, match="Consecutive reads"): 80 | list(align(reads=[fastq_record, fastq_record], idxbase=ref_fasta)) 81 | 82 | 83 | @pytest.mark.skipif(BwaExecutable is None, reason="requires bwa 0.7.17") 84 | def test_paired_end_reads(fastq_record: FastqRecord, ref_fasta: Path) -> None: 85 | # run BWA 86 | r1 = attr.evolve(fastq_record, read_number=1) 87 | r2 = attr.evolve(r1, read_number=2) 88 | io_opts = InputOutputOptions(interleaved_pairs=True) 89 | results = list(align(reads=[r1, r2], idxbase=ref_fasta, io_opts=io_opts)) 90 | _assert_alignment_for_fastq_record(read=r1, results=results[0]) 91 | _assert_alignment_for_fastq_record(read=r2, results=results[1]) 92 | 93 | 94 | @pytest.mark.skipif(BwaExecutable is None, reason="requires bwa 0.7.17") 95 | def test_needs_alignment(fastq_record: FastqRecord, ref_fasta: Path) -> None: 96 | # run BWA 97 | rec1 = fastq_record 98 | rec2 = FastqRecord( 99 | name="needs_alignment=False", 100 | bases=rec1.bases, 101 | quals=rec1.quals, 102 | needs_alignment=False 103 | ) 104 | 105 | results = list(align(reads=[rec1, rec2], idxbase=ref_fasta)) 106 | 107 | # Check the returned alignments 108 | assert len(results) == 2 109 | for result in results: 110 | fastq, alignments = result 111 | if fastq.needs_alignment: 112 | assert fastq == rec1 113 | _assert_alignment_for_fastq_record(read=fastq_record, results=result) 114 | else: 115 | assert len(alignments) == 0 116 | assert fastq.name == "needs_alignment=False" 117 | assert fastq == rec2 118 | 119 | 120 | @pytest.mark.skipif(BwaExecutable is None, reason="requires bwa 0.7.17") 121 | def test_no_alignment(ref_fasta: Path) -> None: 122 | fastq_record = FastqRecord( 123 | name="unmapped", 124 | bases="A" * 60, 125 | quals="I" * 60, 126 | needs_alignment=False 127 | ) 128 | 129 | # run BWA 130 | results = list(align(reads=[fastq_record], idxbase=ref_fasta)) 131 | 132 | # Check the returned alignments 133 | assert len(results) == 1 134 | fastq, alignments = results[0] 135 | assert len(alignments) == 0 136 | assert fastq_record == fastq 137 | -------------------------------------------------------------------------------- /samwell/sam/tests/test_clipping.py: -------------------------------------------------------------------------------- 1 | """Tests for :py:mod:`~samwell.clipping`""" 2 | 3 | from typing import Optional 4 | 5 | import pytest 6 | from pysam import AlignedSegment 7 | 8 | from samwell import sam 9 | from samwell.sam import clipping 10 | from samwell.sam.sambuilder import SamBuilder 11 | 12 | 13 | def r(start: Optional[int], cigar: Optional[str], strand: Optional[str] = "+") -> AlignedSegment: 14 | """"Constructs a read for testing.""" 15 | builder = SamBuilder() 16 | if start: 17 | r1, r2 = builder.add_pair(chrom="chr1", start1=start, cigar1=cigar, strand1=strand) 18 | else: 19 | r1, r2 = builder.add_pair() 20 | return r1 21 | 22 | 23 | def test_make_read_unmapped() -> None: 24 | builder = SamBuilder() 25 | r1, r2 = builder.add_pair(chrom="chr1", start1=100, start2=250) 26 | 27 | clipping._make_read_unmapped(r1) 28 | assert r1.is_unmapped 29 | assert r1.reference_id == sam.NO_REF_INDEX 30 | assert r1.reference_name is None 31 | assert r1.reference_start == sam.NO_REF_POS 32 | 33 | 34 | ############################################################################### 35 | # Tests for read_pos_at_ref_pos() 36 | ############################################################################### 37 | 38 | def test_read_pos_at_ref_pos_simple() -> None: 39 | rec = r(100, "100M") 40 | assert clipping._read_pos_at_ref_pos(rec, 100) == 0 41 | assert clipping._read_pos_at_ref_pos(rec, 150) == 50 42 | 43 | 44 | def test_read_pos_at_ref_pos_fails_with_position_outside_range() -> None: 45 | rec = r(100, "100M") 46 | assert clipping._read_pos_at_ref_pos(rec, 100) == 0 47 | assert clipping._read_pos_at_ref_pos(rec, 199) == 99 48 | 49 | with pytest.raises(ValueError): 50 | clipping._read_pos_at_ref_pos(rec, 99) 51 | with pytest.raises(ValueError): 52 | clipping._read_pos_at_ref_pos(rec, 200) 53 | 54 | 55 | def test_read_pos_at_ref_pos_with_indels_nearby() -> None: 56 | rec = r(100, "25M1D25M1I25M") 57 | assert clipping._read_pos_at_ref_pos(rec, 100) == 0 58 | assert clipping._read_pos_at_ref_pos(rec, 110) == 10 59 | assert clipping._read_pos_at_ref_pos(rec, 120) == 20 60 | assert clipping._read_pos_at_ref_pos(rec, 130) == 29 61 | assert clipping._read_pos_at_ref_pos(rec, 140) == 39 62 | assert clipping._read_pos_at_ref_pos(rec, 150) == 49 63 | assert clipping._read_pos_at_ref_pos(rec, 160) == 60 64 | 65 | 66 | def test_read_pos_at_ref_pos_with_clipping() -> None: 67 | rec = r(100, "10S90M") 68 | assert clipping._read_pos_at_ref_pos(rec, 100) == 10 69 | 70 | 71 | def test_read_pos_at_ref_pos_with_refpos_in_deletion() -> None: 72 | rec = r(100, "50M5D50M") 73 | assert clipping._read_pos_at_ref_pos(rec, 152) is None 74 | assert clipping._read_pos_at_ref_pos(rec, 152, previous=None) is None 75 | assert clipping._read_pos_at_ref_pos(rec, 152, previous=True) == 49 76 | assert clipping._read_pos_at_ref_pos(rec, 152, previous=False) == 50 77 | 78 | 79 | ############################################################################### 80 | # Tests for softclip_start_of_alignment() 81 | ############################################################################### 82 | 83 | def test_softclip_start_of_alignment_by_query_clips_10_aligned_bases() -> None: 84 | rec = r(10, "50M", "+") 85 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 86 | assert info.query_bases_clipped == 10 87 | assert info.ref_bases_clipped == 10 88 | assert rec.reference_start == 20 89 | assert rec.cigarstring == "10S40M" 90 | 91 | 92 | def test_softclip_start_of_alignment_by_query_masking_qualities() -> None: 93 | for new_qual in None, 0, 2: 94 | rec = r(10, "50M", "+") 95 | clipping.softclip_start_of_alignment_by_query(rec, 10, clipped_base_quality=new_qual) 96 | quals = rec.query_qualities 97 | 98 | for i in range(0, 10): 99 | assert quals[i] == (30 if new_qual is None else new_qual) 100 | 101 | 102 | def test_soft_clip_start_of_alignment_by_query_clips_10_aligned_and_inserted_bases() -> None: 103 | for strand in "+", "-": 104 | rec = r(10, "4M2I44M", strand) 105 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 106 | assert info.query_bases_clipped == 10 107 | assert info.ref_bases_clipped == 8 108 | assert rec.reference_start == 18 109 | assert rec.cigarstring == "10S40M" 110 | 111 | 112 | def test_softclip_start_of_alignment_by_query_clips_10_aligned_and_deleted_bases() -> None: 113 | for strand in "+", "-": 114 | rec = r(10, "6M2D44M", strand) 115 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 116 | assert info.query_bases_clipped == 10 117 | assert info.ref_bases_clipped == 12 118 | assert rec.reference_start == 22 119 | assert rec.cigarstring == "10S40M" 120 | 121 | 122 | def test_softclip_start_of_alignment_by_query_clips_10_more_bases() -> None: 123 | for strand in "+", "-": 124 | rec = r(10, "10S40M", strand) 125 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 126 | assert info.query_bases_clipped == 10 127 | assert info.ref_bases_clipped == 10 128 | assert rec.reference_start == 20 129 | assert rec.cigarstring == "20S30M" 130 | 131 | 132 | def test_softclip_start_of_alignment_by_query_preserves_hard_clipping() -> None: 133 | for strand in "+", "-": 134 | rec = r(10, "10H40M", strand) 135 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 136 | assert info.query_bases_clipped == 10 137 | assert info.ref_bases_clipped == 10 138 | assert rec.reference_start == 20 139 | assert rec.cigarstring == "10H10S30M" 140 | 141 | 142 | def test_softclip_start_of_alignment_by_query_with_complicated_cigar() -> None: 143 | for strand in "+", "-": 144 | rec = r(10, "2H4S16M10I5M5I10M", strand) 145 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 146 | assert info.query_bases_clipped == 10 147 | assert info.ref_bases_clipped == 10 148 | assert rec.reference_start == 20 149 | assert rec.cigarstring == "2H14S6M10I5M5I10M" 150 | 151 | 152 | def test_softclip_start_of_alignment_by_query_consumes_rest_of_insertion() -> None: 153 | for strand in "+", "-": 154 | rec = r(10, "8M4I38M", strand) 155 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 156 | assert info.query_bases_clipped == 12 157 | assert info.ref_bases_clipped == 8 158 | assert rec.reference_start == 18 159 | assert rec.cigarstring == "12S38M" 160 | 161 | 162 | def test_softclip_start_of_alignment_by_query_preserves_insertion_adjacent_to_clipping() -> None: 163 | for strand in "+", "-": 164 | rec = r(10, "10M4I36M", strand) 165 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 166 | assert info.query_bases_clipped == 10 167 | assert info.ref_bases_clipped == 10 168 | assert rec.reference_start == 20 169 | assert rec.cigarstring == "10S4I36M" 170 | 171 | 172 | def test_softclip_start_of_alignment_by_query_removes_deletion_following_clipping() -> None: 173 | for strand in "+", "-": 174 | rec = r(10, "10M4D40M", strand) 175 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 176 | assert info.query_bases_clipped == 10 177 | assert info.ref_bases_clipped == 14 178 | assert rec.reference_start == 24 179 | assert rec.cigarstring == "10S40M" 180 | 181 | 182 | def test_softclip_start_of_alignment_by_query_preserves_deletions_post_clipping_region() -> None: 183 | for strand in "+", "-": 184 | rec = r(10, "25M4D25M", strand) 185 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 186 | assert info.query_bases_clipped == 10 187 | assert info.ref_bases_clipped == 10 188 | assert rec.reference_start == 20 189 | assert rec.cigarstring == "10S15M4D25M" 190 | 191 | 192 | def test_softclip_start_of_alignment_by_query_unmapped_reads_ok() -> None: 193 | rec = r(start=None, cigar=None) 194 | info = clipping.softclip_start_of_alignment_by_query(rec, 10) 195 | assert info.query_bases_clipped == 0 196 | assert info.ref_bases_clipped == 0 197 | 198 | 199 | def test_softclip_start_of_alignment_by_query_unmaps_read_when_clipping_all_bases() -> None: 200 | rec = r(10, "50M") 201 | assert not rec.is_unmapped 202 | info = clipping.softclip_start_of_alignment_by_query(rec, 50) 203 | assert info.query_bases_clipped == 50 204 | assert info.ref_bases_clipped == 50 205 | assert rec.is_unmapped 206 | 207 | 208 | ############################################################################### 209 | # Tests for softclip_end_of_alignment() 210 | ############################################################################### 211 | 212 | def test_softclip_end_of_alignment_by_query_clips_last10_bases_of_fully_aligned_read() -> None: 213 | for strand in "+", "-": 214 | rec = r(10, "50M", strand) 215 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 216 | assert info.query_bases_clipped == 10 217 | assert info.ref_bases_clipped == 10 218 | assert rec.reference_start == 10 219 | assert rec.cigarstring == "40M10S" 220 | 221 | 222 | def test_softclip_end_of_alignment_by_query_masks_qualities_when_softclipping() -> None: 223 | for new_qual in None, 2: 224 | rec = r(10, "50M", "+") 225 | clipping.softclip_end_of_alignment_by_query(rec, 10, clipped_base_quality=new_qual) 226 | quals = rec.query_qualities 227 | 228 | for i in range(40, 50): 229 | assert quals[i] == (30 if new_qual is None else new_qual) 230 | 231 | 232 | def test_soft_clip_end_of_alignment_by_query_clips_10_aligned_and_inserted_bases() -> None: 233 | for strand in "+", "-": 234 | rec = r(10, "44M2I4M", strand) 235 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 236 | assert info.query_bases_clipped == 10 237 | assert info.ref_bases_clipped == 8 238 | assert rec.reference_start == 10 239 | assert rec.cigarstring == "40M10S" 240 | 241 | 242 | def test_softclip_end_of_alignment_by_query_clips_10_aligned_and_deleted_bases() -> None: 243 | for strand in "+", "-": 244 | rec = r(10, "44M2D6M", strand) 245 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 246 | assert info.query_bases_clipped == 10 247 | assert info.ref_bases_clipped == 12 248 | assert rec.reference_start == 10 249 | assert rec.cigarstring == "40M10S" 250 | 251 | 252 | def test_softclip_end_of_alignment_by_query_clips_10_more_bases() -> None: 253 | for strand in "+", "-": 254 | rec = r(10, "40M10S", strand) 255 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 256 | assert info.query_bases_clipped == 10 257 | assert info.ref_bases_clipped == 10 258 | assert rec.reference_start == 10 259 | assert rec.cigarstring == "30M20S" 260 | 261 | 262 | def test_softclip_end_of_alignment_by_query_preserves_hard_clipping() -> None: 263 | for strand in "+", "-": 264 | rec = r(10, "40M10H", strand) 265 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 266 | assert info.query_bases_clipped == 10 267 | assert info.ref_bases_clipped == 10 268 | assert rec.reference_start == 10 269 | assert rec.cigarstring == "30M10S10H" 270 | 271 | 272 | def test_softclip_end_of_alignment_by_query_with_complicated_cigar() -> None: 273 | for strand in "+", "-": 274 | rec = r(10, "10M5I5M10I16M4S2H", strand) 275 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 276 | assert info.query_bases_clipped == 10 277 | assert info.ref_bases_clipped == 10 278 | assert rec.reference_start == 10 279 | assert rec.cigarstring == "10M5I5M10I6M14S2H" 280 | 281 | 282 | def test_softclip_end_of_alignment_by_query_consumes_rest_of_insertion() -> None: 283 | for strand in "+", "-": 284 | rec = r(10, "38M4I8M", strand) 285 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 286 | assert info.query_bases_clipped == 12 287 | assert info.ref_bases_clipped == 8 288 | assert rec.reference_start == 10 289 | assert rec.cigarstring == "38M12S" 290 | 291 | 292 | def test_softclip_end_of_alignment_by_query_preserves_insertion_following_clipping() -> None: 293 | for strand in "+", "-": 294 | rec = r(10, "36M4I10M", strand) 295 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 296 | assert info.query_bases_clipped == 10 297 | assert info.ref_bases_clipped == 10 298 | assert rec.reference_start == 10 299 | assert rec.cigarstring == "36M4I10S" 300 | 301 | 302 | def test_softclip_end_of_alignment_by_query_removes_deletion_following_clipping() -> None: 303 | for strand in "+", "-": 304 | rec = r(10, "40M4D10M", strand) 305 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 306 | assert info.query_bases_clipped == 10 307 | assert info.ref_bases_clipped == 14 308 | assert rec.reference_start == 10 309 | assert rec.cigarstring == "40M10S" 310 | 311 | 312 | def test_softclip_end_of_alignment_by_query_preserves_deletions_post_clipping_region() -> None: 313 | for strand in "+", "-": 314 | rec = r(10, "25M4D25M", strand) 315 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 316 | assert info.query_bases_clipped == 10 317 | assert info.ref_bases_clipped == 10 318 | assert rec.reference_start == 10 319 | assert rec.cigarstring == "25M4D15M10S" 320 | 321 | 322 | def test_softclip_end_of_alignment_by_query_unmapped_reads_ok() -> None: 323 | rec = r(start=None, cigar=None) 324 | info = clipping.softclip_end_of_alignment_by_query(rec, 10) 325 | assert info.query_bases_clipped == 0 326 | assert info.ref_bases_clipped == 0 327 | 328 | 329 | def test_softclip_end_of_alignment_by_query_makes_read_unmapped_when_clipping_all_bases() -> None: 330 | rec = r(10, "50M") 331 | assert not rec.is_unmapped 332 | info = clipping.softclip_end_of_alignment_by_query(rec, 50) 333 | assert info.query_bases_clipped == 50 334 | assert info.ref_bases_clipped == 50 335 | assert rec.is_unmapped 336 | 337 | ############################################################################### 338 | # Tests for functions that clip _reference_ bases instead of query bases 339 | ############################################################################### 340 | 341 | 342 | def test_softclip_start_of_alignment_by_ref_simple() -> None: 343 | rec = r(10, "50M") 344 | info = clipping.softclip_start_of_alignment_by_ref(rec, 10) 345 | assert info.query_bases_clipped == 10 346 | assert info.ref_bases_clipped == 10 347 | assert rec.reference_start == 20 348 | assert rec.cigarstring == "10S40M" 349 | 350 | 351 | def test_softclip_start_of_alignment_by_ref_with_deletion() -> None: 352 | rec = r(10, "5M5D45M") 353 | info = clipping.softclip_start_of_alignment_by_ref(rec, 10) 354 | assert info.query_bases_clipped == 5 355 | assert info.ref_bases_clipped == 10 356 | assert rec.reference_start == 20 357 | assert rec.cigarstring == "5S45M" 358 | 359 | 360 | def test_softclip_start_of_alignment_by_ref_with_insertion() -> None: 361 | rec = r(10, "5M5I45M") 362 | info = clipping.softclip_start_of_alignment_by_ref(rec, 10) 363 | assert info.query_bases_clipped == 15 364 | assert info.ref_bases_clipped == 10 365 | assert rec.reference_start == 20 366 | assert rec.cigarstring == "15S40M" 367 | 368 | 369 | def test_softclip_end_of_alignment_by_ref_simple() -> None: 370 | rec = r(10, "50M") 371 | info = clipping.softclip_end_of_alignment_by_ref(rec, 10) 372 | assert info.query_bases_clipped == 10 373 | assert info.ref_bases_clipped == 10 374 | assert rec.reference_start == 10 375 | assert rec.cigarstring == "40M10S" 376 | 377 | 378 | def test_softclip_end_of_alignment_by_ref_with_deletion() -> None: 379 | rec = r(10, "45M5D5M") 380 | info = clipping.softclip_end_of_alignment_by_ref(rec, 10) 381 | assert info.query_bases_clipped == 5 382 | assert info.ref_bases_clipped == 10 383 | assert rec.reference_start == 10 384 | assert rec.cigarstring == "45M5S" 385 | 386 | 387 | def test_softclip_end_of_alignment_by_ref_with_insertion() -> None: 388 | rec = r(10, "45M5I5M") 389 | info = clipping.softclip_end_of_alignment_by_ref(rec, 10) 390 | assert info.query_bases_clipped == 15 391 | assert info.ref_bases_clipped == 10 392 | assert rec.reference_start == 10 393 | assert rec.cigarstring == "40M15S" 394 | -------------------------------------------------------------------------------- /samwell/sam/tests/test_sam.py: -------------------------------------------------------------------------------- 1 | """Tests for :py:mod:`~samwell.sam`""" 2 | 3 | from pathlib import Path 4 | from tempfile import NamedTemporaryFile as NamedTemp 5 | from typing import Any 6 | from typing import Dict 7 | from typing import Generator 8 | from typing import List 9 | from typing import Tuple 10 | from typing import Union 11 | 12 | import pysam 13 | import pytest 14 | from py._path.local import LocalPath as TmpDir 15 | 16 | import samwell.sam as sam 17 | from samwell.sam import Cigar 18 | from samwell.sam import CigarElement 19 | from samwell.sam import CigarOp 20 | from samwell.sam import CigarParsingException 21 | from samwell.sam import SamFileType 22 | from samwell.sam.sambuilder import SamBuilder 23 | 24 | 25 | @pytest.mark.parametrize("file_type", list(SamFileType)) 26 | @pytest.mark.parametrize("as_str", [True, False]) 27 | def test_sam_file_type_from_path(file_type: SamFileType, as_str: bool) -> None: 28 | path: Union[Path, str] 29 | if as_str: 30 | path = "/path/to/some/file" + file_type.ext 31 | else: 32 | path = Path("/path/to/some/file" + file_type.ext) 33 | assert SamFileType.from_path(path=path) == file_type 34 | 35 | 36 | def test_sam_file_type_invalid_path() -> None: 37 | path = "/path/to/excel.xls" 38 | with pytest.raises(ValueError) as ex: 39 | SamFileType.from_path(path=path) 40 | assert "Could not infer file type from " + path in str(ex) 41 | 42 | 43 | @pytest.fixture 44 | def valid_sam() -> Path: 45 | return Path(__file__).parent / 'data' / 'valid.sam' 46 | 47 | 48 | @pytest.fixture 49 | def valid_bam(valid_sam: Path) -> Generator[Path, None, None]: 50 | bam: Path = Path(__file__).parent / 'data' / 'valid.bam' 51 | num_read = 0 52 | with sam.reader(valid_sam) as fh_in: 53 | with sam.writer(bam, fh_in.header, file_type=SamFileType.BAM) as fh_out: 54 | for rec in fh_in: 55 | num_read += 1 56 | fh_out.write(rec) 57 | assert num_read == 8 58 | yield bam 59 | bam.unlink() 60 | 61 | 62 | @pytest.fixture(scope="function") 63 | def in_path(request: Any, valid_sam: Path, valid_bam: Path) -> Path: 64 | """A fixture for test_sam_file_open_reading to modify in_path prior to executing. 65 | 66 | Returns: 67 | the path corresponding to the given file type (i.e. SAM or BAM). 68 | """ 69 | file_type = request.param 70 | return valid_sam if file_type == SamFileType.SAM else valid_bam 71 | 72 | 73 | @pytest.mark.parametrize("in_path,file_type", [ 74 | (SamFileType.SAM, SamFileType.SAM), 75 | (SamFileType.BAM, SamFileType.BAM) 76 | ], indirect=['in_path']) # Note: This modifies in_path via the in_path fixture 77 | def test_sam_file_open_reading(in_path: Path, 78 | file_type: SamFileType) -> None: 79 | 80 | # file pointer 81 | with in_path.open(mode="rb") as fp: 82 | with sam._pysam_open(path=fp, open_for_reading=True, file_type=file_type) as samfile: 83 | assert sum(1 for _ in samfile) == 8 84 | 85 | # Path 86 | with sam._pysam_open(path=in_path, open_for_reading=True, file_type=file_type) as samfile: 87 | assert sum(1 for _ in samfile) == 8 88 | 89 | # str 90 | str_path = str(in_path) 91 | with sam._pysam_open(path=str_path, open_for_reading=True, file_type=file_type) as samfile: 92 | assert sum(1 for _ in samfile) == 8 93 | 94 | 95 | def test_sam_file_open_reading_autorecognize(valid_sam: Path) -> None: 96 | with sam._pysam_open(path=valid_sam, open_for_reading=True, file_type=None) as samfile: 97 | assert sum(1 for _ in samfile) == 8 98 | 99 | 100 | def test_sam_file_open_reading_with_reader(valid_sam: Path) -> None: 101 | with sam.reader(path=valid_sam, file_type=None) as samfile: 102 | assert sum(1 for _ in samfile) == 8 103 | 104 | 105 | @pytest.fixture 106 | def expected_records(valid_sam: Path) -> List[pysam.AlignedSegment]: 107 | """Returns the records that are found in the valid_sam. """ 108 | with sam.reader(valid_sam) as fh: 109 | return [r for r in fh] 110 | 111 | 112 | @pytest.fixture 113 | def header_dict(valid_sam: Path) -> Dict[str, Any]: 114 | """Returns the multi-level dictionary in the valid_sam. """ 115 | with sam.reader(valid_sam) as fh: 116 | return fh.header 117 | 118 | 119 | @pytest.fixture 120 | def header_text(valid_sam: Path) -> Dict[str, Any]: 121 | """Returns the raw dictionary text in the valid_sam. """ 122 | with sam.reader(valid_sam) as fh: 123 | return fh.text 124 | 125 | 126 | def assert_actual_vs_expected(actual_path: str, 127 | expected_records: List[pysam.AlignedSegment]) -> None: 128 | """Helper method to ensure the expected records are in the SAM/BAM at the actual path.""" 129 | with sam.reader(actual_path) as sam_reader: 130 | actual_records = [r for r in sam_reader] 131 | for actual, expected in zip(actual_records, expected_records): 132 | assert actual == expected 133 | assert len(actual_records) == len(expected_records) 134 | 135 | 136 | @pytest.mark.parametrize("file_type", [SamFileType.SAM, SamFileType.BAM]) 137 | def test_sam_file_open_writing(file_type: SamFileType, 138 | expected_records: List[pysam.AlignedSegment], 139 | header_dict: Dict[str, Any], 140 | tmpdir: TmpDir) -> None: 141 | # use header as a keyword argument 142 | with NamedTemp(suffix=file_type.ext, dir=tmpdir, mode='w', delete=False) as fp: 143 | kwargs = {"header": header_dict} 144 | with sam._pysam_open(path=fp.file, # type: ignore 145 | open_for_reading=False, 146 | file_type=file_type, 147 | **kwargs) as sam_writer: 148 | for r in expected_records: 149 | sam_writer.write(r) 150 | assert_actual_vs_expected(fp.name, expected_records) 151 | 152 | 153 | def test_sam_file_open_writing_header_keyword(expected_records: List[pysam.AlignedSegment], 154 | header_dict: Dict[str, Any], 155 | tmpdir: TmpDir) -> None: 156 | # Use SamWriter 157 | # use header as a keyword argument 158 | with NamedTemp(suffix=".sam", dir=tmpdir, mode='w', delete=False) as fp: 159 | with sam.writer(path=fp.name, 160 | header=header_dict, 161 | file_type=SamFileType.SAM) as sam_writer: 162 | for r in expected_records: 163 | sam_writer.write(r) 164 | assert_actual_vs_expected(fp.name, expected_records) 165 | 166 | # FIXME: Bug in pysam 167 | # https://github.com/pysam-developers/pysam/pull/656 168 | # def test_sam_file_open_writing_text_keyword(expected_records: List[pysam.AlignedSegment], 169 | # header_text: str, 170 | # tmpdir: TmpDir) -> None: 171 | # # Try without a file type 172 | # with NamedTemp(suffix=".sam", dir=tmpdir, mode='w', delete=False) as fp: 173 | # kwargs = {"text": header_text} 174 | # with sam.writer(path=fp.name, 175 | # header=header_dict, 176 | # file_type=None) as sam_writer: 177 | # for r in expected_records: 178 | # sam_writer.write(r) 179 | # assert_actual_vs_expected(fp.name, expected_records) 180 | 181 | 182 | def test_cigar_op_util_from_character() -> None: 183 | operators = [operator for operator in CigarOp] 184 | characters = [operator.character for operator in operators] 185 | for i, character in enumerate(characters): 186 | assert CigarOp.from_character(character) == operators[i] 187 | 188 | 189 | def test_cigar_op_util_from_code() -> None: 190 | operators = [operator for operator in CigarOp] 191 | codes = [operator.code for operator in operators] 192 | for i, code in enumerate(codes): 193 | assert CigarOp.from_code(code) == operators[i] 194 | 195 | 196 | @pytest.mark.parametrize("character,operator_length,length_on_query,length_on_target", [ 197 | ("M", 10, 10, 10), 198 | ("I", 10, 10, 0), 199 | ("D", 10, 0, 10), 200 | ("S", 10, 10, 0) 201 | ]) 202 | def test_cigar_element_length_on(character: str, 203 | operator_length: int, 204 | length_on_query: int, 205 | length_on_target: int) -> None: 206 | operator = CigarOp.from_character(character) 207 | element = CigarElement(operator_length, operator) 208 | assert element.length == operator_length 209 | assert element.length_on_query == length_on_query 210 | assert element.length_on_target == length_on_target 211 | 212 | 213 | @pytest.mark.parametrize("in_cigar,out_cigar", [ 214 | ("75M", "75M"), 215 | ("10M10M", "20M"), 216 | ("10M10I10M", "10M10I10M"), 217 | ("10S10S10S10S10M", "40S10M") 218 | ]) 219 | def test_cigar_coalesce(in_cigar: str, out_cigar: str) -> None: 220 | assert str(Cigar.from_cigarstring(in_cigar).coalesce()) == out_cigar 221 | 222 | 223 | @pytest.mark.parametrize("cigartuples,cigarstring", [ 224 | ([], "*"), # Empty cigar 225 | ([(0, 10), (1, 5), (0, 1)], "10M5I1M"), # A simple example 226 | ([(0, 10), (1, 5), (1, 5)], "10M5I5I"), # do not join adjacent operators of the same type 227 | ([(op.code, op.code + 1) for op in CigarOp], "1M2I3D4N5S6H7P8=9X") # all operators 228 | ]) 229 | def test_cigar_from_cigartuples(cigartuples: List[Tuple[int, int]], cigarstring: str) -> None: 230 | cigar = Cigar.from_cigartuples(cigartuples) 231 | assert str(cigar) == cigarstring 232 | 233 | 234 | def test_cigar_from_cigartuples_malformed() -> None: 235 | with pytest.raises(CigarParsingException, match=r'.*Malformed cigar tuples.*'): 236 | cigartuples = [(0, 10), (1, 5), (22, 1)] 237 | Cigar.from_cigartuples(cigartuples) 238 | 239 | 240 | def test_pretty_cigarstring_exception() -> None: 241 | cigar = "10M5U4M" 242 | index = 4 243 | expected = "10M5[U]4M" 244 | with pytest.raises(CigarParsingException, match=r'.*Malformed cigar') as ex: 245 | raise Cigar._pretty_cigarstring_exception(cigar, index) 246 | assert expected in str(ex) 247 | 248 | expected = cigar + "[]" 249 | with pytest.raises(CigarParsingException, match=r'.*Malformed cigar') as ex: 250 | raise Cigar._pretty_cigarstring_exception(cigar, len(cigar)) 251 | assert expected in str(ex) 252 | 253 | 254 | def test_from_cigarstring() -> None: 255 | # Empty cigar 256 | assert str(Cigar.from_cigarstring("*")) == "*" 257 | 258 | elements = [] 259 | for i, operator in enumerate(CigarOp): 260 | elements.append(CigarElement(i + 1, operator)) 261 | cigarstring = str(Cigar(tuple(elements))) 262 | assert str(Cigar.from_cigarstring(cigarstring)) == cigarstring 263 | 264 | 265 | def test_from_cigarstring_op_should_start_with_digit() -> None: 266 | cigars = ["", "M", "10MI", "10M5SU"] 267 | errors = ["", "[M]", "10M[I]", "10M5S[U]"] 268 | for cigar, error in zip(cigars, errors): 269 | match = "Malformed cigar: " + error if cigar else 'Cigar string was empty' 270 | with pytest.raises(CigarParsingException) as ex: 271 | Cigar.from_cigarstring(cigar) 272 | assert match in str(ex) 273 | 274 | 275 | def test_from_cigarstring_no_length() -> None: 276 | cigars = ["M", "10MS"] 277 | errors = ["", "10M[S]"] 278 | for cigar, error in zip(cigars, errors): 279 | with pytest.raises(CigarParsingException) as ex: 280 | Cigar.from_cigarstring(cigar) 281 | assert "Malformed cigar: " + error in str(ex) 282 | 283 | 284 | def test_from_cigarstring_invalid_operator() -> None: 285 | cigars = ["10U", "10M5U"] 286 | errors = ["10[U]", "10M5[U]"] 287 | for cigar, error in zip(cigars, errors): 288 | with pytest.raises(CigarParsingException) as ex: 289 | Cigar.from_cigarstring(cigar) 290 | assert "Malformed cigar: " + error in str(ex) 291 | 292 | 293 | def test_from_cigarstring_missing_operator() -> None: 294 | cigars = ["10", "10M5"] 295 | errors = ["10[]", "10M5[]"] 296 | for cigar, error in zip(cigars, errors): 297 | with pytest.raises(CigarParsingException) as ex: 298 | Cigar.from_cigarstring(cigar) 299 | assert "Malformed cigar: " + error in str(ex) 300 | 301 | 302 | def test_is_indel() -> None: 303 | indels = [op for op in CigarOp if op.is_indel] 304 | assert indels == [CigarOp.I, CigarOp.D] 305 | 306 | 307 | def test_get_and_set_qc_fail() -> None: 308 | builder = SamBuilder() 309 | (r1, _) = builder.add_pair() 310 | 311 | def foo() -> None: 312 | pass 313 | 314 | # the record isn't qc failed, so get_qc_fail should return None 315 | assert sam.get_qc_fail(r1) is None 316 | assert sam.get_qc_fail_by_tool(r1) is None 317 | 318 | # the record is qc failed, but there are no tags set, so get_qc_fail should return None 319 | r1.is_qcfail = True 320 | assert sam.get_qc_fail(r1) is None 321 | assert sam.get_qc_fail_by_tool(r1) is None 322 | 323 | # the record is qc failed by a tool and with a reason, so we should get a return value 324 | sam.set_qc_fail(r1, test_get_and_set_qc_fail, "some reason") 325 | (tool, reason) = sam.get_qc_fail(r1) 326 | assert tool == test_get_and_set_qc_fail.__name__ 327 | assert reason == "some reason" 328 | (tool, reason) = sam.get_qc_fail_by_tool(r1, tool=test_get_and_set_qc_fail) 329 | assert tool == test_get_and_set_qc_fail.__name__ 330 | assert reason == "some reason" 331 | 332 | # returns None if a different tool set the record as QC fail 333 | assert sam.get_qc_fail_by_tool(r1, tool=foo) is None 334 | 335 | 336 | def test_isize() -> None: 337 | builder = SamBuilder() 338 | r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") 339 | assert sam.isize(r1, r2) == 190 340 | assert sam.isize(r2, r1) == -190 341 | 342 | r2.is_unmapped = True 343 | assert sam.isize(r1, r2) == 0 344 | -------------------------------------------------------------------------------- /samwell/sam/tests/test_sambuilder.py: -------------------------------------------------------------------------------- 1 | """Basic tests of the sambuilder module.""" 2 | 3 | import pytest 4 | 5 | from pathlib import Path 6 | from py._path.local import LocalPath as TmpDir 7 | from samwell import sam 8 | from samwell.sam import SamOrder 9 | from samwell.sam.sambuilder import SamBuilder 10 | from typing import Optional 11 | from typing import List 12 | 13 | 14 | def test_add_pair_all_fields() -> None: 15 | builder = SamBuilder() 16 | builder.add_pair( 17 | name="q1", 18 | chrom="chr1", 19 | bases1="ACGTG", 20 | quals1=[20, 21, 22, 23, 24], 21 | start1=10000, 22 | cigar1="5M", 23 | strand1="+", 24 | bases2="GCGC", 25 | quals2=[30, 31, 32, 33], 26 | start2=10200, 27 | cigar2="4M", 28 | strand2="-", 29 | attrs={"aa": "Hello", "bb": 42} 30 | ) 31 | recs = builder.to_sorted_list() 32 | assert len(recs) == 2 33 | for rec in recs: 34 | assert rec.query_name == "q1" 35 | assert rec.reference_name == "chr1" 36 | assert rec.is_paired 37 | assert abs(rec.template_length) == 204 38 | assert rec.get_tag("aa") == "Hello" 39 | assert rec.get_tag("bb") == 42 40 | if rec.is_read1: 41 | assert rec.reference_start == 10000 42 | assert not rec.is_reverse 43 | assert rec.query_sequence == "ACGTG" 44 | assert list(rec.query_qualities) == [20, 21, 22, 23, 24] 45 | assert rec.cigarstring == "5M" 46 | else: 47 | assert rec.reference_start == 10200 48 | assert rec.is_reverse 49 | assert rec.query_sequence == "GCGC" 50 | assert list(rec.query_qualities) == [30, 31, 32, 33] 51 | assert rec.cigarstring == "4M" 52 | 53 | 54 | def test_add_pair_minimal() -> None: 55 | builder = SamBuilder(r1_len=10, r2_len=5, base_quality=25) 56 | r1, r2 = builder.add_pair(chrom="chr1", start1=1000, start2=1200) 57 | assert r1.query_name == r2.query_name 58 | assert r1.reference_name == r2.reference_name == "chr1" 59 | assert r1.reference_start == 1000 60 | assert r2.reference_start == 1200 61 | assert not r1.is_reverse 62 | assert r2.is_reverse 63 | assert len(r1.query_sequence) == len(r1.query_qualities) == 10 64 | assert len(r2.query_sequence) == len(r2.query_qualities) == 5 65 | assert r1.cigarstring == "10M" 66 | assert r2.cigarstring == "5M" 67 | assert r1.get_tag("RG") == builder.rg_id 68 | assert r2.get_tag("RG") == builder.rg_id 69 | 70 | 71 | def test_add_pair_mix_and_match() -> None: 72 | builder = SamBuilder(r1_len=100, r2_len=100, base_quality=30) 73 | r1, r2 = builder.add_pair(chrom="chr1", start1=500, start2=700, cigar1="75M", cigar2="9M1I30M") 74 | assert len(r1.query_sequence) == len(r1.query_qualities) == 75 75 | assert len(r2.query_sequence) == len(r2.query_qualities) == 40 76 | 77 | r1, r2 = builder.add_pair(chrom="chr1", start1=500, start2=700, 78 | bases1="ACGTGCATGC", bases2="ACGAC") 79 | assert len(r1.query_sequence) == len(r1.query_qualities) == 10 80 | assert len(r2.query_sequence) == len(r2.query_qualities) == 5 81 | assert r1.cigarstring == "10M" 82 | assert r2.cigarstring == "5M" 83 | 84 | r1, r2 = builder.add_pair(chrom="chr1", start1=500, start2=700, 85 | quals1=[30] * 20, quals2=[20] * 10) 86 | assert len(r1.query_sequence) == len(r1.query_qualities) == 20 87 | assert len(r2.query_sequence) == len(r2.query_qualities) == 10 88 | assert r1.cigarstring == "20M" 89 | assert r2.cigarstring == "10M" 90 | 91 | # Now what if we provide multiple values that are inconsistent 92 | with pytest.raises(ValueError, match="not length compatible"): 93 | builder.add_pair(chrom="chr1", start1=10, start2=99, bases1="ACGTG", cigar1="10M") 94 | 95 | with pytest.raises(ValueError, match="not length compatible"): 96 | builder.add_pair(chrom="chr1", start1=10, start2=99, bases1="ACGTG", quals1=[2, 2]) 97 | 98 | with pytest.raises(ValueError, match="not length compatible"): 99 | builder.add_pair(chrom="chr1", start1=10, start2=99, quals1=[2, 2], cigar1="5M") 100 | 101 | 102 | def test_unmapped_reads() -> None: 103 | builder = SamBuilder() 104 | r1, r2 = builder.add_pair(chrom="chr1", start1=1000) 105 | assert not r1.is_unmapped 106 | assert r1.mate_is_unmapped 107 | assert r2.is_unmapped 108 | assert not r2.mate_is_unmapped 109 | for rec in r1, r2: 110 | assert rec.reference_name == "chr1" 111 | assert rec.reference_start == 1000 112 | assert rec.next_reference_name == "chr1" 113 | assert rec.next_reference_start == 1000 114 | 115 | r1, r2 = builder.add_pair(chrom="chr1", start2=2000) 116 | assert r1.is_unmapped 117 | assert not r1.mate_is_unmapped 118 | assert not r2.is_unmapped 119 | assert r2.mate_is_unmapped 120 | for rec in r1, r2: 121 | assert rec.reference_name == "chr1" 122 | assert rec.reference_start == 2000 123 | assert rec.next_reference_name == "chr1" 124 | assert rec.next_reference_start == 2000 125 | 126 | r1, r2 = builder.add_pair(chrom=sam.NO_REF_NAME) 127 | assert r1.is_unmapped 128 | assert r1.mate_is_unmapped 129 | assert r2.is_unmapped 130 | assert r2.mate_is_unmapped 131 | for rec in r1, r2: 132 | assert rec.reference_name is None 133 | assert rec.reference_start == sam.NO_REF_POS 134 | assert rec.next_reference_name is None 135 | assert rec.next_reference_start == sam.NO_REF_POS 136 | 137 | 138 | def test_invalid_strand() -> None: 139 | with pytest.raises(ValueError, match="strand"): 140 | SamBuilder().add_pair(chrom="chr1", start1=100, start2=200, strand1="F", strand2="R") 141 | 142 | 143 | def test_proper_pair() -> None: 144 | builder = SamBuilder() 145 | 146 | # Regular innies 147 | for rec in builder.add_pair(chrom="chr1", start1=5000, start2=5200, strand1="+", strand2="-"): 148 | assert rec.is_proper_pair 149 | for rec in builder.add_pair(chrom="chr1", start1=5200, start2=5000, strand1="-", strand2="+"): 150 | assert rec.is_proper_pair 151 | 152 | # Outies 153 | for rec in builder.add_pair(chrom="chr1", start1=5000, start2=5200, strand1="-", strand2="+"): 154 | assert not rec.is_proper_pair 155 | for rec in builder.add_pair(chrom="chr1", start1=5200, start2=5000, strand1="+", strand2="-"): 156 | assert not rec.is_proper_pair 157 | 158 | # Unmapped 159 | for rec in builder.add_pair(chrom="chr1", start1=5000, strand1="+"): 160 | assert not rec.is_proper_pair 161 | for rec in builder.add_pair(chrom="chr1", start2=5000, strand2="+"): 162 | assert not rec.is_proper_pair 163 | for rec in builder.add_pair(): 164 | assert not rec.is_proper_pair 165 | 166 | 167 | def test_sorting() -> None: 168 | builder = SamBuilder() 169 | builder.add_pair(chrom="chr1", start1=5000, start2=4700, strand1="-", strand2="+") 170 | builder.add_pair(chrom="chr1", start1=4000, start2=4300) 171 | builder.add_pair(chrom="chr5", start1=4000, start2=4300) 172 | builder.add_pair(chrom="chr2", start1=4000, start2=4300) 173 | 174 | last_ref_id = -1 175 | last_start = -1 176 | for rec in builder.to_sorted_list(): 177 | ref_id = rec.reference_id 178 | start = rec.reference_start 179 | assert ref_id > last_ref_id or (ref_id == last_ref_id and start >= last_start) 180 | last_ref_id = ref_id 181 | last_start = start 182 | 183 | 184 | def make_sort_order_builder(tmpdir: TmpDir, sort_order: SamOrder) -> Path: 185 | builder = SamBuilder(sort_order=sort_order) 186 | builder.add_pair( 187 | name="test3", 188 | chrom="chr1", 189 | start1=5000, 190 | start2=4700, 191 | strand1="-", 192 | strand2="+" 193 | ) 194 | builder.add_pair(name="test2", chrom="chr1", start1=4000, start2=4300) 195 | builder.add_pair(name="test1", chrom="chr5", start1=4000, start2=4300) 196 | builder.add_pair(name="test4", chrom="chr2", start1=4000, start2=4300) 197 | 198 | pos_path = Path(str(tmpdir)) / "test.bam" 199 | builder.to_path(pos_path) 200 | return pos_path 201 | 202 | 203 | @pytest.mark.parametrize( 204 | argnames=["sort_order", "expected_name_order"], 205 | argvalues=[ 206 | (SamOrder.Coordinate, ["test2", "test3", "test4", "test1"]), 207 | (SamOrder.QueryName, ["test1", "test2", "test3", "test4"]), 208 | (SamOrder.Unsorted, ["test3", "test2", "test1", "test4"]), 209 | (None, ["test3", "test2", "test1", "test4"]) 210 | ], 211 | ids=["Coordinate sorting", "Query name sorting", "Unsorted output", "Unsorted output - None"] 212 | ) 213 | def test_sort_types( 214 | tmpdir: TmpDir, 215 | sort_order: Optional[SamOrder], 216 | expected_name_order: List[str] 217 | ) -> None: 218 | bam_path = make_sort_order_builder(tmpdir=tmpdir, sort_order=sort_order) 219 | with sam.reader(bam_path) as in_bam: 220 | for name in expected_name_order: 221 | read1 = next(in_bam) 222 | assert name == read1.query_name, ( 223 | "Position based read sort order did not match expectation" 224 | ) 225 | read2 = next(in_bam) 226 | assert name == read2.query_name, ( 227 | "Position based read sort order did not match expectation" 228 | ) 229 | 230 | 231 | def test_custom_sd() -> None: 232 | builder1 = SamBuilder() 233 | builder2 = SamBuilder(sd=[{"SN": "hi", "LN": 999}, {"SN": "bye", "LN": 888}]) 234 | builder1.add_pair(chrom="chr1", start1=200, start2=400) 235 | builder2.add_pair(chrom="hi", start1=200, start2=400) 236 | 237 | with pytest.raises(ValueError, match="not a valid chromosome name"): 238 | builder1.add_pair(chrom="hi", start1=200, start2=400) 239 | 240 | with pytest.raises(ValueError, match="not a valid chromosome name"): 241 | builder2.add_pair(chrom="chr1", start1=200, start2=400) 242 | 243 | 244 | def test_custom_rg() -> None: 245 | builder = SamBuilder(rg={"ID": "novel", "SM": "custom_rg", "LB": "foo", "PL": "ILLUMINA"}) 246 | for rec in builder.add_pair(chrom="chr1", start1=100, start2=200): 247 | assert rec.get_tag("RG") == "novel" 248 | -------------------------------------------------------------------------------- /samwell/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/myriad-opensource/samwell/47c4d809e3a228cf2be7af09871ab70e706763a1/samwell/tests/__init__.py -------------------------------------------------------------------------------- /samwell/tests/test_dnautils.py: -------------------------------------------------------------------------------- 1 | """Tests for :py:mod:`~samwell.dnautils`""" 2 | 3 | import pytest 4 | 5 | from samwell import dnautils 6 | 7 | 8 | def test_reverse_complement() -> None: 9 | assert dnautils.reverse_complement("") == "" 10 | assert dnautils.reverse_complement("AATTCCGGaattccgg") == "ccggaattCCGGAATT" 11 | assert dnautils.reverse_complement("ACGTN") == "NACGT" 12 | 13 | with pytest.raises(KeyError): 14 | dnautils.reverse_complement("ACGT.GAT") 15 | 16 | with pytest.raises(KeyError): 17 | dnautils.reverse_complement("RMNY") 18 | 19 | 20 | def test_mask_long_homopolymers() -> None: 21 | assert dnautils.mask_long_homopolymers("A", 0) == "N" 22 | assert dnautils.mask_long_homopolymers("A", 1) == "N" 23 | assert dnautils.mask_long_homopolymers("A", 2) == "A" 24 | assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 1) == "NNNNNNNNNNNNNNNNNNN" 25 | assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 2) == "ANNNNNNNNNNNNNNNNNN" 26 | assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 3) == "ACCNNNNNNNNNNNNNNNN" 27 | assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 4) == "ACCGGGNNNNNNNNNNNNN" 28 | assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 5) == "ACCGGGTTTTNNNNNTTTT" 29 | assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 6) == "ACCGGGTTTTAAAAATTTT" 30 | 31 | 32 | def test_has_long_homopolymer() -> None: 33 | assert dnautils.has_long_homopolymer("A", 0) 34 | assert not dnautils.has_long_homopolymer("A", 1) 35 | assert dnautils.has_long_homopolymer("ACCGGGTTTTAAAAATTTT", 4) 36 | assert not dnautils.has_long_homopolymer("ACCGGGTTTTAAAAATTTT", 5) 37 | assert not dnautils.has_long_homopolymer("ACCGGGTTTTAAAAATTTT", 10) 38 | -------------------------------------------------------------------------------- /samwell/tests/test_itertools.py: -------------------------------------------------------------------------------- 1 | """Tests for :py:mod:`~samwell.itertools`""" 2 | 3 | import pytest 4 | 5 | from samwell.itertools import PeekableIterator 6 | from samwell.itertools import peekable 7 | from samwell.itertools import MergingIterator 8 | 9 | 10 | def test_peekable_iterator_empty() -> None: 11 | empty_iter: PeekableIterator[None] = peekable([]) 12 | assert not empty_iter.can_peek() 13 | assert empty_iter.maybe_peek() is None, "maybe_peek was not None for empty iterator" 14 | with pytest.raises(StopIteration): 15 | empty_iter.peek() 16 | with pytest.raises(StopIteration): 17 | next(empty_iter) 18 | 19 | 20 | def test_peekable_iterator_nonempty() -> None: 21 | nonempty_iter = peekable(range(10)) 22 | for i in range(10): 23 | assert nonempty_iter.can_peek() 24 | assert nonempty_iter.peek() == i 25 | assert nonempty_iter.maybe_peek() == i, "maybe_peek value didn't match expectation" 26 | assert next(nonempty_iter) == i 27 | 28 | assert nonempty_iter.maybe_peek() is None, "maybe_peek was not None for exhausted iterator" 29 | with pytest.raises(StopIteration): 30 | nonempty_iter.peek() 31 | with pytest.raises(StopIteration): 32 | next(nonempty_iter) 33 | 34 | 35 | def test_peekable_with_nones() -> None: 36 | xs = [1, 2, None, 4, None, 6] 37 | iterator = peekable(xs) 38 | 39 | for i in range(len(xs)): 40 | assert iterator.peek() is xs[i] 41 | assert iterator.maybe_peek() is xs[i] 42 | assert next(iterator) is xs[i] 43 | 44 | 45 | def test_takewhile() -> None: 46 | xs = [2, 4, 6, 8, 11, 13, 15, 17, 19, 20, 22, 24] 47 | iterator = peekable(xs) 48 | assert iterator.takewhile(lambda x: x % 2 == 0) == [2, 4, 6, 8] 49 | assert iterator.takewhile(lambda x: x % 2 == 1) == [11, 13, 15, 17, 19] 50 | assert iterator.takewhile(lambda x: x % 2 == 1) == [] 51 | assert iterator.takewhile(lambda x: x % 2 == 0) == [20, 22, 24] 52 | 53 | 54 | def test_dropwhile() -> None: 55 | xs = [2, 4, 6, 8, 11, 13, 15, 17, 19, 20, 22, 24] 56 | iterator = peekable(xs) 57 | iterator.dropwhile(lambda x: x % 2 == 0) 58 | iterator.dropwhile(lambda x: x <= 20) 59 | assert list(iterator) == [22, 24] 60 | 61 | 62 | def test_merging_iterator() -> None: 63 | xs = [1, 3, 5, 7, 9] 64 | ys = [2, 4, 6, 8, 9] 65 | ms = MergingIterator(iter(xs), iter(ys), keyfunc=lambda x: x) 66 | assert list(ms) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 9] 67 | 68 | s1 = ["one", "enormous", "hippopotamus"] 69 | s2 = ["a", "little", "diplodocus"] 70 | ss = MergingIterator(iter(s1), iter(s2), keyfunc=lambda x: len(x)) 71 | assert list(ss) == ["a", "one", "little", "enormous", "diplodocus", "hippopotamus"] 72 | -------------------------------------------------------------------------------- /samwell/tests/test_overlap_detector.py: -------------------------------------------------------------------------------- 1 | """Tests for :py:mod:`~samwell.overlap_detector`""" 2 | 3 | from typing import List 4 | 5 | from samwell.overlap_detector import Interval 6 | from samwell.overlap_detector import OverlapDetector 7 | 8 | 9 | def run_test(targets: List[Interval], query: Interval, results: List[Interval]) -> None: 10 | detector = OverlapDetector() 11 | # Use add_all() to covert itself and add() 12 | detector.add_all(intervals=targets) 13 | # Test overlaps_any() 14 | assert detector.overlaps_any(query) == (len(results) > 0) 15 | # Test get_overlaps() 16 | assert detector.get_overlaps(query) == results 17 | 18 | 19 | def test_same_interval() -> None: 20 | interval = Interval("1", 10, 100) 21 | run_test(targets=[interval], query=interval, results=[interval]) 22 | 23 | 24 | def test_query_wholly_contained_in_target() -> None: 25 | target = Interval("1", 10, 100) 26 | query = Interval("1", 11, 99) 27 | run_test(targets=[target], query=query, results=[target]) 28 | 29 | 30 | def test_target_wholly_contained_in_query() -> None: 31 | target = Interval("1", 10, 100) 32 | query = Interval("1", 9, 101) 33 | run_test(targets=[target], query=query, results=[target]) 34 | 35 | 36 | def test_target_overlaps_first_base_of_query() -> None: 37 | target = Interval("1", 10, 100) 38 | query = Interval("1", 99, 100) 39 | run_test(targets=[target], query=query, results=[target]) 40 | 41 | 42 | def test_target_overlaps_last_base_of_query() -> None: 43 | target = Interval("1", 10, 100) 44 | query = Interval("1", 10, 11) 45 | run_test(targets=[target], query=query, results=[target]) 46 | 47 | 48 | def test_query_before_target() -> None: 49 | target = Interval("1", 10, 100) 50 | query = Interval("1", 9, 10) 51 | run_test(targets=[target], query=query, results=[]) 52 | 53 | 54 | def test_query_after_target() -> None: 55 | target = Interval("1", 10, 100) 56 | query = Interval("1", 100, 101) 57 | run_test(targets=[target], query=query, results=[]) 58 | 59 | 60 | def test_different_references() -> None: 61 | target = Interval("1", 10, 100) 62 | query = Interval("2", 10, 100) 63 | run_test(targets=[target], query=query, results=[]) 64 | 65 | 66 | def test_multiple_overlaps() -> None: 67 | interval_a = Interval("1", 10, 20) 68 | interval_b = Interval("1", 15, 25) 69 | interval_c = Interval("1", 19, 30) 70 | interval_d = Interval("1", 24, 35) 71 | 72 | # B overlaps both A and C 73 | run_test(targets=[interval_a, interval_c], query=interval_b, results=[interval_a, interval_c]) 74 | # C overlaps both A and B 75 | run_test(targets=[interval_a, interval_b], query=interval_c, results=[interval_a, interval_b]) 76 | # D overlaps only B and C (is after A) 77 | run_test(targets=[interval_a, interval_b, interval_c], 78 | query=interval_d, 79 | results=[interval_b, interval_c]) 80 | 81 | 82 | def test_multiple_references() -> None: 83 | target_chr1 = Interval("1", 10, 20) 84 | target_chr2 = Interval("2", 10, 20) 85 | run_test(targets=[target_chr1, target_chr2], query=target_chr1, results=[target_chr1]) 86 | run_test(targets=[target_chr1, target_chr2], query=target_chr2, results=[target_chr2]) 87 | 88 | 89 | def test_same_interval_twice() -> None: 90 | interval = Interval("1", 10, 100) 91 | run_test(targets=[interval, interval], query=interval, results=[interval]) 92 | 93 | 94 | def test_wholly_contained_target() -> None: 95 | target_inner = Interval("1", 50, 60) 96 | target_outer = Interval("1", 40, 80) 97 | 98 | run_test(targets=[target_inner, target_outer], 99 | query=target_inner, 100 | results=[target_outer, target_inner]) 101 | 102 | 103 | def test_get_enclosing_intervals() -> None: 104 | a = Interval("1", 1, 250) 105 | b = Interval("1", 5, 30) 106 | c = Interval("1", 10, 99) 107 | d = Interval("1", 15, 19) 108 | e = Interval("1", 16, 20) 109 | 110 | detector = OverlapDetector() 111 | detector.add_all([a, b, c, d, e]) 112 | 113 | assert detector.get_enclosing_intervals(Interval("1", 10, 100)) == [a] 114 | assert detector.get_enclosing_intervals(Interval("1", 15, 20)) == [a, b, c] 115 | assert detector.get_enclosing_intervals(Interval("1", 18, 19)) == [a, b, c, d, e] 116 | assert detector.get_enclosing_intervals(Interval("1", 50, 99)) == [a, c] 117 | 118 | 119 | def test_get_enclosed() -> None: 120 | a = Interval("1", 10, 100) 121 | b = Interval("1", 15, 20) 122 | c = Interval("1", 18, 19) 123 | d = Interval("1", 50, 99) 124 | 125 | detector = OverlapDetector() 126 | detector.add_all([a, b, c, d]) 127 | 128 | assert detector.get_enclosed(Interval("1", 1, 250)) == [a, b, c, d] 129 | assert detector.get_enclosed(Interval("1", 5, 30)) == [b, c] 130 | assert detector.get_enclosed(Interval("1", 16, 20)) == [c] 131 | assert detector.get_enclosed(Interval("1", 15, 19)) == [c] 132 | assert detector.get_enclosed(Interval("1", 10, 99)) == [b, c, d] 133 | --------------------------------------------------------------------------------