├── .github
    └── workflows
    │   └── pythonpackage.yml
├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── README.md
├── docs
    ├── Makefile
    ├── api.rst
    ├── conf.py
    ├── index.rst
    └── release_notes.rst
├── flake8.cfg
├── mypy.ini
├── poetry.lock
├── pyproject.toml
└── samwell
    ├── __init__.py
    ├── dnautils.py
    ├── itertools.py
    ├── overlap_detector.py
    ├── sam
        ├── __init__.py
        ├── bwa_mem.py
        ├── clipping.py
        ├── sambuilder.py
        └── tests
        │   ├── __init__.py
        │   ├── data
        │       └── valid.sam
        │   ├── test_bwa_mem.py
        │   ├── test_clipping.py
        │   ├── test_sam.py
        │   └── test_sambuilder.py
    └── tests
        ├── __init__.py
        ├── test_dnautils.py
        ├── test_itertools.py
        └── test_overlap_detector.py


/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push]
 4 | env:
 5 |   POETRY_VERSION: 1.0
 6 | 
 7 | 
 8 | jobs:
 9 |   testing:
10 |     runs-on: ubuntu-latest
11 |     strategy:
12 |       matrix:
13 |         PYTHON_VERSION: [3.6, 3.7, 3.8]
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |     - name: Set up Python ${{matrix.PYTHON_VERSION}}
17 |       uses: actions/setup-python@v1
18 |       with:
19 |         python-version: ${{matrix.PYTHON_VERSION}}
20 |     - name: Install bwa
21 |       env:
22 |         ACTIONS_ALLOW_UNSECURE_COMMANDS: 'true'
23 |       run: |
24 |         wget https://github.com/lh3/bwa/releases/download/v0.7.17/bwa-0.7.17.tar.bz2
25 |         tar -jxvf bwa-0.7.17.tar.bz2
26 |         cd bwa-0.7.17
27 |         make -j$(nproc)
28 |         cd ..
29 |         echo "${GITHUB_WORKSPACE}/bwa-0.7.17/" >> $GITHUB_PATH
30 |     - name: Install poetry
31 |       run: |
32 |         python -m pip install --upgrade pip
33 |         pip install poetry==${{env.POETRY_VERSION}}
34 |     - name: Install cython
35 |       run: |
36 |         poetry run pip install cython==0.29.15
37 |     - name: Install setuptools-scm for py3.6
38 |       run: |
39 |         poetry run pip install setuptools-scm==6.4.2
40 |     - name: Install deps
41 |       run: |
42 |         poetry install --extras docs
43 |     - name: Run pytest
44 |       run: |
45 |         poetry run python -m pytest --cov=samwell --cov-branch
46 |     - name: Run lint
47 |       run: |
48 |         poetry run flake8 --config=flake8.cfg samwell
49 |     - name: Run mypy
50 |       run: |
51 |         poetry run mypy -p samwell --config=mypy.ini
52 |     - name: Run docs
53 |       shell: bash
54 |       run: |
55 |         set -euo pipefail
56 |         pushd docs
57 |         poetry run make html
58 |         popd
59 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # PyCharm
 2 | .idea
 3 | 
 4 | # Python compiled & optimized files
 5 | *.pyc
 6 | *.pyo
 7 | 
 8 | # MyPy Cache directory
 9 | .mypy_cache
10 | 
11 | # for develop installs
12 | *.egg-info
13 | 
14 | # venv set up
15 | .venv
16 | dist/
17 | 
18 | # Sphinx documentation
19 | html/_build
20 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | build:
 2 |     image: latest
 3 | version: 2
 4 | sphinx:
 5 |   configuration: docs/conf.py
 6 | python:
 7 |     version: 3.6
 8 |     install:
 9 |     - method: pip
10 |       path: .
11 |       extra_requirements:
12 |         - docs
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Myriad Genetics, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Language][language-badge]][language-link]
  2 | [![Code Style][code-style-badge]][code-style-link]
  3 | [![Type Checked][type-checking-badge]][type-checking-link]
  4 | [![PEP8][pep-8-badge]][pep-8-link]
  5 | [![License][license-badge]][license-link]
  6 | 
  7 | ---
  8 | 
  9 | [![Python package][python-package-badge]][python-package-link]
 10 | [![PyPI version][pypi-badge]][pypi-link]
 11 | [![PyPI download total][pypi-downloads-badge]][pypi-downloads-link]
 12 | 
 13 | ---
 14 | 
 15 | [language-badge]:       http://img.shields.io/badge/language-python-brightgreen.svg
 16 | [language-link]:        http://www.python.org/
 17 | [code-style-badge]:     https://img.shields.io/badge/code%20style-black-000000.svg
 18 | [code-style-link]:      https://black.readthedocs.io/en/stable/ 
 19 | [type-checking-badge]:  http://www.mypy-lang.org/static/mypy_badge.svg
 20 | [type-checking-link]:   http://mypy-lang.org/
 21 | [pep-8-badge]:          https://img.shields.io/badge/code%20style-pep8-brightgreen.svg
 22 | [pep-8-link]:           https://www.python.org/dev/peps/pep-0008/
 23 | [license-badge]:        http://img.shields.io/badge/license-MIT-blue.svg
 24 | [license-link]:         https://github.com/myriad-opensource/samwell/blob/master/LICENSE
 25 | [python-package-badge]: https://github.com/myriad-opensource/samwell/workflows/Python%20package/badge.svg
 26 | [python-package-link]:  https://github.com/myriad-opensource/samwell/actions?query=workflow%3A%22Python+package%22
 27 | [pypi-badge]:           https://badge.fury.io/py/samwell.svg
 28 | [pypi-link]:            https://pypi.python.org/pypi/samwell
 29 | [pypi-downloads-badge]: https://img.shields.io/pypi/dm/samwell
 30 | [pypi-downloads-link]:  https://pypi.python.org/pypi/samwell
 31 | 
 32 | # Samwell: a python package for using genomic files... well
 33 | 
 34 | Samwell provides elegant utilities for managing biological data.
 35 | 
 36 | See: [samwell.readthedocs.io](https://samwell.readthedocs.io/en/latest/)
 37 | 
 38 | ## Quickstart
 39 | 
 40 | First install samwell:
 41 | 
 42 | ```
 43 | pip install samwell
 44 | ```
 45 | 
 46 | ### Reading/Writing BAMs with automatic inference of filetype
 47 | 
 48 | Samwell provides easy utilities for reading/writing BAMs:
 49 | 
 50 | ```python
 51 | from samwell import sam
 52 | with sam.reader("myfile.bam") as in_bam:
 53 |     with sam.writer("my-output-file.bam", header=in_bam.header) as out_bam:
 54 |         for read in in_bam:
 55 |             if read.is_paired:
 56 |                 out_bam.write(read)
 57 | ```
 58 | 
 59 | 
 60 | ### Realigning fastqs with bwa
 61 | 
 62 | You can use `samwell` to easily realign fastq records as necessary
 63 | 
 64 | ```python
 65 | from pathlib import Path
 66 | from samwell import sam
 67 | from samwell.sam import bwa_mem
 68 | from samwell.sam import clipping
 69 | from samwell.sam.bwa_mem import FastqRecord
 70 | with sam.reader("myfile.bam") as in_bam:
 71 |     with sam.writer("outfile.bam", header=in_bam.header) as out_bam:
 72 |          fastq_gen = iter(FastqRecord.build(read) for read in in_bam)
 73 |          for read in bwa_mem.align(fastq_gen, Path("genome.fasta")):
 74 |              out_bam.write(read)
 75 | ```
 76 | 
 77 | See `samwell.bwa_mem` module for more detail.
 78 | 
 79 | 
 80 | ## Developing with samwell
 81 | 
 82 | Samwell uses [`poetry`](https://github.com/python-poetry/poetry#installation) for dependency managment.
 83 | 
 84 | Please install `poetry` using the instructions in the above link.
 85 | Then simply execute:
 86 | 
 87 | ```bash
 88 | poetry install
 89 | ```
 90 | 
 91 | ## Checking the Build
 92 | 
 93 | ### Linting 
 94 | 
 95 | ```bash
 96 | poetry run flake8 --config=flake8.cfg samwell
 97 | ```
 98 | 
 99 | ### Type Checking
100 | 
101 | ```bash
102 | poetry run mypy -p samwell --config=mypy.ini
103 | ```
104 | 
105 | ### Unit Tests
106 | 
107 | ```bash
108 | poetry run python -m pytest --cov=samwell --cov-branch
109 | ```
110 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | PAPER         =
 8 | 
 9 | # Internal variables.
10 | PAPEROPT_a4     = -D latex_paper_size=a4
11 | PAPEROPT_letter = -D latex_paper_size=letter
12 | ALLSPHINXOPTS   = -d _build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
13 | 
14 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
15 | 
16 | help:
17 | 	@echo "Please use \`make <target>' where <target> is one of"
18 | 	@echo "  html      to make standalone HTML files"
19 | 	@echo "  dirhtml   to make HTML files named index.html in directories"
20 | 	@echo "  pickle    to make pickle files"
21 | 	@echo "  json      to make JSON files"
22 | 	@echo "  htmlhelp  to make HTML files and a HTML help project"
23 | 	@echo "  qthelp    to make HTML files and a qthelp project"
24 | 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
25 | 	@echo "  changes   to make an overview of all changed/added/deprecated items"
26 | 	@echo "  linkcheck to check all external links for integrity"
27 | 	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
28 | 
29 | clean:
30 | 	-rm -rf _build/*
31 | 
32 | html:
33 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) _build/html
34 | 	@echo
35 | 	@echo "Build finished. The HTML pages are in _build/html."
36 | 
37 | dirhtml:
38 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) _build/dirhtml
39 | 	@echo
40 | 	@echo "Build finished. The HTML pages are in _build/dirhtml."
41 | 
42 | pickle:
43 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) _build/pickle
44 | 	@echo
45 | 	@echo "Build finished; now you can process the pickle files."
46 | 
47 | json:
48 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) _build/json
49 | 	@echo
50 | 	@echo "Build finished; now you can process the JSON files."
51 | 
52 | htmlhelp:
53 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) _build/htmlhelp
54 | 	@echo
55 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
56 | 	      ".hhp project file in _build/htmlhelp."
57 | 
58 | qthelp:
59 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) _build/qthelp
60 | 	@echo
61 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
62 | 	      ".qhcp project file in _build/qthelp, like this:"
63 | 	@echo "# qcollectiongenerator _build/qthelp/samtools.qhcp"
64 | 	@echo "To view the help file:"
65 | 	@echo "# assistant -collectionFile _build/qthelp/samtools.qhc"
66 | 
67 | latex:
68 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) _build/latex
69 | 	@echo
70 | 	@echo "Build finished; the LaTeX files are in _build/latex."
71 | 	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
72 | 	      "run these through (pdf)latex."
73 | 
74 | changes:
75 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) _build/changes
76 | 	@echo
77 | 	@echo "The overview file is in _build/changes."
78 | 
79 | linkcheck:
80 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) _build/linkcheck
81 | 	@echo
82 | 	@echo "Link check complete; look for any errors in the above output " \
83 | 	      "or in _build/linkcheck/output.txt."
84 | 
85 | doctest:
86 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) _build/doctest
87 | 	@echo "Testing of doctests in the sources finished, look at the " \
88 | 	      "results in _build/doctest/output.txt."
89 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | ===
 2 | API
 3 | ===
 4 | 
 5 | ============
 6 | 
 7 | .. automodule:: samwell.dnautils
 8 |    :members:
 9 | 
10 | .. automodule:: samwell.itertools
11 |    :members:
12 | 
13 | .. automodule:: samwell.overlap_detector
14 |    :members:
15 | 
16 | .. automodule:: samwell.sam
17 |    :members:
18 | 
19 | .. automodule:: samwell.sam.bwa_mem
20 |    :members:
21 | 
22 | .. automodule:: samwell.sam.clipping
23 |    :members:
24 | 
25 | .. automodule:: samwell.sam.sambuilder
26 |    :members:
27 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # samwell documentation build configuration file
  4 | #
  5 | # This file is execfile()d with the current directory set to its containing dir.
  6 | #
  7 | # Note that not all possible configuration values are present in this
  8 | # autogenerated file.
  9 | #
 10 | # All configuration values have a default; values that are commented out
 11 | # serve to show the default.
 12 | 
 13 | import sys, os, glob
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #_libdir = "../build/lib.%s-%s-%s.%s" % (os.uname()[0].lower(), os.uname()[4],
 19 | #                                        sys.version_info[0], sys.version_info[1])
 20 | _libdir = "../build/lib"
 21 | if os.path.exists(_libdir):
 22 |     sys.path.insert(0, os.path.abspath(_libdir))
 23 | 
 24 | # -- General configuration -----------------------------------------------------
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = ['sphinx.ext.autodoc',
 29 |               'sphinx.ext.autosummary',
 30 |               'sphinx.ext.todo', 
 31 |               'sphinx.ext.ifconfig',
 32 |               'sphinx.ext.intersphinx',
 33 |               'sphinx.ext.napoleon']
 34 | 
 35 | intersphinx_mapping = {'python': ('http://docs.python.org/3.6', None)}
 36 | 
 37 | # Add any paths that contain templates here, relative to this directory.
 38 | templates_path = ['_templates']
 39 | 
 40 | # The suffix of source filenames.
 41 | source_suffix = '.rst'
 42 | 
 43 | # The encoding of source files.
 44 | #source_encoding = 'utf-8'
 45 | 
 46 | # The master toctree document.
 47 | master_doc = 'index'
 48 | 
 49 | # General information about the project.
 50 | project = u'samwell'
 51 | copyright = u'2021, Myriad Genetics, Inc.'
 52 | 
 53 | # Included at the end of each rst file
 54 | rst_epilog = '''
 55 | .. _samwell: https://github.com/fulcrumgenomics/samwell
 56 | .. _python: http://python.org/
 57 | .. _conda: https://conda.io/docs/
 58 | '''
 59 | 
 60 | autosummary_generate = True
 61 | 
 62 | # The version info for the project you're documenting, acts as replacement for
 63 | # |version| and |release|, also used in various other places throughout the
 64 | # built documents.
 65 | #
 66 | from pathlib import Path
 67 | import os
 68 | toml_path = Path(os.path.realpath(__file__)).parent.parent / 'pyproject.toml'
 69 | with toml_path.open("r") as reader:
 70 |     for line in reader:
 71 |         if line.startswith("version"):
 72 |             version = line.rstrip("\r\n").split(" = ")[1]
 73 |             version = version[1:-1]
 74 |             break
 75 | 
 76 | # The full version, including alpha/beta/rc tags.
 77 | release = version
 78 | 
 79 | # The language for content autogenerated by Sphinx. Refer to documentation
 80 | # for a list of supported languages.
 81 | # language = None
 82 | 
 83 | # There are two options for replacing |today|: either, you set today to some
 84 | # non-false value, then it is used:
 85 | # today = ''
 86 | # Else, today_fmt is used as the format for a strftime call.
 87 | # today_fmt = '%B %d, %Y'
 88 | 
 89 | # List of documents that shouldn't be included in the build.
 90 | # unused_docs = []
 91 | 
 92 | # List of directories, relative to source directory, that shouldn't be searched
 93 | # for source files.
 94 | exclude_trees = ['_build']
 95 | 
 96 | # The reST default role (used for this markup: `text`) to use for all documents.
 97 | # default_role = None
 98 | 
 99 | # If true, '()' will be appended to :func: etc. cross-reference text.
100 | # add_function_parentheses = True
101 | 
102 | # If true, the current module name will be prepended to all description
103 | # unit titles (such as .. function::).
104 | # add_module_names = True
105 | 
106 | # If true, sectionauthor and moduleauthor directives will be shown in the
107 | # output. They are ignored by default.
108 | # show_authors = False
109 | 
110 | # The name of the Pygments (syntax highlighting) style to use.
111 | pygments_style = 'sphinx'
112 | 
113 | # A list of ignored prefixes for module index sorting.
114 | #modindex_common_prefix = []
115 | 
116 | 
117 | # -- Options for HTML output ---------------------------------------------------
118 | 
119 | # The theme to use for HTML and HTML Help pages.  Major themes that come with
120 | # Sphinx are currently 'default' and 'sphinxdoc'.
121 | html_theme = 'default'
122 | 
123 | # Theme options are theme-specific and customize the look and feel of a theme
124 | # further.  For a list of options available for each theme, see the
125 | # documentation.
126 | # html_theme_options = {}
127 | 
128 | # Add any paths that contain custom themes here, relative to this directory.
129 | # html_theme_path = []
130 | 
131 | # The name for this set of Sphinx documents.  If None, it defaults to
132 | # "<project> v<release> documentation".
133 | # html_title = None
134 | 
135 | # A shorter title for the navigation bar.  Default is the same as html_title.
136 | # html_short_title = None
137 | 
138 | # The name of an image file (relative to this directory) to place at the top
139 | # of the sidebar.
140 | # html_logo = None
141 | 
142 | # The name of an image file (within the static path) to use as favicon of the
143 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
144 | # pixels large.
145 | # html_favicon = None
146 | 
147 | # Add any paths that contain custom static files (such as style sheets) here,
148 | # relative to this directory. They are copied after the builtin static files,
149 | # so a file named "default.css" will overwrite the builtin "default.css".
150 | html_static_path = []
151 | 
152 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
153 | # using the given strftime format.
154 | # html_last_updated_fmt = '%b %d, %Y'
155 | 
156 | # If true, SmartyPants will be used to convert quotes and dashes to
157 | # typographically correct entities.
158 | # html_use_smartypants = True
159 | 
160 | # Custom sidebar templates, maps document names to template names.
161 | # html_sidebars = {}
162 | 
163 | # Additional templates that should be rendered to pages, maps page names to
164 | # template names.
165 | # html_additional_pages = {}
166 | 
167 | # If false, no module index is generated.
168 | # html_use_modindex = True
169 | 
170 | # If false, no index is generated.
171 | # html_use_index = True
172 | 
173 | # If true, the index is split into individual pages for each letter.
174 | # html_split_index = False
175 | 
176 | # If true, links to the reST sources are added to the pages.
177 | # html_show_sourcelink = True
178 | 
179 | # If true, an OpenSearch description file will be output, and all pages will
180 | # contain a <link> tag referring to it.  The value of this option must be the
181 | # base URL from which the finished HTML is served.
182 | # html_use_opensearch = ''
183 | 
184 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
185 | # html_file_suffix = ''
186 | 
187 | # Output file base name for HTML help builder.
188 | htmlhelp_basename = 'samwelldoc'
189 | 
190 | 
191 | # -- Options for LaTeX output --------------------------------------------------
192 | 
193 | # The paper size ('letter' or 'a4').
194 | # latex_paper_size = 'letter'
195 | 
196 | # The font size ('10pt', '11pt' or '12pt').
197 | # latex_font_size = '10pt'
198 | 
199 | # Grouping the document tree into LaTeX files. List of tuples
200 | # (source start file, target name, title, author, documentclass [howto/manual]).
201 | latex_documents = [
202 |     ('index', 'samwell.tex', u'samwell documentation', u'Nils Homer, Tim Fennell, et al.', 'manual'),
203 | ]
204 | 
205 | # The name of an image file (relative to this directory) to place at the top of
206 | # the title page.
207 | # latex_logo = None
208 | 
209 | # For "manual" documents, if this is true, then toplevel headings are parts,
210 | # not chapters.
211 | # latex_use_parts = False
212 | 
213 | # Additional stuff for the LaTeX preamble.
214 | # latex_preamble = ''
215 | 
216 | # Documents to append as an appendix to all manuals.
217 | # latex_appendices = []
218 | 
219 | # If false, no module index is generated.
220 | # latex_use_modindex = True
221 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ==========================================================
 2 | Samwell: a python package for using genomic files... well.
 3 | ==========================================================
 4 | 
 5 | :Date: |today|
 6 | :Version: |version|
 7 | 
 8 | Samwell provides elegant utilities for managing biological data.
 9 | 
10 | 
11 | Documentation Contents
12 | ======================
13 | 
14 | .. toctree::
15 |    :maxdepth: 2
16 | 
17 |    index.rst
18 |    api.rst
19 | 
20 | .. toctree::
21 |    :maxdepth: 1
22 | 
23 |    release_notes.rst
24 | 
25 | 
26 | Quickstart
27 | ==========
28 | 
29 | First install samwell::
30 | 
31 |     pip install samwell
32 | 
33 | Reading/Writing BAMs with automatic inference of filetype
34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
35 | 
36 | Samwell provides easy utilities for reading/writing BAMs::
37 | 
38 |     from samwell import sam
39 |     with sam.reader("myfile.bam") as in_bam:
40 |         with sam.writer("my-output-file.bam", header=in_bam.header) as out_bam:
41 |             for read in in_bam:
42 |                 if read.is_paired:
43 |                     out_bam.write(read)
44 | 
45 | See :mod:`~samwell.sam` module for more detail.
46 | 
47 | Realigning fastqs with bwa
48 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
49 | 
50 | You can use :mod:`~samwell` to easily realign fastq records as necessary::
51 | 
52 |     from pathlib import Path
53 |     from samwell import sam
54 |     from samwell.sam import bwa_mem
55 |     from samwell.sam import clipping
56 |     from samwell.sam.bwa_mem import FastqRecord
57 |     with sam.reader("myfile.bam") as in_bam:
58 |         with sam.writer("outfile.bam", header=in_bam.header) as out_bam:
59 |              fastq_gen = iter(FastqRecord.build(read) for read in in_bam)
60 |              for read in bwa_mem.align(fastq_gen, Path("genome.fasta")):
61 |                  out_bam.write(read)
62 | 
63 | See :mod:`~samwell.bwa_mem` module for more detail.
64 | 
65 | Developing with samwell
66 | =======================
67 | 
68 | Samwell uses `poetry <https://github.com/python-poetry/poetry#installation>`_ for dependency managment.
69 | 
70 | Please install `poetry` using the instructions in the above link.
71 | Then simply execute::
72 | 
73 |     poetry install
74 | 
75 | Checking the Build
76 | ~~~~~~~~~~~~~~~~~~
77 | 
78 | Linting::
79 | 
80 |     poetry run flake8 --config=flake8.cfg samwell
81 | 
82 | Type Checking::
83 | 
84 |     poetry run mypy -p samwell --config=mypy.ini
85 | 
86 | Unit Tests::
87 | 
88 |     poetry run python -m pytest --cov=samwell --cov-branch
89 | 


--------------------------------------------------------------------------------
/docs/release_notes.rst:
--------------------------------------------------------------------------------
 1 | Release Notes
 2 | =============
 3 | 
 4 | .. contents:: Table of Contents
 5 |    :depth: 2
 6 |    :local:
 7 |    :backlinks: none
 8 | 
 9 | v0.0.2
10 | ------
11 | Add in some missing typing to dnautils and sam.bwa_mem
12 | 
13 | v0.0.1
14 | ------
15 | Initial release
16 | 


--------------------------------------------------------------------------------
/flake8.cfg:
--------------------------------------------------------------------------------
1 | # flake8 config file for pysg
2 | 
3 | [flake8]
4 | max_line_length = 99
5 | show-source = true
6 | ignore = E701 W504
7 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | strict_optional = False
3 | ignore_missing_imports = True
4 | disallow_untyped_decorators = False
5 | follow_imports = silent
6 | disallow_untyped_defs = True
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "samwell"
 3 | version = "v0.0.4"
 4 | description = "Useful utilities for biological data formats and analyses"
 5 | authors = ["Jeff Tratner <jeffrey.tratner@myriad.com>", "Nils Homer", "Tim Fennell", "Nathan Roach"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = ">=3.6"
 9 | mypy-extensions = ">=0.4.3"
10 | defopt = ">=5.1.0"
11 | attrs = ">=19.3.0"
12 | intervaltree = ">=3.0.2"
13 | pysam = ">=0.15.3"
14 | pybedlite = ">=0.0.1"
15 | sphinx = {version = "4.3.1", optional = true}
16 | 
17 | [tool.poetry.dev-dependencies]
18 | pytest = ">=5.3.5"
19 | pytest-vcr = ">=1.0.2"
20 | flake8 = ">=3.7.9"
21 | mypy = ">=0.761"
22 | pytest-cov = ">=2.8.1"
23 | 
24 | [tool.poetry.extras]
25 | docs = ["sphinx"]
26 | 
27 | [build-system]
28 | requires = ["poetry>=0.12"]
29 | build-backend = "poetry.masonry.api"
30 | 


--------------------------------------------------------------------------------
/samwell/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myriad-opensource/samwell/47c4d809e3a228cf2be7af09871ab70e706763a1/samwell/__init__.py


--------------------------------------------------------------------------------
/samwell/dnautils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility Functions for Manipulating DNA sequences.
 3 | -------------------------------------------------
 4 | 
 5 | This module contains utility functions for manipulating DNA sequences.
 6 | 
 7 | """
 8 | 
 9 | from typing import Dict
10 | 
11 | _RC_DICT: Dict[str, str] = (
12 |     dict(A='T', C='G', G='C', T='A', a='t', c='g', g='c', t='a', N='N')
13 | )
14 | 
15 | 
16 | def reverse_complement(bases: str) -> str:
17 |     """Reverse complements a base sequence.
18 | 
19 |     Arguments:
20 |         bases: the bases to be reverse complemented.
21 | 
22 |     Returns:
23 |         the reverse complement of the provided base string
24 |     """
25 |     return ''.join([_RC_DICT[b] for b in bases[::-1]])
26 | 
27 | 
28 | def mask_long_homopolymers(bases: str, min_long_hp_length: int, mask_base: str = 'N') -> str:
29 |     """Returns the bases masked for regions with long homopolymers
30 | 
31 |     Args:
32 |         bases: the bases to mask.
33 |         min_long_hp_length: the minimum homopolymer length (inclusive) to mask.
34 |         mask_base: the base to use when masking
35 |     """
36 |     masked = list(bases)
37 |     count = 1
38 |     last_base = bases[0]
39 |     for i in range(1, len(bases)):
40 |         cur_base = bases[i]
41 |         if last_base == cur_base:
42 |             count += 1
43 |         else:
44 |             if count >= min_long_hp_length:
45 |                 masked[i - count:i] = mask_base * count
46 |             last_base = cur_base
47 |             count = 1
48 |     if count >= min_long_hp_length:
49 |         masked[-count:] = mask_base * count
50 |     return ''.join(masked)
51 | 
52 | 
53 | def has_long_homopolymer(bases: str, max_hp_length: int) -> bool:
54 |     '''Returns true if the given bases has a homopolymer length longer than the given length.
55 | 
56 |     Args:
57 |         bases: the bases to examine.
58 |         max_hp_length: the maximum homopolymer length to allow.
59 |     '''
60 |     count = 1
61 |     last_base = bases[0]
62 |     for i in range(1, len(bases)):
63 |         cur_base = bases[i]
64 |         if last_base == cur_base:
65 |             count += 1
66 |             if count > max_hp_length:
67 |                 return True
68 |         else:
69 |             last_base = cur_base
70 |             count = 1
71 |     return count > max_hp_length
72 | 


--------------------------------------------------------------------------------
/samwell/itertools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions for Creating Useful Iterators
  3 | ---------------------------------------
  4 | 
  5 | This module contains classes and functions for creating useful iterators.
  6 | 
  7 | Examples of a "Peekable" Iterator
  8 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  9 | 
 10 | "Peekable" iterators are useful to "peek" at the next item in an iterator without consuming it.
 11 | For example, this is useful when consuming items in iterator while a predicate is true, and not
 12 | consuming the first element where the element is not true.  See the
 13 | :func:`~samwell.itertools.PeekableIterator.takewhile` and
 14 | :func:`~samwell.itertools.PeekableIterator.dropwhile` methods.
 15 | 
 16 | An empty peekable iterator throws StopIteration:
 17 | 
 18 | .. code-block:: python
 19 | 
 20 |     >>> from samwell.itertools import peekable
 21 |     >>> piter = peekable(iter([]))
 22 |     >>> piter.peek()
 23 |     StopIteration
 24 | 
 25 | A peekable iterator will return the next item before consuming it.
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |     >>> piter = peekable(iter([1, 2, 3]))
 30 |     >>> piter.peek()
 31 |     1
 32 |     >>> next(piter)
 33 |     1
 34 |     >>> [j for j in piter]
 35 |     [2, 3]
 36 | 
 37 | The `can_peek()` function can be used to determine if the iterator can be peeked without
 38 | StopIteration being thrown:
 39 | 
 40 |     >>> piter = peekable([1])
 41 |     >>> piter.peek() if piter.can_peek() else -1
 42 |     1
 43 |     >>> next(piter)
 44 |     1
 45 |     >>> piter.peek() if piter.can_peek() else -1
 46 |     -1
 47 |     >>> next(piter)
 48 |     StopIteration
 49 | 
 50 | The `peekable()` function should be preferred to calling `PeekableIterator`'s constructor
 51 | directly as it supports creation from iterable objects as well as iterators, while the constructor
 52 | requires an iterator.
 53 | 
 54 | Examples of a "Merging" Iterator
 55 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 56 | 
 57 | A "merging" iterator can merge two iterators in order based on a given ordering function.  This is
 58 | useful for merging two iterators that are already in order.
 59 | 
 60 | .. code-block:: python
 61 | 
 62 |     >>> from samwell.itertools import MergingIterator
 63 |     >>> even = iter([2, 4, 6, 8])
 64 |     >>> odd = iter([1, 3, 5, 9])
 65 |     >>> merging = MergingIterator(even, odd, lambda x: x)
 66 |     >>> list(merging)
 67 |     [1, 2, 3, 4, 5, 6, 7, 8, 9]
 68 | 
 69 | Module Contents
 70 | ~~~~~~~~~~~~~~~
 71 | 
 72 | The module contains the following public classes:
 73 | 
 74 |     - :class:`~samwell.itertools.PeekableIterator` -- Iterator that allows you to peek at the
 75 |         next value before calling next
 76 | 
 77 |     - :class:`samwell.itertools.MergingIterator` -- Iterator that allows merging of two
 78 |         iterator using a keyfunc to decide from which iterator to draw the next item
 79 | 
 80 | The module contains the following methods:
 81 | 
 82 |     - :func:`~samwell.itertools.peekable` -- Creates an iterator that allows you to peek at
 83 |         the next value before calling next
 84 | """
 85 | 
 86 | from typing import Any
 87 | from typing import Optional
 88 | from typing import Callable
 89 | from typing import Generic
 90 | from typing import Iterable
 91 | from typing import Iterator
 92 | from typing import List
 93 | from typing import TypeVar
 94 | from typing import Union
 95 | 
 96 | 
 97 | IterType = TypeVar('IterType')
 98 | 
 99 | 
100 | class PeekableIterator(Generic[IterType], Iterator[IterType]):
101 |     """A peekable iterator wrapping an iterable.
102 | 
103 |     This allows returning the next item without consuming it.
104 | 
105 |     Args:
106 |         source: an iterator over the objects
107 |     """
108 | 
109 |     def __init__(self, source: Iterator[IterType]) -> None:
110 |         self._iter: Iterator[IterType] = source
111 |         self._sentinel: Any = object()
112 |         self.__update_peek()
113 | 
114 |     def __iter__(self) -> Iterator[IterType]:
115 |         return self
116 | 
117 |     def __next__(self) -> IterType:
118 |         to_return = self.peek()
119 |         self.__update_peek()
120 |         return to_return
121 | 
122 |     def __update_peek(self) -> None:
123 |         self._peek = next(self._iter, self._sentinel)
124 | 
125 |     def can_peek(self) -> bool:
126 |         """Returns true if there is a value that can be peeked at, false otherwise."""
127 |         return self._peek is not self._sentinel
128 | 
129 |     def peek(self) -> IterType:
130 |         """Returns the next element without consuming it, or StopIteration otherwise."""
131 |         if self.can_peek():
132 |             return self._peek
133 |         else:
134 |             raise StopIteration
135 | 
136 |     def maybe_peek(self) -> Optional[IterType]:
137 |         """Returns the next element without consuming it, or None otherwise."""
138 |         return self._peek if self.can_peek() else None
139 | 
140 |     def takewhile(self, pred: Callable[[IterType], bool]) -> List[IterType]:
141 |         """Consumes from the iterator while pred is true, and returns the result as a List.
142 | 
143 |         The iterator is left pointing at the first non-matching item, or if all items match
144 |         then the iterator will be exhausted.
145 | 
146 |         Args:
147 |             pred: a function that takes the next value from the iterator and returns
148 |                   true or false.
149 | 
150 |         Returns:
151 |             List[V]: A list of the values from the iterator, in order, up until and excluding
152 |             the first value that does not match the predicate.
153 |         """
154 |         xs: List[IterType] = []
155 |         while self.can_peek() and pred(self._peek):
156 |             xs.append(next(self))
157 |         return xs
158 | 
159 |     def dropwhile(self, pred: Callable[[IterType], bool]) -> "PeekableIterator[IterType]":
160 |         """Drops elements from the iterator while the predicate is true.
161 | 
162 |         Updates the iterator to point at the first non-matching element, or exhausts the
163 |         iterator if all elements match the predicate.
164 | 
165 |         Args:
166 |             pred (Callable[[V], bool]): a function that takes a value from the iterator
167 |             and returns true or false.
168 | 
169 |         Returns:
170 |             PeekableIterator[V]: a reference to this iterator, so calls can be chained
171 |         """
172 |         while self.can_peek() and pred(self._peek):
173 |             self.__update_peek()
174 |         return self
175 | 
176 | 
177 | def peekable(source: Union[Iterator[IterType], Iterable[IterType]]) -> PeekableIterator[IterType]:
178 |     """Creates a peekable iterator that allows you to peek at the next value before calling next
179 | 
180 |     The peek method will return the next element without consuming it, otherwise StopIteration.
181 | 
182 |     Args:
183 |         source: either an iterator over the objects, or a callable that is called until it
184 |             returns the sentinel.
185 | 
186 |     Returns:
187 |         a :class:`~samwell.itertools.PeekableIterator`
188 |     """
189 |     return PeekableIterator(source=iter(source))
190 | 
191 | 
192 | class MergingIterator(Generic[IterType], Iterator[IterType]):
193 |     """An iterator that merges two iterators; if they are sorted and keyfunc is passed, yields
194 |     results in order.
195 | 
196 |     Args:
197 |         iter1: an iterator
198 |         iter2: an iterator
199 |         keyfunc: a function that extracts a key from an item that is used to order items
200 |     """
201 | 
202 |     def __init__(self,
203 |                  iter1: Iterator[IterType],
204 |                  iter2: Iterator[IterType],
205 |                  keyfunc: Callable[[IterType], Any]) -> None:
206 |         self._iter1 = peekable(iter1)
207 |         self._iter2 = peekable(iter2)
208 |         self._keyfunc = keyfunc
209 | 
210 |     def __iter__(self) -> Iterator[IterType]:
211 |         return self
212 | 
213 |     def __next__(self) -> IterType:
214 |         if self._iter1.can_peek() and self._iter2.can_peek():
215 |             k1 = self._keyfunc(self._iter1.peek())
216 |             k2 = self._keyfunc(self._iter2.peek())
217 |             return next(self._iter1 if k1 <= k2 else self._iter2)
218 |         elif self._iter1.can_peek():
219 |             return next(self._iter1)
220 |         elif self._iter2.can_peek():
221 |             return next(self._iter2)
222 |         else:
223 |             raise StopIteration
224 | 


--------------------------------------------------------------------------------
/samwell/overlap_detector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility Classes for Querying Overlaps with Genomic Regions
 3 | ----------------------------------------------------------
 4 | 
 5 | DEPRECATED - if you have the option use `~pybedlite.overlap_detector` in favor of this.
 6 | 
 7 | Examples of Detecting Overlaps
 8 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 9 | 
10 | .. code-block:: python
11 | 
12 |     >>> from samwell.overlap_detector import Interval, OverlapDetector
13 |     >>> detector = OverlapDetector()
14 |     >>> query = Interval("chr1", 2, 20)
15 |     >>> detector.overlaps_any(query)
16 |     False
17 |     >>> detector.add(Interval("chr2", 1, 100))
18 |     >>> detector.add(Interval("chr1", 21, 100))
19 |     >>> detector.overlaps_any(query)
20 |     False
21 |     >>> detector.add(Interval("chr1", 1, 1))
22 |     >>> detector.overlaps_any(query)
23 |     True
24 |     >>> detector.get_overlaps(query)
25 |     [Interval("chr1", 1, 1)]
26 |     >>> detector.add(Interval("chr1", 3, 10))
27 |     >>> detector.overlaps_any(query)
28 |     True
29 |     >>> detector.get_overlaps(query)
30 |     [Interval("chr1", 1, 1), interval("chr1", 3, 10)]
31 | 
32 | Module Contents
33 | ~~~~~~~~~~~~~~~
34 | 
35 | The module contains the following public classes:
36 | 
37 |     - :class:`~samwell.overlap_detector.Interval` -- Represents a region mapping to the genome
38 |         that is 0-based and open-ended
39 |     - :class:`~samwell.overlap_detector.OverlapDetector` -- Detects and returns overlaps between
40 |         a set of genomic regions and another genomic region
41 | """
42 | 
43 | 
44 | from pybedlite.overlap_detector import Interval
45 | from pybedlite.overlap_detector import OverlapDetector
46 | 
47 | __all__ = ["Interval", "OverlapDetector"]
48 | 


--------------------------------------------------------------------------------
/samwell/sam/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility Classes and Methods for SAM/BAM
  3 | ---------------------------------------
  4 | 
  5 | This module contains utility classes for reading and writing SAM/BAM files, as well as for
  6 | manipulating Cigars.  It is recommended to use the :func:`~samwell.sam.reader` and
  7 | :func:`~samwell.sam.writer` methods rather than :class:`pysam.AlignmentFile` directly (see
  8 | below for motivation).
  9 | 
 10 | Motivation for Reader and Writer methods
 11 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 12 | 
 13 | The following are the reasons for choosing to implement methods to open a SAM/BAM file for
 14 | reading and writing, rather than relying on :class:`pysam.AlignmentFile` directly:
 15 | 
 16 | 1. Provides a centralized place for the implementation of opening a SAM/BAM for reading and
 17 |    writing.  This is useful if any additional parameters are added, or changes to standards or
 18 |    defaults are made.
 19 | 2. Makes the requirement to provide a header when opening a file for writing more explicit.
 20 | 3. Adds support for :class:`~pathlib.Path`.
 21 | 4. Remove the reliance on specifying the mode correctly, including specifying the file type (i.e.
 22 |    SAM, BAM, or CRAM), as well as additional options (ex. compression level).  This makes the
 23 |    code more explicit and easier to read.
 24 | 5. An explicit check is performed to ensure the file type is specified when writing using a
 25 |    file-like object rather than a path to a file.
 26 | 
 27 | Examples of Opening a SAM/BAM for Reading or Writing
 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 29 | 
 30 | Opening a SAM/BAM file for reading, auto-recognizing the file-type by the file extension.  See
 31 | :class:`~samwell.sam.SamFileType` for the supported file types.
 32 | 
 33 | .. code-block:: python
 34 | 
 35 |     >>> from samwell.sam import reader
 36 |     >>> with reader("/path/to/sample.sam") as fh:
 37 |     ...     for record in fh:
 38 |     ...         print(record.name)  # do something
 39 |     >>> with reader("/path/to/sample.bam") as fh:
 40 |     ...     for record in fh:
 41 |     ...         print(record.name)  # do something
 42 | 
 43 | Opening a SAM/BAM file for reading, explicitly passing the file type.
 44 | 
 45 |     >>> from samwell.sam import SamFileType
 46 |     >>> with reader(path="/path/to/sample.ext1", file_type=SamFileType.SAM) as fh:
 47 |     ...     for record in fh:
 48 |     ...         print(record.name)  # do something
 49 |     >>> with reader(path="/path/to/sample.ext2", file_type=SamFileType.BAM) as fh:
 50 |     ...     for record in fh:
 51 |     ...         print(record.name)  # do something
 52 | 
 53 | Opening a SAM/BAM file for reading, using an existing file-like object
 54 | 
 55 |     >>> with open("/path/to/sample.sam", "rb") as file_object:
 56 |     ...     with reader(path=file_object, file_type=SamFileType.BAM) as fh:
 57 |     ...         for record in fh:
 58 |     ...             print(record.name)  # do something
 59 | 
 60 | Opening a SAM/BAM file for writing follows similar to the :func:`~samwell.sam.reader` method,
 61 | but the SAM file header object is required.
 62 | 
 63 |     >>> from samwell.sam import writer
 64 |     >>> header: Dict[str, Any] = {
 65 |     ...     "HD": {"VN": "1.5", "SO": "coordinate"},
 66 |     ...     "RG": [{"ID": "1", "SM": "1_AAAAAA", "LB": "lib", "PL": "ILLUMINA", "PU": "xxx.1"}],
 67 |     ...     "SQ":  [
 68 |     ...         {"SN": "chr1", "LN": 249250621},
 69 |     ...         {"SN": "chr2", "LN": 243199373}
 70 |     ...     ]
 71 |     ... }
 72 |     >>> with writer(path="/path/to/sample.bam", header=header) as fh:
 73 |     ...     pass  # do something
 74 | 
 75 | Examples of Manipulating Cigars
 76 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 77 | 
 78 | Creating a :class:`~samwell.sam.Cigar` from a :class:`pysam.AlignedSegment`.
 79 | 
 80 |     >>> from samwell.sam import Cigar
 81 |     >>> with reader("/path/to/sample.sam") as fh:
 82 |     ...     record = next(fh)
 83 |     ...     cigar = Cigar.from_cigartuples(record.cigartuples)
 84 |     ...     print(str(cigar))
 85 |     50M2D5M10S
 86 | 
 87 | Creating a :class:`~samwell.sam.Cigar` from a :class:`str`.
 88 | 
 89 |     >>> cigar = Cigar.from_cigarstring("50M2D5M10S")
 90 |     >>> print(str(cigar))
 91 |     50M2D5M10S
 92 | 
 93 | If the cigar string is invalid, the exception message will show you the problem character(s) in
 94 | square brackets.
 95 | 
 96 |     >>> cigar = Cigar.from_cigarstring("10M5U")
 97 |     ... CigarException("Malformed cigar: 10M5[U]")
 98 | 
 99 | The cigar contains a tuple of :class:`~samwell.sam.CigarElement`s.  Each element contains the
100 | cigar operator (:class:`~samwell.sam.CigarOp`) and associated operator length.  A number of
101 | useful methods are part of both classes.
102 | 
103 | The number of bases aligned on the query (i.e. the number of bases consumed by the cigar from
104 | the query):
105 | 
106 |     >>> cigar = Cigar.from_cigarstring("50M2D5M2I10S")
107 |     >>> [e.length_on_query for e in cigar.elements]
108 |     [50, 0, 5, 2, 10]
109 |     >>> [e.length_on_target for e in cigar.elements]
110 |     [50, 2, 5, 0, 0]
111 |     >>> [e.operator.is_indel for e in cigar.elements]
112 |     [False, True, False, True, False]
113 | 
114 | Module Contents
115 | ~~~~~~~~~~~~~~~
116 | 
117 | The module contains the following public classes:
118 | 
119 |     - :class:`~samwell.sam.SamFileType` -- Enumeration of valid SAM/BAM/CRAM file types.
120 |     - :class:`~samwell.sam.SamOrder` -- Enumeration of possible SAM/BAM/CRAM sort orders.
121 |     - :class:`~samwell.sam.CigarOp` -- Enumeration of operators that can appear in a Cigar string.
122 |     - :class:`~samwell.sam.CigarElement` -- Class representing an element in a Cigar string.
123 |     - :class:`~samwell.sam.CigarParsingException` -- The exception raised specific to parsing a
124 |         cigar
125 |     - :class:`~samwell.sam.Cigar` -- Class representing a cigar string.
126 | 
127 | The module contains the following methods:
128 | 
129 |     - :func:`~samwell.sam.reader` -- opens a SAM/BAM/CRAM file for reading.
130 |     - :func:`~samwell.sam.writer` -- opens a SAM/BAM/CRAM file for writing
131 |     - :func:`~samwell.sam.set_qc_fail` -- sets the QC fail flag in a
132 |         :class:`pysam.AlignedSegment` record and sets additional SAM tags giving the tool name and
133 |         reason for why the QC fail flag was set.
134 |     - :func:`~samwell.sam.get_qc_fail` -- gets the tool name and reason for why the QC fail flag
135 |         was set, or None if it is not set.
136 | """
137 | 
138 | import enum
139 | import io
140 | from pathlib import Path
141 | from typing import Any
142 | from typing import Callable
143 | from typing import Dict
144 | from typing import IO
145 | from typing import List
146 | from typing import Optional
147 | from typing import Tuple
148 | from typing import Union
149 | from typing import TYPE_CHECKING
150 | import attr
151 | import pysam
152 | import sys
153 | from pysam import AlignmentFile as SamFile
154 | from pysam import AlignmentHeader as SamHeader
155 | from pysam import AlignedSegment
156 | 
157 | if TYPE_CHECKING or sys.version_info < (3, 8, 0):
158 |     from typing_extensions import Final
159 | else:
160 |     from typing import Final
161 | 
162 | """The valid base classes for opening a SAM/BAM/CRAM file."""
163 | SamPath = Union[IO[Any], Path, str]
164 | 
165 | """The reference index to use to indicate no reference in SAM/BAM."""
166 | NO_REF_INDEX: int = -1
167 | 
168 | """The reference name to use to indicate no reference in SAM/BAM."""
169 | NO_REF_NAME: str = "*"
170 | 
171 | """The reference position to use to indicate no position in SAM/BAM."""
172 | NO_REF_POS: int = -1
173 | 
174 | 
175 | @enum.unique
176 | class SamFileType(enum.Enum):
177 |     """Enumeration of valid SAM/BAM/CRAM file types.
178 | 
179 |     Attributes:
180 |         mode (str): The additional mode character to add when opening this file type.
181 |         ext (str): The standard file extension for this file type.
182 |     """
183 | 
184 |     def __init__(self, mode: str, ext: str) -> None:
185 |         self.mode: Final[str] = mode
186 |         self.ext: Final[str] = ext
187 | 
188 |     SAM = ("", ".sam")
189 |     BAM = ("b", ".bam")
190 |     CRAM = ("c", ".cram")
191 | 
192 |     @classmethod
193 |     def from_path(cls, path: Union[Path, str]) -> 'SamFileType':
194 |         """Infers the file type based on the file extension.
195 | 
196 |         Args:
197 |             path: the path to the SAM/BAM/CRAM to read or write.
198 |         """
199 |         ext = Path(path).suffix
200 |         try:
201 |             return next(iter([tpe for tpe in SamFileType if tpe.ext == ext]))
202 |         except StopIteration:
203 |             raise ValueError(f"Could not infer file type from {path}")
204 | 
205 | 
206 | """The classes that should be treated as file-like classes"""
207 | _IOClasses = (
208 |     io.TextIOBase,
209 |     io.BufferedIOBase,
210 |     io.RawIOBase,
211 |     io.IOBase
212 | )
213 | 
214 | 
215 | def _pysam_open(path: SamPath,
216 |                 open_for_reading: bool,
217 |                 file_type: Optional[SamFileType] = None,
218 |                 **kwargs: Any) -> SamFile:
219 |     """Opens a SAM/BAM/CRAM for reading or writing.
220 | 
221 |     Args:
222 |         path: a file handle or path to the SAM/BAM/CRAM to read or write.
223 |         open_for_reading: True to open for reading, false otherwise.
224 |         file_type: the file type to assume when opening the file.  If None, then the file type
225 |             will be auto-detected for reading and must be a path-like object for writing.
226 |         kwargs: any keyword arguments to be passed to
227 |         :class:`~pysam.AlignmentFile`; may not include "mode".
228 |     """
229 | 
230 |     if isinstance(path, (str, Path)):  # type: ignore
231 |         file_type = file_type or SamFileType.from_path(path)
232 |         path = str(path)
233 |     elif not isinstance(path, _IOClasses):  # type: ignore
234 |         open_type = "reading" if open_for_reading else "writing"
235 |         raise TypeError(f"Cannot open '{type(path)}' for {open_type}.")
236 | 
237 |     if file_type is None and not open_for_reading:
238 |         raise ValueError("file_type must be given when writing to a file-like object")
239 | 
240 |     # file_type must be set when writing, so if file_type is None, then we must be opening it
241 |     # for reading.  Hence, only set mode in kwargs to pysam when file_type is set and when
242 |     # writing since we can let pysam auto-recognize the file type when reading.  See discussion:
243 |     # https://github.com/pysam-developers/pysam/issues/655
244 |     if file_type is not None:
245 |         kwargs["mode"] = "r" if open_for_reading else "w" + file_type.mode
246 |     else:
247 |         assert open_for_reading, "Bug: file_type was None but open_for_reading was False"
248 | 
249 |     # Open it!
250 |     return pysam.AlignmentFile(path, **kwargs)
251 | 
252 | 
253 | def reader(path: SamPath,
254 |            file_type: Optional[SamFileType] = None
255 |            ) -> SamFile:
256 |     """Opens a SAM/BAM/CRAM for reading.
257 | 
258 |         Args:
259 |             path: a file handle or path to the SAM/BAM/CRAM to read or write.
260 |             file_type: the file type to assume when opening the file.  If None, then the file
261 |                 type will be auto-detected.
262 |        """
263 |     return _pysam_open(path=path, open_for_reading=True, file_type=file_type)
264 | 
265 | 
266 | def writer(path: SamPath,
267 |            header: Union[str, Dict[str, Any], SamHeader],
268 |            file_type: Optional[SamFileType] = None) -> SamFile:
269 |     """Opens a SAM/BAM/CRAM for writing.
270 | 
271 |         Args:
272 |             path: a file handle or path to the SAM/BAM/CRAM to read or write.
273 |             header: Either a string to use for the header or a multi-level dictionary.  The
274 |                 multi-level dictionary should be given as follows.  The first level are the four
275 |                 types (‘HD’, ‘SQ’, ...). The second level are a list of lines, with each line being
276 |                 a list of tag-value pairs. The header is constructed first from all the defined
277 |                 fields, followed by user tags in alphabetical order.
278 |             file_type: the file type to assume when opening the file.  If None, then the
279 |                 filetype will be auto-detected and must be a path-like object.
280 |           """
281 |     # Set the header for pysam's AlignmentFile
282 |     key = "text" if isinstance(header, str) else "header"
283 |     kwargs = {key: header}
284 | 
285 |     return _pysam_open(path=path, open_for_reading=False, file_type=file_type, **kwargs)
286 | 
287 | 
288 | class _CigarOpUtil:
289 |     """Some useful constants to speed up methods on CigarOp"""
290 | 
291 |     """A dictionary from the cigar op code to the cigar op char.
292 | 
293 |     This is to speed up the translation of cigar op code to CigarOp in CigarOp, so needs to be
294 |     declared beforehand.
295 |     """
296 |     CODE_TO_CHARACTER: Dict[int, str] = {0: "M", 1: "I", 2: "D", 3: "N", 4: "S", 5: "H", 6: "P",
297 |                                          7: "EQ", 8: "X"}
298 | 
299 | 
300 | @enum.unique
301 | class CigarOp(enum.Enum):
302 |     """Enumeration of operators that can appear in a Cigar string.
303 | 
304 |     Attributes:
305 |         code (int): The :py:mod:`~pysam` cigar operator code.
306 |         character (int): The single character cigar operator.
307 |         consumes_query (bool): True if this operator consumes query bases, False otherwise.
308 |         consumes_target (bool): True if this operator consumes target bases, False otherwise.
309 |     """
310 | 
311 |     M = (0, 'M', True, True)  #: Match or Mismatch the reference
312 |     I = (1, 'I', True, False)  #: Insertion versus the reference  # noqa: E741
313 |     D = (2, 'D', False, True)  #: Deletion versus the reference
314 |     N = (3, 'N', False, True)  #: Skipped region from the reference
315 |     S = (4, 'S', True, False)  #: Soft clip
316 |     H = (5, 'H', False, False)  #: Hard clip
317 |     P = (6, 'P', False, False)  #: Padding
318 |     EQ = (7, '=', True, True)  #: Matches the reference
319 |     X = (8, 'X', True, True)  #: Mismatches the reference
320 | 
321 |     def __init__(self,
322 |                  code: int,
323 |                  character: str,
324 |                  consumes_query: bool,
325 |                  consumes_reference: bool) -> None:
326 |         self.code = code
327 |         self.character = character
328 |         self.consumes_query = consumes_query
329 |         self.consumes_reference = consumes_reference
330 | 
331 |     @staticmethod
332 |     def from_character(character: str) -> 'CigarOp':
333 |         """Returns the operator from the single character."""
334 |         if CigarOp.EQ.character == character:
335 |             return CigarOp.EQ
336 |         else:
337 |             return CigarOp[character]
338 | 
339 |     @staticmethod
340 |     def from_code(code: int) -> 'CigarOp':
341 |         """Returns the operator from the given operator code.
342 | 
343 |         Note: this is mainly used to get the operator from :py:mod:`~pysam`.
344 |         """
345 |         return CigarOp[_CigarOpUtil.CODE_TO_CHARACTER[code]]
346 | 
347 |     @property
348 |     def is_indel(self) -> bool:
349 |         """Returns true if the operator is an indel, false otherwise. """
350 |         return self == CigarOp.I or self == CigarOp.D
351 | 
352 | 
353 | @attr.s(frozen=True, slots=True)
354 | class CigarElement:
355 |     """ Represents an element in a Cigar
356 | 
357 |     Attributes:
358 |         - length (int): the length of the element
359 |         - operator (CigarOp): the operator of the element
360 |     """
361 | 
362 |     length: int = attr.ib()
363 |     operator: CigarOp = attr.ib()
364 | 
365 |     @length.validator
366 |     def _validate_length(self, attribute: Any, value: int) -> None:
367 |         """Validates the length attribute is greater than zero."""
368 |         if value <= 0:
369 |             raise ValueError(f"Cigar element must have a length > 0, found {value}")
370 | 
371 |     @property
372 |     def length_on_query(self) -> int:
373 |         """Returns the length of the element on the query sequence."""
374 |         return self.length if self.operator.consumes_query else 0
375 | 
376 |     @property
377 |     def length_on_target(self) -> int:
378 |         """Returns the length of the element on the target (often reference) sequence."""
379 |         return self.length if self.operator.consumes_reference else 0
380 | 
381 |     def __str__(self) -> str:
382 |         return f"{self.length}{self.operator.character}"
383 | 
384 | 
385 | class CigarParsingException(Exception):
386 |     """The exception raised specific to parsing a cigar."""
387 |     pass
388 | 
389 | 
390 | @attr.s(frozen=True, slots=True)
391 | class Cigar:
392 |     """Class representing a cigar string.
393 | 
394 |     Attributes:
395 |         - elements (Tuple[CigarElement, ...]): zero or more cigar elements
396 |     """
397 | 
398 |     elements: Tuple[CigarElement, ...] = attr.ib(default=())
399 | 
400 |     @classmethod
401 |     def from_cigartuples(cls, cigartuples: Optional[List[Tuple[int, int]]]) -> 'Cigar':
402 |         """Returns a Cigar from a list of tuples returned by pysam.
403 | 
404 |         Each tuple denotes the operation and length.  See
405 |         :class:`~samwell.sam.CigarOp` for more information on the
406 |         various operators.  If None is given, returns an empty Cigar.
407 |         """
408 |         if cigartuples is None or cigartuples == []:
409 |             return Cigar()
410 |         try:
411 |             elements = []
412 |             for code, length in cigartuples:
413 |                 operator = CigarOp.from_code(code)
414 |                 elements.append(CigarElement(length, operator))
415 |             return Cigar(tuple(elements))
416 |         except Exception as ex:
417 |             raise CigarParsingException(f"Malformed cigar tuples: {cigartuples}") from ex
418 | 
419 |     @classmethod
420 |     def _pretty_cigarstring_exception(cls,
421 |                                       cigarstring: str,
422 |                                       index: int) -> CigarParsingException:
423 |         """Raises an exception highlighting the malformed character"""
424 |         prefix = cigarstring[:index]
425 |         character = cigarstring[index] if index < len(cigarstring) else ""
426 |         suffix = cigarstring[index + 1:]
427 |         pretty_cigarstring = f"{prefix}[{character}]{suffix}"
428 |         message = f"Malformed cigar: {pretty_cigarstring}"
429 |         return CigarParsingException(message)
430 | 
431 |     @classmethod
432 |     def from_cigarstring(cls, cigarstring: str) -> 'Cigar':
433 |         """Constructs a Cigar from a string returned by pysam.
434 | 
435 |         If "*" is given, returns an empty Cigar.
436 |         """
437 |         if cigarstring == "*":
438 |             return Cigar()
439 | 
440 |         cigarstring_length = len(cigarstring)
441 |         if cigarstring_length == 0:
442 |             raise CigarParsingException("Cigar string was empty")
443 | 
444 |         elements = []
445 |         i = 0
446 |         while i < cigarstring_length:
447 |             if not cigarstring[i].isdigit():
448 |                 raise cls._pretty_cigarstring_exception(cigarstring, i)  # type: ignore
449 |             length = int(cigarstring[i])
450 |             i += 1
451 |             while i < cigarstring_length and cigarstring[i].isdigit():
452 |                 length = (length * 10) + int(cigarstring[i])
453 |                 i += 1
454 |             if i == cigarstring_length:
455 |                 raise cls._pretty_cigarstring_exception(cigarstring, i)  # type: ignore
456 |             try:
457 |                 operator = CigarOp.from_character(cigarstring[i])
458 |                 elements.append(CigarElement(length, operator))
459 |             except KeyError as ex:
460 |                 # cigar operator was not valid
461 |                 raise cls._pretty_cigarstring_exception(cigarstring, i) from ex  # type: ignore
462 |             except IndexError as ex:
463 |                 # missing cigar operator (i == len(cigarstring))
464 |                 raise cls._pretty_cigarstring_exception(cigarstring, i) from ex  # type: ignore
465 |             i += 1
466 |         return Cigar(tuple(elements))
467 | 
468 |     def __str__(self) -> str:
469 |         if self.elements:
470 |             return "".join([str(e) for e in self.elements])
471 |         else:
472 |             return "*"
473 | 
474 |     def reversed(self) -> "Cigar":
475 |         """Returns a copy of the Cigar with the elements in reverse order."""
476 |         return Cigar(tuple(reversed(self.elements)))
477 | 
478 |     def length_on_query(self) -> int:
479 |         """Returns the length of the alignment on the query sequence."""
480 |         return sum([elem.length_on_query for elem in self.elements])
481 | 
482 |     def length_on_target(self) -> int:
483 |         """Returns the length of the alignment on the target sequence."""
484 |         return sum([elem.length_on_target for elem in self.elements])
485 | 
486 |     def coalesce(self) -> "Cigar":
487 |         """Returns a copy of the cigar adjacent operators of the same type coalesced into single
488 |         operators."""
489 |         new_elements: List[CigarElement] = []
490 |         element_index: int = 0
491 |         while element_index < len(self.elements):
492 |             cur_element: CigarElement = self.elements[element_index]
493 |             op_length: int = cur_element.length
494 |             element_index += 1
495 |             while (element_index < len(self.elements) and
496 |                     cur_element.operator == self.elements[element_index].operator):
497 |                 op_length += self.elements[element_index].length
498 |                 element_index += 1
499 |             new_elements.append(CigarElement(operator=cur_element.operator, length=op_length))
500 |         return Cigar(tuple(new_elements))
501 | 
502 | 
503 | # The SAM tag to store which tool caused the QC fail flag to be set
504 | QcFailToolTag = 'qt'
505 | 
506 | 
507 | # The SAM tag to store the reason why the tool caused the QC flag to be set
508 | QcFailReasonTag = 'qr'
509 | 
510 | 
511 | def set_qc_fail(rec: pysam.AlignedSegment, tool: Callable[..., Any], reason: str) -> None:
512 |     """Sets the QC fail flag, and adds tags containing the tool name and reason for failing.
513 |     Args:
514 |         rec: the record to fail
515 |         tool: the tool (as a callable) that failed this record
516 |         reason: the reason for failing
517 |     """
518 |     assert '\t' not in reason, f"Reason may not contain tabs: {reason}"
519 |     rec.is_qcfail = True
520 |     rec.set_tag(QcFailToolTag, tool.__name__)
521 |     rec.set_tag(QcFailReasonTag, reason)
522 | 
523 | 
524 | def get_qc_fail(rec: pysam.AlignedSegment) -> Optional[Tuple[str, str]]:
525 |     """Gets the tool and reason for why the QC fail flag is set, otherwise None if not set.
526 | 
527 |     If the QC fail flag is set, but the tool and filter reason SAM tags are not set, None will be
528 |     returned.  Use pysam.AlignedSegment.is_qcfail() to check if the record is simply QC failed.
529 | 
530 |     Args:
531 |         rec: the record to fail
532 |     """
533 |     if not rec.is_qcfail or not rec.has_tag(QcFailToolTag):
534 |         return None
535 |     else:
536 |         tool_value = rec.get_tag(QcFailToolTag)
537 |         reason_value = rec.get_tag(QcFailReasonTag)
538 |         return (tool_value, reason_value)
539 | 
540 | 
541 | def get_qc_fail_by_tool(rec: pysam.AlignedSegment,
542 |                         tool: Callable[..., Any] = None) -> Optional[Tuple[str, str]]:
543 |     """Gets the tool and reason for why the QC fail flag if the flag was set by the passed tool.
544 | 
545 |     None will be returned in the following cases:
546 |       - The QC fail flag is not set
547 |       - The QC fail flag isset, but the tool and filter reason SAM tags are not set
548 |       - The tool and filter reason SAM tags were set by a different tool
549 | 
550 |     Use pysam.AlignedSegment.is_qcfail() to check if the record is simply QC failed.
551 | 
552 |     Args:
553 |         rec: the record to fail
554 |         tool: the tool that must have set the QC fail flag
555 |     """
556 |     maybe_tool_and_reason = get_qc_fail(rec)
557 |     if maybe_tool_and_reason is None:
558 |         return maybe_tool_and_reason
559 |     else:
560 |         tool_value = maybe_tool_and_reason[0]
561 |         return maybe_tool_and_reason if tool.__name__ == tool_value else None
562 | 
563 | 
564 | def isize(r1: AlignedSegment, r2: AlignedSegment) -> int:
565 |     """Computes the insert size for a pair of records."""
566 |     if r1.is_unmapped or r2.is_unmapped or r1.reference_id != r2.reference_id:
567 |         return 0
568 |     else:
569 |         r1_pos = r1.reference_end if r1.is_reverse else r1.reference_start
570 |         r2_pos = r2.reference_end if r2.is_reverse else r2.reference_start
571 |         return r2_pos - r1_pos
572 | 
573 | 
574 | def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = True) -> None:
575 |     """Resets mate pair information between reads in a pair. Requires that both r1
576 |     and r2 are mapped.  Can be handed reads that already have pairing flags setup or
577 |     independent R1 and R2 records that are currently flagged as SE reads.
578 | 
579 |     Args:
580 |         r1: read 1
581 |         r2: read 2 with the same queryname as r1
582 |     """
583 |     assert not r1.is_unmapped, f"Cannot process unmapped mate {r1.query_name}/1"
584 |     assert not r2.is_unmapped, f"Cannot process unmapped mate {r2.query_name}/2"
585 |     assert r1.query_name == r2.query_name, (
586 |         f"Attempting to pair reads with different qnames {r1.query_name} vs {r2.query_name}."
587 |     )
588 | 
589 |     for r in [r1, r2]:
590 |         r.is_paired = True
591 |         r.is_proper_pair = proper_pair
592 | 
593 |     r1.is_read1 = True
594 |     r1.is_read2 = False
595 |     r2.is_read2 = True
596 |     r2.is_read1 = False
597 | 
598 |     for src, dest in [(r1, r2), (r2, r1)]:
599 |         dest.next_reference_id = src.reference_id
600 |         dest.next_reference_start = src.reference_start
601 |         dest.mate_is_reverse = src.is_reverse
602 |         dest.mate_is_unmapped = False
603 |         dest.set_tag("MC", src.cigarstring)
604 | 
605 |     insert_size = isize(r1, r2)
606 |     r1.template_length = insert_size
607 |     r2.template_length = - insert_size
608 | 
609 | 
610 | @enum.unique
611 | class SamOrder(enum.Enum):
612 |     """
613 |     Enumerations of possible sort orders for a SAM file.
614 |     """
615 | 
616 |     Unsorted = "unsorted"  #: the SAM / BAM / CRAM is unsorted
617 |     Coordinate = "coordinate"  #: coordinate sorted
618 |     QueryName = "queryname"  #: queryname sorted
619 |     Unknown = "unknown"  # Unknown SAM / BAM / CRAM sort order
620 | 


--------------------------------------------------------------------------------
/samwell/sam/bwa_mem.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility methods for running BWA
  3 | -------------------------------
  4 | 
  5 | This module contains methods for running BWA.  Currently only the "mem" algorithm is supported:
  6 |     - :func:`~samwell.bwa_mem.align` -- Aligns the given reads with BWA mem.
  7 | 
  8 | 
  9 | The options for running BWA can be customized via the three Options classes:
 10 |     - :class:`~samwell.bwa_mem.AlgorithmOptions` -- Bwa mem algorithm options.
 11 |     - :class:`~samwell.bwa_mem.ScoringOptions` -- Bwa mem scoring options
 12 |     - :class:`~samwell.bwa_mem.InputOutputOptions` -- Bwa mem input and output options
 13 | 
 14 | An input read for alignment must be minimally transformed into a FASTQ-like record:
 15 | 
 16 |     - :class:`~samwell.bwa_mem.FastqRecord` -- Fastq record used as input to alignment.
 17 | 
 18 | 
 19 | Implementation
 20 | ~~~~~~~~~~~~~~
 21 | 
 22 | Alignment of reads is performed asynchronously.
 23 | 
 24 | This is achieved by creating three sub-processes in :func:`~samwell.bwa_mem.align`:
 25 |     1. A process to consume the input iterable of reads and write them (a) to the stdin of
 26 |     BWA mem, and (b) to a queue of reads that are awaiting alignment results from BWA mem.
 27 |     2. A process to run BWA mem, where FASTQ records are written to the process' stdin,
 28 |     alignment results are returned to stdout, and any error/logging information from BWA mem is
 29 |     returned to stderr.
 30 |     3. A process to route the stderr of BWA mem to the given stderr handle (stderr_out).
 31 | 
 32 | Then :func:`~samwell.bwa_mem.align` method consumes the stdout of the BWA mem process and collates
 33 | that with the queue of reads that have been written/given to BWA mem (from process 1).  For each
 34 | input read, one or more alignments is expected to be returned by BWA mem.  The order in which
 35 | alignments of reads are returned by BWA mem is the same order as the order of reads given to BWA
 36 | mem.  The :func:`~samwell.bwa_mem.align` method then returns an iterable over the alignment
 37 | results.
 38 | 
 39 | Exceptions may occur in the thread to input FASTQ records to BWA mem, which are propagated
 40 | to the caller.  Furthermore, an exception is returned if the # of reads given to BWA mem is
 41 | not the same as the # of reads returned by BWA mem.
 42 | 
 43 | Some specific handling occurs around reading the BWA mem output with :py:mod:`~pysam`, since the
 44 | latter blocks waiting for at least some reads from BWA mem, which may not happen if there was an
 45 | issue in the various upstream processes (input to BWA mem or BWA mem itself).  This would have
 46 | caused a deadlock.
 47 | 
 48 | 
 49 | Examples
 50 | ~~~~~~~~
 51 | 
 52 | Typically, we have :class:`~pysam.AlignedSegment` records obtained from reading from a SAM or BAM
 53 | file.  The first must be converted into :class:`~samwell.bwa_mem.FastqRecord` objects.
 54 | 
 55 | .. code-block:: python
 56 | 
 57 |     >>> from samwell.sam.bwa_mem import FastqRecord
 58 |     >>> from samwell.sam import reader
 59 |     >>> reads = reader("/path/to/sample.sam")
 60 |     >>> fastq_reads = map(lambda read: FastqRecord.build(read), reads)
 61 | 
 62 | Next, those :class:`~samwell.bwa_mem.FastqRecord`s can be aligned with the
 63 | :func:`~samwell.bwa_mem.align_mem` method.
 64 | 
 65 | .. code-block:: python
 66 | 
 67 |     >>> from samwell.sam.bwa_mem import align
 68 |     >>> results = map(lambda read: align(read), fastq_reads)
 69 | 
 70 | This returns an iterable over the alignment results.  An alignment result is a tuple
 71 | consisting of the original :class:`~samwell.bwa_mem.FastqRecord` and an iterator over the
 72 | alignments (see :class:`~pysam.AlignedSegment`).
 73 | 
 74 | .. code-block:: python
 75 | 
 76 |     >>> result = next(result)
 77 |     >>> fastq_read, alignments = result
 78 |     >>> str(fastq_read)
 79 |     @name
 80 |     GATTACA
 81 |     +
 82 |     HIJKLKM
 83 |     >>> len(alignments)
 84 |     2
 85 |     >>> alignment = str(next(alignments))
 86 |     >>> alignment.query_name
 87 |     name
 88 |     >>> type(alignment)
 89 |     <class 'pysam.libcalignedsegment.AlignedSegment'>
 90 | """
 91 | 
 92 | 
 93 | import enum
 94 | import logging
 95 | import queue
 96 | import subprocess
 97 | import sys
 98 | import threading
 99 | import time
100 | from pathlib import Path
101 | from typing import Any
102 | from typing import Callable
103 | from typing import ClassVar
104 | from typing import Dict
105 | from typing import Generic
106 | from typing import Iterable
107 | from typing import Iterator
108 | from typing import List
109 | from typing import Optional
110 | from typing import Tuple
111 | from typing import TypeVar
112 | from typing import Union
113 | from typing import cast
114 | 
115 | import attr
116 | import pysam
117 | 
118 | import samwell.sam as sam
119 | from samwell.itertools import PeekableIterator
120 | from samwell.sam import SamFileType
121 | from samwell.dnautils import reverse_complement
122 | 
123 | # The type for the source attribute for a :class:`samwell.bwa_mem.FastqRecord`
124 | FastqRecordSourceType = TypeVar('FastqRecordSourceType')
125 | 
126 | 
127 | @attr.s(frozen=True, auto_attribs=True)
128 | class FastqRecord:
129 |     """Fastq record used as input to alignment.
130 | 
131 |       Attributes:
132 |           name: the name of the read
133 |           bases: the read bases
134 |           quals: the base qualities
135 |           source: optionally the :class:`~pysam.AlignedSegment` from which this was built
136 |           needs_alignment: True if the read needs alignment, False otherwise
137 |           read_number: optionally the read number; should be set to 1 or 2 for paired end
138 |               reads.
139 |       """
140 | 
141 |     name: str = attr.ib()
142 |     bases: str = attr.ib()
143 |     quals: str = attr.ib()
144 |     source: Optional[FastqRecordSourceType] = None  # type: ignore
145 |     needs_alignment: bool = True
146 |     read_number: Optional[int] = None
147 | 
148 |     _BASE_QUALITY_OFFSET: ClassVar[int] = 33
149 | 
150 |     @classmethod
151 |     def build(cls,
152 |               read: pysam.AlignedSegment,
153 |               needs_alignment: bool = True,
154 |               aligned_bases_only: bool = False,
155 |               clip_three_prime: int = 0
156 |               ) -> 'FastqRecord':
157 |         """Builds a :class:`~samwell.bwa_mem.FastqRecord` from a :class:`~pysam.AlignedSegment`
158 | 
159 |         Args:
160 |             read: the read to convert
161 |             needs_alignment: True if the read should be aligned, False otherwise
162 |             aligned_bases_only: only align the aligned bases (excludes soft-clipped bases)
163 |             clip_three_prime: the number of bases to clip on the three-prime end of the read
164 |                 relative to the original direction of sequencing.  This will be applied after
165 |                 extracting the bases based on ``aligned_bases_only``.
166 |         """
167 |         # Get the bases and qualities
168 |         if needs_alignment:
169 |             if aligned_bases_only:
170 |                 bases = read.query_alignment_sequence
171 |                 quals = read.query_alignment_qualities
172 |             else:
173 |                 bases = read.query_sequence
174 |                 quals = read.query_qualities
175 | 
176 |             # reverse complement if necessary
177 |             if read.is_reverse:
178 |                 bases = reverse_complement(bases)
179 |                 quals = quals[::-1]
180 | 
181 |             if clip_three_prime > 0:
182 |                 index_from_end = -1 * clip_three_prime
183 |                 bases = bases[:index_from_end]
184 |                 quals = quals[:index_from_end]
185 | 
186 |             # convert to string
187 |             quals = "".join([chr(q + FastqRecord._BASE_QUALITY_OFFSET) for q in quals])
188 |         else:
189 |             # If we're not going to align it, no need to muck with bases and quals
190 |             bases = ""
191 |             quals = ""
192 | 
193 |         # Get the read number
194 |         if read.is_paired:
195 |             read_number = 1 if read.is_read1 else 2
196 |         else:
197 |             read_number = None
198 | 
199 |         return FastqRecord(name=read.query_name,
200 |                            bases=bases,
201 |                            quals=quals,
202 |                            source=read,
203 |                            needs_alignment=needs_alignment,
204 |                            read_number=read_number)
205 | 
206 |     def __hash__(self) -> int:
207 |         """Returns a unique value for this record given the inputs.
208 | 
209 |         If source is defined and is a :class:`~pysam.AlignedSegment`, then the source's hash
210 |         will be returned.  Otherwise, the hash of the concatenation of the name, bases, and
211 |         qualities will be returned.
212 |         """
213 |         if self.source is not None and issubclass(self.source, pysam.AlignedSegment):
214 |             return hash(self.source)
215 |         else:
216 |             return hash(self.str_with_read_number())
217 | 
218 |     def __str__(self) -> str:
219 |         return f"@{self.name}\n{self.bases}\n+\n{self.quals}\n"
220 | 
221 |     def str_with_read_number(self) -> str:
222 |         """Returns the record in FASTQ format, with the read number appended (colon delimited)."""
223 |         name = self.name + ":" + (str(self.read_number) if self.read_number is not None else "0")
224 |         return f"@{name}\n{self.bases}\n+\n{self.quals}\n"
225 | 
226 | 
227 | class _CommandLineOptionGroup:
228 |     """Base class for groups of bwa options using @attr.s.
229 | 
230 |     It is assumed that every attribute has the 'flag' key specified in its metadata field.  Use the
231 |     :func:`~samwell.bwa_mem._flag` method to add additional flag attributes.
232 |     """
233 | 
234 |     def args(self) -> List[str]:
235 |         """Build the list of command line arguments from the defined options."""
236 |         _args = []
237 |         flag_to_attribute_name: Dict[str, str] = {}
238 |         # go through each attribute
239 |         for attribute in attr.fields(type(self)):
240 |             # get the value for the flag
241 |             value = getattr(self, attribute.name)
242 |             if isinstance(value, enum.Enum):
243 |                 value = value.value
244 |             else:
245 |                 # check if it iterable, and if so, join them with commas
246 |                 try:
247 |                     value = ",".join(iter(value))
248 |                 except TypeError:
249 |                     pass
250 | 
251 |             # if it is set, add it to args
252 |             if value is not None:
253 |                 # assume that they have metadata, with the "flag" specified.  Get the flag to use
254 |                 flag = attribute.metadata['flag']
255 |                 if flag in flag_to_attribute_name:
256 |                     cur_name = attribute.name
257 |                     other_name = flag_to_attribute_name[flag]
258 |                     raise ValueError(
259 |                         f"Flag '{flag}' found in attributes {cur_name} and {other_name}")
260 |                 flag_to_attribute_name[flag] = attribute.name
261 |                 if attribute.type in (bool, Optional[bool]):
262 |                     if value is True:
263 |                         _args.append(flag)
264 |                 else:
265 |                     _args.extend([flag, str(value)])
266 |         return _args
267 | 
268 | 
269 | # Alias for the alignment result
270 | AlignmentResult = Tuple[FastqRecord, List[pysam.AlignedSegment]]
271 | 
272 | 
273 | @attr.s(frozen=True)
274 | class AlgorithmOptions(_CommandLineOptionGroup):
275 |     """Bwa mem algorithm options
276 | 
277 |     Attributes:
278 |         threads: number of threads
279 |         min_seed_len: minimum seed length
280 |         band_width: band width for banded alignment
281 |         off_diagonal_dropoff: off-diagonal X-dropoff
282 |         internal_seeds_length_factor: look for internal seeds inside a seed longer than
283 |             min_seed_len * internal_seeds_length_factor
284 |         max_third_seed_occurrence: seed occurrence for the 3rd round seeding
285 |         max_seed_occurrence: skip seeds with more than INT occurrences
286 |         drop_ratio: drop chains shorter than this fraction of the longest overlapping chain
287 |         min_chain_weight: discard a chain if seeded bases shorter than this value
288 |         max_mate_rescue_rounds: perform at most INT rounds of mate rescues for each read
289 |         skip_mate_rescue: skip mate rescue
290 |         skip_pairing: skip pairing; mate rescue performed unless skip_mate_rescue also in use
291 |     """
292 | 
293 |     threads: Optional[int] = attr.ib(default=None, metadata={'flag': '-t'})
294 |     min_seed_len: Optional[int] = attr.ib(default=None, metadata={'flag': '-k'})
295 |     band_width: Optional[int] = attr.ib(default=None, metadata={'flag': '-w'})
296 |     off_diagonal_dropoff: Optional[int] = attr.ib(default=None, metadata={'flag': '-d'})
297 |     internal_seeds_length_factor: Optional[float] = attr.ib(default=None, metadata={'flag': '-r'})
298 |     max_third_seed_occurrence: Optional[int] = attr.ib(default=None, metadata={'flag': '-y'})
299 |     max_seed_occurrence: Optional[int] = attr.ib(default=None, metadata={'flag': '-c'})
300 |     drop_ratio: Optional[float] = attr.ib(default=None, metadata={'flag': '-D'})
301 |     min_chain_weight: Optional[int] = attr.ib(default=None, metadata={'flag': '-W'})
302 |     max_mate_rescue_rounds: Optional[int] = attr.ib(default=None, metadata={'flag': '-m'})
303 |     skip_mate_rescue: Optional[bool] = attr.ib(default=None, metadata={'flag': '-S'})
304 |     skip_pairing: Optional[bool] = attr.ib(default=None, metadata={'flag': '-P'})
305 | 
306 | 
307 | @enum.unique
308 | class ReadType(enum.Enum):
309 |     """The read type for BWA mem."""
310 | 
311 |     PacBio = "pacbio"
312 |     OxfordNano2D = "ont2d"
313 |     IntraSpecies = "intractg"
314 | 
315 | 
316 | @attr.s(frozen=True)
317 | class ScoringOptions(_CommandLineOptionGroup):
318 | 
319 |     """Bwa mem scoring options
320 | 
321 |     Attributes:
322 |         match_score: the score for a sequence match, which scales options -TdBOELU unless
323 |             overridden
324 |         mismatch_score: penalty for a mismatch
325 |         gap_open: gap open penalties for deletions and insertions (single value to use the same
326 |             for both)
327 |         gap_extend: gap extension penalty; a gap of size k cost '{-O} + {-E}*k' (single value to
328 |             use the same for both)
329 |         clipping_penalty: penalty for 5'- and 3'-end clipping
330 |         unpaired_penalty: penalty for an unpaired read pair
331 |         read_type: read type. Setting -x changes multiple parameters unless overriden:
332 |                      pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0  (PacBio reads to ref)
333 |                      ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0  (Oxford Nanopore 2D-reads to ref)
334 |                      intractg: -B9 -O16 -L5  (intra-species contigs to ref)
335 |     """
336 | 
337 |     match_score: Optional[int] = attr.ib(default=None, metadata={'flag': '-A'})
338 |     mismatch_score: Optional[int] = attr.ib(default=None, metadata={'flag': '-B'})
339 |     gap_open: Optional[Union[int, Tuple[int, int]]] = attr.ib(default=None,
340 |                                                               metadata={'flag': '-O'})
341 |     gap_extend: Optional[Union[int, Tuple[int, int]]] = attr.ib(default=None,
342 |                                                                 metadata={'flag': '-E'})
343 |     clipping_penalty: Optional[Union[int, Tuple[int, int]]] = attr.ib(default=None,
344 |                                                                       metadata={'flag': '-L'})
345 |     unpaired_penalty: Optional[int] = attr.ib(default=None, metadata={'flag': '-U'})
346 |     read_type: Optional[ReadType] = attr.ib(default=None, metadata={'flag': '-x'})
347 | 
348 | 
349 | # The type for BWA mem's insert size parameter (-I) option
350 | InsertSizeParamsType = Union[
351 |     float,
352 |     Tuple[float, float],
353 |     Tuple[float, float, int],
354 |     Tuple[float, float, int, int]]
355 | 
356 | 
357 | @attr.s(frozen=True)
358 | class InputOutputOptions(_CommandLineOptionGroup):
359 |     """Bwa mem input and output options
360 | 
361 |     Attributes:
362 |         interleaved_pairs: read pairs are consecutive (r1 then r2), otherwise fragment reads
363 |         read_group: read group header line such as '@RG\tID:foo\tSM:bar
364 |         header_insert: insert STR to header if it starts with @; or insert lines in FILE
365 |         alts_as_primary: treat ALT contigs as part of the primary assembly (i.e. ignore
366 |             <idxbase>.alt file)
367 |         verbosity: verbose level: 1=error, 2=warning, 3=message, 4+=debuggin
368 |         min_alignment_score: minimum score to output
369 |         max_hits_within_max_score: if there are <INT hits with score >80% of the max score, output
370 |             all in XA
371 |         all_alignments: output all alignments for SE or unpaired PE
372 |         append_fastq_comment: append FASTA/FASTQ comment to SAM output
373 |         add_fasta_header_to_xr: output the reference FASTA header in the XR tag
374 |         softclip_supplementary: use soft clipping for supplementary alignments
375 |         split_hits_are_secondary: mark shorter split hits as secondary
376 |         insert_size_params: specify the mean, standard deviation (10% of the mean if absent), max
377 |             (4 sigma from the mean if absent) and min of the insert size distribution.  FR
378 |             orientation only.
379 |         bases_per_batch: how many bases of sequence data bwa should read from the input
380 |             before triggering a batch of alignments.
381 |     """
382 | 
383 |     interleaved_pairs: Optional[bool] = attr.ib(default=None, metadata={'flag': '-p'})
384 |     read_group: Optional[str] = attr.ib(default=None, metadata={'flag': '-R'})
385 |     header_insert: Optional[Union[str, Path]] = attr.ib(default=None, metadata={'flag': '-H'})
386 |     alts_as_primary: Optional[bool] = attr.ib(default=None, metadata={'flag': '-j'})
387 |     verbosity: Optional[int] = attr.ib(default=None, metadata={'flag': '-v'})
388 |     min_alignment_score: Optional[int] = attr.ib(default=None, metadata={'flag': '-T'})
389 |     max_hits_within_max_score: Optional[Union[int, Tuple[int, int]]] = \
390 |         attr.ib(default=None, metadata={'flag': '-h'})
391 |     all_alignments: Optional[bool] = attr.ib(default=None, metadata={'flag': '-a'})
392 |     append_fastq_comment: Optional[bool] = attr.ib(default=None, metadata={'flag': '-C'})
393 |     add_fasta_header_to_xr: Optional[bool] = attr.ib(default=None, metadata={'flag': '-V'})
394 |     softclip_supplementary: Optional[bool] = attr.ib(default=None, metadata={'flag': '-Y'})
395 |     split_hits_are_secondary: Optional[bool] = attr.ib(default=None, metadata={'flag': '-M'})
396 |     insert_size_params: Optional[InsertSizeParamsType] = attr.ib(default=None,
397 |                                                                  metadata={'flag': '-I'})
398 |     bases_per_batch: Optional[int] = attr.ib(default=115000, metadata={'flag': '-K'})
399 | 
400 | 
401 | # The type for the source items in :class:`samwell.bwa_mem._SourceToSinkThread`
402 | SourceToSinkThreadType = TypeVar('SourceToSinkThreadType')
403 | 
404 | 
405 | class _SourceToSinkThread(threading.Thread, Generic[SourceToSinkThreadType]):
406 |     """A thread that consumes elements from the source and adds them to the sink
407 | 
408 |     Attributes:
409 |         num_added: the number of elements from the source added to the sink
410 |     """
411 | 
412 |     def __init__(self,
413 |                  source: Iterator[SourceToSinkThreadType],
414 |                  sink_add_func: Callable[[SourceToSinkThreadType], None],
415 |                  sink_close_func: Optional[Callable[..., None]] = None) -> None:
416 |         """Creates a new thread for consuming the source and adding to the sink.
417 | 
418 |         Args:
419 |             source: the source iterator from which to consume
420 |             sink_add_func: the method to use to add an element to the sink
421 |             sink_close_func: the method to call when all elements have been added to the sink
422 |         """
423 |         super().__init__(daemon=True)
424 |         self.num_added: int = 0
425 |         self._source = source
426 |         self._sink_add_method = sink_add_func
427 |         self._sink_close_method = sink_close_func
428 |         self.exception: Optional[Exception] = None
429 |         self.done = False
430 | 
431 |     def run(self) -> None:
432 |         """Runs the source to sink transfer"""
433 |         try:
434 |             for item in self._source:
435 |                 self._sink_add_method(item)
436 |                 self.num_added += 1
437 |         except Exception as e:
438 |             self.exception = e
439 |         finally:
440 |             if self._sink_close_method is not None:
441 |                 self._sink_close_method()
442 |             self.done = True
443 | 
444 | 
445 | def _same_read(read: FastqRecord, alignment: pysam.AlignedSegment) -> bool:
446 |     """True if an alignment for the given read, False otherwise.
447 | 
448 |     For the alignment to be considered as an alignment for this read, the read name and read number
449 |     must match.  The read number is appended to the alignment's query name.
450 |     """
451 |     if alignment.is_paired:
452 |         assert read.read_number is not None, f"Paired alignment but the read has no read #: {read}"
453 |         if read.name != alignment.query_name:
454 |             return False
455 |         else:
456 |             alignment_read_number = 1 if alignment.is_read1 else 2
457 |             return read.read_number == alignment_read_number
458 |     else:
459 |         alignment_name, alignment_read_number = alignment.query_name.rsplit(':', 1)
460 |         if read.name != alignment_name:
461 |             return False
462 |         elif read.read_number is None:
463 |             return int(alignment_read_number) == 0
464 |         else:
465 |             return read.read_number == int(alignment_read_number)
466 | 
467 | 
468 | def _collate_alignments(reads_queue: queue.Queue,
469 |                         alignments_reader: pysam.AlignmentFile,
470 |                         suppress_secondaries: bool = False) -> Iterable[AlignmentResult]:
471 |     """Collates the alignments for each read in the given queue.
472 | 
473 |     Alignments for reads in the alignments reader are in the same order as the reads in the reads
474 |     queue.  This allows traversal of both once and in a step-wise fashion.  In fact, there exist
475 |     alignments for reads in the read queue that need alignment (see the corresponding property).
476 | 
477 |     This method may block waiting for (1) reads to be written for BWA to consume and thus added
478 |     to the queue that will be consumed by this method, or (2) the alignments returned by BWA for a
479 |     given read.  The former (1) will not block indefinitely since at some point the BWA mem
480 |     input process will write the sentinel value and reads_queue.get will return None.  The
481 |     latter will not block indefinitely since the stdout will be closed when the BWA mem process is
482 |     terminated.
483 | 
484 |     Args:
485 |         reads_queue: the queue of reads
486 |         alignments_reader: the reader of sam records
487 |         suppress_secondaries: true to discard all secondary alignments, false otherwise
488 | 
489 |     Returns:
490 |         An iterable over the alignment results.  An alignment result is a tuple consisting of the
491 |         original :class:`~samwell.bwa_mem.FastqRecord` and an iterator over the alignments (see
492 |         :class:`~pysam.AlignedSegment`).
493 |     """
494 |     alignments_iter: PeekableIterator = PeekableIterator(alignments_reader)
495 |     reads_iterator = cast(Iterator[FastqRecord], iter(reads_queue.get, None))
496 |     for read in reads_iterator:
497 |         results: List[pysam.AlignedSegment] = []
498 | 
499 |         if read.needs_alignment:
500 |             result = alignments_iter.peek() if alignments_iter.can_peek() else None
501 |             while result is not None and _same_read(read, result):
502 |                 next(alignments_iter)  # consume the current record
503 |                 if not suppress_secondaries or not result.is_secondary:
504 |                     # Update the query name since we may have originally appended the read number
505 |                     result.query_name = read.name
506 |                     results.append(result)
507 |                 result = alignments_iter.peek() if alignments_iter.can_peek() else None
508 | 
509 |         yield (read, results)
510 |     assert not alignments_iter.can_peek(), 'Alignments exist but no more reads in the queue'
511 | 
512 | 
513 | def _build_command_line(idxbase: Path,
514 |                         executable_path: Path = Path('bwa'),
515 |                         algo_opts: Optional[AlgorithmOptions] = None,
516 |                         scoring_opts: Optional[ScoringOptions] = None,
517 |                         io_opts: Optional[InputOutputOptions] = None) -> List[str]:
518 |     """Builds the command line for bwa mem.
519 | 
520 |     Args:
521 |         idxbase: the path prefix for all the BWA-specific index files
522 |         executable_path: the path to the BWA executable
523 |         algo_opts: the algorithm options
524 |         scoring_opts: the scoring options
525 |         io_opts: the input and output options
526 |     """
527 |     # Start with the path to BWA, then the mem command
528 |     cmd: List[Any] = [executable_path, 'mem']
529 |     # Add any options
530 |     for opts in [algo_opts, scoring_opts, io_opts]:
531 |         if opts is not None:
532 |             cmd.extend(opts.args())
533 |     # Now the reference genome index basename
534 |     cmd.append(idxbase)
535 |     # Now set the input to be from standard input
536 |     cmd.append('/dev/stdin')
537 |     # Convert all args to strings
538 |     args = [str(arg) for arg in cmd]
539 |     return args
540 | 
541 | 
542 | def _build_bwa_input_process(reads: Iterable[FastqRecord],
543 |                              to_bwa_handle: Any,
544 |                              to_output_queue: queue.Queue,
545 |                              interleaved_pairs: Optional[bool] = None
546 |                              ) -> _SourceToSinkThread:
547 |     """Builds and starts a process to write the given FASTQ records for BWA mem and the given queue
548 | 
549 |     Args:
550 |         reads: the reads to input to the BWA mem subprocess' stdin
551 |         to_bwa_handle: the IO handle to which to write the FASTQ records for BWA mem
552 |         to_output_queue: the queue to also write to after writing a FASTQ record for BWA mem;
553 |             this queue is mainly used for collating FASTQ reads and SAM alignments.
554 |         interleaved_pairs: read pairs are consecutive (r1 then r2), otherwise unpaired reads
555 |     """
556 |     last_read_name_and_number: Optional[Tuple[str, Optional[int]]] = None
557 | 
558 |     def sink_add_method(read: FastqRecord) -> None:
559 |         """Writes a FASTQ record to the BWA mem input as well as the results collation"""
560 |         nonlocal last_read_name_and_number
561 | 
562 |         if read.needs_alignment:
563 |             if last_read_name_and_number is not None:
564 |                 name, read_number = last_read_name_and_number
565 |                 assert name != read.name or read_number != read.read_number, \
566 |                     'Consecutive reads have the same name and read number:' + \
567 |                     f'\n\t\tname: {name}\n\t\tread number: {read_number}' + \
568 |                     f'\n\t\tsource: {read.source}'
569 |             last_read_name_and_number = (read.name, read.read_number)
570 |             if interleaved_pairs is True:
571 |                 to_bwa_handle.write(str(read))
572 |             else:
573 |                 # IMPORTANT: the read name has the read number appended to disambiguate ends of a
574 |                 # pair
575 |                 to_bwa_handle.write(read.str_with_read_number())
576 |         to_output_queue.put(read)
577 | 
578 |     def sink_close_method() -> None:
579 |         """Close the BWA mem input handle and the output queue for results collation"""
580 |         to_bwa_handle.close()
581 |         to_output_queue.put(None)  # add the sentinel value
582 | 
583 |     bwa_input_process = _SourceToSinkThread(source=iter(reads),
584 |                                             sink_add_func=sink_add_method,
585 |                                             sink_close_func=sink_close_method)
586 |     bwa_input_process.start()
587 | 
588 |     return bwa_input_process
589 | 
590 | 
591 | def align(reads: Iterable[FastqRecord],
592 |           idxbase: Path,
593 |           executable_path: Path = Path('bwa'),
594 |           algo_opts: Optional[AlgorithmOptions] = None,
595 |           scoring_opts: Optional[ScoringOptions] = None,
596 |           io_opts: Optional[InputOutputOptions] = None,
597 |           suppress_secondaries: bool = False,
598 |           stderr_out: Any = sys.stderr
599 |           ) -> Iterable[AlignmentResult]:
600 |     """Aligns the given reads with BWA mem.
601 | 
602 |     See :py:mod:`~samwell.bwa_mem` for a detailed explanation for the implementation approach.
603 | 
604 |     Args:
605 |         reads: the reads to align
606 |         idxbase: the path prefix for all the BWA-specific index files
607 |         executable_path: the path to the BWA executable
608 |         algo_opts: the algorithm options
609 |         scoring_opts: the scoring options
610 |         io_opts: the input and output options
611 |         suppress_secondaries: true to discard all secondary alignments, false otherwise
612 | 
613 |     Returns:
614 |         An iterable over the alignment results.  An alignment result is a tuple consisting of the
615 |         original :class:`~samwell.bwa_mem.FastqRecord` and an iterator over the alignments (see
616 |         :class:`~pysam.AlignedSegment`)
617 |     """
618 | 
619 |     # Build the command line used to run BWA MEM
620 |     command_line = _build_command_line(idxbase=idxbase,
621 |                                        executable_path=executable_path,
622 |                                        algo_opts=algo_opts,
623 |                                        scoring_opts=scoring_opts,
624 |                                        io_opts=io_opts)
625 | 
626 |     # Create a sub-process in which to run BWA mem.  This process will read FASTQ records from
627 |     # stdin, write SAM records to stdout, and write any error/logging information to stderr.
628 |     bwa_mem_process = subprocess.Popen(args=command_line,
629 |                                        stdin=subprocess.PIPE,
630 |                                        stdout=subprocess.PIPE,
631 |                                        stderr=subprocess.PIPE,
632 |                                        universal_newlines=True)
633 | 
634 |     # Create a sub-process in which we read the stderr of the BWA mem subprocess and write it to
635 |     # the given stderr_out handle.
636 |     bwa_mem_stderr_process = _SourceToSinkThread(source=iter(bwa_mem_process.stderr),
637 |                                                  sink_add_func=stderr_out.write,
638 |                                                  sink_close_func=None)
639 |     bwa_mem_stderr_process.start()
640 | 
641 |     # Create a queue of FASTQ records that the sub-process who will write to BWA mem's stdin
642 |     # will also write.  This is so we can collate/join the input FASTQ records with the output SAM
643 |     # (or alignment) records.  A sentinel value (None) will be written to indicate no more reads
644 |     # will be placed in the queue.
645 |     reads_queue: queue.Queue = queue.Queue()
646 | 
647 |     # Create a sub-process to consume the input FASTQ records and write them to BWA mem's stdin. We
648 |     # write in a separate thread to avoid any deadlock with waiting for output from BWA mem's
649 |     # stdout.  This can happen in a synchronous implementation where BWA mem is buffering reads and
650 |     # we are waiting for some results from BWA mem's stdout, but really BWA mem is waiting for
651 |     # either more reads from stdin or for stdin to be closed.
652 |     interleaved_pairs = io_opts.interleaved_pairs if io_opts is not None else None
653 |     bwa_input_process = _build_bwa_input_process(reads=reads,
654 |                                                  to_bwa_handle=bwa_mem_process.stdin,
655 |                                                  to_output_queue=reads_queue,
656 |                                                  interleaved_pairs=interleaved_pairs)
657 | 
658 |     # Go through the output
659 |     num_aligned = 0
660 |     try:
661 |         # Wait for some reads to be written.  pysam will block opening the input file until some
662 |         # data is available, or the stream is closed.  If no data is added, don't even try opening
663 |         # the stream.
664 |         while bwa_input_process.num_added == 0 and not bwa_input_process.done:
665 |             # the input process is still running but no reads have been added
666 |             time.sleep(.1)
667 |         if bwa_input_process.num_added == 0 and bwa_input_process.done:
668 |             # the input process is done (error or success) and no reads have been added, so skip
669 |             # opening pysam
670 |             raise StopIteration
671 |         # Read through the output of BWA mem, and collate that with the queue of reads given to
672 |         # BWA mem
673 |         with sam.reader(path=bwa_mem_process.stdout, file_type=SamFileType.SAM) as reader:
674 |             alignment_results = _collate_alignments(reads_queue=reads_queue,
675 |                                                     alignments_reader=reader,
676 |                                                     suppress_secondaries=suppress_secondaries)
677 |             # A simple loop with its only purpose to count the number of alignment results
678 |             for result in alignment_results:
679 |                 num_aligned += 1
680 |                 yield result
681 |     finally:
682 |         # Close the stdin of the BWA mem process.  This should signal BWA mem to shut down, and
683 |         # for the input thread to stop.
684 |         bwa_mem_process.stdin.close()
685 | 
686 |         # Join the input thread as now stdin of the BWA mem process is closed.
687 |         bwa_input_process.join(timeout=1.0)
688 | 
689 |         # Check if the inputting reads to BWA had an exception
690 |         if bwa_input_process.exception is not None:
691 |             raise bwa_input_process.exception
692 |         elif bwa_input_process.is_alive():
693 |             raise RuntimeError("BWA process encountered no errors but did not terminate.")
694 | 
695 |         # Check that the number of reads given to BWA mem was the same # returned by BWA mem
696 |         num_left = bwa_input_process.num_added - num_aligned
697 |         if num_left != 0:
698 |             raise ValueError(f"Still had {num_left:,d} remaining reads from BWA")
699 | 
700 |         # Shut down the BWA mem process.  If it fails to shutdown, log a warning and continue on
701 |         try:
702 |             bwa_mem_process.wait(timeout=5.0)
703 |         except subprocess.TimeoutExpired as ex:
704 |             logger = logging.getLogger(__name__)
705 |             logger.warning("Could not shutdown BWA, ignoring error: %s", str(ex))
706 | 
707 |         # Shut down the stderr thread
708 |         bwa_mem_stderr_process.join(timeout=1.0)
709 | 


--------------------------------------------------------------------------------
/samwell/sam/clipping.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility Functions for Soft-Clipping records in SAM/BAM Files
  3 | ------------------------------------------------------------
  4 | 
  5 | This module contains utility functions for soft-clipping reads.  There are four variants
  6 | that support clipping the beginnings and ends of reads, and specifying the amount to be
  7 | clipped in terms of query bases or reference bases:
  8 | 
  9 |     - :func:`~samwell.clipping.softclip_start_of_alignment_by_query` clips the start
 10 |       of the alignment in terms of query bases
 11 |     - :func:`~samwell.clipping.softclip_end_of_alignment_by_query` clips the end
 12 |       of the alignment in terms of query bases
 13 |     - :func:`~samwell.clipping.softclip_start_of_alignment_by_ref` clips the start
 14 |       of the alignment in terms of reference bases
 15 |     - :func:`~samwell.clipping.softclip_end_of_alignment_by_ref` clips the end
 16 |       of the alignment in terms of reference bases
 17 | 
 18 | The difference between query and reference based versions is apparent only when there are
 19 | insertions or deletions in the read as indels have lengths on either the query (insertions) or
 20 | reference (deletions) but not both.
 21 | 
 22 | Upon clipping a set of additional SAM tags are removed from reads as they are likely invalid.
 23 | 
 24 | For example, to clip the last 10 query bases of all records and reduce the qualities to Q2:
 25 | 
 26 | .. code-block:: python
 27 | 
 28 | 
 29 |     >>> from samwell.sam import reader, clipping
 30 |     >>> with reader("/path/to/sample.sam") as fh:
 31 |     ...     for rec in fh:
 32 |     ...         clipping.softclip_end_of_alignment_by_query(rec, 10, 2)
 33 |     ...         print(rec.cigarstring)
 34 | 
 35 | It should be noted that any clipping potentially makes the common SAM tags NM, MD and UQ
 36 | invalid, as well as potentially other alignment based SAM tags.  Any clipping added to the start
 37 | of an alignment changes the position (reference_start) of the record. Any reads that have no
 38 | aligned bases after clipping are set to be unmapped.  If writing the clipped reads back to a BAM
 39 | it should be noted that:
 40 | 
 41 |     - Mate pairs may have incorrect information about their mate's positions
 42 |     - Even if the input was coordinate sorted, the output may be out of order
 43 | 
 44 | To rectify these problems it is necessary to do the equivalent of:
 45 | 
 46 | .. code-block:: bash
 47 | 
 48 |     cat clipped.bam | samtools sort -n | samtools fixmate | samtools sort | samtools calmd
 49 | """
 50 | 
 51 | from array import array
 52 | from typing import Iterable
 53 | from typing import List
 54 | from typing import NamedTuple
 55 | from typing import Optional
 56 | from typing import Tuple
 57 | 
 58 | from pysam import AlignedSegment
 59 | 
 60 | from samwell import dnautils
 61 | from samwell import sam
 62 | from samwell.itertools import peekable
 63 | from samwell.sam import Cigar
 64 | from samwell.sam import CigarElement
 65 | from samwell.sam import CigarOp
 66 | 
 67 | """The default set of SAM tags which become invalid when clipping is applied."""
 68 | TAGS_TO_INVALIDATE: Iterable[str] = ("MD", "NM", "UQ")
 69 | 
 70 | 
 71 | class ClippingInfo(NamedTuple):
 72 |     """Named tuple holding the number of bases clipped on the query and reference respectively.
 73 | 
 74 |     Attributes:
 75 |         query_bases_clipped (int): the number of query bases in the alignment that were clipped.
 76 |         ref_bases_clipped (int): the number of reference bases in the alignment that were clipped.
 77 |     """
 78 |     query_bases_clipped: int
 79 |     ref_bases_clipped: int
 80 | 
 81 | 
 82 | def softclip_start_of_alignment_by_query(rec: AlignedSegment,
 83 |                                          bases_to_clip: int,
 84 |                                          clipped_base_quality: Optional[int] = None,
 85 |                                          tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE
 86 |                                          ) -> ClippingInfo:
 87 |     """
 88 |     Adds soft-clipping to the start of a read's alignment.
 89 | 
 90 |     Clipping is applied after any existing hard or soft clipping.  E.g. a read with cigar 5S100M
 91 |     that is clipped with bases_to_clip=10 will yield a cigar of 15S90M.
 92 | 
 93 |     If the read is unmapped or bases_to_clip < 1 then nothing is done.
 94 | 
 95 |     If the read has fewer clippable bases than requested the read will be unmapped.
 96 | 
 97 |     Args:
 98 |         rec: the BAM record to clip
 99 |         bases_to_clip: the number of additional bases of clipping desired in the read/query
100 |         clipped_base_quality: if not None, set bases in the clipped region to this quality
101 |         tags_to_invalidate: the set of extended attributes to remove upon clipping
102 | 
103 |     Returns:
104 |         ClippingInfo: a named tuple containing the number of query/read bases and the number
105 |             of target/reference bases clipped.
106 |     """
107 |     if rec.is_unmapped or bases_to_clip < 1:
108 |         return ClippingInfo(0, 0)
109 | 
110 |     num_clippable_bases = rec.query_alignment_length
111 | 
112 |     if bases_to_clip >= num_clippable_bases:
113 |         return _clip_whole_read(rec, tags_to_invalidate)
114 | 
115 |     cigar = Cigar.from_cigartuples(rec.cigartuples)
116 |     quals = rec.query_qualities
117 |     new_cigar, clipping_info = _clip(cigar, quals, bases_to_clip, clipped_base_quality)
118 |     rec.query_qualities = quals
119 | 
120 |     rec.reference_start += clipping_info.ref_bases_clipped
121 |     rec.cigarstring = str(new_cigar)
122 |     _cleanup(rec, tags_to_invalidate)
123 |     return clipping_info
124 | 
125 | 
126 | def softclip_end_of_alignment_by_query(rec: AlignedSegment,
127 |                                        bases_to_clip: int,
128 |                                        clipped_base_quality: Optional[int] = None,
129 |                                        tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE
130 |                                        ) -> ClippingInfo:
131 |     """
132 |     Adds soft-clipping to the end of a read's alignment.
133 | 
134 |     Clipping is applied before any existing hard or soft clipping.  E.g. a read with cigar 100M5S
135 |     that is clipped with bases_to_clip=10 will yield a cigar of 90M15S.
136 | 
137 |     If the read is unmapped or bases_to_clip < 1 then nothing is done.
138 | 
139 |     If the read has fewer clippable bases than requested the read will be unmapped.
140 | 
141 |     Args:
142 |         rec: the BAM record to clip
143 |         bases_to_clip: the number of additional bases of clipping desired in the read/query
144 |         clipped_base_quality: if not None, set bases in the clipped region to this quality
145 |         tags_to_invalidate: the set of extended attributes to remove upon clipping
146 | 
147 |     Returns:
148 |         ClippingInfo: a named tuple containing the number of query/read bases and the number
149 |             of target/reference bases clipped.
150 |     """
151 |     if rec.is_unmapped or bases_to_clip < 1:
152 |         return ClippingInfo(0, 0)
153 | 
154 |     num_clippable_bases = rec.query_alignment_length
155 | 
156 |     if bases_to_clip >= num_clippable_bases:
157 |         return _clip_whole_read(rec, tags_to_invalidate)
158 | 
159 |     # Reverse the cigar and qualities so we can clip from the start
160 |     cigar = Cigar.from_cigartuples(rec.cigartuples).reversed()
161 |     quals = rec.query_qualities
162 |     quals.reverse()
163 |     new_cigar, clipping_info = _clip(cigar, quals, bases_to_clip, clipped_base_quality)
164 | 
165 |     # Then reverse everything back again
166 |     quals.reverse()
167 |     rec.query_qualities = quals
168 |     rec.cigarstring = str(new_cigar.reversed())
169 | 
170 |     _cleanup(rec, tags_to_invalidate)
171 |     return clipping_info
172 | 
173 | 
174 | def softclip_start_of_alignment_by_ref(rec: AlignedSegment,
175 |                                        bases_to_clip: int,
176 |                                        clipped_base_quality: Optional[int] = None,
177 |                                        tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE
178 |                                        ) -> ClippingInfo:
179 |     """Soft-clips the start of an alignment by bases_to_clip bases on the reference.
180 | 
181 |     Clipping is applied after any existing hard or soft clipping.  E.g. a read with cigar 5S100M
182 |     that is clipped with bases_to_clip=10 will yield a cigar of 15S90M.
183 | 
184 |     If the read is unmapped or bases_to_clip < 1 then nothing is done.
185 | 
186 |     If the read has fewer clippable bases than requested the read will be unmapped.
187 | 
188 |     Args:
189 |         rec: the BAM record to clip
190 |         bases_to_clip: the number of additional bases of clipping desired on the reference
191 |         clipped_base_quality: if not None, set bases in the clipped region to this quality
192 |         tags_to_invalidate: the set of extended attributes to remove upon clipping
193 | 
194 |     Returns:
195 |         ClippingInfo: a named tuple containing the number of query/read bases and the number
196 |             of target/reference bases clipped.
197 |     """
198 |     if rec.reference_length <= bases_to_clip:
199 |         return _clip_whole_read(rec, tags_to_invalidate)
200 | 
201 |     new_start = rec.reference_start + bases_to_clip
202 |     new_query_start = _read_pos_at_ref_pos(rec, new_start, previous=False)
203 |     query_bases_to_clip = new_query_start - rec.query_alignment_start
204 |     return softclip_start_of_alignment_by_query(rec,
205 |                                                 query_bases_to_clip,
206 |                                                 clipped_base_quality,
207 |                                                 tags_to_invalidate)
208 | 
209 | 
210 | def softclip_end_of_alignment_by_ref(rec: AlignedSegment,
211 |                                      bases_to_clip: int,
212 |                                      clipped_base_quality: Optional[int] = None,
213 |                                      tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE
214 |                                      ) -> ClippingInfo:
215 |     """Soft-clips the end of an alignment by bases_to_clip bases on the reference.
216 | 
217 |     Clipping is applied beforeany existing hard or soft clipping.  E.g. a read with cigar 100M5S
218 |     that is clipped with bases_to_clip=10 will yield a cigar of 90M15S.
219 | 
220 |     If the read is unmapped or bases_to_clip < 1 then nothing is done.
221 | 
222 |     If the read has fewer clippable bases than requested the read will be unmapped.
223 | 
224 |     Args:
225 |         rec: the BAM record to clip
226 |         bases_to_clip: the number of additional bases of clipping desired on the reference
227 |         clipped_base_quality: if not None, set bases in the clipped region to this quality
228 |         tags_to_invalidate: the set of extended attributes to remove upon clipping
229 | 
230 |     Returns:
231 |         ClippingInfo: a named tuple containing the number of query/read bases and the number
232 |             of target/reference bases clipped.
233 |     """
234 |     if rec.reference_length <= bases_to_clip:
235 |         return _clip_whole_read(rec, tags_to_invalidate)
236 | 
237 |     new_end = rec.reference_end - bases_to_clip
238 |     new_query_end = _read_pos_at_ref_pos(rec, new_end, previous=False)
239 |     query_bases_to_clip = rec.query_alignment_end - new_query_end
240 |     return softclip_end_of_alignment_by_query(rec,
241 |                                               query_bases_to_clip,
242 |                                               clipped_base_quality,
243 |                                               tags_to_invalidate)
244 | 
245 | 
246 | def _clip_whole_read(rec: AlignedSegment, tags_to_invalidate: Iterable[str]) -> ClippingInfo:
247 |     """Private method that unmaps a read and returns an appropriate ClippingInfo."""
248 |     retval = ClippingInfo(rec.query_alignment_length, rec.reference_length)
249 |     _cleanup(rec, tags_to_invalidate)
250 |     _make_read_unmapped(rec)
251 |     return retval
252 | 
253 | 
254 | def _make_read_unmapped(rec: AlignedSegment) -> None:
255 |     """Removes mapping information from a read."""
256 |     if rec.is_reverse:
257 |         quals = rec.query_qualities
258 |         quals.reverse()
259 |         rec.query_sequence = dnautils.reverse_complement(rec.query_sequence)
260 |         rec.query_qualities = quals
261 |         rec.is_reverse = False
262 | 
263 |     rec.reference_id = sam.NO_REF_INDEX
264 |     rec.reference_start = sam.NO_REF_POS
265 |     rec.cigar = None
266 |     rec.mapping_quality = 0
267 |     rec.template_length = 0
268 |     rec.is_duplicate = False
269 |     rec.is_secondary = False
270 |     rec.is_supplementary = False
271 |     rec.is_proper_pair = False
272 |     rec.is_unmapped = True
273 | 
274 | 
275 | def _cleanup(rec: AlignedSegment, tags_to_invalidate: Iterable[str]) -> None:
276 |     """Removes extended tags from a record that may have become invalid after clipping."""
277 |     for tag in tags_to_invalidate:
278 |         rec.set_tag(tag, None)
279 | 
280 | 
281 | def _read_pos_at_ref_pos(rec: AlignedSegment,
282 |                          ref_pos: int,
283 |                          previous: Optional[bool] = None) -> Optional[int]:
284 |     """
285 |     Returns the read or query position at the reference position.
286 | 
287 |     If the reference position is not within the span of reference positions to which the
288 |     read is aligned an exception will be raised.  If the reference position is within the span
289 |     but is not aligned (i.e. it is deleted in the read) behavior is controlled by the
290 |     "previous" argument.
291 | 
292 |     Args:
293 |         rec: the AlignedSegment within which to find the read position
294 |         ref_pos: the reference position to be found
295 |         previous: Controls behavior when the reference position is not aligned to any
296 |             read position.  True indicates to return the previous read position, False
297 |             indicates to return the next read position and None indicates to return None.
298 | 
299 |     Returns:
300 |         The read position at the reference position, or None.
301 |     """
302 |     if ref_pos < rec.reference_start or ref_pos >= rec.reference_end:
303 |         raise ValueError(f"{ref_pos} is not within the reference span for read {rec.query_name}")
304 | 
305 |     pairs = rec.get_aligned_pairs()
306 |     index = 0
307 |     read_pos = None
308 |     for read, ref in pairs:
309 |         if ref == ref_pos:
310 |             read_pos = read
311 |             break
312 |         else:
313 |             index += 1
314 | 
315 |     if not read_pos and previous is not None:
316 |         if previous:
317 |             while read_pos is None and index > 0:
318 |                 index -= 1
319 |                 read_pos = pairs[index][0]
320 |         else:
321 |             while read_pos is None and index < len(pairs):
322 |                 read_pos = pairs[index][0]
323 |                 index += 1
324 | 
325 |     return read_pos
326 | 
327 | 
328 | def _clip(cigar: Cigar,
329 |           quals: array,
330 |           bases_to_clip: int,
331 |           clipped_base_quality: Optional[int]) -> Tuple[Cigar, ClippingInfo]:
332 |     """Workhorse private clipping method that clips the start of cigars.
333 | 
334 |     Always works on the start of the cigars/quals; end-clipping is accomplished by
335 |     reversing value before calling this function.  Since the function is private it
336 |     makes the following assumptions:
337 | 
338 |     1. There are at least bases_to_clip bases available for clipping in the read
339 |     2. The cigar and quals agree on the query length
340 |     2. clipped_base_quality is either None or a valid integer base quality
341 |     """
342 | 
343 |     if any(cig.operator == CigarOp.P for cig in cigar.elements):
344 |         raise ValueError(f"Cannot handle cigars that contain padding: {cigar}")
345 | 
346 |     elems = peekable(cigar.elements)
347 |     existing_hard_clips = elems.takewhile(lambda c: c.operator == CigarOp.H)
348 |     existing_soft_clips = elems.takewhile(lambda c: c.operator == CigarOp.S)
349 |     read_bases_clipped = 0
350 |     ref_bases_clipped = 0
351 |     new_elems: List[CigarElement] = []  # buffer of cigar elements used to make the returned cigar
352 | 
353 |     # Returns true if the operator immediately after the clipping point is a deletion
354 |     def is_trailing_deletion() -> bool:
355 |         # Four conditions must be met:
356 |         # 1. The number of bases _to_ clip equals the number of bases _already_ clipped
357 |         # 2. The clipping point falls between operators (i.e. new_elems is empty)
358 |         # 3. There is at least one more element to consider.
359 |         # 4. The next element is a deletion.
360 |         return read_bases_clipped == bases_to_clip \
361 |             and not new_elems \
362 |             and elems.peek() is not None \
363 |             and elems.peek().operator == CigarOp.D
364 | 
365 |     # The loop skips over all operators that are getting turned into clipping, while keeping track
366 |     # of how many reference bases and how many read bases are skipped over.  If the clipping point
367 |     # falls between existing operators then the new_elems buffer is empty at the end of the while
368 |     # loop. If the clip point falls within:
369 |     #    a) an alignment operator then the operator is split and the remainder added to the buffer
370 |     #    b) an insertion: the remainder of the insertion is also clipped
371 |     # If the operator immediately after the clip is a deletion, it is also discarded.
372 |     #
373 |     # At the end of the while loop new_elems is either:
374 |     #   a) Empty
375 |     #   b) Contains a single element which is the remainder of an element that had to be split
376 |     while read_bases_clipped < bases_to_clip or is_trailing_deletion():
377 |         elem = next(elems)
378 |         op: CigarOp = elem.operator
379 |         length: int = elem.length
380 |         remaining_to_clip = bases_to_clip - read_bases_clipped
381 | 
382 |         if op.consumes_query and length > remaining_to_clip:
383 |             if op == CigarOp.I:
384 |                 read_bases_clipped += length
385 |             else:
386 |                 remaining_length = length - remaining_to_clip
387 |                 read_bases_clipped += remaining_to_clip
388 |                 ref_bases_clipped += remaining_to_clip
389 |                 new_elems.append(CigarElement(remaining_length, op))
390 |         else:
391 |             read_bases_clipped += elem.length_on_query
392 |             ref_bases_clipped += elem.length_on_target
393 | 
394 |     # Add in the remainder of the elements post-clipping
395 |     new_elems.extend(elems)
396 | 
397 |     # Add in the clips
398 |     clip_elems = []
399 |     hard_clip_length = sum(map(lambda e: e.length, existing_hard_clips))
400 |     soft_clip_length = sum(map(lambda e: e.length, existing_soft_clips)) + read_bases_clipped
401 |     if hard_clip_length > 0:
402 |         clip_elems.append(CigarElement(hard_clip_length, CigarOp.H))
403 |     if soft_clip_length > 0:
404 |         clip_elems.append(CigarElement(soft_clip_length, CigarOp.S))
405 | 
406 |     # Touch up the qualities if requested
407 |     if clipped_base_quality is not None:
408 |         for index in range(0, soft_clip_length):
409 |             quals[index] = clipped_base_quality
410 | 
411 |     new_cigar = Cigar(tuple(clip_elems + new_elems))
412 |     return new_cigar, ClippingInfo(read_bases_clipped, ref_bases_clipped)
413 | 


--------------------------------------------------------------------------------
/samwell/sam/sambuilder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Classes for generating SAM and BAM files and records for testing
  3 | ----------------------------------------------------------------
  4 | 
  5 | This module contains utility classes for the generation of SAM and BAM files and
  6 | alignment records, for use in testing.
  7 | 
  8 | The module contains the following public classes:
  9 | 
 10 |     - :class:`~samwell.sam.sambuilder.SamBuilder` -- A builder class that allows the accumulation
 11 |         of alignment records and access as a list and writing to file.
 12 | """
 13 | 
 14 | from pathlib import Path
 15 | from random import Random
 16 | from tempfile import NamedTemporaryFile
 17 | from typing import Any
 18 | from typing import Callable
 19 | from typing import Dict
 20 | from typing import IO
 21 | from typing import List
 22 | from typing import Optional
 23 | from typing import Tuple
 24 | 
 25 | import pysam
 26 | from pysam import AlignmentHeader, AlignedSegment
 27 | 
 28 | from samwell import sam
 29 | from samwell.sam import SamOrder
 30 | 
 31 | 
 32 | class SamBuilder:
 33 |     """Builder for constructing one or more sam records (`AlignmentSegment`s in pysam terms).
 34 | 
 35 |     Provides the ability to manufacture records from minimal arguments, while generating
 36 |     any remaining attributes to ensure a valid record.
 37 | 
 38 |     A builder is constructed with a handful of defaults including lengths for generated R1s
 39 |     and R2s, the default base quality score to use, a sequence dictionary and a single read group.
 40 | 
 41 |     Records are then added using the :func:`~samwell.sambuilder.SamBuilder.add_pair` method.
 42 |     Once accumulated the records can be accessed in the order in which they were created through
 43 |     the :func:`~samwell.sambuilder.SamBuilder.to_unsorted_list` function, or in a list sorted
 44 |     by coordinate order via :func:`~samwell.sambuilder.SamBuilder.to_sorted_list`.  The latter
 45 |     creates a temporary file to do the sorting and is somewhat slower as a result.
 46 | 
 47 |     Records can be further modified after being returned from
 48 |     :func:`~samwell.sambuilder.SamBuilder.add_pair`,
 49 |     :func:`~samwell.sambuilder.SamBuilder.to_unsorted_list`, or
 50 |     :func:`~samwell.sambuilder.SamBuilder.to_sorted_list` by directly accessing their attributes
 51 |     through the
 52 |     [AlignedSegment](https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment) API.
 53 | 
 54 |     Lastly, the records can be written to a temporary file using
 55 |     :func:`~samwell.sambuilder.SamBuilder.to_path`.
 56 |     """
 57 | 
 58 |     # The default read one length
 59 |     DEFAULT_R1_LENGTH: int = 100
 60 | 
 61 |     # The default read two length
 62 |     DEFAULT_R2_LENGTH: int = 100
 63 | 
 64 |     @staticmethod
 65 |     def default_sd() -> List[Dict[str, Any]]:
 66 |         """Generates the sequence dictionary that is used by default by SamBuilder.
 67 | 
 68 |         Matches the names and lengths of the HG19 reference in use in production.
 69 | 
 70 |         Returns:
 71 |             A new copy of the sequence dictionary as a list of dictionaries, one per chromosome.
 72 |         """
 73 |         return [
 74 |             {"SN": "chr1", "LN": 249250621},
 75 |             {"SN": "chr2", "LN": 243199373},
 76 |             {"SN": "chr3", "LN": 198022430},
 77 |             {"SN": "chr4", "LN": 191154276},
 78 |             {"SN": "chr5", "LN": 180915260},
 79 |             {"SN": "chr6", "LN": 171115067},
 80 |             {"SN": "chr7", "LN": 159138663},
 81 |             {"SN": "chr8", "LN": 146364022},
 82 |             {"SN": "chr9", "LN": 141213431},
 83 |             {"SN": "chr10", "LN": 135534747},
 84 |             {"SN": "chr11", "LN": 135006516},
 85 |             {"SN": "chr12", "LN": 133851895},
 86 |             {"SN": "chr13", "LN": 115169878},
 87 |             {"SN": "chr14", "LN": 107349540},
 88 |             {"SN": "chr15", "LN": 102531392},
 89 |             {"SN": "chr16", "LN": 90354753},
 90 |             {"SN": "chr17", "LN": 81195210},
 91 |             {"SN": "chr18", "LN": 78077248},
 92 |             {"SN": "chr19", "LN": 59128983},
 93 |             {"SN": "chr20", "LN": 63025520},
 94 |             {"SN": "chr21", "LN": 48129895},
 95 |             {"SN": "chr22", "LN": 51304566},
 96 |             {"SN": "chrX", "LN": 155270560},
 97 |             {"SN": "chrY", "LN": 59373566},
 98 |             {"SN": "chrM", "LN": 16571}
 99 |         ]
100 | 
101 |     @staticmethod
102 |     def default_rg() -> Dict[str, str]:
103 |         """Returns the default read group used by the SamBuilder, as a dictionary."""
104 |         return {"ID": "1", "SM": "1_AAAAAA", "LB": "default", "PL": "ILLUMINA", "PU": "xxx.1"}
105 | 
106 |     def __init__(self,
107 |                  r1_len: Optional[int] = None,
108 |                  r2_len: Optional[int] = None,
109 |                  base_quality: int = 30,
110 |                  mapping_quality: int = 60,
111 |                  sd: Optional[List[Dict[str, Any]]] = None,
112 |                  rg: Optional[Dict[str, str]] = None,
113 |                  extra_header: Optional[Dict[str, Any]] = None,
114 |                  seed: int = 42,
115 |                  sort_order: Optional[SamOrder] = SamOrder.Coordinate,
116 |                  ) -> None:
117 |         """Initializes a new SamBuilder for generating alignment records and SAM/BAM files.
118 | 
119 |         Args:
120 |             r1_len: The length of R1s to create unless otherwise specified
121 |             r2_len: The length of R2s to create unless otherwise specified
122 |             base_quality: The base quality of bases to create unless otherwise specified
123 |             sd: a sequence dictionary as a list of dicts; defaults to calling default_sd() if None
124 |             rg: a single read group as a dict; defaults to calling default_sd() if None
125 |             extra_header: a dictionary of extra values to add to the header, None otherwise.  See
126 |                           `::class::~pysam.AlignmentHeader` for more details.
127 |             seed: a seed value for random number/string generation
128 |             sort_order: optional sort order, if `None` reads will be output in the same order as
129 |                 they were appended. If `SamOrder.Coordinate`, reads will be ordered by reference
130 |                 index and coordinate order. If `SamOrder.QueryName`, reads will be ordered by
131 |                 query name.
132 |         """
133 | 
134 |         self.r1_len: int = r1_len if r1_len is not None else self.DEFAULT_R1_LENGTH
135 |         self.r2_len: int = r2_len if r2_len is not None else self.DEFAULT_R2_LENGTH
136 |         self.base_quality: int = base_quality
137 |         self.mapping_quality: int = mapping_quality
138 | 
139 |         sort_order = (
140 |             SamOrder.Unsorted
141 |             if sort_order is None
142 |             else sort_order
143 |         )
144 |         assert sort_order in [SamOrder.Coordinate, SamOrder.QueryName, SamOrder.Unsorted], (
145 |             "`sort_order for `SamBuilder` must be one of `Coordinate` `QueryName` or `Unsorted`"
146 |         )
147 |         self.sort_order: SamOrder = sort_order
148 | 
149 |         self._header: Dict[str, Any] = {
150 |             "HD": {"VN": "1.5", "SO": sort_order.value},
151 |             "SQ": (sd if sd is not None else SamBuilder.default_sd()),
152 |             "RG": [(rg if rg is not None else SamBuilder.default_rg())]
153 |         }
154 |         if extra_header is not None:
155 |             self._header = {**self._header, **extra_header}
156 |         self._samheader = AlignmentHeader.from_dict(self._header)
157 |         self._seq_lookup = dict([(s["SN"], s) for s in self._header["SQ"]])
158 | 
159 |         self._random: Random = Random(seed)
160 |         self._records: List[AlignedSegment] = []
161 |         self._counter: int = 0
162 | 
163 |     def _next_name(self) -> str:
164 |         """Returns the next available query/template name."""
165 |         n = self._counter
166 |         self._counter += 1
167 |         return f"q{n:>04}"
168 | 
169 |     def _bases(self, length: int) -> str:
170 |         """Returns a random string of bases of the length requested."""
171 |         return "".join(self._random.choices("ACGT", k=length))  # type: ignore
172 | 
173 |     def _new_rec(self,
174 |                  name: str,
175 |                  chrom: str,
176 |                  start: int,
177 |                  attrs: Optional[Dict[str, Any]]) -> AlignedSegment:
178 |         """Generates a new AlignedSegment.  Sets the segment up with the correct
179 |         header and adds the RG attribute if not contained in attrs.
180 | 
181 |         Args:
182 |             name: the name of the read/template
183 |             chrom: the chromosome to which the read is mapped
184 |             start: the start position of the read on the chromosome
185 |             attrs: an optional dictionary of SAM attributes with two-char keys
186 | 
187 |         Returns:
188 |             AlignedSegment: an aligned segment with name, chrom, pos, attributes the
189 |                 read group, and the unmapped flag all set appropriately.
190 |         """
191 |         if chrom is not sam.NO_REF_NAME and chrom not in self._seq_lookup:
192 |             raise ValueError(f"{chrom} is not a valid chromosome name in this builder.")
193 | 
194 |         rec = AlignedSegment(header=self._samheader)
195 |         rec.query_name = name
196 |         rec.reference_name = chrom
197 |         rec.reference_start = start
198 |         rec.mapping_quality = self.mapping_quality
199 | 
200 |         if chrom == sam.NO_REF_NAME or start == sam.NO_REF_POS:
201 |             rec.is_unmapped = True
202 | 
203 |         attrs = attrs if attrs else dict()
204 |         if "RG" not in attrs:
205 |             attrs["RG"] = self.rg_id
206 |         rec.set_tags(list(attrs.items()))
207 |         return rec
208 | 
209 |     def _set_flags(self, rec: pysam.AlignedSegment, is_r1: bool, strand: str) -> None:
210 |         """Appropriately sets most flag fields on the given read.
211 | 
212 |         Args:
213 |             rec: the read to set the flags on
214 |             is_r1: True if the read is a R1, False if it is an R2
215 |             strand: Either "+" or "-" to indicate strand of the read
216 |         """
217 |         rec.is_paired = True
218 |         rec.is_read1 = is_r1
219 |         rec.is_read2 = not is_r1
220 |         rec.is_qcfail = False
221 |         rec.is_duplicate = False
222 |         rec.is_secondary = False
223 |         rec.is_supplementary = False
224 |         if not rec.is_unmapped:
225 |             rec.is_reverse = strand != "+"
226 | 
227 |     def _set_length_dependent_fields(self,
228 |                                      rec: pysam.AlignedSegment,
229 |                                      length: int,
230 |                                      bases: Optional[str] = None,
231 |                                      quals: Optional[List[int]] = None,
232 |                                      cigar: Optional[str] = None,
233 |                                      ) -> None:
234 |         """Fills in bases, quals and cigar on a record.
235 | 
236 |         If any of bases, quals or cigar are defined, they must all have the same length/query
237 |         length.  If none are defined then the length parameter is used.  Undefined values are
238 |         synthesize at the inferred length.
239 | 
240 |         Args:
241 |             rec: a SAM record
242 |             length: the length to use if all of bases/quals/cigar are None
243 |             bases: an optional string of bases for the read
244 |             quals: an optional list of qualities for the read
245 |             cigar: an optional cigar string for the read
246 |         """
247 | 
248 |         # Do some validation to make sure all defined things have the same lengths
249 |         lengths = set()
250 |         if bases is not None:
251 |             lengths.add(len(bases))
252 |         if quals is not None:
253 |             lengths.add(len(quals))
254 |         if cigar is not None:
255 |             cig = sam.Cigar.from_cigarstring(cigar)
256 |             lengths.add(sum([elem.length_on_query for elem in cig.elements]))
257 | 
258 |         if not lengths:
259 |             lengths.add(length)
260 | 
261 |         if len(lengths) != 1:
262 |             raise ValueError("Provided bases/quals/cigar are not length compatible.")
263 | 
264 |         # Fill in the record, making any parts that were not defined as params
265 |         length = lengths.pop()
266 |         rec.query_sequence = bases if bases else self._bases(length)
267 |         rec.query_qualities = quals if quals else [self.base_quality] * length
268 |         if not rec.is_unmapped:
269 |             rec.cigarstring = cigar if cigar else f"{length}M"
270 | 
271 |     def _set_mate_info(self, r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None:
272 |         """Sets the mate information on a pair of sam records.
273 | 
274 |         Handles cases where both reads are mapped, one of the two reads is unmapped or both reads
275 |         are unmapped.
276 | 
277 |         Args:
278 |             r1: the first read in the pair
279 |             r2: the sceond read in the pair
280 |         """
281 |         for rec in r1, r2:
282 |             rec.template_length = 0
283 |             rec.is_proper_pair = False
284 | 
285 |         if r1.is_unmapped and r2.is_unmapped:
286 |             # If they're both unmapped just clean the records up
287 |             for rec, other in [(r1, r2), (r2, r1)]:
288 |                 rec.reference_id = sam.NO_REF_INDEX
289 |                 rec.next_reference_id = sam.NO_REF_INDEX
290 |                 rec.reference_start = sam.NO_REF_POS
291 |                 rec.next_reference_start = sam.NO_REF_POS
292 |                 rec.is_unmapped = True
293 |                 rec.mate_is_unmapped = True
294 |                 rec.is_proper_pair = False
295 |                 rec.mate_is_reverse = other.is_reverse
296 | 
297 |         elif r1.is_unmapped or r2.is_unmapped:
298 |             # If only one is mapped/unmapped copy over the relevant stuff
299 |             (m, u) = (r1, r2) if r2.is_unmapped else (r2, r1)
300 |             u.reference_id = m.reference_id
301 |             u.reference_start = m.reference_start
302 |             u.next_reference_id = m.reference_id
303 |             u.next_reference_start = m.reference_start
304 |             u.mate_is_reverse = m.is_reverse
305 |             u.mate_is_unmapped = False
306 |             u.set_tag("MC", m.cigarstring)
307 | 
308 |             m.next_reference_id = u.reference_id
309 |             m.next_reference_start = u.reference_start
310 |             m.mate_is_reverse = u.is_reverse
311 |             m.mate_is_unmapped = True
312 | 
313 |         else:
314 |             # Else they are both mapped
315 |             for rec, other in [(r1, r2), (r2, r1)]:
316 |                 rec.next_reference_id = other.reference_id
317 |                 rec.next_reference_start = other.reference_start
318 |                 rec.mate_is_reverse = other.is_reverse
319 |                 rec.mate_is_unmapped = False
320 |                 rec.set_tag("MC", other.cigarstring)
321 | 
322 |             if r1.reference_id == r2.reference_id:
323 |                 r1p = r1.reference_end if r1.is_reverse else r1.reference_start
324 |                 r2p = r2.reference_end if r2.is_reverse else r2.reference_start
325 |                 r1.template_length = r2p - r1p
326 |                 r2.template_length = r1p - r2p
327 | 
328 |                 # Arbitrarily set proper pair if the we have an FR pair with isize <= 1000
329 |                 if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000:
330 |                     fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p)
331 |                     if fpos < rpos:
332 |                         r1.is_proper_pair = True
333 |                         r2.is_proper_pair = True
334 | 
335 |     @property
336 |     def rg(self) -> Dict[str, Any]:
337 |         """Returns the single read group that is defined in the header."""
338 |         rgs = self._header["RG"]
339 |         assert len(rgs) == 1, "Header did not contain exactly one read group!"
340 |         return rgs[0]
341 | 
342 |     @property
343 |     def rg_id(self) -> str:
344 |         """Returns the ID of the single read group that is defined in the header."""
345 |         return self.rg["ID"]
346 | 
347 |     def add_pair(self, *,
348 |                  name: Optional[str] = None,
349 |                  bases1: Optional[str] = None,
350 |                  bases2: Optional[str] = None,
351 |                  quals1: Optional[List[int]] = None,
352 |                  quals2: Optional[List[int]] = None,
353 |                  chrom: str = sam.NO_REF_NAME,
354 |                  start1: int = sam.NO_REF_POS,
355 |                  start2: int = sam.NO_REF_POS,
356 |                  cigar1: Optional[str] = None,
357 |                  cigar2: Optional[str] = None,
358 |                  strand1: str = "+",
359 |                  strand2: str = "-",
360 |                  attrs: Optional[Dict[str, Any]] = None) -> Tuple[AlignedSegment, AlignedSegment]:
361 |         """Generates a new pair of reads, adds them to the internal collection, and returns them.
362 | 
363 |         Most fields are optional.
364 | 
365 |         An unmapped pair can be created by calling the method with no parameters (specifically,
366 |         not setting chrom, start1 or start2).  If either cigar is provided, it will be ignored.
367 | 
368 |         A pair with only one of the two reads mapped is created by setting e.g. chrom and start1.
369 |         The values will be automaticaly transferred to the unmapped mate, and flags set correctly.
370 | 
371 |         A mapped pair is created by providing all three of chrom, start1 and start2.
372 | 
373 |         For a given read (i.e. R1 or R2) the length of the read is determined based on the presence
374 |         or absence of bases, quals, and cigar.  If values are provided for one or more of these
375 |         parameters, the lengths must match, and the length will be used to generate any
376 |         unsupplied values.  If none of bases, quals, and cigar are provided, all three will be
377 |         synthesized based on either the r1_len or r2_len stored on the class as appropriate.
378 | 
379 |         When synthesizing, bases are always a random sequence of bases, quals are all the default
380 |         base quality (supplied when constructing a SamBuilder) and the cigar is always a single M
381 |         operator of the read length.
382 | 
383 |         Alignment attributes not exposed through the method parameters can be modified directly on
384 |         the returned AlignedSegment objects. Modifications will be reflected when records are
385 |         written to a temporary file with :func:`~samwell.sambuilder.SamBuilder.to_path`.
386 | 
387 |         Args:
388 |             name: The name of the template. If None is given a unique name will be auto-generated.
389 |             bases1: The bases for R1. If None is given a random sequence is generated.
390 |             bases2: The bases for R2. If None is given a random sequence is generated.
391 |             quals1: The list of int qualities for R1. If None, the default base quality is used.
392 |             quals2: The list of int qualities for R2. If None, the default base quality is used.
393 |             chrom: The chromosome to which both reads are mapped. Defaults to the unmapped value.
394 |             start1: The start position of R1. Defaults to the unmapped value.
395 |             start2: The start position of R2. Defaults to the unmapped value.
396 |             cigar1: The cigar string for R1. Defaults to None for unmapped reads, otherwise all M.
397 |             cigar2: The cigar string for R2. Defaults to None for unmapped reads, otherwise all M.
398 |             strand1: The strand for R1, either "+" or "-". Defaults to "+".
399 |             strand2: The strand for R2, either "+" or "-". Defaults to "-".
400 |             attrs: An optional dictionary of SAM attribute to place on both R1 and R2.
401 | 
402 |         Raises:
403 |             ValueError: if either strand field is not "+" or "-"
404 |             ValueError: if bases/quals/cigar are set in a way that is not self-consistent
405 | 
406 |         Returns:
407 |             Tuple[AlignedSegment, AlignedSegment]: The pair of records created, R1 then R2.
408 |         """
409 | 
410 |         if strand1 not in ["+", "-"]: raise ValueError(f"Invalid value for strand1: {strand1}")
411 |         if strand2 not in ["+", "-"]: raise ValueError(f"Invalid value for strand2: {strand2}")
412 | 
413 |         name = name if name is not None else self._next_name()
414 | 
415 |         # Setup R1
416 |         r1 = self._new_rec(name=name, chrom=chrom, start=start1, attrs=attrs)
417 |         self._set_flags(r1, is_r1=True, strand=strand1)
418 |         self._set_length_dependent_fields(
419 |             rec=r1, length=self.r1_len, bases=bases1, quals=quals1, cigar=cigar1)
420 | 
421 |         # Setup R2
422 |         r2 = self._new_rec(name=name, chrom=chrom, start=start2, attrs=attrs)
423 |         self._set_flags(r2, is_r1=False, strand=strand2)
424 |         self._set_length_dependent_fields(
425 |             rec=r2, length=self.r2_len, bases=bases2, quals=quals2, cigar=cigar2)
426 | 
427 |         # Sync up mate info and we're done!
428 |         self._set_mate_info(r1, r2)
429 |         self._records.append(r1)
430 |         self._records.append(r2)
431 |         return r1, r2
432 | 
433 |     def to_path(self,
434 |                 path: Optional[Path] = None,
435 |                 index: bool = True,
436 |                 pred: Callable[[AlignedSegment], bool] = lambda r: True) -> Path:
437 |         """Write the accumulated records to a file, sorts & indexes it, and returns the Path.
438 |         If a path is provided, it will be written to, otherwise a temporary file is created
439 |         and returned.
440 | 
441 |         Args:
442 |             path: a path at which to write the file, otherwise a temp file is used.
443 |             index: if True and `sort_order` is `Coordinate` index is generated, otherwise not.
444 |             pred: optional predicate to specify which reads should be output
445 | 
446 |         Returns:
447 |             Path: The path to the sorted (and possibly indexed) file.
448 |         """
449 | 
450 |         if path is None:
451 |             with NamedTemporaryFile(suffix=".bam", delete=False) as fp:
452 |                 path = Path(fp.name)
453 | 
454 |         with NamedTemporaryFile(suffix=".bam", delete=True) as fp:
455 |             file_handle: IO
456 |             if self.sort_order is SamOrder.Unsorted:
457 |                 file_handle = path.open('w')
458 |             else:
459 |                 file_handle = fp.file
460 | 
461 |             with sam.writer(file_handle,  # type: ignore
462 |                             header=self._samheader,
463 |                             file_type=sam.SamFileType.BAM) as writer:
464 |                 for rec in self._records:
465 |                     if pred(rec):
466 |                         writer.write(rec)
467 | 
468 |             default_samtools_opt_list = ["-o", str(path), fp.name]
469 | 
470 |             file_handle.close()
471 |             if self.sort_order == SamOrder.QueryName:
472 |                 pysam.sort(*(["-n"] + default_samtools_opt_list))
473 |             elif self.sort_order == SamOrder.Coordinate:
474 |                 pysam.sort(*default_samtools_opt_list)
475 |                 if index:
476 |                     pysam.index(str(path))
477 |         return path
478 | 
479 |     def __len__(self) -> int:
480 |         """Returns the number of records accumulated so far."""
481 |         return len(self._records)
482 | 
483 |     def to_unsorted_list(self) -> List[pysam.AlignedSegment]:
484 |         """Returns the accumulated records in the order they were created."""
485 |         return list(self._records)
486 | 
487 |     def to_sorted_list(self) -> List[pysam.AlignedSegment]:
488 |         """Returns the accumulated records in coordinate order."""
489 |         with NamedTemporaryFile(suffix=".bam", delete=True) as fp:
490 |             filename = fp.name
491 |             path = self.to_path(path=Path(filename), index=False)
492 |             bam = sam.reader(path)
493 |             return list(bam)
494 | 
495 |     @property
496 |     def header(self) -> AlignmentHeader:
497 |         """Returns a copy of the alignmentt header used by this builder"""
498 |         return AlignmentHeader.from_dict(self._header)
499 | 


--------------------------------------------------------------------------------
/samwell/sam/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myriad-opensource/samwell/47c4d809e3a228cf2be7af09871ab70e706763a1/samwell/sam/tests/__init__.py


--------------------------------------------------------------------------------
/samwell/sam/tests/data/valid.sam:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.0	SO:coordinate
 2 | @SQ	SN:chr1	LN:101
 3 | @SQ	SN:chr2	LN:101
 4 | @SQ	SN:chr3	LN:101
 5 | @SQ	SN:chr4	LN:101
 6 | @SQ	SN:chr5	LN:101
 7 | @SQ	SN:chr6	LN:101
 8 | @SQ	SN:chr7	LN:404
 9 | @SQ	SN:chr8	LN:202
10 | @RG	ID:0	SM:Hi,Mom!	LB:my-library	PL:ILLUMINA
11 | @RG	ID:1	SM:Hi,Mom!	LB:my-library	PL:ILLUMINA
12 | @RG	ID:2	SM:Hi,Mom!	LB:my-library	PL:Illumina
13 | @PG	ID:1	PN:Hey!	VN:2.0
14 | both_reads_align_clip_marked	1107	chr7	1	255	101M	=	302	201	CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN	)'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/&	RG:Z:0	PG:Z:1	NM:i:0	MQ:i:255	XT:Z:foo	OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
15 | both_reads_present_only_first_aligns	89	chr7	1	255	101M	*	0	0	CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN	)'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/&	RG:Z:1	PG:Z:1	NM:i:3	MQ:i:255	XT:Z:foo	OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
16 | read_2_too_many_gaps	83	chr7	1	255	101M	=	302	201	CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN	)'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/&	RG:Z:2	PG:Z:1	NM:i:8	MQ:i:255	XT:Z:foo2	OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
17 | both_reads_align_clip_adapter	147	chr7	16	255	101M	=	21	-96	CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN	)'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/&	RG:Z:1	PG:Z:1	NM:i:1	MQ:i:255	XT:Z:foo2	OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
18 | both_reads_align_clip_adapter	99	chr7	21	255	101M	=	16	96	CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN	)'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/&	RG:Z:1	PG:Z:1	NM:i:1	MQ:i:255	XT:Z:foo2	OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
19 | both_reads_align_clip_marked	163	chr7	302	255	101M	=	1	-201	NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA	&/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1	RG:Z:0	PG:Z:1	NM:i:5	MQ:i:255	OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
20 | read_2_too_many_gaps	163	chr7	302	255	10M1D10M5I76M	=	1	-201	NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA	&/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1	RG:Z:2	PG:Z:1	NM:i:6	MQ:i:255	OQ:Z:11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
21 | both_reads_present_only_first_aligns	165	*	0	0	*	chr7	1	0	NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA	&/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1	RG:Z:1	PG:Z:1
22 | 


--------------------------------------------------------------------------------
/samwell/sam/tests/test_bwa_mem.py:
--------------------------------------------------------------------------------
  1 | import distutils.spawn
  2 | import subprocess
  3 | from pathlib import Path
  4 | from tempfile import NamedTemporaryFile as NamedTemp
  5 | from typing import List
  6 | from typing import Optional
  7 | from typing import Tuple
  8 | 
  9 | import attr
 10 | import pytest
 11 | from py._path.local import LocalPath as TmpDir
 12 | from pysam import AlignedSegment
 13 | 
 14 | from samwell.sam.bwa_mem import FastqRecord
 15 | from samwell.sam.bwa_mem import InputOutputOptions
 16 | from samwell.sam.bwa_mem import align
 17 | 
 18 | 
 19 | BwaExecutable: Optional[str] = distutils.spawn.find_executable("bwa")
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def ref_fasta(tmpdir: TmpDir) -> Path:
 24 |     with NamedTemp(suffix=".fasta", dir=tmpdir, mode='w', delete=False) as fp:
 25 |         filename = Path(fp.name).name
 26 |     ref_fasta = tmpdir / filename
 27 | 
 28 |     with ref_fasta.open('w') as fh:
 29 |         fh.write(">1\n")
 30 |         fh.write("CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAA\n")
 31 |     subprocess.check_call(args=["bwa", "index", str(fp.name)])
 32 |     return ref_fasta
 33 | 
 34 | 
 35 | @pytest.fixture
 36 | def fastq_record() -> FastqRecord:
 37 |     read_bases = "CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAA"
 38 |     return FastqRecord(
 39 |         name="some_name",
 40 |         bases=read_bases,
 41 |         quals="".join(["I" for _ in read_bases])
 42 |     )
 43 | 
 44 | 
 45 | def _assert_alignment_for_fastq_record(read: FastqRecord,
 46 |                                        results: Tuple[FastqRecord, List[AlignedSegment]]) -> None:
 47 | 
 48 |     fastq, alignments = results
 49 |     assert len(alignments) == 1
 50 |     alignment = alignments[0]
 51 | 
 52 |     assert alignment.query_name == read.name
 53 |     assert alignment.query_sequence == read.bases
 54 |     assert "".join([chr(q + 33) for q in alignment.query_qualities]) == read.quals
 55 |     assert alignment.query_name == read.name
 56 |     assert alignment.reference_name == "1"
 57 |     assert alignment.reference_start == 0
 58 |     assert alignment.cigarstring == "60M"
 59 |     if read.read_number is not None:
 60 |         assert read.read_number == 1 or read.read_number == 2
 61 |         assert alignment.is_paired
 62 |         assert read.read_number == (1 if alignment.is_read1 else 2)
 63 | 
 64 | 
 65 | @pytest.mark.skipif(BwaExecutable is None, reason="requires bwa 0.7.17")
 66 | def test_single_alignment(fastq_record: FastqRecord, ref_fasta: Path) -> None:
 67 |     # run BWA
 68 |     results = list(align(reads=[fastq_record], idxbase=ref_fasta))
 69 | 
 70 |     # Check the returned alignments
 71 |     assert len(results) == 1
 72 |     _assert_alignment_for_fastq_record(read=fastq_record, results=results[0])
 73 | 
 74 | 
 75 | @pytest.mark.skipif(BwaExecutable is None, reason="requires bwa 0.7.17")
 76 | def test_fails_consecutive_reads_with_the_same_name_and_number(fastq_record: FastqRecord,
 77 |                                                                ref_fasta: Path) -> None:
 78 |     # run BWA
 79 |     with pytest.raises(Exception, match="Consecutive reads"):
 80 |         list(align(reads=[fastq_record, fastq_record], idxbase=ref_fasta))
 81 | 
 82 | 
 83 | @pytest.mark.skipif(BwaExecutable is None, reason="requires bwa 0.7.17")
 84 | def test_paired_end_reads(fastq_record: FastqRecord, ref_fasta: Path) -> None:
 85 |     # run BWA
 86 |     r1 = attr.evolve(fastq_record, read_number=1)
 87 |     r2 = attr.evolve(r1, read_number=2)
 88 |     io_opts = InputOutputOptions(interleaved_pairs=True)
 89 |     results = list(align(reads=[r1, r2], idxbase=ref_fasta, io_opts=io_opts))
 90 |     _assert_alignment_for_fastq_record(read=r1, results=results[0])
 91 |     _assert_alignment_for_fastq_record(read=r2, results=results[1])
 92 | 
 93 | 
 94 | @pytest.mark.skipif(BwaExecutable is None, reason="requires bwa 0.7.17")
 95 | def test_needs_alignment(fastq_record: FastqRecord, ref_fasta: Path) -> None:
 96 |     # run BWA
 97 |     rec1 = fastq_record
 98 |     rec2 = FastqRecord(
 99 |         name="needs_alignment=False",
100 |         bases=rec1.bases,
101 |         quals=rec1.quals,
102 |         needs_alignment=False
103 |     )
104 | 
105 |     results = list(align(reads=[rec1, rec2], idxbase=ref_fasta))
106 | 
107 |     # Check the returned alignments
108 |     assert len(results) == 2
109 |     for result in results:
110 |         fastq, alignments = result
111 |         if fastq.needs_alignment:
112 |             assert fastq == rec1
113 |             _assert_alignment_for_fastq_record(read=fastq_record, results=result)
114 |         else:
115 |             assert len(alignments) == 0
116 |             assert fastq.name == "needs_alignment=False"
117 |             assert fastq == rec2
118 | 
119 | 
120 | @pytest.mark.skipif(BwaExecutable is None, reason="requires bwa 0.7.17")
121 | def test_no_alignment(ref_fasta: Path) -> None:
122 |     fastq_record = FastqRecord(
123 |         name="unmapped",
124 |         bases="A" * 60,
125 |         quals="I" * 60,
126 |         needs_alignment=False
127 |     )
128 | 
129 |     # run BWA
130 |     results = list(align(reads=[fastq_record], idxbase=ref_fasta))
131 | 
132 |     # Check the returned alignments
133 |     assert len(results) == 1
134 |     fastq, alignments = results[0]
135 |     assert len(alignments) == 0
136 |     assert fastq_record == fastq
137 | 


--------------------------------------------------------------------------------
/samwell/sam/tests/test_clipping.py:
--------------------------------------------------------------------------------
  1 | """Tests for :py:mod:`~samwell.clipping`"""
  2 | 
  3 | from typing import Optional
  4 | 
  5 | import pytest
  6 | from pysam import AlignedSegment
  7 | 
  8 | from samwell import sam
  9 | from samwell.sam import clipping
 10 | from samwell.sam.sambuilder import SamBuilder
 11 | 
 12 | 
 13 | def r(start: Optional[int], cigar: Optional[str], strand: Optional[str] = "+") -> AlignedSegment:
 14 |     """"Constructs a read for testing."""
 15 |     builder = SamBuilder()
 16 |     if start:
 17 |         r1, r2 = builder.add_pair(chrom="chr1", start1=start, cigar1=cigar, strand1=strand)
 18 |     else:
 19 |         r1, r2 = builder.add_pair()
 20 |     return r1
 21 | 
 22 | 
 23 | def test_make_read_unmapped() -> None:
 24 |     builder = SamBuilder()
 25 |     r1, r2 = builder.add_pair(chrom="chr1", start1=100, start2=250)
 26 | 
 27 |     clipping._make_read_unmapped(r1)
 28 |     assert r1.is_unmapped
 29 |     assert r1.reference_id == sam.NO_REF_INDEX
 30 |     assert r1.reference_name is None
 31 |     assert r1.reference_start == sam.NO_REF_POS
 32 | 
 33 | 
 34 | ###############################################################################
 35 | # Tests for read_pos_at_ref_pos()
 36 | ###############################################################################
 37 | 
 38 | def test_read_pos_at_ref_pos_simple() -> None:
 39 |     rec = r(100, "100M")
 40 |     assert clipping._read_pos_at_ref_pos(rec, 100) == 0
 41 |     assert clipping._read_pos_at_ref_pos(rec, 150) == 50
 42 | 
 43 | 
 44 | def test_read_pos_at_ref_pos_fails_with_position_outside_range() -> None:
 45 |     rec = r(100, "100M")
 46 |     assert clipping._read_pos_at_ref_pos(rec, 100) == 0
 47 |     assert clipping._read_pos_at_ref_pos(rec, 199) == 99
 48 | 
 49 |     with pytest.raises(ValueError):
 50 |         clipping._read_pos_at_ref_pos(rec, 99)
 51 |     with pytest.raises(ValueError):
 52 |         clipping._read_pos_at_ref_pos(rec, 200)
 53 | 
 54 | 
 55 | def test_read_pos_at_ref_pos_with_indels_nearby() -> None:
 56 |     rec = r(100, "25M1D25M1I25M")
 57 |     assert clipping._read_pos_at_ref_pos(rec, 100) == 0
 58 |     assert clipping._read_pos_at_ref_pos(rec, 110) == 10
 59 |     assert clipping._read_pos_at_ref_pos(rec, 120) == 20
 60 |     assert clipping._read_pos_at_ref_pos(rec, 130) == 29
 61 |     assert clipping._read_pos_at_ref_pos(rec, 140) == 39
 62 |     assert clipping._read_pos_at_ref_pos(rec, 150) == 49
 63 |     assert clipping._read_pos_at_ref_pos(rec, 160) == 60
 64 | 
 65 | 
 66 | def test_read_pos_at_ref_pos_with_clipping() -> None:
 67 |     rec = r(100, "10S90M")
 68 |     assert clipping._read_pos_at_ref_pos(rec, 100) == 10
 69 | 
 70 | 
 71 | def test_read_pos_at_ref_pos_with_refpos_in_deletion() -> None:
 72 |     rec = r(100, "50M5D50M")
 73 |     assert clipping._read_pos_at_ref_pos(rec, 152) is None
 74 |     assert clipping._read_pos_at_ref_pos(rec, 152, previous=None) is None
 75 |     assert clipping._read_pos_at_ref_pos(rec, 152, previous=True) == 49
 76 |     assert clipping._read_pos_at_ref_pos(rec, 152, previous=False) == 50
 77 | 
 78 | 
 79 | ###############################################################################
 80 | # Tests for softclip_start_of_alignment()
 81 | ###############################################################################
 82 | 
 83 | def test_softclip_start_of_alignment_by_query_clips_10_aligned_bases() -> None:
 84 |     rec = r(10, "50M", "+")
 85 |     info = clipping.softclip_start_of_alignment_by_query(rec, 10)
 86 |     assert info.query_bases_clipped == 10
 87 |     assert info.ref_bases_clipped == 10
 88 |     assert rec.reference_start == 20
 89 |     assert rec.cigarstring == "10S40M"
 90 | 
 91 | 
 92 | def test_softclip_start_of_alignment_by_query_masking_qualities() -> None:
 93 |     for new_qual in None, 0, 2:
 94 |         rec = r(10, "50M", "+")
 95 |         clipping.softclip_start_of_alignment_by_query(rec, 10, clipped_base_quality=new_qual)
 96 |         quals = rec.query_qualities
 97 | 
 98 |         for i in range(0, 10):
 99 |             assert quals[i] == (30 if new_qual is None else new_qual)
100 | 
101 | 
102 | def test_soft_clip_start_of_alignment_by_query_clips_10_aligned_and_inserted_bases() -> None:
103 |     for strand in "+", "-":
104 |         rec = r(10, "4M2I44M", strand)
105 |         info = clipping.softclip_start_of_alignment_by_query(rec, 10)
106 |         assert info.query_bases_clipped == 10
107 |         assert info.ref_bases_clipped == 8
108 |         assert rec.reference_start == 18
109 |         assert rec.cigarstring == "10S40M"
110 | 
111 | 
112 | def test_softclip_start_of_alignment_by_query_clips_10_aligned_and_deleted_bases() -> None:
113 |     for strand in "+", "-":
114 |         rec = r(10, "6M2D44M", strand)
115 |         info = clipping.softclip_start_of_alignment_by_query(rec, 10)
116 |         assert info.query_bases_clipped == 10
117 |         assert info.ref_bases_clipped == 12
118 |         assert rec.reference_start == 22
119 |         assert rec.cigarstring == "10S40M"
120 | 
121 | 
122 | def test_softclip_start_of_alignment_by_query_clips_10_more_bases() -> None:
123 |     for strand in "+", "-":
124 |         rec = r(10, "10S40M", strand)
125 |         info = clipping.softclip_start_of_alignment_by_query(rec, 10)
126 |         assert info.query_bases_clipped == 10
127 |         assert info.ref_bases_clipped == 10
128 |         assert rec.reference_start == 20
129 |         assert rec.cigarstring == "20S30M"
130 | 
131 | 
132 | def test_softclip_start_of_alignment_by_query_preserves_hard_clipping() -> None:
133 |     for strand in "+", "-":
134 |         rec = r(10, "10H40M", strand)
135 |         info = clipping.softclip_start_of_alignment_by_query(rec, 10)
136 |         assert info.query_bases_clipped == 10
137 |         assert info.ref_bases_clipped == 10
138 |         assert rec.reference_start == 20
139 |         assert rec.cigarstring == "10H10S30M"
140 | 
141 | 
142 | def test_softclip_start_of_alignment_by_query_with_complicated_cigar() -> None:
143 |     for strand in "+", "-":
144 |         rec = r(10, "2H4S16M10I5M5I10M", strand)
145 |         info = clipping.softclip_start_of_alignment_by_query(rec, 10)
146 |         assert info.query_bases_clipped == 10
147 |         assert info.ref_bases_clipped == 10
148 |         assert rec.reference_start == 20
149 |         assert rec.cigarstring == "2H14S6M10I5M5I10M"
150 | 
151 | 
152 | def test_softclip_start_of_alignment_by_query_consumes_rest_of_insertion() -> None:
153 |     for strand in "+", "-":
154 |         rec = r(10, "8M4I38M", strand)
155 |         info = clipping.softclip_start_of_alignment_by_query(rec, 10)
156 |         assert info.query_bases_clipped == 12
157 |         assert info.ref_bases_clipped == 8
158 |         assert rec.reference_start == 18
159 |         assert rec.cigarstring == "12S38M"
160 | 
161 | 
162 | def test_softclip_start_of_alignment_by_query_preserves_insertion_adjacent_to_clipping() -> None:
163 |     for strand in "+", "-":
164 |         rec = r(10, "10M4I36M", strand)
165 |         info = clipping.softclip_start_of_alignment_by_query(rec, 10)
166 |         assert info.query_bases_clipped == 10
167 |         assert info.ref_bases_clipped == 10
168 |         assert rec.reference_start == 20
169 |         assert rec.cigarstring == "10S4I36M"
170 | 
171 | 
172 | def test_softclip_start_of_alignment_by_query_removes_deletion_following_clipping() -> None:
173 |     for strand in "+", "-":
174 |         rec = r(10, "10M4D40M", strand)
175 |         info = clipping.softclip_start_of_alignment_by_query(rec, 10)
176 |         assert info.query_bases_clipped == 10
177 |         assert info.ref_bases_clipped == 14
178 |         assert rec.reference_start == 24
179 |         assert rec.cigarstring == "10S40M"
180 | 
181 | 
182 | def test_softclip_start_of_alignment_by_query_preserves_deletions_post_clipping_region() -> None:
183 |     for strand in "+", "-":
184 |         rec = r(10, "25M4D25M", strand)
185 |         info = clipping.softclip_start_of_alignment_by_query(rec, 10)
186 |         assert info.query_bases_clipped == 10
187 |         assert info.ref_bases_clipped == 10
188 |         assert rec.reference_start == 20
189 |         assert rec.cigarstring == "10S15M4D25M"
190 | 
191 | 
192 | def test_softclip_start_of_alignment_by_query_unmapped_reads_ok() -> None:
193 |     rec = r(start=None, cigar=None)
194 |     info = clipping.softclip_start_of_alignment_by_query(rec, 10)
195 |     assert info.query_bases_clipped == 0
196 |     assert info.ref_bases_clipped == 0
197 | 
198 | 
199 | def test_softclip_start_of_alignment_by_query_unmaps_read_when_clipping_all_bases() -> None:
200 |     rec = r(10, "50M")
201 |     assert not rec.is_unmapped
202 |     info = clipping.softclip_start_of_alignment_by_query(rec, 50)
203 |     assert info.query_bases_clipped == 50
204 |     assert info.ref_bases_clipped == 50
205 |     assert rec.is_unmapped
206 | 
207 | 
208 | ###############################################################################
209 | # Tests for softclip_end_of_alignment()
210 | ###############################################################################
211 | 
212 | def test_softclip_end_of_alignment_by_query_clips_last10_bases_of_fully_aligned_read() -> None:
213 |     for strand in "+", "-":
214 |         rec = r(10, "50M", strand)
215 |         info = clipping.softclip_end_of_alignment_by_query(rec, 10)
216 |         assert info.query_bases_clipped == 10
217 |         assert info.ref_bases_clipped == 10
218 |         assert rec.reference_start == 10
219 |         assert rec.cigarstring == "40M10S"
220 | 
221 | 
222 | def test_softclip_end_of_alignment_by_query_masks_qualities_when_softclipping() -> None:
223 |     for new_qual in None, 2:
224 |         rec = r(10, "50M", "+")
225 |         clipping.softclip_end_of_alignment_by_query(rec, 10, clipped_base_quality=new_qual)
226 |         quals = rec.query_qualities
227 | 
228 |         for i in range(40, 50):
229 |             assert quals[i] == (30 if new_qual is None else new_qual)
230 | 
231 | 
232 | def test_soft_clip_end_of_alignment_by_query_clips_10_aligned_and_inserted_bases() -> None:
233 |     for strand in "+", "-":
234 |         rec = r(10, "44M2I4M", strand)
235 |         info = clipping.softclip_end_of_alignment_by_query(rec, 10)
236 |         assert info.query_bases_clipped == 10
237 |         assert info.ref_bases_clipped == 8
238 |         assert rec.reference_start == 10
239 |         assert rec.cigarstring == "40M10S"
240 | 
241 | 
242 | def test_softclip_end_of_alignment_by_query_clips_10_aligned_and_deleted_bases() -> None:
243 |     for strand in "+", "-":
244 |         rec = r(10, "44M2D6M", strand)
245 |         info = clipping.softclip_end_of_alignment_by_query(rec, 10)
246 |         assert info.query_bases_clipped == 10
247 |         assert info.ref_bases_clipped == 12
248 |         assert rec.reference_start == 10
249 |         assert rec.cigarstring == "40M10S"
250 | 
251 | 
252 | def test_softclip_end_of_alignment_by_query_clips_10_more_bases() -> None:
253 |     for strand in "+", "-":
254 |         rec = r(10, "40M10S", strand)
255 |         info = clipping.softclip_end_of_alignment_by_query(rec, 10)
256 |         assert info.query_bases_clipped == 10
257 |         assert info.ref_bases_clipped == 10
258 |         assert rec.reference_start == 10
259 |         assert rec.cigarstring == "30M20S"
260 | 
261 | 
262 | def test_softclip_end_of_alignment_by_query_preserves_hard_clipping() -> None:
263 |     for strand in "+", "-":
264 |         rec = r(10, "40M10H", strand)
265 |         info = clipping.softclip_end_of_alignment_by_query(rec, 10)
266 |         assert info.query_bases_clipped == 10
267 |         assert info.ref_bases_clipped == 10
268 |         assert rec.reference_start == 10
269 |         assert rec.cigarstring == "30M10S10H"
270 | 
271 | 
272 | def test_softclip_end_of_alignment_by_query_with_complicated_cigar() -> None:
273 |     for strand in "+", "-":
274 |         rec = r(10, "10M5I5M10I16M4S2H", strand)
275 |         info = clipping.softclip_end_of_alignment_by_query(rec, 10)
276 |         assert info.query_bases_clipped == 10
277 |         assert info.ref_bases_clipped == 10
278 |         assert rec.reference_start == 10
279 |         assert rec.cigarstring == "10M5I5M10I6M14S2H"
280 | 
281 | 
282 | def test_softclip_end_of_alignment_by_query_consumes_rest_of_insertion() -> None:
283 |     for strand in "+", "-":
284 |         rec = r(10, "38M4I8M", strand)
285 |         info = clipping.softclip_end_of_alignment_by_query(rec, 10)
286 |         assert info.query_bases_clipped == 12
287 |         assert info.ref_bases_clipped == 8
288 |         assert rec.reference_start == 10
289 |         assert rec.cigarstring == "38M12S"
290 | 
291 | 
292 | def test_softclip_end_of_alignment_by_query_preserves_insertion_following_clipping() -> None:
293 |     for strand in "+", "-":
294 |         rec = r(10, "36M4I10M", strand)
295 |         info = clipping.softclip_end_of_alignment_by_query(rec, 10)
296 |         assert info.query_bases_clipped == 10
297 |         assert info.ref_bases_clipped == 10
298 |         assert rec.reference_start == 10
299 |         assert rec.cigarstring == "36M4I10S"
300 | 
301 | 
302 | def test_softclip_end_of_alignment_by_query_removes_deletion_following_clipping() -> None:
303 |     for strand in "+", "-":
304 |         rec = r(10, "40M4D10M", strand)
305 |         info = clipping.softclip_end_of_alignment_by_query(rec, 10)
306 |         assert info.query_bases_clipped == 10
307 |         assert info.ref_bases_clipped == 14
308 |         assert rec.reference_start == 10
309 |         assert rec.cigarstring == "40M10S"
310 | 
311 | 
312 | def test_softclip_end_of_alignment_by_query_preserves_deletions_post_clipping_region() -> None:
313 |     for strand in "+", "-":
314 |         rec = r(10, "25M4D25M", strand)
315 |         info = clipping.softclip_end_of_alignment_by_query(rec, 10)
316 |         assert info.query_bases_clipped == 10
317 |         assert info.ref_bases_clipped == 10
318 |         assert rec.reference_start == 10
319 |         assert rec.cigarstring == "25M4D15M10S"
320 | 
321 | 
322 | def test_softclip_end_of_alignment_by_query_unmapped_reads_ok() -> None:
323 |     rec = r(start=None, cigar=None)
324 |     info = clipping.softclip_end_of_alignment_by_query(rec, 10)
325 |     assert info.query_bases_clipped == 0
326 |     assert info.ref_bases_clipped == 0
327 | 
328 | 
329 | def test_softclip_end_of_alignment_by_query_makes_read_unmapped_when_clipping_all_bases() -> None:
330 |     rec = r(10, "50M")
331 |     assert not rec.is_unmapped
332 |     info = clipping.softclip_end_of_alignment_by_query(rec, 50)
333 |     assert info.query_bases_clipped == 50
334 |     assert info.ref_bases_clipped == 50
335 |     assert rec.is_unmapped
336 | 
337 | ###############################################################################
338 | # Tests for functions that clip _reference_ bases instead of query bases
339 | ###############################################################################
340 | 
341 | 
342 | def test_softclip_start_of_alignment_by_ref_simple() -> None:
343 |     rec = r(10, "50M")
344 |     info = clipping.softclip_start_of_alignment_by_ref(rec, 10)
345 |     assert info.query_bases_clipped == 10
346 |     assert info.ref_bases_clipped == 10
347 |     assert rec.reference_start == 20
348 |     assert rec.cigarstring == "10S40M"
349 | 
350 | 
351 | def test_softclip_start_of_alignment_by_ref_with_deletion() -> None:
352 |     rec = r(10, "5M5D45M")
353 |     info = clipping.softclip_start_of_alignment_by_ref(rec, 10)
354 |     assert info.query_bases_clipped == 5
355 |     assert info.ref_bases_clipped == 10
356 |     assert rec.reference_start == 20
357 |     assert rec.cigarstring == "5S45M"
358 | 
359 | 
360 | def test_softclip_start_of_alignment_by_ref_with_insertion() -> None:
361 |     rec = r(10, "5M5I45M")
362 |     info = clipping.softclip_start_of_alignment_by_ref(rec, 10)
363 |     assert info.query_bases_clipped == 15
364 |     assert info.ref_bases_clipped == 10
365 |     assert rec.reference_start == 20
366 |     assert rec.cigarstring == "15S40M"
367 | 
368 | 
369 | def test_softclip_end_of_alignment_by_ref_simple() -> None:
370 |     rec = r(10, "50M")
371 |     info = clipping.softclip_end_of_alignment_by_ref(rec, 10)
372 |     assert info.query_bases_clipped == 10
373 |     assert info.ref_bases_clipped == 10
374 |     assert rec.reference_start == 10
375 |     assert rec.cigarstring == "40M10S"
376 | 
377 | 
378 | def test_softclip_end_of_alignment_by_ref_with_deletion() -> None:
379 |     rec = r(10, "45M5D5M")
380 |     info = clipping.softclip_end_of_alignment_by_ref(rec, 10)
381 |     assert info.query_bases_clipped == 5
382 |     assert info.ref_bases_clipped == 10
383 |     assert rec.reference_start == 10
384 |     assert rec.cigarstring == "45M5S"
385 | 
386 | 
387 | def test_softclip_end_of_alignment_by_ref_with_insertion() -> None:
388 |     rec = r(10, "45M5I5M")
389 |     info = clipping.softclip_end_of_alignment_by_ref(rec, 10)
390 |     assert info.query_bases_clipped == 15
391 |     assert info.ref_bases_clipped == 10
392 |     assert rec.reference_start == 10
393 |     assert rec.cigarstring == "40M15S"
394 | 


--------------------------------------------------------------------------------
/samwell/sam/tests/test_sam.py:
--------------------------------------------------------------------------------
  1 | """Tests for :py:mod:`~samwell.sam`"""
  2 | 
  3 | from pathlib import Path
  4 | from tempfile import NamedTemporaryFile as NamedTemp
  5 | from typing import Any
  6 | from typing import Dict
  7 | from typing import Generator
  8 | from typing import List
  9 | from typing import Tuple
 10 | from typing import Union
 11 | 
 12 | import pysam
 13 | import pytest
 14 | from py._path.local import LocalPath as TmpDir
 15 | 
 16 | import samwell.sam as sam
 17 | from samwell.sam import Cigar
 18 | from samwell.sam import CigarElement
 19 | from samwell.sam import CigarOp
 20 | from samwell.sam import CigarParsingException
 21 | from samwell.sam import SamFileType
 22 | from samwell.sam.sambuilder import SamBuilder
 23 | 
 24 | 
 25 | @pytest.mark.parametrize("file_type", list(SamFileType))
 26 | @pytest.mark.parametrize("as_str", [True, False])
 27 | def test_sam_file_type_from_path(file_type: SamFileType, as_str: bool) -> None:
 28 |     path: Union[Path, str]
 29 |     if as_str:
 30 |         path = "/path/to/some/file" + file_type.ext
 31 |     else:
 32 |         path = Path("/path/to/some/file" + file_type.ext)
 33 |     assert SamFileType.from_path(path=path) == file_type
 34 | 
 35 | 
 36 | def test_sam_file_type_invalid_path() -> None:
 37 |     path = "/path/to/excel.xls"
 38 |     with pytest.raises(ValueError) as ex:
 39 |         SamFileType.from_path(path=path)
 40 |     assert "Could not infer file type from " + path in str(ex)
 41 | 
 42 | 
 43 | @pytest.fixture
 44 | def valid_sam() -> Path:
 45 |     return Path(__file__).parent / 'data' / 'valid.sam'
 46 | 
 47 | 
 48 | @pytest.fixture
 49 | def valid_bam(valid_sam: Path) -> Generator[Path, None, None]:
 50 |     bam: Path = Path(__file__).parent / 'data' / 'valid.bam'
 51 |     num_read = 0
 52 |     with sam.reader(valid_sam) as fh_in:
 53 |         with sam.writer(bam, fh_in.header, file_type=SamFileType.BAM) as fh_out:
 54 |             for rec in fh_in:
 55 |                 num_read += 1
 56 |                 fh_out.write(rec)
 57 |     assert num_read == 8
 58 |     yield bam
 59 |     bam.unlink()
 60 | 
 61 | 
 62 | @pytest.fixture(scope="function")
 63 | def in_path(request: Any, valid_sam: Path, valid_bam: Path) -> Path:
 64 |     """A fixture for test_sam_file_open_reading to modify in_path prior to executing.
 65 | 
 66 |     Returns:
 67 |          the path corresponding to the given file type (i.e. SAM or BAM).
 68 |     """
 69 |     file_type = request.param
 70 |     return valid_sam if file_type == SamFileType.SAM else valid_bam
 71 | 
 72 | 
 73 | @pytest.mark.parametrize("in_path,file_type", [
 74 |     (SamFileType.SAM, SamFileType.SAM),
 75 |     (SamFileType.BAM, SamFileType.BAM)
 76 | ], indirect=['in_path'])  # Note: This modifies in_path via the in_path fixture
 77 | def test_sam_file_open_reading(in_path: Path,
 78 |                                file_type: SamFileType) -> None:
 79 | 
 80 |     # file pointer
 81 |     with in_path.open(mode="rb") as fp:
 82 |         with sam._pysam_open(path=fp, open_for_reading=True, file_type=file_type) as samfile:
 83 |             assert sum(1 for _ in samfile) == 8
 84 | 
 85 |     # Path
 86 |     with sam._pysam_open(path=in_path, open_for_reading=True, file_type=file_type) as samfile:
 87 |         assert sum(1 for _ in samfile) == 8
 88 | 
 89 |     # str
 90 |     str_path = str(in_path)
 91 |     with sam._pysam_open(path=str_path, open_for_reading=True, file_type=file_type) as samfile:
 92 |         assert sum(1 for _ in samfile) == 8
 93 | 
 94 | 
 95 | def test_sam_file_open_reading_autorecognize(valid_sam: Path) -> None:
 96 |     with sam._pysam_open(path=valid_sam, open_for_reading=True, file_type=None) as samfile:
 97 |         assert sum(1 for _ in samfile) == 8
 98 | 
 99 | 
100 | def test_sam_file_open_reading_with_reader(valid_sam: Path) -> None:
101 |     with sam.reader(path=valid_sam, file_type=None) as samfile:
102 |         assert sum(1 for _ in samfile) == 8
103 | 
104 | 
105 | @pytest.fixture
106 | def expected_records(valid_sam: Path) -> List[pysam.AlignedSegment]:
107 |     """Returns the records that are found in the valid_sam. """
108 |     with sam.reader(valid_sam) as fh:
109 |         return [r for r in fh]
110 | 
111 | 
112 | @pytest.fixture
113 | def header_dict(valid_sam: Path) -> Dict[str, Any]:
114 |     """Returns the multi-level dictionary in the valid_sam. """
115 |     with sam.reader(valid_sam) as fh:
116 |         return fh.header
117 | 
118 | 
119 | @pytest.fixture
120 | def header_text(valid_sam: Path) -> Dict[str, Any]:
121 |     """Returns the raw dictionary text in the valid_sam. """
122 |     with sam.reader(valid_sam) as fh:
123 |         return fh.text
124 | 
125 | 
126 | def assert_actual_vs_expected(actual_path: str,
127 |                               expected_records: List[pysam.AlignedSegment]) -> None:
128 |     """Helper method to ensure the expected records are in the SAM/BAM at the actual path."""
129 |     with sam.reader(actual_path) as sam_reader:
130 |         actual_records = [r for r in sam_reader]
131 |     for actual, expected in zip(actual_records, expected_records):
132 |         assert actual == expected
133 |     assert len(actual_records) == len(expected_records)
134 | 
135 | 
136 | @pytest.mark.parametrize("file_type", [SamFileType.SAM, SamFileType.BAM])
137 | def test_sam_file_open_writing(file_type: SamFileType,
138 |                                expected_records: List[pysam.AlignedSegment],
139 |                                header_dict: Dict[str, Any],
140 |                                tmpdir: TmpDir) -> None:
141 |     # use header as a keyword argument
142 |     with NamedTemp(suffix=file_type.ext, dir=tmpdir, mode='w', delete=False) as fp:
143 |         kwargs = {"header": header_dict}
144 |         with sam._pysam_open(path=fp.file,  # type: ignore
145 |                              open_for_reading=False,
146 |                              file_type=file_type,
147 |                              **kwargs) as sam_writer:
148 |             for r in expected_records:
149 |                 sam_writer.write(r)
150 |     assert_actual_vs_expected(fp.name, expected_records)
151 | 
152 | 
153 | def test_sam_file_open_writing_header_keyword(expected_records: List[pysam.AlignedSegment],
154 |                                               header_dict: Dict[str, Any],
155 |                                               tmpdir: TmpDir) -> None:
156 |     # Use SamWriter
157 |     # use header as a keyword argument
158 |     with NamedTemp(suffix=".sam", dir=tmpdir, mode='w', delete=False) as fp:
159 |         with sam.writer(path=fp.name,
160 |                         header=header_dict,
161 |                         file_type=SamFileType.SAM) as sam_writer:
162 |             for r in expected_records:
163 |                 sam_writer.write(r)
164 |     assert_actual_vs_expected(fp.name, expected_records)
165 | 
166 | # FIXME: Bug in pysam
167 | # https://github.com/pysam-developers/pysam/pull/656
168 | # def test_sam_file_open_writing_text_keyword(expected_records: List[pysam.AlignedSegment],
169 | #                                             header_text: str,
170 | #                                             tmpdir: TmpDir) -> None:
171 | #     # Try without a file type
172 | #     with NamedTemp(suffix=".sam", dir=tmpdir, mode='w', delete=False) as fp:
173 | #         kwargs = {"text": header_text}
174 | #         with sam.writer(path=fp.name,
175 | #                             header=header_dict,
176 | #                             file_type=None) as sam_writer:
177 | #             for r in expected_records:
178 | #                 sam_writer.write(r)
179 | #     assert_actual_vs_expected(fp.name, expected_records)
180 | 
181 | 
182 | def test_cigar_op_util_from_character() -> None:
183 |     operators = [operator for operator in CigarOp]
184 |     characters = [operator.character for operator in operators]
185 |     for i, character in enumerate(characters):
186 |         assert CigarOp.from_character(character) == operators[i]
187 | 
188 | 
189 | def test_cigar_op_util_from_code() -> None:
190 |     operators = [operator for operator in CigarOp]
191 |     codes = [operator.code for operator in operators]
192 |     for i, code in enumerate(codes):
193 |         assert CigarOp.from_code(code) == operators[i]
194 | 
195 | 
196 | @pytest.mark.parametrize("character,operator_length,length_on_query,length_on_target", [
197 |     ("M", 10, 10, 10),
198 |     ("I", 10, 10, 0),
199 |     ("D", 10, 0, 10),
200 |     ("S", 10, 10, 0)
201 | ])
202 | def test_cigar_element_length_on(character: str,
203 |                                  operator_length: int,
204 |                                  length_on_query: int,
205 |                                  length_on_target: int) -> None:
206 |     operator = CigarOp.from_character(character)
207 |     element = CigarElement(operator_length, operator)
208 |     assert element.length == operator_length
209 |     assert element.length_on_query == length_on_query
210 |     assert element.length_on_target == length_on_target
211 | 
212 | 
213 | @pytest.mark.parametrize("in_cigar,out_cigar", [
214 |     ("75M", "75M"),
215 |     ("10M10M", "20M"),
216 |     ("10M10I10M", "10M10I10M"),
217 |     ("10S10S10S10S10M", "40S10M")
218 | ])
219 | def test_cigar_coalesce(in_cigar: str, out_cigar: str) -> None:
220 |     assert str(Cigar.from_cigarstring(in_cigar).coalesce()) == out_cigar
221 | 
222 | 
223 | @pytest.mark.parametrize("cigartuples,cigarstring", [
224 |     ([], "*"),  # Empty cigar
225 |     ([(0, 10), (1, 5), (0, 1)], "10M5I1M"),  # A simple example
226 |     ([(0, 10), (1, 5), (1, 5)], "10M5I5I"),  # do not join adjacent operators of the same type
227 |     ([(op.code, op.code + 1) for op in CigarOp], "1M2I3D4N5S6H7P8=9X")  # all operators
228 | ])
229 | def test_cigar_from_cigartuples(cigartuples: List[Tuple[int, int]], cigarstring: str) -> None:
230 |     cigar = Cigar.from_cigartuples(cigartuples)
231 |     assert str(cigar) == cigarstring
232 | 
233 | 
234 | def test_cigar_from_cigartuples_malformed() -> None:
235 |     with pytest.raises(CigarParsingException, match=r'.*Malformed cigar tuples.*'):
236 |         cigartuples = [(0, 10), (1, 5), (22, 1)]
237 |         Cigar.from_cigartuples(cigartuples)
238 | 
239 | 
240 | def test_pretty_cigarstring_exception() -> None:
241 |     cigar = "10M5U4M"
242 |     index = 4
243 |     expected = "10M5[U]4M"
244 |     with pytest.raises(CigarParsingException, match=r'.*Malformed cigar') as ex:
245 |         raise Cigar._pretty_cigarstring_exception(cigar, index)
246 |     assert expected in str(ex)
247 | 
248 |     expected = cigar + "[]"
249 |     with pytest.raises(CigarParsingException, match=r'.*Malformed cigar') as ex:
250 |         raise Cigar._pretty_cigarstring_exception(cigar, len(cigar))
251 |     assert expected in str(ex)
252 | 
253 | 
254 | def test_from_cigarstring() -> None:
255 |     # Empty cigar
256 |     assert str(Cigar.from_cigarstring("*")) == "*"
257 | 
258 |     elements = []
259 |     for i, operator in enumerate(CigarOp):
260 |         elements.append(CigarElement(i + 1, operator))
261 |     cigarstring = str(Cigar(tuple(elements)))
262 |     assert str(Cigar.from_cigarstring(cigarstring)) == cigarstring
263 | 
264 | 
265 | def test_from_cigarstring_op_should_start_with_digit() -> None:
266 |     cigars = ["", "M", "10MI", "10M5SU"]
267 |     errors = ["", "[M]", "10M[I]", "10M5S[U]"]
268 |     for cigar, error in zip(cigars, errors):
269 |         match = "Malformed cigar: " + error if cigar else 'Cigar string was empty'
270 |         with pytest.raises(CigarParsingException) as ex:
271 |             Cigar.from_cigarstring(cigar)
272 |         assert match in str(ex)
273 | 
274 | 
275 | def test_from_cigarstring_no_length() -> None:
276 |     cigars = ["M", "10MS"]
277 |     errors = ["", "10M[S]"]
278 |     for cigar, error in zip(cigars, errors):
279 |         with pytest.raises(CigarParsingException) as ex:
280 |             Cigar.from_cigarstring(cigar)
281 |         assert "Malformed cigar: " + error in str(ex)
282 | 
283 | 
284 | def test_from_cigarstring_invalid_operator() -> None:
285 |     cigars = ["10U", "10M5U"]
286 |     errors = ["10[U]", "10M5[U]"]
287 |     for cigar, error in zip(cigars, errors):
288 |         with pytest.raises(CigarParsingException) as ex:
289 |             Cigar.from_cigarstring(cigar)
290 |         assert "Malformed cigar: " + error in str(ex)
291 | 
292 | 
293 | def test_from_cigarstring_missing_operator() -> None:
294 |     cigars = ["10", "10M5"]
295 |     errors = ["10[]", "10M5[]"]
296 |     for cigar, error in zip(cigars, errors):
297 |         with pytest.raises(CigarParsingException) as ex:
298 |             Cigar.from_cigarstring(cigar)
299 |         assert "Malformed cigar: " + error in str(ex)
300 | 
301 | 
302 | def test_is_indel() -> None:
303 |     indels = [op for op in CigarOp if op.is_indel]
304 |     assert indels == [CigarOp.I, CigarOp.D]
305 | 
306 | 
307 | def test_get_and_set_qc_fail() -> None:
308 |     builder = SamBuilder()
309 |     (r1, _) = builder.add_pair()
310 | 
311 |     def foo() -> None:
312 |         pass
313 | 
314 |     # the record isn't qc failed, so get_qc_fail should return None
315 |     assert sam.get_qc_fail(r1) is None
316 |     assert sam.get_qc_fail_by_tool(r1) is None
317 | 
318 |     # the record is qc failed, but there are no tags set, so get_qc_fail should return None
319 |     r1.is_qcfail = True
320 |     assert sam.get_qc_fail(r1) is None
321 |     assert sam.get_qc_fail_by_tool(r1) is None
322 | 
323 |     # the record is qc failed by a tool and with a reason, so we should get a return value
324 |     sam.set_qc_fail(r1, test_get_and_set_qc_fail, "some reason")
325 |     (tool, reason) = sam.get_qc_fail(r1)
326 |     assert tool == test_get_and_set_qc_fail.__name__
327 |     assert reason == "some reason"
328 |     (tool, reason) = sam.get_qc_fail_by_tool(r1, tool=test_get_and_set_qc_fail)
329 |     assert tool == test_get_and_set_qc_fail.__name__
330 |     assert reason == "some reason"
331 | 
332 |     # returns None if a different tool set the record as QC fail
333 |     assert sam.get_qc_fail_by_tool(r1, tool=foo) is None
334 | 
335 | 
336 | def test_isize() -> None:
337 |     builder = SamBuilder()
338 |     r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
339 |     assert sam.isize(r1, r2) == 190
340 |     assert sam.isize(r2, r1) == -190
341 | 
342 |     r2.is_unmapped = True
343 |     assert sam.isize(r1, r2) == 0
344 | 


--------------------------------------------------------------------------------
/samwell/sam/tests/test_sambuilder.py:
--------------------------------------------------------------------------------
  1 | """Basic tests of the sambuilder module."""
  2 | 
  3 | import pytest
  4 | 
  5 | from pathlib import Path
  6 | from py._path.local import LocalPath as TmpDir
  7 | from samwell import sam
  8 | from samwell.sam import SamOrder
  9 | from samwell.sam.sambuilder import SamBuilder
 10 | from typing import Optional
 11 | from typing import List
 12 | 
 13 | 
 14 | def test_add_pair_all_fields() -> None:
 15 |     builder = SamBuilder()
 16 |     builder.add_pair(
 17 |         name="q1",
 18 |         chrom="chr1",
 19 |         bases1="ACGTG",
 20 |         quals1=[20, 21, 22, 23, 24],
 21 |         start1=10000,
 22 |         cigar1="5M",
 23 |         strand1="+",
 24 |         bases2="GCGC",
 25 |         quals2=[30, 31, 32, 33],
 26 |         start2=10200,
 27 |         cigar2="4M",
 28 |         strand2="-",
 29 |         attrs={"aa": "Hello", "bb": 42}
 30 |     )
 31 |     recs = builder.to_sorted_list()
 32 |     assert len(recs) == 2
 33 |     for rec in recs:
 34 |         assert rec.query_name == "q1"
 35 |         assert rec.reference_name == "chr1"
 36 |         assert rec.is_paired
 37 |         assert abs(rec.template_length) == 204
 38 |         assert rec.get_tag("aa") == "Hello"
 39 |         assert rec.get_tag("bb") == 42
 40 |         if rec.is_read1:
 41 |             assert rec.reference_start == 10000
 42 |             assert not rec.is_reverse
 43 |             assert rec.query_sequence == "ACGTG"
 44 |             assert list(rec.query_qualities) == [20, 21, 22, 23, 24]
 45 |             assert rec.cigarstring == "5M"
 46 |         else:
 47 |             assert rec.reference_start == 10200
 48 |             assert rec.is_reverse
 49 |             assert rec.query_sequence == "GCGC"
 50 |             assert list(rec.query_qualities) == [30, 31, 32, 33]
 51 |             assert rec.cigarstring == "4M"
 52 | 
 53 | 
 54 | def test_add_pair_minimal() -> None:
 55 |     builder = SamBuilder(r1_len=10, r2_len=5, base_quality=25)
 56 |     r1, r2 = builder.add_pair(chrom="chr1", start1=1000, start2=1200)
 57 |     assert r1.query_name == r2.query_name
 58 |     assert r1.reference_name == r2.reference_name == "chr1"
 59 |     assert r1.reference_start == 1000
 60 |     assert r2.reference_start == 1200
 61 |     assert not r1.is_reverse
 62 |     assert r2.is_reverse
 63 |     assert len(r1.query_sequence) == len(r1.query_qualities) == 10
 64 |     assert len(r2.query_sequence) == len(r2.query_qualities) == 5
 65 |     assert r1.cigarstring == "10M"
 66 |     assert r2.cigarstring == "5M"
 67 |     assert r1.get_tag("RG") == builder.rg_id
 68 |     assert r2.get_tag("RG") == builder.rg_id
 69 | 
 70 | 
 71 | def test_add_pair_mix_and_match() -> None:
 72 |     builder = SamBuilder(r1_len=100, r2_len=100, base_quality=30)
 73 |     r1, r2 = builder.add_pair(chrom="chr1", start1=500, start2=700, cigar1="75M", cigar2="9M1I30M")
 74 |     assert len(r1.query_sequence) == len(r1.query_qualities) == 75
 75 |     assert len(r2.query_sequence) == len(r2.query_qualities) == 40
 76 | 
 77 |     r1, r2 = builder.add_pair(chrom="chr1", start1=500, start2=700,
 78 |                               bases1="ACGTGCATGC", bases2="ACGAC")
 79 |     assert len(r1.query_sequence) == len(r1.query_qualities) == 10
 80 |     assert len(r2.query_sequence) == len(r2.query_qualities) == 5
 81 |     assert r1.cigarstring == "10M"
 82 |     assert r2.cigarstring == "5M"
 83 | 
 84 |     r1, r2 = builder.add_pair(chrom="chr1", start1=500, start2=700,
 85 |                               quals1=[30] * 20, quals2=[20] * 10)
 86 |     assert len(r1.query_sequence) == len(r1.query_qualities) == 20
 87 |     assert len(r2.query_sequence) == len(r2.query_qualities) == 10
 88 |     assert r1.cigarstring == "20M"
 89 |     assert r2.cigarstring == "10M"
 90 | 
 91 |     # Now what if we provide multiple values that are inconsistent
 92 |     with pytest.raises(ValueError, match="not length compatible"):
 93 |         builder.add_pair(chrom="chr1", start1=10, start2=99, bases1="ACGTG", cigar1="10M")
 94 | 
 95 |     with pytest.raises(ValueError, match="not length compatible"):
 96 |         builder.add_pair(chrom="chr1", start1=10, start2=99, bases1="ACGTG", quals1=[2, 2])
 97 | 
 98 |     with pytest.raises(ValueError, match="not length compatible"):
 99 |         builder.add_pair(chrom="chr1", start1=10, start2=99, quals1=[2, 2], cigar1="5M")
100 | 
101 | 
102 | def test_unmapped_reads() -> None:
103 |     builder = SamBuilder()
104 |     r1, r2 = builder.add_pair(chrom="chr1", start1=1000)
105 |     assert not r1.is_unmapped
106 |     assert r1.mate_is_unmapped
107 |     assert r2.is_unmapped
108 |     assert not r2.mate_is_unmapped
109 |     for rec in r1, r2:
110 |         assert rec.reference_name == "chr1"
111 |         assert rec.reference_start == 1000
112 |         assert rec.next_reference_name == "chr1"
113 |         assert rec.next_reference_start == 1000
114 | 
115 |     r1, r2 = builder.add_pair(chrom="chr1", start2=2000)
116 |     assert r1.is_unmapped
117 |     assert not r1.mate_is_unmapped
118 |     assert not r2.is_unmapped
119 |     assert r2.mate_is_unmapped
120 |     for rec in r1, r2:
121 |         assert rec.reference_name == "chr1"
122 |         assert rec.reference_start == 2000
123 |         assert rec.next_reference_name == "chr1"
124 |         assert rec.next_reference_start == 2000
125 | 
126 |     r1, r2 = builder.add_pair(chrom=sam.NO_REF_NAME)
127 |     assert r1.is_unmapped
128 |     assert r1.mate_is_unmapped
129 |     assert r2.is_unmapped
130 |     assert r2.mate_is_unmapped
131 |     for rec in r1, r2:
132 |         assert rec.reference_name is None
133 |         assert rec.reference_start == sam.NO_REF_POS
134 |         assert rec.next_reference_name is None
135 |         assert rec.next_reference_start == sam.NO_REF_POS
136 | 
137 | 
138 | def test_invalid_strand() -> None:
139 |     with pytest.raises(ValueError, match="strand"):
140 |         SamBuilder().add_pair(chrom="chr1", start1=100, start2=200, strand1="F", strand2="R")
141 | 
142 | 
143 | def test_proper_pair() -> None:
144 |     builder = SamBuilder()
145 | 
146 |     # Regular innies
147 |     for rec in builder.add_pair(chrom="chr1", start1=5000, start2=5200, strand1="+", strand2="-"):
148 |         assert rec.is_proper_pair
149 |     for rec in builder.add_pair(chrom="chr1", start1=5200, start2=5000, strand1="-", strand2="+"):
150 |         assert rec.is_proper_pair
151 | 
152 |     # Outies
153 |     for rec in builder.add_pair(chrom="chr1", start1=5000, start2=5200, strand1="-", strand2="+"):
154 |         assert not rec.is_proper_pair
155 |     for rec in builder.add_pair(chrom="chr1", start1=5200, start2=5000, strand1="+", strand2="-"):
156 |         assert not rec.is_proper_pair
157 | 
158 |     # Unmapped
159 |     for rec in builder.add_pair(chrom="chr1", start1=5000, strand1="+"):
160 |         assert not rec.is_proper_pair
161 |     for rec in builder.add_pair(chrom="chr1", start2=5000, strand2="+"):
162 |         assert not rec.is_proper_pair
163 |     for rec in builder.add_pair():
164 |         assert not rec.is_proper_pair
165 | 
166 | 
167 | def test_sorting() -> None:
168 |     builder = SamBuilder()
169 |     builder.add_pair(chrom="chr1", start1=5000, start2=4700, strand1="-", strand2="+")
170 |     builder.add_pair(chrom="chr1", start1=4000, start2=4300)
171 |     builder.add_pair(chrom="chr5", start1=4000, start2=4300)
172 |     builder.add_pair(chrom="chr2", start1=4000, start2=4300)
173 | 
174 |     last_ref_id = -1
175 |     last_start = -1
176 |     for rec in builder.to_sorted_list():
177 |         ref_id = rec.reference_id
178 |         start = rec.reference_start
179 |         assert ref_id > last_ref_id or (ref_id == last_ref_id and start >= last_start)
180 |         last_ref_id = ref_id
181 |         last_start = start
182 | 
183 | 
184 | def make_sort_order_builder(tmpdir: TmpDir, sort_order: SamOrder) -> Path:
185 |     builder = SamBuilder(sort_order=sort_order)
186 |     builder.add_pair(
187 |         name="test3",
188 |         chrom="chr1",
189 |         start1=5000,
190 |         start2=4700,
191 |         strand1="-",
192 |         strand2="+"
193 |     )
194 |     builder.add_pair(name="test2", chrom="chr1", start1=4000, start2=4300)
195 |     builder.add_pair(name="test1", chrom="chr5", start1=4000, start2=4300)
196 |     builder.add_pair(name="test4", chrom="chr2", start1=4000, start2=4300)
197 | 
198 |     pos_path = Path(str(tmpdir)) / "test.bam"
199 |     builder.to_path(pos_path)
200 |     return pos_path
201 | 
202 | 
203 | @pytest.mark.parametrize(
204 |     argnames=["sort_order", "expected_name_order"],
205 |     argvalues=[
206 |         (SamOrder.Coordinate, ["test2", "test3", "test4", "test1"]),
207 |         (SamOrder.QueryName, ["test1", "test2", "test3", "test4"]),
208 |         (SamOrder.Unsorted, ["test3", "test2", "test1", "test4"]),
209 |         (None, ["test3", "test2", "test1", "test4"])
210 |     ],
211 |     ids=["Coordinate sorting", "Query name sorting", "Unsorted output", "Unsorted output - None"]
212 | )
213 | def test_sort_types(
214 |     tmpdir: TmpDir,
215 |     sort_order: Optional[SamOrder],
216 |     expected_name_order: List[str]
217 | ) -> None:
218 |     bam_path = make_sort_order_builder(tmpdir=tmpdir, sort_order=sort_order)
219 |     with sam.reader(bam_path) as in_bam:
220 |         for name in expected_name_order:
221 |             read1 = next(in_bam)
222 |             assert name == read1.query_name, (
223 |                 "Position based read sort order did not match expectation"
224 |             )
225 |             read2 = next(in_bam)
226 |             assert name == read2.query_name, (
227 |                 "Position based read sort order did not match expectation"
228 |             )
229 | 
230 | 
231 | def test_custom_sd() -> None:
232 |     builder1 = SamBuilder()
233 |     builder2 = SamBuilder(sd=[{"SN": "hi", "LN": 999}, {"SN": "bye", "LN": 888}])
234 |     builder1.add_pair(chrom="chr1", start1=200, start2=400)
235 |     builder2.add_pair(chrom="hi", start1=200, start2=400)
236 | 
237 |     with pytest.raises(ValueError, match="not a valid chromosome name"):
238 |         builder1.add_pair(chrom="hi", start1=200, start2=400)
239 | 
240 |     with pytest.raises(ValueError, match="not a valid chromosome name"):
241 |         builder2.add_pair(chrom="chr1", start1=200, start2=400)
242 | 
243 | 
244 | def test_custom_rg() -> None:
245 |     builder = SamBuilder(rg={"ID": "novel", "SM": "custom_rg", "LB": "foo", "PL": "ILLUMINA"})
246 |     for rec in builder.add_pair(chrom="chr1", start1=100, start2=200):
247 |         assert rec.get_tag("RG") == "novel"
248 | 


--------------------------------------------------------------------------------
/samwell/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myriad-opensource/samwell/47c4d809e3a228cf2be7af09871ab70e706763a1/samwell/tests/__init__.py


--------------------------------------------------------------------------------
/samwell/tests/test_dnautils.py:
--------------------------------------------------------------------------------
 1 | """Tests for :py:mod:`~samwell.dnautils`"""
 2 | 
 3 | import pytest
 4 | 
 5 | from samwell import dnautils
 6 | 
 7 | 
 8 | def test_reverse_complement() -> None:
 9 |     assert dnautils.reverse_complement("") == ""
10 |     assert dnautils.reverse_complement("AATTCCGGaattccgg") == "ccggaattCCGGAATT"
11 |     assert dnautils.reverse_complement("ACGTN") == "NACGT"
12 | 
13 |     with pytest.raises(KeyError):
14 |         dnautils.reverse_complement("ACGT.GAT")
15 | 
16 |     with pytest.raises(KeyError):
17 |         dnautils.reverse_complement("RMNY")
18 | 
19 | 
20 | def test_mask_long_homopolymers() -> None:
21 |     assert dnautils.mask_long_homopolymers("A", 0) == "N"
22 |     assert dnautils.mask_long_homopolymers("A", 1) == "N"
23 |     assert dnautils.mask_long_homopolymers("A", 2) == "A"
24 |     assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 1) == "NNNNNNNNNNNNNNNNNNN"
25 |     assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 2) == "ANNNNNNNNNNNNNNNNNN"
26 |     assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 3) == "ACCNNNNNNNNNNNNNNNN"
27 |     assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 4) == "ACCGGGNNNNNNNNNNNNN"
28 |     assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 5) == "ACCGGGTTTTNNNNNTTTT"
29 |     assert dnautils.mask_long_homopolymers("ACCGGGTTTTAAAAATTTT", 6) == "ACCGGGTTTTAAAAATTTT"
30 | 
31 | 
32 | def test_has_long_homopolymer() -> None:
33 |     assert dnautils.has_long_homopolymer("A", 0)
34 |     assert not dnautils.has_long_homopolymer("A", 1)
35 |     assert dnautils.has_long_homopolymer("ACCGGGTTTTAAAAATTTT", 4)
36 |     assert not dnautils.has_long_homopolymer("ACCGGGTTTTAAAAATTTT", 5)
37 |     assert not dnautils.has_long_homopolymer("ACCGGGTTTTAAAAATTTT", 10)
38 | 


--------------------------------------------------------------------------------
/samwell/tests/test_itertools.py:
--------------------------------------------------------------------------------
 1 | """Tests for :py:mod:`~samwell.itertools`"""
 2 | 
 3 | import pytest
 4 | 
 5 | from samwell.itertools import PeekableIterator
 6 | from samwell.itertools import peekable
 7 | from samwell.itertools import MergingIterator
 8 | 
 9 | 
10 | def test_peekable_iterator_empty() -> None:
11 |     empty_iter: PeekableIterator[None] = peekable([])
12 |     assert not empty_iter.can_peek()
13 |     assert empty_iter.maybe_peek() is None, "maybe_peek was not None for empty iterator"
14 |     with pytest.raises(StopIteration):
15 |         empty_iter.peek()
16 |     with pytest.raises(StopIteration):
17 |         next(empty_iter)
18 | 
19 | 
20 | def test_peekable_iterator_nonempty() -> None:
21 |     nonempty_iter = peekable(range(10))
22 |     for i in range(10):
23 |         assert nonempty_iter.can_peek()
24 |         assert nonempty_iter.peek() == i
25 |         assert nonempty_iter.maybe_peek() == i, "maybe_peek value didn't match expectation"
26 |         assert next(nonempty_iter) == i
27 | 
28 |     assert nonempty_iter.maybe_peek() is None, "maybe_peek was not None for exhausted iterator"
29 |     with pytest.raises(StopIteration):
30 |         nonempty_iter.peek()
31 |     with pytest.raises(StopIteration):
32 |         next(nonempty_iter)
33 | 
34 | 
35 | def test_peekable_with_nones() -> None:
36 |     xs = [1, 2, None, 4, None, 6]
37 |     iterator = peekable(xs)
38 | 
39 |     for i in range(len(xs)):
40 |         assert iterator.peek() is xs[i]
41 |         assert iterator.maybe_peek() is xs[i]
42 |         assert next(iterator) is xs[i]
43 | 
44 | 
45 | def test_takewhile() -> None:
46 |     xs = [2, 4, 6, 8, 11, 13, 15, 17, 19, 20, 22, 24]
47 |     iterator = peekable(xs)
48 |     assert iterator.takewhile(lambda x: x % 2 == 0) == [2, 4, 6, 8]
49 |     assert iterator.takewhile(lambda x: x % 2 == 1) == [11, 13, 15, 17, 19]
50 |     assert iterator.takewhile(lambda x: x % 2 == 1) == []
51 |     assert iterator.takewhile(lambda x: x % 2 == 0) == [20, 22, 24]
52 | 
53 | 
54 | def test_dropwhile() -> None:
55 |     xs = [2, 4, 6, 8, 11, 13, 15, 17, 19, 20, 22, 24]
56 |     iterator = peekable(xs)
57 |     iterator.dropwhile(lambda x: x % 2 == 0)
58 |     iterator.dropwhile(lambda x: x <= 20)
59 |     assert list(iterator) == [22, 24]
60 | 
61 | 
62 | def test_merging_iterator() -> None:
63 |     xs = [1, 3, 5, 7, 9]
64 |     ys = [2, 4, 6, 8, 9]
65 |     ms = MergingIterator(iter(xs), iter(ys), keyfunc=lambda x: x)
66 |     assert list(ms) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 9]
67 | 
68 |     s1 = ["one", "enormous", "hippopotamus"]
69 |     s2 = ["a", "little", "diplodocus"]
70 |     ss = MergingIterator(iter(s1), iter(s2), keyfunc=lambda x: len(x))
71 |     assert list(ss) == ["a", "one", "little", "enormous", "diplodocus", "hippopotamus"]
72 | 


--------------------------------------------------------------------------------
/samwell/tests/test_overlap_detector.py:
--------------------------------------------------------------------------------
  1 | """Tests for :py:mod:`~samwell.overlap_detector`"""
  2 | 
  3 | from typing import List
  4 | 
  5 | from samwell.overlap_detector import Interval
  6 | from samwell.overlap_detector import OverlapDetector
  7 | 
  8 | 
  9 | def run_test(targets: List[Interval], query: Interval, results: List[Interval]) -> None:
 10 |     detector = OverlapDetector()
 11 |     # Use add_all() to covert itself and add()
 12 |     detector.add_all(intervals=targets)
 13 |     # Test overlaps_any()
 14 |     assert detector.overlaps_any(query) == (len(results) > 0)
 15 |     # Test get_overlaps()
 16 |     assert detector.get_overlaps(query) == results
 17 | 
 18 | 
 19 | def test_same_interval() -> None:
 20 |     interval = Interval("1", 10, 100)
 21 |     run_test(targets=[interval], query=interval, results=[interval])
 22 | 
 23 | 
 24 | def test_query_wholly_contained_in_target() -> None:
 25 |     target = Interval("1", 10, 100)
 26 |     query = Interval("1", 11, 99)
 27 |     run_test(targets=[target], query=query, results=[target])
 28 | 
 29 | 
 30 | def test_target_wholly_contained_in_query() -> None:
 31 |     target = Interval("1", 10, 100)
 32 |     query = Interval("1", 9, 101)
 33 |     run_test(targets=[target], query=query, results=[target])
 34 | 
 35 | 
 36 | def test_target_overlaps_first_base_of_query() -> None:
 37 |     target = Interval("1", 10, 100)
 38 |     query = Interval("1", 99, 100)
 39 |     run_test(targets=[target], query=query, results=[target])
 40 | 
 41 | 
 42 | def test_target_overlaps_last_base_of_query() -> None:
 43 |     target = Interval("1", 10, 100)
 44 |     query = Interval("1", 10, 11)
 45 |     run_test(targets=[target], query=query, results=[target])
 46 | 
 47 | 
 48 | def test_query_before_target() -> None:
 49 |     target = Interval("1", 10, 100)
 50 |     query = Interval("1", 9, 10)
 51 |     run_test(targets=[target], query=query, results=[])
 52 | 
 53 | 
 54 | def test_query_after_target() -> None:
 55 |     target = Interval("1", 10, 100)
 56 |     query = Interval("1", 100, 101)
 57 |     run_test(targets=[target], query=query, results=[])
 58 | 
 59 | 
 60 | def test_different_references() -> None:
 61 |     target = Interval("1", 10, 100)
 62 |     query = Interval("2", 10, 100)
 63 |     run_test(targets=[target], query=query, results=[])
 64 | 
 65 | 
 66 | def test_multiple_overlaps() -> None:
 67 |     interval_a = Interval("1", 10, 20)
 68 |     interval_b = Interval("1", 15, 25)
 69 |     interval_c = Interval("1", 19, 30)
 70 |     interval_d = Interval("1", 24, 35)
 71 | 
 72 |     # B overlaps both A and C
 73 |     run_test(targets=[interval_a, interval_c], query=interval_b, results=[interval_a, interval_c])
 74 |     # C overlaps both A and B
 75 |     run_test(targets=[interval_a, interval_b], query=interval_c, results=[interval_a, interval_b])
 76 |     # D overlaps only B and C (is after A)
 77 |     run_test(targets=[interval_a, interval_b, interval_c],
 78 |              query=interval_d,
 79 |              results=[interval_b, interval_c])
 80 | 
 81 | 
 82 | def test_multiple_references() -> None:
 83 |     target_chr1 = Interval("1", 10, 20)
 84 |     target_chr2 = Interval("2", 10, 20)
 85 |     run_test(targets=[target_chr1, target_chr2], query=target_chr1, results=[target_chr1])
 86 |     run_test(targets=[target_chr1, target_chr2], query=target_chr2, results=[target_chr2])
 87 | 
 88 | 
 89 | def test_same_interval_twice() -> None:
 90 |     interval = Interval("1", 10, 100)
 91 |     run_test(targets=[interval, interval], query=interval, results=[interval])
 92 | 
 93 | 
 94 | def test_wholly_contained_target() -> None:
 95 |     target_inner = Interval("1", 50, 60)
 96 |     target_outer = Interval("1", 40, 80)
 97 | 
 98 |     run_test(targets=[target_inner, target_outer],
 99 |              query=target_inner,
100 |              results=[target_outer, target_inner])
101 | 
102 | 
103 | def test_get_enclosing_intervals() -> None:
104 |     a = Interval("1", 1, 250)
105 |     b = Interval("1", 5, 30)
106 |     c = Interval("1", 10, 99)
107 |     d = Interval("1", 15, 19)
108 |     e = Interval("1", 16, 20)
109 | 
110 |     detector = OverlapDetector()
111 |     detector.add_all([a, b, c, d, e])
112 | 
113 |     assert detector.get_enclosing_intervals(Interval("1", 10, 100)) == [a]
114 |     assert detector.get_enclosing_intervals(Interval("1", 15, 20)) == [a, b, c]
115 |     assert detector.get_enclosing_intervals(Interval("1", 18, 19)) == [a, b, c, d, e]
116 |     assert detector.get_enclosing_intervals(Interval("1", 50, 99)) == [a, c]
117 | 
118 | 
119 | def test_get_enclosed() -> None:
120 |     a = Interval("1", 10, 100)
121 |     b = Interval("1", 15, 20)
122 |     c = Interval("1", 18, 19)
123 |     d = Interval("1", 50, 99)
124 | 
125 |     detector = OverlapDetector()
126 |     detector.add_all([a, b, c, d])
127 | 
128 |     assert detector.get_enclosed(Interval("1", 1, 250)) == [a, b, c, d]
129 |     assert detector.get_enclosed(Interval("1", 5, 30)) == [b, c]
130 |     assert detector.get_enclosed(Interval("1", 16, 20)) == [c]
131 |     assert detector.get_enclosed(Interval("1", 15, 19)) == [c]
132 |     assert detector.get_enclosed(Interval("1", 10, 99)) == [b, c, d]
133 | 


--------------------------------------------------------------------------------