├── .gitlab-ci.yml
├── .gitmodules
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── build-wheels.sh
├── build.py
├── conda
    ├── build.sh
    ├── conda_build_config.yaml
    └── meta.yaml
├── images
    └── ONT_logo_590x106.png
├── modbampy
    └── __init__.py
├── requirements.txt
├── setup.py
├── src
    ├── args.c
    ├── args.h
    ├── bamiter.c
    ├── bamiter.h
    ├── common.c
    ├── common.h
    ├── counts.c
    ├── counts.h
    ├── modbam2bed.c
    └── version.h
├── test
    ├── test_api.py
    └── test_motifs.py
└── test_data
    ├── 400ecoli.bam
    ├── 400ecoli.bam.bai
    ├── ecoli.fasta.gz
    ├── tag_codes.bam
    └── tag_codes.bam.bai


/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | include:
  2 |     - project: "epi2melabs/ci-templates"
  3 |       file:
  4 |           - "push-github.yaml"
  5 |           - "push-conda.yaml"
  6 |           - "snippets.yaml"
  7 | 
  8 | image: ${UBUNTUIMAGE}:20.04
  9 | 
 10 | variables:
 11 |     GIT_SUBMODULE_STRATEGY: recursive
 12 | 
 13 | 
 14 | .prep-image: &prep-image |
 15 |     export DEBIAN_FRONTEND=noninteractive
 16 |     apt update -qq
 17 |     apt install -y --no-install-recommends gcc autoconf libtool automake valgrind make curl wget zlib1g-dev libbz2-dev libreadline-dev libssl-dev libffi-dev liblzma-dev libcurl4-gnutls-dev
 18 | 
 19 | .minimal-python: &minimal-python |
 20 |     export DEBIAN_FRONTEND=noninteractive
 21 |     apt-get update -qq && apt-get install -y -qq python3-all-dev python3-venv
 22 | 
 23 | 
 24 | stages:
 25 |     - test
 26 |     - prerelease
 27 |     - release
 28 | 
 29 | 
 30 | bld:program:
 31 |     stage: test
 32 |     before_script:
 33 |         - *prep-image
 34 |     script:
 35 |         - !reference [.check, license]
 36 |         - make modbam2bed
 37 |         - ./modbam2bed --help
 38 |         - make mem_check
 39 | 
 40 | 
 41 | bld:api-test:
 42 |     stage: test
 43 |     script:
 44 |         - *prep-image
 45 |         - *minimal-python
 46 |         - make test_api
 47 |         - make test_python
 48 | 
 49 | 
 50 | deploy-checks:
 51 |     stage: prerelease
 52 |     variables:
 53 |         PACKAGE_NAME: modbampy
 54 |     script:
 55 |         - !reference [.check, argp-c-version]
 56 |         - !reference [.check, python-version]
 57 |         - !reference [.check, changelog]
 58 |         - !reference [.check, license]
 59 |     rules:
 60 |         - if: '$CI_COMMIT_TAG =~ /^v[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+$/'
 61 | 
 62 | 
 63 | .before-script: &before-script |
 64 |     export CONDA_PKG=${CI_PROJECT_NAME}
 65 |     export CONDA_PKG_VERSION=${CI_COMMIT_TAG/v/}
 66 |     mkdir conda-build
 67 |     cd conda-build
 68 | 
 69 | 
 70 | conda:
 71 |     extends: .deploy-conda-linux
 72 |     variables:
 73 |         RECIPEPATH: "../conda"
 74 |     before_script:
 75 |         - *prep-image
 76 |         - *before-script
 77 | 
 78 | conda-arm:
 79 |     extends: .deploy-conda-linux-arm
 80 |     variables:
 81 |         RECIPEPATH: "../conda"
 82 |     before_script:
 83 |         - *prep-image
 84 |         - *before-script
 85 | 
 86 | conda-mac:
 87 |     extends: .deploy-conda-mac
 88 |     variables:
 89 |         RECIPEPATH: "../conda"
 90 |     before_script:
 91 |         - *before-script
 92 | 
 93 | conda-mac-arm:
 94 |     extends: .deploy-conda-mac-arm
 95 |     variables:
 96 |         RECIPEPATH: "../conda"
 97 |     before_script:
 98 |         - *before-script
 99 | 
100 | ### Python
101 | 
102 | bld:py-sdist:
103 |     stage: prerelease
104 |     script:
105 |         - *prep-image
106 |         - *minimal-python
107 |         - make sdist
108 |         - cd dist
109 |         - python3 -m venv venv
110 |         - . venv/bin/activate
111 |         - pip install --upgrade pip
112 |         - pip install *.tar.gz
113 |     artifacts:
114 |         paths:
115 |             - dist/*.tar.gz
116 | 
117 | .many-linux: &many-linux-def
118 |     stage: prerelease
119 |     variables:
120 |         DO_COUNT_TEST: 1
121 |     script:
122 |         - echo "Building a Python ${PYWHEEL} wheel on manylinux_${FLAVOUR}"
123 |         - ./build-wheels.sh . ${PYWHEEL}
124 |     artifacts:
125 |         paths:
126 |             - wheelhouse-final/*.whl
127 |     only:
128 |         - tags
129 | 
130 | 
131 | make-wheels-2010:
132 |     extends: .many-linux
133 |     image: "quay.io/pypa/manylinux2010_x86_64"
134 |     parallel:
135 |         matrix:
136 |             - PYWHEEL: [7, 8]
137 |               FLAVOUR: ["2010"]
138 | 
139 | 
140 | make-wheels-2014:
141 |     extends: .many-linux
142 |     image: "quay.io/pypa/manylinux2014_x86_64"
143 |     parallel:
144 |         matrix:
145 |             - PYWHEEL: [7, 8, 9]
146 |               FLAVOUR: ["2014"]
147 | 
148 | 
149 | make-wheels-2_24:
150 |     extends: .many-linux
151 |     image: "quay.io/pypa/manylinux_2_24_x86_64"
152 |     parallel:
153 |         matrix:
154 |             - PYWHEEL: [8, 9, 10]
155 |               FLAVOUR: ["2_24"]
156 | 
157 | 
158 | deploy:pypi:
159 |     stage: release
160 |     script:
161 |         - *minimal-python
162 |         - make pypi_build/bin/activate
163 |         - source pypi_build/bin/activate
164 |         - twine upload --non-interactive dist/modbampy*.tar.gz wheelhouse-final/modbampy*.whl
165 |     rules:
166 |         - if: '$CI_COMMIT_TAG =~ /^v[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+$/'
167 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "libdeflate"]
2 | 	path = libdeflate
3 | 	url = https://github.com/ebiggers/libdeflate.git
4 | [submodule "htslib"]
5 | 	path = htslib
6 | 	url = https://github.com/samtools/htslib
7 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  6 | 
  7 | ## [v0.10.0]
  8 | ### Changed
  9 | Read iterator now returns copies of alignments for more Pythonic behaviour.
 10 | 
 11 | ## [v0.9.5]
 12 | This version adds no user facing changes.
 13 | 
 14 | ### Fixed
 15 | - Pinned pip in Makefile so CI tests will run.
 16 | 
 17 | ## [v0.9.4]
 18 | ### Fixed
 19 | - Python source distribution did not include libdeflate directory.
 20 | 
 21 | ## [v0.9.3]
 22 | ### Added
 23 | - Linux and macOS ARM conda builds.
 24 | 
 25 | ## [v0.9.2]
 26 | ### Changed
 27 | - Rebuild for conda with more specific htslib version pin.
 28 | 
 29 | ## [v0.9.1]
 30 | ### Added
 31 | - `--map_q` command line option to filter reads by minimum mapping quality.
 32 | ### Fixed
 33 | - The default mapping quality was erroneously 1 not 0.
 34 | 
 35 | ## [v0.9.0]
 36 | ### Added
 37 | - "Other" modified base column to extended output. For example, when using
 38 |   `-m 5mC` to count 5-methylcytosine in reads, the "other" column will
 39 |   enumerate counts of other cytosine modifications present. When using the
 40 |   option `--combine` this column will contain zero (the counts being included
 41 |   in the modified base count).
 42 | - `--theshold` option to replace both `-a` and `-b`.
 43 | ### Fixed
 44 | - In line with the "other" column, when not using the `--combine` option
 45 |   the potential presence of other modifications in the same family of the
 46 |   base requested is taking into account. This has an effect of distributing
 47 |   some previously erroneous "canonical/non-modified" calls to "other" and
 48 |   "filtered" counts.
 49 | ### Removed
 50 | - The options `-a` and `-b` are deprecated. Instead use `--threshold`.
 51 | 
 52 | ## [v0.8.0]
 53 | ### Added
 54 | - `--combine` option to combine calls from all modified bases in a family.
 55 |   The previous behaviour was that the non-modified (canonical) count would
 56 |   have be incremented. For example when searching for 5mC modifications
 57 |   with `-m 5mC` and a 5hmC base was present, the read would contribute
 58 |   to the canonical count, not the modified count.
 59 | 
 60 | ## [v0.7.0]
 61 | ### Added
 62 | - `--pileup` option to output full raw base counts rather than BED methyl.
 63 | ### Changed
 64 | - `-c` no longer synonym to `--cpg`.
 65 | - `?`-style MM subtags now handle correctly with "missing" entries being recorded
 66 |   as "no call" rather than implied canonical.
 67 | - extended output now includes a 15th column for "no call" bases.
 68 | ### Fixed
 69 | - Links in README.
 70 | 
 71 | ## [v0.6.3]
 72 | ### Changed
 73 | - Bumped htslib version to version 1.16 for fixes to MM tag parsing/validation.
 74 | - Change conda build back to bioconda::htslib since we're using a released version.
 75 | 
 76 | ## [v0.6.2]
 77 | ### Fixed
 78 | - Off-by-one in pointless BED field.
 79 | 
 80 | ## [v0.6.1]
 81 | ### Fixed
 82 | - ChEBI codes not cast correctly in Python API.
 83 | ### Added
 84 | - Support ambiguous modified bases as listed in HTS tags specification.
 85 | 
 86 | ## [v0.6.0]
 87 | ## Changed
 88 | - Sites with no coverage now report "nan" methylation frequency and score.
 89 | ## Added
 90 | - Option `--aggregate` to pair information from two strands and output additional files.
 91 | - Support for ChEBI codes in C and Python pileup API.
 92 | 
 93 | ## [v0.5.3]
 94 | ## Added
 95 | - Python 3.9 and 3.10 wheel builds.
 96 | 
 97 | ## [v0.5.2]
 98 | ## Changed
 99 | - Use commit `e51f72f` of htslib for `?` and `.` parsing of Mm tag.
100 | 
101 | ## [v0.5.1]
102 | ## Added
103 | - `--max_depth` argument, and do not limit by default.
104 | - `--chh` and `--chg` filter options.
105 | 
106 | ## [v0.5.0]
107 | ### Changed
108 | - Decouple file opening from read iteration.
109 | - Move Python pileup function to method of ModBam class.
110 | 
111 | ## [v0.4.6]
112 | ### Changed
113 | - Reworked compilation to remove argparser from Python module.
114 | ### Fixed
115 | - Memory leak in modbampy.
116 | 
117 | ## [v0.4.5]
118 | ### Fixed
119 | - Unmasking of reference sites (again).
120 | ### Added
121 | - Option `--mask`/`-k` to respect reference soft-masking.
122 | 
123 | ## [v0.4.4]
124 | ### Fixed
125 | - Logic error in filtering CpG sites for masked bases.
126 | 
127 | ## [v0.4.3]
128 | ### Changed
129 |  - Update modbampy version to match C code.
130 | 
131 | ## [v0.4.2]
132 | ### Changed
133 | - Include soft-masked reference positions when performing CpG filtering.
134 | 
135 | ## [v0.4.1]
136 | ### Fixed
137 | - Inaccuracies in README.
138 | 
139 | ## [v0.4.0]
140 | ### Fixed
141 | - Python pileup access after addition of multi-BAM support.
142 | ### Added
143 | - Additional properties to alignment objects in Python API.
144 | 
145 | ## [v0.3.3]
146 | ### Changed
147 | - conda build now uses htslib from bioconda.
148 | 
149 | ## [v0.3.2]
150 | ### Changed
151 | - Updated software build to use official version 1.14 htslib release.
152 | - Reorganised and updated README.
153 | 
154 | ## [v0.3.1]
155 | ### Changed
156 | - Build conda package with explicit libdeflate version.
157 | 
158 | ## [v0.3.0]
159 | ### Added
160 | - Multiple BAM parsing to streamline interaction with data from Guppy/MinKNOW.
161 | ### Changed
162 | - Reference file must now be given before list of BAM files on command-line.
163 | ### Fixed
164 | - Removed some debugging text.
165 | 
166 | ## [v0.2.2]
167 | ### Changed
168 | - Updated README to note Python package.
169 | 
170 | ## [v0.2.1]
171 | ### Fixed
172 | - Incorrect processing of non-primary alignments in Python API.
173 | ### Added
174 | - Add Python packages, available on PyPI.
175 | ### Changed
176 | - Updated htslib to version from samtools/dev.
177 | 
178 | 
179 | ## [v0.2.0]
180 | ### Fixed
181 | - Segmentation fault on exit caused by double free of faidx member.
182 | ### Added
183 | - Python API to pileup and read-level parsing.
184 | 
185 | ## [v0.1.1]
186 | ### Fixed
187 | - Check input files are present and readable rather than segfaulting.
188 | ### Changed
189 | - Clearer error messaging.
190 | 
191 | 
192 | ## [v0.1.0]
193 | 
194 | First release.
195 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Oxford Nanopore Technologies PLC. Public License Version 1.0
  2 | =============================================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor’s Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Executable Form"
 25 |     means any form of the work other than Source Code Form.
 26 | 
 27 | 1.6. "Larger Work"
 28 |     means a work that combines Covered Software with other material, in
 29 |     a separate file or files, that is not Covered Software.
 30 | 
 31 | 1.7. "License"
 32 |     means this document.
 33 | 
 34 | 1.8. "Licensable"
 35 |     means having the right to grant, to the maximum extent possible,
 36 |     whether at the time of the initial grant or subsequently, any and
 37 |     all of the rights conveyed by this License.
 38 | 
 39 | 1.9. "Modifications"
 40 |     means any of the following:
 41 | 
 42 |     (a)	  any file in Source Code Form that results from an addition to,
 43 |           deletion from, or modification of the contents of Covered
 44 |           Software; or
 45 |     (b)   any new file in Source Code Form that contains any Covered
 46 |           Software.
 47 | 
 48 | 1.10. "Research Purposes"
 49 |     means use for internal research and not intended for or directed
 50 |     towards commercial advantages or monetary compensation; provided,
 51 |     however, that monetary compensation does not include sponsored
 52 |     research of research funded by grants.
 53 | 
 54 | 1.11  "Secondary License"
 55 |     means either the GNU General Public License, Version 2.0, the GNU
 56 |     Lesser General Public License, Version 2.1, the GNU Affero General
 57 |     Public License, Version 3.0, or any later versions of those
 58 |     licenses.
 59 | 
 60 | 1.12. "Source Code Form"
 61 |     means the form of the work preferred for making modifications.
 62 | 
 63 | 1.13. "You" (or "Your")
 64 |     means an individual or a legal entity exercising rights under this
 65 |     License. For legal entities, "You" includes any entity that
 66 |     controls, is controlled by, or is under common control with You. For
 67 |     purposes of this definition, "control" means (a) the power, direct
 68 |     or indirect, to cause the direction or management of such entity,
 69 |     whether by contract or otherwise, or (b) ownership of more than
 70 |     fifty percent (50%) of the outstanding shares or beneficial
 71 |     ownership of such entity.
 72 | 
 73 | 2. License Grants and Conditions
 74 | --------------------------------
 75 | 
 76 | 2.1. Grants
 77 | 
 78 | Each Contributor hereby grants You a world-wide, royalty-free,
 79 | non-exclusive license under Contributor copyrights Licensable by such
 80 | Contributor to use, reproduce, make available, modify, display,
 81 | perform, distribute, and otherwise exploit solely for Research Purposes
 82 | its Contributions, either on an unmodified basis, with Modifications,
 83 | or as part of a Larger Work.
 84 | 
 85 | 2.2. Effective Date
 86 | 
 87 | The licenses granted in Section 2.1 with respect to any Contribution
 88 | become effective for each Contribution on the date the Contributor
 89 | first distributes such Contribution.
 90 | 
 91 | 2.3. Limitations on Grant Scope
 92 | 
 93 | The licenses granted in this Section 2 are the only rights granted under
 94 | this License. No additional rights or licenses will be implied from the
 95 | distribution or licensing of Covered Software under this License. The
 96 | License is incompatible with Secondary Licenses.  Notwithstanding
 97 | Section 2.1 above, no copyright license is granted:
 98 | 
 99 | (a) for any code that a Contributor has removed from Covered Software;
100 |     or
101 | 
102 | (b) use of the Contributions or its Contributor Version other than for
103 | Research Purposes only; or
104 | 
105 | (c) for infringements caused by: (i) Your and any other third party’s
106 | modifications of Covered Software, or (ii) the combination of its
107 | Contributions with other software (except as part of its Contributor
108 | Version).
109 | 
110 | This License does not grant any rights in the patents, trademarks,
111 | service marks, or logos of any Contributor (except as may be necessary
112 | to comply with the notice requirements in Section 3.4).
113 | 
114 | 2.4. Subsequent Licenses
115 | 
116 | No Contributor makes additional grants as a result of Your choice to
117 | distribute the Covered Software under a subsequent version of this
118 | License (see Section 10.2) or under the terms of a Secondary License
119 | (if permitted under the terms of Section 3.3).
120 | 
121 | 2.5. Representation
122 | 
123 | Each Contributor represents that the Contributor believes its
124 | Contributions are its original creation(s) or it has sufficient rights
125 | to grant the rights to its Contributions conveyed by this License.
126 | 
127 | 2.6. Fair Use
128 | 
129 | This License is not intended to limit any rights You have under
130 | applicable copyright doctrines of fair use, fair dealing, or other
131 | equivalents.
132 | 
133 | 2.7. Conditions
134 | 
135 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
136 | in Section 2.1.
137 | 
138 | 3. Responsibilities
139 | -------------------
140 | 
141 | 3.1. Distribution of Source Form
142 | 
143 | All distribution of Covered Software in Source Code Form, including any
144 | Modifications that You create or to which You contribute, must be under
145 | the terms of this License. You must inform recipients that the Source
146 | Code Form of the Covered Software is governed by the terms of this
147 | License, and how they can obtain a copy of this License. You may not
148 | attempt to alter or restrict the recipients’ rights in the Source Code Form.
149 | 
150 | 3.2. Distribution of Executable Form
151 | 
152 | If You distribute Covered Software in Executable Form then:
153 | 
154 | (a) such Covered Software must also be made available in Source Code
155 |     Form, as described in Section 3.1, and You must inform recipients of
156 |     the Executable Form how they can obtain a copy of such Source Code
157 |     Form by reasonable means in a timely manner, at a charge no more
158 |     than the cost of distribution to the recipient; and
159 | 
160 | (b) You may distribute such Executable Form under the terms of this
161 |     License.
162 | 
163 | 3.3. Distribution of a Larger Work
164 | 
165 | You may create and distribute a Larger Work under terms of Your choice,
166 | provided that You also comply with the requirements of this License for
167 | the Covered Software. The Larger Work may not be a combination of Covered
168 | Software with a work governed by one or more Secondary Licenses.
169 | 
170 | 3.4. Notices
171 | 
172 | You may not remove or alter the substance of any license notices
173 | (including copyright notices, patent notices, disclaimers of warranty,
174 | or limitations of liability) contained within the Source Code Form of
175 | the Covered Software, except that You may alter any license notices to
176 | the extent required to remedy known factual inaccuracies.
177 | 
178 | 3.5. Application of Additional Terms
179 | 
180 | You may not choose to offer, or charge a fee for use of the Covered
181 | Software or a fee for, warranty, support, indemnity or liability
182 | obligations to one or more recipients of Covered Software.  You must
183 | make it absolutely clear that any such warranty, support, indemnity, or
184 | liability obligation is offered by You alone, and You hereby agree to
185 | indemnify every Contributor for any liability incurred by such
186 | Contributor as a result of warranty, support, indemnity or liability
187 | terms You offer. You may include additional disclaimers of warranty and
188 | limitations of liability specific to any jurisdiction.
189 | 
190 | 4. Inability to Comply Due to Statute or Regulation
191 | ---------------------------------------------------
192 | 
193 | If it is impossible for You to comply with any of the terms of this
194 | License with respect to some or all of the Covered Software due to
195 | statute, judicial order, or regulation then You must: (a) comply with
196 | the terms of this License to the maximum extent possible; and (b)
197 | describe the limitations and the code they affect. Such description must
198 | be placed in a text file included with all distributions of the Covered
199 | Software under this License. Except to the extent prohibited by statute
200 | or regulation, such description must be sufficiently detailed for a
201 | recipient of ordinary skill to be able to understand it.
202 | 
203 | 5. Termination
204 | --------------
205 | 
206 | 5.1. The rights granted under this License will terminate automatically
207 | if You fail to comply with any of its terms.
208 | 
209 | 5.2. If You initiate litigation against any entity by asserting an
210 | infringement claim (excluding declaratory judgment actions,
211 | counter-claims, and cross-claims) alleging that a Contributor Version
212 | directly or indirectly infringes, then the rights granted to
213 | You by any and all Contributors for the Covered Software under Section
214 | 2.1 of this License shall terminate.
215 | 
216 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
217 | end user license agreements (excluding distributors and resellers) which
218 | have been validly granted by You or Your distributors under this License
219 | prior to termination shall survive termination.
220 | 
221 | ************************************************************************
222 | *                                                                      *
223 | *  6. Disclaimer of Warranty                                           *
224 | *  -------------------------                                           *
225 | *                                                                      *
226 | *  Covered Software is provided under this License on an "as is"       *
227 | *  basis, without warranty of any kind, either expressed, implied, or  *
228 | *  statutory, including, without limitation, warranties that the       *
229 | *  Covered Software is free of defects, merchantable, fit for a        *
230 | *  particular purpose or non-infringing. The entire risk as to the     *
231 | *  quality and performance of the Covered Software is with You.        *
232 | *  Should any Covered Software prove defective in any respect, You     *
233 | *  (not any Contributor) assume the cost of any necessary servicing,   *
234 | *  repair, or correction. This disclaimer of warranty constitutes an   *
235 | *  essential part of this License. No use of any Covered Software is   *
236 | *  authorized under this License except under this disclaimer.         *
237 | *                                                                      *
238 | ************************************************************************
239 | 
240 | ************************************************************************
241 | *                                                                      *
242 | *  7. Limitation of Liability                                          *
243 | *  --------------------------                                          *
244 | *                                                                      *
245 | *  Under no circumstances and under no legal theory, whether tort      *
246 | *  (including negligence), contract, or otherwise, shall any           *
247 | *  Contributor, or anyone who distributes Covered Software as          *
248 | *  permitted above, be liable to You for any direct, indirect,         *
249 | *  special, incidental, or consequential damages of any character      *
250 | *  including, without limitation, damages for lost profits, loss of    *
251 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
252 | *  and all other commercial damages or losses, even if such party      *
253 | *  shall have been informed of the possibility of such damages. This   *
254 | *  limitation of liability shall not apply to liability for death or   *
255 | *  personal injury resulting from such party’s negligence to the       *
256 | *  extent applicable law prohibits such limitation, but in such event, *
257 | *  and to the greatest extent permissible, damages will be limited to  *
258 | *  direct damages not to exceed one hundred dollars. Some              *
259 | *  jurisdictions do not allow the exclusion or limitation of           *
260 | *  incidental or consequential damages, so this exclusion and          *
261 | *  limitation may not apply to You.                                    *
262 | *                                                                      *
263 | ************************************************************************
264 | 
265 | 8. Litigation
266 | -------------
267 | 
268 | Any litigation relating to this License may be brought only in the
269 | courts of a jurisdiction where the defendant maintains its principal
270 | place of business and such litigation shall be governed by laws of that
271 | jurisdiction, without reference to its conflict-of-law provisions.
272 | Nothing in this Section shall prevent a party’s ability to bring
273 | cross-claims or counter-claims.
274 | 
275 | 9. Miscellaneous
276 | ----------------
277 | 
278 | This License represents the complete agreement concerning the subject
279 | matter hereof. If any provision of this License is held to be
280 | unenforceable, such provision shall be reformed only to the extent
281 | necessary to make it enforceable. Any law or regulation which provides
282 | that the language of a contract shall be construed against the drafter
283 | shall not be used to construe this License against a Contributor.
284 | 
285 | 10. Versions of the License
286 | ---------------------------
287 | 
288 | 10.1. New Versions
289 | 
290 | Oxford Nanopore Technologies PLC. is the license steward. Except as
291 | provided in Section 10.3, no one other than the license steward has the
292 | right to modify or publish new versions of this License. Each version
293 | will be given a distinguishing version number.
294 | 
295 | 10.2. Effect of New Versions
296 | 
297 | You may distribute the Covered Software under the terms of the version
298 | of the License under which You originally received the Covered Software,
299 | or under the terms of any subsequent version published by the license
300 | steward.
301 | 
302 | 10.3. Modified Versions
303 | 
304 | If you create software not governed by this License, and you want to
305 | create a new license for such software, you may create and use a
306 | modified version of this License if you rename the license and remove
307 | any references to the name of the license steward (except to note that
308 | such modified license differs from this License).
309 | 
310 | Exhibit A - Source Code Form License Notice
311 | -------------------------------------------
312 | 
313 |   This Source Code Form is subject to the terms of the Oxford Nanopore
314 |   Technologies PLC. Public License, v. 1.0. Full licence can be found
315 |   obtained from support@nanoporetech.com
316 | 
317 | If it is not possible or desirable to put the notice in a particular
318 | file, then You may include the notice in a location (such as a LICENSE
319 | file in a relevant directory) where a recipient would be likely to look
320 | for such a notice.
321 | 
322 | You may add additional accurate notices of copyright ownership.
323 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include src/*.c
 2 | include src/*.h
 3 | include README.md
 4 | include LICENSE.md
 5 | include requirements.txt
 6 | include build.py
 7 | include Makefile
 8 | graft htslib*
 9 | prune htslib/test/
10 | include htslib/test/*.c
11 | include htslib/test/*.h
12 | include htslib/test/fuzz/*.c
13 | include htslib/test/fuzz/*.h
14 | prune htslib/htscodecs/tests
15 | include htslib/htscodecs/tests**/*.c
16 | include htslib/htscodecs/tests**/*.h
17 | graft libdeflate*
18 | graft images
19 | prune build
20 | prune docs
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | OS := $(shell uname)
  2 | ARCH := $(shell arch)
  3 | 
  4 | OS := $(shell uname)
  5 | ifeq ($(OS), Darwin)
  6 |     # mainly for dev builds using homebrew things
  7 |     EXTRA_LDFLAGS ?= -L$(shell brew --prefix openssl@1.1)/lib
  8 |     ARGP ?= $(shell brew --prefix argp-standalone)/lib/libargp.a
  9 |     ARGP_INCLUDE ?= -I$(shell brew --prefix argp-standalone)/include
 10 | else
 11 |     ARGP ?=
 12 |     ARGP_INCLUDE ?=
 13 | endif
 14 | 
 15 | 
 16 | CC ?= gcc
 17 | CFLAGS ?= -fpic -msse3 -O3 -std=c99
 18 | DEFLATE ?= $(PWD)/libdeflate
 19 | STATIC_HTSLIB ?= htslib/libhts.a
 20 | EXTRA_CFLAGS ?=
 21 | EXTRA_LDFLAGS ?=
 22 | EXTRA_LIBS ?=
 23 | HTS_CONF_ARGS ?=
 24 | HTS_CONF_ENV ?= CFLAGS="$(CFLAGS) $(EXTRA_CFLAGS)"
 25 | 
 26 | WITHDEFLATE ?= 
 27 | DEFLATEREQ =
 28 | ifeq ($(WITHDEFLATE), 1)
 29 | CFLAGS += -I$(DEFLATE) -L$(DEFLATE)
 30 | HTS_CONF_ARGS += --with-libdeflate
 31 | HTS_CONF_ENV += LDFLAGS="-L$(DEFLATE)"
 32 | EXTRA_LIBS += -ldeflate
 33 | DEFLATEREQ = libdeflate/libdeflate.so.0
 34 | endif
 35 | 
 36 | NOTHREADS ?=
 37 | ifeq ($(NOTHREADS), 1)
 38 | 	CFLAGS += -DNOTHREADS
 39 | endif
 40 | 
 41 | VALGRIND ?= valgrind
 42 | 
 43 | 
 44 | .PHONY: default
 45 | default: modbam2bed
 46 | 
 47 | libdeflate/libdeflate.so.0:
 48 | 	@echo Compiling $(@F)
 49 | 	cd libdeflate && make
 50 | 	
 51 | 
 52 | htslib/libhts.a: $(DEFLATEREQ)
 53 | 	@echo Compiling $(@F)
 54 | 	cd htslib/ \
 55 | 		&& autoreconf -i \
 56 | 		&& autoheader \
 57 | 		&& autoconf \
 58 | 		&& $(HTS_CONF_ENV) ./configure $(HTS_CONF_ARGS) \
 59 | 		&& make -j 4
 60 | 
 61 | 
 62 | .PHONY: clean_htslib
 63 | clean_htslib:
 64 | 	rm -rf htslib/autom4te.cache/ 
 65 | 	cd htslib && make clean || exit 0
 66 | 
 67 | 
 68 | %.o: src/%.c
 69 | 	mkdir -p obj && \
 70 | 		$(CC) -c -pthread -Wall -fstack-protector-strong -D_FORTIFY_SOURCE=2 $(CFLAGS) \
 71 | 		-Isrc -Ihtslib $(ARGP_INCLUDE) $(EXTRA_CFLAGS) $^ -o $@
 72 | 
 73 | .PHONY: clean_obj
 74 | clean_obj:
 75 | 	rm -rf *.o
 76 | 
 77 | 
 78 | modbam2bed: modbam2bed.o common.o counts.o bamiter.o args.o $(STATIC_HTSLIB)
 79 | 	$(CC) -pthread -Wall -fstack-protector-strong -D_FORTIFY_SOURCE=2 $(CFLAGS) \
 80 | 		-Isrc -Ihtslib $(EXTRA_CFLAGS) $(EXTRA_LDFLAGS)\
 81 | 		$^ $(ARGP) \
 82 | 		-lm -lz -llzma -lbz2 -lpthread -lcurl -lcrypto $(EXTRA_LIBS) \
 83 | 		-o $(@)
 84 | 
 85 | .PHONY: clean
 86 | clean: clean_obj clean_htslib
 87 | 	rm -rf modbam2bed modbampy.egg-info pymod.a  venv obj
 88 | 
 89 | .PHONY: mem_check
 90 | mem_check: modbam2bed
 91 | 	$(VALGRIND) --error-exitcode=1 --tool=memcheck --leak-check=full --show-leak-kinds=all -s \
 92 | 		./modbam2bed --threshold 0.66 -t 2 -r ecoli1 test_data/ecoli.fasta.gz test_data/400ecoli.bam test_data/400ecoli.bam > /dev/null
 93 | 
 94 | 
 95 | .PHONY: test_api
 96 | test_api: python
 97 | 	${IN_VENV} && pip install pytest
 98 | 	${IN_VENV} && pytest test --doctest-modules
 99 | 
100 | ### Python
101 | 
102 | PYTHON ?= python3
103 | VENV ?= venv
104 | venv: ${VENV}/bin/activate
105 | IN_VENV=. ./${VENV}/bin/activate
106 | 
107 | $(VENV)/bin/activate:
108 | 	test -d $(VENV) || $(PYTHON) -m venv $(VENV) --prompt "modbam"
109 | 	${IN_VENV} && pip install pip==23.0.1 --upgrade
110 | 	${IN_VENV} && pip install setuptools
111 | 
112 | .PHONY: python
113 | python: htslib/libhts.a pymod.a $(VENV)/bin/activate
114 | 	${IN_VENV} && pip install -r requirements.txt
115 | 	${IN_VENV} && WITHDEFLATE=$(WITHDEFLATE) LDFLAGS=$(EXTRA_LDFLAGS) pip install -e .
116 | 
117 | .PHONY: clean_python
118 | clean_python: clean_obj
119 | 	rm -rf dist build modbampy.egg-info pymod.a libmodbampy.abi3.so ${VENV}
120 | 
121 | pymod.a: common.o bamiter.o counts.o
122 | 	ar rcs $@ $^
123 | 
124 | test_python: python
125 | 	${IN_VENV} && pip install flake8 flake8-rst-docstrings flake8-docstrings flake8-import-order
126 | 	${IN_VENV} && flake8 modbampy \
127 | 		--import-order-style google --application-import-names modbampy,libmodbampy \
128 | 		--statistics
129 | 	${IN_VENV} && modbampy test_data/400ecoli.bam ecoli1 0 4000000 | wc -l
130 | 	${IN_VENV} && modbampy test_data/400ecoli.bam ecoli1 0 4000000 --pileup | wc -l
131 | 
132 | IN_BUILD=. ./pypi_build/bin/activate
133 | pypi_build/bin/activate:
134 | 	test -d pypi_build || $(PYTHON) -m venv pypi_build --prompt "(pypi) "
135 | 	${IN_BUILD} && pip install pip --upgrade
136 | 	${IN_BUILD} && pip install --upgrade pip setuptools twine wheel readme_renderer[md] keyrings.alt
137 | 
138 | .PHONY: sdist
139 | sdist: pypi_build/bin/activate
140 | 	${IN_BUILD} && python setup.py sdist
141 | 
142 | 
143 | .PHONY: wheels
144 | wheels: clean clean_python 
145 | 	docker run -v `pwd`:/io quay.io/pypa/manylinux2010_x86_64 /io/build-wheels.sh /io 6 7 8
146 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Oxford Nanopore Technologies logo](https://github.com/epi2me-labs/modbam2bed/raw/master/images/ONT_logo_590x106.png)
  2 | 
  3 | 
  4 | We have a new bioinformatic resource that replaces the functionality of this project! See our new repository here: 
  5 | [modkit](https://github.com/nanoporetech/modkit/).
  6 | 
  7 | This repository is now unsupported and we do not recommend its use. Please contact Oxford Nanopore: support@nanoporetech.com for help with your application if it is not possible to upgrade.
  8 | 
  9 | 
 10 | ******************
 11 | 
 12 | 
 13 | Modified-base BAM to bedMethyl
 14 | ------------------------------
 15 | 
 16 | A program to aggregate modified base counts stored in a
 17 | [modified-base BAM](https://samtools.github.io/hts-specs/SAMtags.pdf) (Section 2.1) file to 
 18 | a [bedMethyl](https://www.encodeproject.org/data-standards/wgbs/) file.
 19 | 
 20 | A Python module is also available to obtain modified base information
 21 | from BAM files in a convenient form. It is envisaged that this will eventually
 22 | be replaced by an implementation in [pysam](https://pysam.readthedocs.io/en/latest/index.html).
 23 | 
 24 | ### Installation
 25 | 
 26 | The program is available from our conda channel, so can be installed with:
 27 | 
 28 |     mamba create -n modbam2bed -c bioconda -c conda-forge -c epi2melabs modbam2bed
 29 | 
 30 | Packages are available for both Linux and MacOS.
 31 | 
 32 | Alternatively to install from the source code, clone the repository and then use make:
 33 | 
 34 |     git clone --recursive https://github.com/epi2me-labs/modbam2bed.git
 35 |     make modbam2bed
 36 |     ./modbam2bed
 37 | 
 38 | See the Makefile for more information. The code has been tested on MacOS (with
 39 | dependencies from brew) and on Ubuntu 18.04 and 20.04.
 40 | 
 41 | ### Usage
 42 | 
 43 | The code requires aligned reads with the `Mm` and `Ml` tags (`MM` and `ML` also supported),
 44 | and the reference sequence used for alignment.
 45 | 
 46 | The below is a snapshot of the command-line interface; it may not be up-to-date, please
 47 | refer to the program `--help` option for the most accurate guidance.
 48 | 
 49 | ```
 50 | Usage: modbam2bed [OPTION...] <reference.fasta> <reads.bam> [<reads.bam> ...]
 51 | modbam2bed -- summarise one or more BAM with modified base tags to bedMethyl. 
 52 | 
 53 |  General options:
 54 |       --aggregate            Output additional aggregated (across strand)
 55 |                              counts, requires --cpg or --chg.
 56 |       --combine              Create output with combined modified counts: i.e.
 57 |                              alternative modified bases within the same family
 58 |                              (same canonical base) are included.
 59 |   -c, --pileup               Output (full) raw base counts rather than BED
 60 |                              file.
 61 |   -e, --extended             Output extended bedMethyl including counts of
 62 |                              canonical, modified, and filtered bases (in that
 63 |                              order).
 64 |   -m, --mod_base=BASE        Modified base of interest, one of: 5mC, 5hmC, 5fC,
 65 |                              5caC, 5hmU, 5fU, 5caU, 6mA, 5oxoG, Xao. (Or modA,
 66 |                              modC, modG, modT, modU, modN for generic modified
 67 |                              base).
 68 |   -p, --prefix=PREFIX        Output file prefix. Only used when multiple output
 69 |                              filters are given.
 70 |   -r, --region=chr:start-end Genomic region to process.
 71 |   -t, --threads=THREADS      Number of threads for BAM processing.
 72 | 
 73 |  Base filtering options:
 74 |   -a, --canon_threshold=THRESHOLD
 75 |                              Deprecated. The option will be removed in a future
 76 |                              version. Please use --threshold.
 77 |   -b, --mod_threshold=THRESHOLD   Deprecated. The option will be removed in a
 78 |                              future version. Please use --threshold.
 79 |       --chg                  Output records filtered to CHG sites.
 80 |       --chh                  Output records filtered to CHH sites.
 81 |       --cpg                  Output records filtered to CpG sites.
 82 |   -f, --threshold=THRESHOLD  Bases with a call probability < THRESHOLD are
 83 |                              filtered from results (default 0.66).
 84 |   -k, --mask                 Respect soft-masking in reference file.
 85 | 
 86 |  Read filtering options:
 87 |   -d, --max_depth=DEPTH      Max. per-file depth; avoids excessive memory
 88 |                              usage.
 89 |   -g, --read_group=RG        Only process reads from given read group.
 90 |       --haplotype=VAL        Only process reads from a given haplotype.
 91 |                              Equivalent to --tag_name HP --tag_value VAL.
 92 |       --tag_name=TN          Only process reads with a given tag (see
 93 |                              --tag_value).
 94 |       --tag_value=VAL        Only process reads with a given tag value.
 95 | 
 96 |   -?, --help                 Give this help list
 97 |       --usage                Give a short usage message
 98 |   -V, --version              Print program version
 99 | 
100 | Mandatory or optional arguments to long options are also mandatory or optional
101 | for any corresponding short options.
102 | ```
103 | 
104 | ### Method and output format
105 | 
106 | Oxford Nanopore Technogies' sequencing chemistries and basecallers can detect
107 | any number of modified bases. Compared to traditional methods which force a
108 | false dichoctomy between say cytosine and 5-methylcytosine, this rich biology
109 | needs to be remembered when interpreting modified base calls.
110 | 
111 | The htslib pileup API is used to create a matrix of per-strand base counts
112 | including substitutions, modified bases and deletions. Inserted bases are not
113 | counted. Bases of an abiguous nature (refered to as "filtered" below), as
114 | defined by the filter threshold probabilities option `-b` are masked and used
115 | (along with substitutions and deletions) in the definition of the "score"
116 | (column 5) and "coverage" (column 10) entries of the bedMethyl file.
117 | 
118 | In the case of `?`-style `MM` subtags, where a lack of a recorded call should
119 | not be taken as implying a canonical-base call, the "no call" count is incremented.
120 | The "no call" count is used in the calculation of "coverage" and also the denominator
121 | of "score".
122 | 
123 | In summary, a base is determined as being either "canonical", "modified", "filtered",
124 | or "no call". The final output includes a modification frequency and score and
125 | coverage information in order to assess the reliability of the frequency.
126 | 
127 | **Call filtering**
128 | 
129 | To determine the base present at a locus in a read, the query base in the
130 | BAM record is examined along with the modified base information. A "canonical"
131 | base probability is calculated as `1 - sum(P_mod)`, with `P_mod` being
132 | the set of probabilities associated with all the modifications enumerated
133 | in the BAM record. The base form with largest probability is taken as the
134 | base present subject to the user-specified threshold. If the probability
135 | is below the threshold the call is masked and contributes to the "filtered"
136 | base count rather than the "canonical" or "modified" counts.
137 | 
138 | **Special Handling of alternative modified bases (`--combine` option)**
139 | 
140 | To intepret the case of multiple modifications being listed in
141 | the BAM, `modbam2bed` can operate in two modes:
142 | 
143 | * *default*: alternative modified bases in the same family as the requested
144 |   modification are counted separatedly as "other" --- neither in
145 |   the "canonical" count of the "modified" count.
146 | * `--combine`: alternative modified bases are lumped together into the 
147 |   "modified" count and ultimately into a single modification frequency.
148 | 
149 | ***A particular case where `--combine` is useful is when comparing to the result of bisulfite sequencing.***
150 | 
151 | **Output format**
152 | 
153 | > The description of the [bedMethyl](https://www.encodeproject.org/data-standards/wgbs/)
154 | > format on the ENCODE project website is rather loose. The definitions below are chosen pragmatically.
155 | 
156 | The table below describes precisely the entries in each column of the output BED
157 | file. Columns seven to nine inclusive are included for compatibility with the BED
158 | file specification, the values written are fixed and no meaning should be derived
159 | from them. Columns 5, 10, and 11 are defined in terms of counts of observed
160 | bases to agree with reasonable interpretations of the bedMethyl specifications:
161 | 
162 |  * N<sub>canon</sub> - canonical (unmodified) base count, (contigent on the use of `--combine`, see above.)
163 |  * N<sub>mod</sub> - modified base count.
164 |  * N<sub>filt</sub> - count of bases where read does not contain a substitution or deletion
165 |    with respect to the reference, but the modification status is ambiguous: these bases
166 |    were filtered from the calculation of the modification frequency.
167 |  * N<sub>sub</sub> - count of reads with a substitution with respect to the reference.
168 |  * N<sub>del</sub> - count of reads with a deletion with respect to the reference.
169 |  * N<sub>no call</sub> - counts of reads with an absent modification call (but not a substitution or deletion).
170 |  * N<sub>alt mod</sub> - counts of reads with and alternative modification call (but not a substitution or deletion).
171 | 
172 | Since these interpretations may differ from other tools an extended output is
173 | available (enabled with the `-e` option) which includes three additional columns
174 | with verbatim base counts.
175 | 
176 | | column | description                                                                                                                                                                                                                                                                  |
177 | |--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
178 | | 1      | reference sequence name                                                                                                                                                                                                                                                      |
179 | | 2      | 0-based start position                                                                                                                                                                                                                                                       |
180 | | 3      | 0-based exclusive end position (invariably start + 1)                                                                                                                                                                                                                        |
181 | | 4      | Abbreviated name of modified-base examined                                                                                                                                                                                                                                   |
182 | | 5      | "Score" 1000 * (N<sub>mod</sub> + N<sub>canon</sub>) / (N<sub>mod</sub> + N<sub>canon</sub> + N<sub>no call</sub> + N<sub>alt mod</sub> + N<sub>filt</sub> + N<sub>sub</sub> + N<sub>del</sub>). The quantity reflects the extent to which the calculated modification frequency in Column 11 is confounded by the alternative calls. The denominator here is the total read coverage as given in Column 10. |
183 | | 6      | Strand (of reference sequence). Forward "+", or reverse "-".                                                                                                                                                                                                                 |
184 | | 7-9    | Ignore, included simply for compatibility.                                                                                                                                                                                                                                   |
185 | | 10     | Read coverage at reference position including all canonical, modified, undecided (no calls and filtered), substitutions from reference, and deletions.  N<sub>mod</sub> + N<sub>canon</sub> + N<sub>no call</sub> + N<sub>alt mod</sub> + N<sub>filt</sub> + N<sub>sub</sub> + N<sub>del</sub>                                        |
186 | | 11     | Percentage of modified bases, as a proportion of canonical and modified (excluding no calls, filtered, substitutions, and deletions).  100 \* N<sub>mod</sub> / (N<sub>mod</sub>  + N<sub>alt mod</sub> + N<sub>canon</sub>)                                                                                       |
187 | | 12\*    | N<sub>canon</sub>                                                                                                                                                                                                                                                            |
188 | | 13\*    | N<sub>mod</sub>                                                                                                                                                                                                                                                         |
189 | | 14\*    | N<sub>filt</sub> those bases with a modification probability falling between given thresholds.                                                                                                                                                                           |
190 | | 15\*    | N<sub>no call</sub> those bases for which the query base was the correct canonical base for the modified base being considered, but no call was made (see the definition of the `.` and `?` flags in the SAM tag specification).                                                                                                                                                                           |
191 | | 16\*    | N<sub>alt mod</sub> those bases for which the query base was the correct canonical base for the modified base being considered, but and alternative modification was present.                                                                                                                                                                           |
192 | 
193 | \* Included in extended output only.
194 | 
195 | 
196 | ### Limitations
197 | 
198 | The code has not been developed extensively and currently has some limitations:
199 | 
200 |  * Support for motif filtering is limited to CpG, CHG, and CHH, sites. Without
201 |    this filtering enabled all reference positions that are the canonical base
202 |    (on forward or reverse strand) equivalent to the modified base under
203 |    consideration are reported.
204 |  * Insertion columns are completely ignored for simplicitly (and avoid
205 |    any heuristics).
206 |  * Second strand `MM` subtags (i.e. `MM:C-m` as compared with `MM:C+m`)
207 |    are not supported. These are not typically used so shouldn't affect most users.
208 |    If such a tag is detected and warning will be thrown and the tag ignored. These tags
209 |    do come in to play for duplex basecalls.
210 | 
211 | ### Python package
212 | 
213 | A Python package is available on [PyPI](https://pypi.org/project/modbampy/) which
214 | contains basic functionality for parsing BAM files with modified-base information.
215 | It is envisaged that this will eventually be replaced by an implementation in
216 | [pysam](https://pysam.readthedocs.io/en/latest/index.html). As such the interface
217 | is supplements but does not integrate or replace pysam.
218 | 
219 | The package can be installed with:
220 | 
221 | ```
222 | pip install modbampy
223 | ```
224 | 
225 | The package contains simply to modes of use. Firstly an interface to iterate
226 | over reads in a BAM file and report modification sites:
227 | 
228 | ```
229 | from modbampy import ModBam
230 | with ModBam(args.bam) as bam:
231 |     for read in bam.reads(args.chrom, args.start, args.end):
232 |         for pos_mod in read.mod_sites:
233 |             print(*pos_mod)
234 | ```
235 | 
236 | Each line of the above reports the
237 | 
238 | * read_id,
239 | * reference position,
240 | * query (read) position,
241 | * reference strand (+ or -),
242 | * modification strand (0 or 1, as defined in the HTSlib tag specification. This is invariable 0),
243 | * canonical base associated with modification,
244 | * modified base,
245 | * modified-base score (scaled to 0-255).
246 | 
247 | A second method is provided which mimics the couting procedure implemented in
248 | `modbam2bed`:
249 | 
250 | ```
251 | from modbampy import ModBam
252 | with ModBam(args.bam) as bam:
253 |     positions, counts = bam.pileup(
254 |         args.chrom, args.start, args.end
255 |         low_threshold=0.33, high_threshold=0.66, mod_base="m")
256 | ```
257 | 
258 | The result is two [numpy](https://numpy.org/) arrays. The first indicates the reference
259 | positions associated with the counts in the second array. Each row of the second array
260 | (`counts` above) enumerates the observed counts of bases in the order:
261 | 
262 |     a c g t A C G T d D m M f F n N
263 | 
264 | where uppercase letters refer to bases on the forward strand, lowercase letters
265 | relate to the reverse strand:
266 | 
267 | * A, C, G, T are the usual DNA bases,
268 | * D indicates deletion counts,
269 | * M modified base counts,
270 | * F filtered counts - bases in reads with a modified-base record but which were filtered
271 |   according to the thresholds provided.
272 | * N no call base counts.
273 | 
274 | **Extras**
275 | 
276 | The read iterator API also contains a minimal set of functionality mirroring properties of 
277 | alignments available from pysam. See the [code](https://github.com/epi2me-labs/modbam2bed/blob/master/modbampy/__init__.py)
278 | for further details.
279 | 
280 | ### Acknowledgements
281 | 
282 | We thank [jkbonfield](https://github.com/jkbonfield) for developing the modified base
283 | functionality into the htslib pileup API, and [Jared Simpson](https://github.com/jts)
284 | for testing and comparison to his independently developed code.
285 | 
286 | ### Help
287 | 
288 | **Licence and Copyright**
289 | 
290 | © 2021- Oxford Nanopore Technologies Ltd.
291 | 
292 | `modbam2bed` is distributed under the terms of the Mozilla Public License 2.0.
293 | 
294 | **Research Release**
295 | 
296 | Research releases are provided as technology demonstrators to provide early
297 | access to features or stimulate Community development of tools. Support for
298 | this software will be minimal and is only provided directly by the developers.
299 | Feature requests, improvements, and discussions are welcome and can be
300 | implemented by forking and pull requests. However much as we would
301 | like to rectify every issue and piece of feedback users may have, the
302 | developers may have limited resource for support of this software. Research
303 | releases may be unstable and subject to rapid iteration by Oxford Nanopore
304 | Technologies.
305 | 


--------------------------------------------------------------------------------
/build-wheels.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Usage: ./build-wheels.sh <workdir> <pyminorversion1> <pyminorversion2> ...
 3 | set -e -x
 4 | 
 5 | PACKAGE_NAME=modbampy
 6 | 
 7 | workdir=$1
 8 | shift
 9 | 
10 | echo "Changing cwd to ${workdir}"
11 | cd ${workdir}
12 | 
13 | # some many linux containers are centos-based, others are debian!
14 | if [ -f /etc/centos-release ]; then
15 |     yum install -y zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel
16 | else
17 |     # https://stackoverflow.com/questions/76094428/debian-stretch-repositories-404-not-found
18 |     sed -i -e 's/deb.debian.org/archive.debian.org/g' \
19 |            -e 's|security.debian.org|archive.debian.org/|g' \
20 |            -e '/stretch-updates/d' /etc/apt/sources.list
21 |     apt update
22 |     apt install -y zlib1g-dev libbz2-dev liblzma-dev libncurses5-dev libcurl4-gnutls-dev libssl-dev libffi-dev
23 | fi
24 | 
25 | # downgrade autoconf to work more nicely with htslib
26 | curl -L -O http://ftp.gnu.org/gnu/autoconf/autoconf-2.69.tar.gz
27 | tar zxf autoconf-2.69.tar.gz
28 | cd autoconf-2.69
29 | ./configure
30 | make && make install
31 | cd ..
32 | 
33 | export WITHDEFLATE=1
34 | LIBDEFLATE="${PWD}/libdeflate"
35 | LDFLAGS="-L${LIBDEFLATE}"
36 | 
37 | make htslib/libhts.a
38 | mkdir -p wheelhouse
39 | 
40 | echo "PYTHON VERSIONS AVAILABLE"
41 | ls /opt/python/
42 | 
43 | # Compile wheels
44 | for minor in $@; do
45 |     if [[ "${minor}" == "8" ]]  || [[ "${minor}" == "9" ]] || [[ "${minor}" == "10" ]]; then
46 |         PYBIN="/opt/python/cp3${minor}-cp3${minor}/bin"
47 |     else
48 |         PYBIN="/opt/python/cp3${minor}-cp3${minor}m/bin"
49 |     fi
50 |     # auditwheel/issues/102
51 |     "${PYBIN}"/pip install --upgrade setuptools pip wheel==0.31.1 cffi==1.15.0
52 |     "${PYBIN}"/pip wheel --no-dependencies . -w ./wheelhouse/
53 | done
54 | 
55 | 
56 | # Bundle external shared libraries into the wheels
57 | export LD_LIBRARY_PATH=$PWD/libdeflate
58 | ls ${LD_LIBRARY_PATH}
59 | for whl in "wheelhouse/${PACKAGE_NAME}"*.whl; do
60 |     LD_LIBRARY_PATH=${LIBDEFLATE} auditwheel repair "${whl}" -w ./wheelhouse/
61 | done
62 | unset LD_LIBRARY_PATH
63 | 
64 | 
65 | ## Install packages
66 | for minor in $@; do
67 |     if [[ "${minor}" == "8" ]]  || [[ "${minor}" == "9" ]] || [[ "${minor}" == "10" ]]; then
68 |         PYBIN="/opt/python/cp3${minor}-cp3${minor}/bin"
69 |     else
70 |         PYBIN="/opt/python/cp3${minor}-cp3${minor}m/bin"
71 |     fi
72 |     "${PYBIN}"/pip install -r requirements.txt 
73 |     "${PYBIN}"/pip install "${PACKAGE_NAME}" --no-index -f ./wheelhouse
74 |     "${PYBIN}"/modbampy --pileup test_data/400ecoli.bam ecoli1 105000 105100
75 | done
76 | 
77 | mkdir wheelhouse-final
78 | cp wheelhouse/${PACKAGE_FILE_NAME}*manylinux* wheelhouse-final
79 | 


--------------------------------------------------------------------------------
/build.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import os
  3 | 
  4 | from cffi import FFI
  5 | 
  6 | dir_path = os.path.dirname(os.path.realpath(__file__))
  7 | src_dir='src'
  8 | libraries=['m', 'lzma', 'bz2', 'pthread', 'curl', 'crypto']
  9 | library_dirs=[]
 10 | print("WITHDEFLATE:", os.getenv('WITHDEFLATE'))
 11 | if os.getenv('WITHDEFLATE') == "1":
 12 |     print("Using deflate")
 13 |     libraries.append('deflate')
 14 |     library_dirs.append(os.path.join(dir_path, 'libdeflate'))
 15 | 
 16 | ffibuilder = FFI()
 17 | ffibuilder.set_source("libmodbampy",
 18 |     r"""
 19 |     #include "htslib/sam.h"
 20 |     #include "bamiter.h"
 21 |     #include "common.h"
 22 |     #include "counts.h"
 23 | 
 24 |     """,
 25 |     libraries=libraries,
 26 |     library_dirs=library_dirs,
 27 |     include_dirs=[src_dir, 'htslib'],
 28 |     extra_compile_args=['-std=c99', '-msse3', '-O3'],
 29 |     extra_objects=[
 30 |         'pymod.a',
 31 |         os.path.join('htslib', 'libhts.a')]
 32 | )
 33 | 
 34 | cdef = ["""
 35 |     // START: custom header
 36 | 
 37 |     // export free
 38 |     void free(void *ptr);
 39 | 
 40 |     typedef int64_t hts_pos_t;
 41 | 
 42 |     // basic bam opening/handling
 43 |     typedef struct bam1_core_t {
 44 |         hts_pos_t pos;
 45 |         int32_t tid;
 46 |         uint16_t bin; // NB: invalid on 64-bit pos
 47 |         uint8_t qual;
 48 |         uint8_t l_extranul;
 49 |         uint16_t flag;
 50 |         uint16_t l_qname;
 51 |         uint32_t n_cigar;
 52 |         int32_t l_qseq;
 53 |         int32_t mtid;
 54 |         hts_pos_t mpos;
 55 |         hts_pos_t isize;
 56 |     } bam1_core_t;
 57 | 
 58 | 
 59 |     typedef struct bam1_t {
 60 |         bam1_core_t core;
 61 |         uint64_t id;
 62 |         uint8_t *data;
 63 |         int l_data;
 64 |         uint32_t m_data;
 65 |         uint32_t mempolicy:2, :30 /* Reserved */;
 66 |     } bam1_t;
 67 | 
 68 |     bam1_t *bam_init1();
 69 |     void bam_destroy1(bam1_t *b);
 70 |     bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc);
 71 |     typedef struct mplp_data {...;} mplp_data;
 72 | 
 73 |     // opening bam with idx and hdr info
 74 |     typedef struct { ...; } bam_fset;
 75 |     bam_fset* create_bam_fset(char* fname);
 76 |     void destroy_bam_fset(bam_fset* fset);
 77 |     typedef struct set_fsets {
 78 |         bam_fset **fsets;
 79 |         size_t n;
 80 |     } set_fsets;
 81 |     set_fsets *create_filesets(const char **bams);
 82 |     void destroy_filesets(set_fsets *s);
 83 | 
 84 |     mplp_data *create_bam_iter_data(
 85 |         const bam_fset* fset, const char *chr, int start, int end,
 86 |         const char *read_group, const char tag_name[2], const int tag_value, const int min_mapq);
 87 |     void destroy_bam_iter_data(mplp_data *data);
 88 |     // iterate a file
 89 |     int read_bam(void *data, bam1_t *b);
 90 |     // cigar parsing
 91 |     int *qpos2rpos(bam1_t *b);
 92 | 
 93 |     // things from htslib
 94 |     hts_pos_t bam_endpos(const bam1_t *b);
 95 | 
 96 |     // retrieving mod data
 97 |     typedef struct hts_base_mod_state hts_base_mod_state;
 98 |     hts_base_mod_state *hts_base_mod_state_alloc();
 99 |     void hts_base_mod_state_free(hts_base_mod_state *state);
100 |     int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state);
101 | 
102 |     typedef struct hts_base_mod {
103 |         int modified_base;
104 |         int canonical_base;
105 |         int strand;
106 |         int qual;
107 |     } hts_base_mod;
108 |     int bam_next_basemod(
109 |         const bam1_t *b, hts_base_mod_state *state,
110 |         hts_base_mod *mods, int n_mods, int *pos);
111 | 
112 |     // from common.h needed in functions in counts.h
113 |     //typedef struct mod_base {...;} mod_base;
114 | 
115 |     // END: custom header
116 | """]
117 | 
118 | # add in some things from headers, removing directives
119 | for header in ('src/common.h', 'src/counts.h'):
120 |     with open(header, 'r') as fh:
121 |         cdef.append("// START: {}".format(header))
122 |         cdef.append(
123 |             ''.join(
124 |                 x for x in fh.readlines()
125 |                 if not (x.startswith('#') or x.startswith("static inline int"))))
126 |         cdef.append("// END: {}".format(header))
127 | 
128 | ffibuilder.cdef('\n\n'.join(cdef))
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     ffibuilder.compile(verbose=True)
133 | 


--------------------------------------------------------------------------------
/conda/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=modbam2bed
 4 | 
 5 | ## self-built htslib
 6 | #export HTS_CONF_ARGS="--prefix=${PREFIX} --enable-libcurl --with-libdeflate --enable-plugins --enable-gcs --enable-s3"
 7 | #export EXTRA_CFLAGS="-I$PREFIX/include"
 8 | #export EXTRA_LDFLAGS="-L$PREFIX/lib"
 9 | #export EXTRA_LIBS="-ldl -lhts -ldeflate"
10 | ##export STATIC_HTSLIB=""
11 | 
12 | # just link to htslib from bioconda
13 | export EXTRA_CFLAGS="-I$PREFIX/include"
14 | export STATIC_HTSLIB=""
15 | export EXTRA_LDFLAGS="-L$PREFIX/lib"
16 | export EXTRA_LIBS="-ldl -lhts"
17 | 
18 | OS=$(uname)
19 | if [[ "$OS" == "Darwin" ]]; then
20 |     echo "Setting Darwin args"
21 |     export ARGP=${PREFIX}/lib/libargp.a
22 |     export EXTRA_CFLAGS="${EXTRA_CFLAGS} -isysroot ${CONDA_BUILD_SYSROOT} -mmacosx-version-min=${MACOSX_DEPLOYMENT_TARGET}"
23 | fi
24 | 
25 | make clean $NAME
26 | 
27 | mkdir -p $PREFIX/bin
28 | cp $NAME $PREFIX/bin && chmod +x $PREFIX/bin/$NAME
29 | 


--------------------------------------------------------------------------------
/conda/conda_build_config.yaml:
--------------------------------------------------------------------------------
1 | CONDA_BUILD_SYSROOT:
2 |     - /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk  # [osx]
3 | 


--------------------------------------------------------------------------------
/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |     name: {{ environ.get('CONDA_PKG') }}
 3 |     version: {{ environ.get('CONDA_PKG_VERSION') }}
 4 | 
 5 | source:
 6 |     path: ../
 7 | 
 8 | build:
 9 |     number: {{ environ.get('CONDA_PKG_BUILD', 0) }}
10 | 
11 | requirements:
12 |     build:
13 |         - {{ compiler('c') }}
14 |     host:
15 |         - argp-standalone # [osx]
16 |         - binutils # [not osx]
17 |         # explicitly list htslib to get same versions of
18 |         # other things, even when we build our own. When
19 |         # using bioconda htslib the other things can be
20 |         # removed from here. The pin is because we copy
21 |         # private interface code for hts_base_mod_state
22 |         # so need to ensure compatibility. We should
23 |         # hopefully be fine with ABI
24 |         - bioconda::htslib==1.16
25 |           # - libcurl
26 |           # - bzip2
27 |           # - xz
28 |           # - zlib
29 |           # - libdeflate
30 |           # - openssl  # [not osx]
31 |     run:
32 |         - bioconda::htslib==1.16
33 |           # - libcurl
34 |           # - bzip2
35 |           # - xz
36 |           # - zlib
37 |           # - libdeflate
38 |           # - openssl  # [not osx]
39 | 
40 | test:
41 |     commands:
42 |         - modbam2bed --help
43 | 
44 | about:
45 |     home: "https://github.com/epi2me-labs/modbam2bed"
46 |     license: Mozilla Public License 2.0 
47 |     license_family: OTHER
48 |     license_file: LICENSE
49 |     summary: "Summarise BAM files containing modified-base information to bedMethyl format."
50 |     doc_url: https://github.com/epi2me-labs/modbam2bed
51 |     dev_url: https://github.com/epi2me-labs/modbam2bed
52 | 
53 | extra:
54 |     recipe-maintainers:
55 |         - cjw85
56 | 


--------------------------------------------------------------------------------
/images/ONT_logo_590x106.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/images/ONT_logo_590x106.png


--------------------------------------------------------------------------------
/modbampy/__init__.py:
--------------------------------------------------------------------------------
  1 | """Functionality for interacting with modified base tags in BAM files."""
  2 | 
  3 | import argparse
  4 | import collections
  5 | 
  6 | import numpy as np
  7 | 
  8 | import libmodbampy
  9 | 
 10 | # remember to bump version in src/version.h too
 11 | __version__ = "0.10.0"
 12 | ffi = libmodbampy.ffi
 13 | libbam = libmodbampy.lib
 14 | 
 15 | MAX_MODS = 256  # from htslib
 16 | 
 17 | ModInfo = collections.namedtuple(
 18 |     'ModInfo', (
 19 |         'query_name', 'rpos', 'qpos', 'strand', 'mstrand',
 20 |         'cbase', 'mbase', 'qual'))
 21 | 
 22 | 
 23 | def _tidy_args(read_group, tag_name, tag_value):
 24 |     """Turn Python variables into CFFI ones."""
 25 |     if read_group is None:
 26 |         read_group = ffi.NULL
 27 |     else:
 28 |         read_group = ffi.new("char[]", read_group.encode())
 29 |     if tag_name is None:
 30 |         tag_name = ffi.new("char[2]", "".encode())
 31 |         tag_value = 0
 32 |     elif len(tag_name) != 2:
 33 |         raise ValueError("'tag_name' must be a length-2 string.")
 34 |     else:
 35 |         tag_name = ffi.new("char[2]", tag_name.encode())
 36 |     return read_group, tag_name, tag_value
 37 | 
 38 | 
 39 | class ModBase:
 40 |     """Helper to create a mod_base instance.
 41 | 
 42 |     :param code: modified base ChEBI code (e.g. "h" or 104)
 43 |     :param base: one of {A, C, G, T}
 44 |     :param name: long name of modified base (e.g. "5-methylcytosine")
 45 |     :param abbrev: short name of modified base (e.g. "5mC")
 46 | 
 47 |     Actually just a compatible list is created. Reuses the predefined
 48 |     instances from header where possible.
 49 |     """
 50 | 
 51 |     def __init__(self, code, base=None, name="unknown", abbrev="unknown"):
 52 |         """Initialise the instance."""
 53 |         self._name = ffi.new("char[]", name.encode())
 54 |         self._abbrev = ffi.new("char[]", abbrev.encode())
 55 |         self._base = base
 56 |         err = TypeError(
 57 |             "'base' should be a single character or None")
 58 |         if isinstance(self._base, str):
 59 |             if len(self._base) != 1:
 60 |                 raise err
 61 |             self._base = base.encode()
 62 |             self._base_i = {"A": 1, "C": 2, "G": 4, "T": 8}[base]
 63 |         elif self._base is not None:
 64 |             raise err
 65 | 
 66 |         err = TypeError(
 67 |             "'code' should be a single character or an "
 68 |             "integer (ChEBI) code.")
 69 |         self._code = code
 70 |         if isinstance(self._code, str):
 71 |             # ffi won't coerce a char to int, so we need to do it
 72 |             if len(self._code) != 1:
 73 |                 raise err
 74 |             self._code = ord(self._code)
 75 |         elif not isinstance(self._code, int):
 76 |             raise err
 77 | 
 78 |     @property
 79 |     def struct(self):
 80 |         """Return a list compatible with C structure."""
 81 |         for i in range(libbam.n_mod_bases):
 82 |             if libbam.mod_bases[i].code == self._code:
 83 |                 return libbam.mod_bases[i]
 84 | 
 85 |         # make a new mod_base using a code and a canonical base
 86 |         if self._base is None:
 87 |             raise ValueError(
 88 |                 f"Modified base type '{self._code}' unknown. Please provide "
 89 |                 "a value for 'base' to describe the unmodified base.")
 90 |         mod_base_type = [
 91 |             self._name, self._abbrev,
 92 |             self._base, self._base_i, self._code]
 93 |         return mod_base_type
 94 | 
 95 | 
 96 | class ModBam:
 97 |     """A minimal class to iterate over a bam."""
 98 | 
 99 |     def __init__(self, bam):
100 |         """Open a BAM file.
101 | 
102 |         :param bam: BAM file to open.
103 |         """
104 |         self.bam = bam
105 |         self._bam_fset = ffi.gc(
106 |             libbam.create_bam_fset(self.bam.encode()),
107 |             libbam.destroy_bam_fset)
108 | 
109 |     def __enter__(self):
110 |         """Open context."""
111 |         return self
112 | 
113 |     def __exit__(self, type, value, traceback):
114 |         """Exit context."""
115 |         pass
116 | 
117 |     def reads(
118 |             self, chrom, start, end,
119 |             read_group=None, tag_name=None, tag_value=None, min_mapq=0):
120 |         """Iterate over (filtered) alignments in file.
121 | 
122 |         :param chrom: reference sequence from BAM.
123 |         :param start: reference start coordinate.
124 |         :param end: reference end coordinate.
125 |         :param read group: read group of read to return.
126 |         :param tag_name: read tag to check during read filtering.
127 |         :param tag_value: tag value for reads to keep.
128 |         :param min_mapq: minimum read mapping quality.
129 |         """
130 |         read_group, tag_name, tag_value = _tidy_args(
131 |             read_group, tag_name, tag_value)
132 | 
133 |         it = libbam.create_bam_iter_data(
134 |                 self._bam_fset, chrom.encode(), start, end,
135 |                 read_group, tag_name, tag_value, min_mapq)
136 |         if it == ffi.NULL:
137 |             return
138 | 
139 |         data = ffi.gc(it, libbam.destroy_bam_iter_data)
140 |         mod_state = ffi.gc(
141 |             libbam.hts_base_mod_state_alloc(),
142 |             libbam.hts_base_mod_state_free)
143 | 
144 |         bam1_t = ffi.gc(libbam.bam_init1(), libbam.bam_destroy1)
145 |         while libbam.read_bam(data, bam1_t) > 0:
146 |             yield ModRead(bam1_t, mod_state)
147 | 
148 |     def pileup(
149 |             self, chrom, start, end,
150 |             read_group=None, tag_name=None, tag_value=None,
151 |             low_threshold=0.33, high_threshold=0.66, threshold=0.66,
152 |             mod_base="m", max_depth=None, canon_base=None, combine=False,
153 |             min_mapq=0):
154 |         """Create a base count matrix.
155 | 
156 |         :param chrom: reference sequence from BAM.
157 |         :param start: reference start coordinate.
158 |         :param end: reference end coordinate.
159 |         :param read group: read group of read to return.
160 |         :param tag_name: read tag to check during read filtering.
161 |         :param tag_value: tag value for reads to keep.
162 |         :param threshold: probability filter threshold for excluding
163 |             calls from counts.
164 |         :param mod_base: ChEBI code of modified base to examine.
165 |         :param max_depth: maximum read depth to examine.
166 |         :param canon_base: canonical base corresponding to `mod_base`.
167 |             Required only if `mod_base` is not a modification known to
168 |             the code.
169 |         :param combine: combine (include) all alternative modifications
170 |             with the same parent canonical base.
171 |         :param min_mapq: minimum read mapping quality.
172 |         """
173 |         for thresh in (low_threshold, high_threshold):
174 |             if thresh < 0.0 or thresh > 1.0:
175 |                 raise ValueError("Thresholds should be in (0,1).")
176 |         threshold = int(threshold * 255.0)
177 |         # C code currently uses high_threshold as the only threshold
178 |         high_threshold = threshold
179 |         read_group, tag_name, tag_value = _tidy_args(
180 |             read_group, tag_name, tag_value)
181 | 
182 |         if max_depth is None:
183 |             max_depth = libbam._INT_MAX
184 | 
185 |         _f = ffi.new("bam_fset *[]", [self._bam_fset])
186 |         fsets = ffi.new("set_fsets *", {"fsets": _f, "n": 1})
187 |         mod_base = ModBase(code=mod_base, base=canon_base)
188 |         plp_data = libbam.calculate_pileup(
189 |             fsets, chrom.encode(), start, end,
190 |             read_group, tag_name, tag_value,
191 |             threshold, mod_base.struct,
192 |             combine, max_depth, min_mapq)
193 |         # TODO: check for NULL
194 | 
195 |         # copy data to numpy, we could be more clever here an wrap
196 |         #   the pointer in a subclass of ndarray to track its lifetime
197 |         #   and avoid the explicit copy
198 |         n_rows = libbam.featlen
199 |         size_sizet = np.dtype(np.uintp).itemsize
200 |         np_counts = np.frombuffer(ffi.buffer(
201 |             plp_data.matrix, size_sizet * plp_data.n_cols * n_rows),
202 |             dtype=np.uintp
203 |         ).reshape(plp_data.n_cols, n_rows).copy()
204 |         np_positions = np.frombuffer(
205 |             ffi.buffer(plp_data.major, size_sizet * plp_data.n_cols),
206 |             dtype=np.uintp).copy()
207 |         libbam.destroy_plp_data(plp_data)
208 |         return np_positions, np_counts
209 | 
210 | 
211 | class ModRead:
212 |     """Proxy for a bam alignment.
213 | 
214 |     The class is not intended to be instantiated by users.
215 |     """
216 | 
217 |     def __init__(self, bam1_t, mod_state, header=None):
218 |         """Create an interface to alignment.
219 | 
220 |         The input alignment is copied.
221 |         """
222 |         self._bam1_t = ffi.gc(libbam.bam_init1(), libbam.bam_destroy1)
223 |         libbam.bam_copy1(self._bam1_t, bam1_t)
224 |         self._mod_state = mod_state
225 |         self._header = header
226 | 
227 |     @property
228 |     def flags(self):
229 |         """Return alignment flags."""
230 |         return self._bam1_t.core.flag
231 | 
232 |     @property
233 |     def is_unmapped(self):
234 |         """Return if read is unmapped."""
235 |         return self._bam1_t.core.flag & 4 > 0
236 | 
237 |     @property
238 |     def is_reverse(self):
239 |         """Return if alignment is to reverse strand."""
240 |         return self._bam1_t.core.flag & 16 > 0
241 | 
242 |     @property
243 |     def is_secondary(self):
244 |         """Return if alignment is a secondary alignment."""
245 |         return self._bam1_t.core.flag & 256 > 0
246 | 
247 |     @property
248 |     def is_supplementary(self):
249 |         """Return is alignment is a supplementary alignment."""
250 |         return self._bam1_t.core.flag & 2048 > 0
251 | 
252 |     @property
253 |     def mapping_quality(self):
254 |         """Return mapping quality."""
255 |         return self._bam1_t.core.qual
256 | 
257 |     @property
258 |     def strand(self):
259 |         """Return strand as '+' or '-'."""
260 |         return "+-"[self.is_reverse]
261 | 
262 |     @property
263 |     def query_name(self):
264 |         """Return query name."""
265 |         return ffi.string(
266 |             (ffi.cast("char*", self._bam1_t.data))).decode()
267 | 
268 |     @property
269 |     def query_length(self):
270 |         """Return query length as record in BAM. See `query_sequence`."""
271 |         return self._bam1_t.core.l_qseq
272 | 
273 |     @property
274 |     def query_sequence(self):
275 |         """Return the query sequence as recorded in the BAM.
276 | 
277 |         Includes soft-clipped bases, does not include hard-clipped bases, and
278 |         may return an error when sequence is not recorded.
279 |         """
280 |         # bam1_seq() define
281 |         # (b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname)
282 |         raise NotImplementedError("query_sequence not implemented")
283 | 
284 |     @property
285 |     def query_qualities(self):
286 |         """Return the query quality array.
287 | 
288 |         Includes soft-clipped bases as for `query_sequence`.
289 |         """
290 |         # bam1_qual define
291 |         # ((b)->data + ((b)->core.n_cigar<<2)
292 |         #   + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
293 |         raise NotImplementedError("query_qualities not implemented")
294 | 
295 |     @property
296 |     def reference_name(self):
297 |         """Return the reference name associated with the alignment."""
298 |         if self._bam1_t.core.tid == -1:
299 |             return None
300 |         elif self.header is None:
301 |             raise IndexError(
302 |                 "Require header information to retrieve reference_name")
303 |         else:
304 |             raise NotImplementedError(
305 |                 "Fetching reference_name not implemented")
306 | 
307 |     @property
308 |     def reference_start(self):
309 |         """Return the 0-based start position of the alignment."""
310 |         return self._bam1_t.core.pos
311 | 
312 |     @property
313 |     def reference_end(self):
314 |         """Return the 0-based (exclusive) end position of the alignment."""
315 |         return libbam.bam_endpos(self._bam1_t)
316 | 
317 |     @property
318 |     def reference_length(self):
319 |         """Return the length of the alignment on the reference."""
320 |         return self.reference_end - self.reference_start
321 | 
322 |     @property
323 |     def get_aligned_pairs(self):
324 |         """Return aligned query and reference positions."""
325 |         raise NotImplementedError("get_aligned_pairs not implemented")
326 | 
327 |     @property
328 |     def alignment(self):
329 |         """Create array representing alignment.
330 | 
331 |         The returned item is of length self.query_length
332 |         """
333 |         if not hasattr(self, "_alignment"):
334 |             self._alignment = ffi.gc(
335 |                 libbam.qpos2rpos(self._bam1_t), libbam.free)
336 |         return self._alignment
337 | 
338 |     @property
339 |     def mod_sites(self):
340 |         """Iterate over all modified bases in read.
341 | 
342 |         :yields: (read_id, ref. pos., query pos., ref. strand,
343 |             mod. strand, canon. base, mod. base, mod. quality)
344 | 
345 |         The ref. strand is that recorded in the Mm tag from the bam.
346 |         """
347 |         mods = ffi.new("hts_base_mod[{}]".format(MAX_MODS))
348 |         pos = ffi.new("int *")
349 |         align = self.alignment
350 |         libbam.bam_parse_basemod(self._bam1_t, self._mod_state)
351 |         n = 1
352 |         while n > 0:
353 |             n = libbam.bam_next_basemod(
354 |                 self._bam1_t, self._mod_state, mods, MAX_MODS, pos)
355 |             rpos = align[pos[0]]
356 |             if n > 0:
357 |                 for i in range(n):
358 |                     m = mods[i]
359 |                     # note m.strand refers to strand recorded in the Mm tag.
360 |                     modbase = m.modified_base
361 |                     if modbase > 0:
362 |                         modbase = chr(modbase)
363 |                     else:
364 |                         modbase = -modbase
365 |                     yield ModInfo(
366 |                         self.query_name, rpos, pos[0], self.strand, m.strand,
367 |                         chr(m.canonical_base), modbase, m.qual)
368 | 
369 | 
370 | def main():
371 |     """Test entry point."""
372 |     parser = argparse.ArgumentParser(
373 |         description="Modified base demo program.",
374 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
375 |     parser.add_argument(
376 |         "bam", help="Indexed .bam file.")
377 |     parser.add_argument(
378 |         "chrom", help="Chromosome for which to fetch read")
379 |     parser.add_argument(
380 |         "start", type=int,
381 |         help="Reference start coordinate.")
382 |     parser.add_argument(
383 |         "end", type=int,
384 |         help="Reference end coordinate.")
385 |     parser.add_argument(
386 |         "--pileup", action="store_true",
387 |         help="Create pileup counts rather than per-read modified base data")
388 |     parser.add_argument(
389 |         "--mod_base", default="m",
390 |         help="Modified base to count during pileup.")
391 |     parser.add_argument(
392 |         "--low_threshold", type=float, default=0.33,
393 |         help="Lower threshold for filtering.")
394 |     parser.add_argument(
395 |         "--high_threshold", type=float, default=0.66,
396 |         help="High threshold for filtering.")
397 |     args = parser.parse_args()
398 | 
399 |     with ModBam(args.bam) as bam:
400 |         if args.pileup:
401 |             codes = ffi.string(libbam.plp_bases).decode()
402 |             print("pos\t", end="")
403 |             print("\t".join(x for x in codes))
404 |             positions, counts = bam.pileup(
405 |                 args.chrom, args.start, args.end, mod_base=args.mod_base,
406 |                 low_threshold=args.low_threshold,
407 |                 high_threshold=args.high_threshold)
408 |             for p, row in zip(positions, counts):
409 |                 print(p, end='\t')
410 |                 print("\t".join(str(x) for x in row))
411 |         else:
412 |             counts = collections.Counter()
413 |             for read in bam.reads(args.chrom, args.start, args.end):
414 |                 for pos_mod in read.mod_sites:
415 |                     counts[pos_mod.qual] += 1
416 |             for k in sorted(counts.keys()):
417 |                 print(k, counts[k])
418 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Add comments to keep track of why we are using particular versions
2 | cffi==1.15.0  # 1.15.1 leads to c/_cffi_backend.c:15:10: fatal error: ffi.h: No such file or directory
3 | numpy
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | import re
 5 | import shutil
 6 | import platform
 7 | from glob import glob
 8 | from setuptools import setup, find_packages, Extension
 9 | from setuptools import Distribution, Command
10 | from setuptools.command.install import install
11 | from setuptools.command.build_ext import build_ext
12 | import subprocess
13 | import pkg_resources
14 | 
15 | 
16 | __pkg_name__ = 'modbampy'
17 | __author__ = 'cwright'
18 | __description__ = 'Accessing modified-base data from BAM files.'
19 | 
20 | # Use readme as long description and say its github-flavour markdown
21 | from os import path
22 | this_directory = path.abspath(path.dirname(__file__))
23 | kwargs = {'encoding':'utf-8'} if sys.version_info.major == 3 else {}
24 | with open(path.join(this_directory, 'README.md'), **kwargs) as f:
25 |     __long_description__ = f.read()
26 | __long_description_content_type__ = 'text/markdown'
27 | 
28 | __path__ = os.path.dirname(__file__)
29 | __pkg_path__ = os.path.join(os.path.join(__path__, __pkg_name__))
30 | 
31 | # Get the version number from __init__.py
32 | verstrline = open(os.path.join(__pkg_name__, '__init__.py'), 'r').read()
33 | vsre = r"^__version__ = ['\"]([^'\"]*)['\"]"
34 | mo = re.search(vsre, verstrline, re.M)
35 | if mo:
36 |     __version__ = mo.group(1)
37 | else:
38 |     raise RuntimeError('Unable to find version string in "{}/__init__.py".'.format(__pkg_name__))
39 | 
40 | dir_path = os.path.dirname(__file__)
41 | with open(os.path.join(dir_path, 'requirements.txt')) as fh:
42 |     install_requires = [
43 |         str(requirement) for requirement in
44 |         pkg_resources.parse_requirements(fh)]
45 | 
46 | data_files = []
47 | extra_requires = {}
48 | extensions = []
49 | 
50 | class HTSBuild(build_ext):
51 |     # uses the Makefile to build libhts.a, this will get done before the cffi extension
52 |     def run(self):
53 | 
54 |         def compile_hts():
55 |             subprocess.check_call(['make', os.path.join('htslib', 'libhts.a'), 'pymod.a'])
56 | 
57 |         self.execute(compile_hts, [], 'Compiling htslib using Makefile')
58 |         build_ext.run(self)
59 | 
60 | 
61 | setup(
62 |     name=__pkg_name__,
63 |     version=__version__,
64 |     url='https://github.com/epi2me-labs/modbam2bed',
65 |     author=__author__,
66 |     author_email='{}@nanoporetech.com'.format(__author__),
67 |     cffi_modules=["build.py:ffibuilder"],
68 |     description=__description__,
69 |     long_description=__long_description__,
70 |     long_description_content_type=__long_description_content_type__,
71 |     dependency_links=[],
72 |     ext_modules=extensions,
73 |     install_requires=install_requires,
74 |     tests_require=[].extend(install_requires),
75 |     extras_require=extra_requires,
76 |     # don't include any testing subpackages in dist
77 |     packages=find_packages(exclude=['*.test', '*.test.*', 'test.*', 'test']),
78 |     package_data={__pkg_name__:[os.path.join('data', '*')]},
79 |     zip_safe=False,
80 |     data_files=data_files,
81 |     entry_points={
82 |         'console_scripts': [
83 |             '{0} = {0}:main'.format(__pkg_name__)
84 |         ]
85 |     },
86 |     cmdclass={
87 |         'build_ext': HTSBuild
88 |     },
89 |     scripts=[]
90 | )
91 | 


--------------------------------------------------------------------------------
/src/args.c:
--------------------------------------------------------------------------------
  1 | #include <limits.h>
  2 | #include <stdbool.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include <sys/stat.h>
  6 | #include <argp.h>
  7 | 
  8 | #include "htslib/sam.h"
  9 | #include "htslib/faidx.h"
 10 | #include "args.h"
 11 | #include "version.h"
 12 | 
 13 | const char *argp_program_bug_address = "chris.wright@nanoporetech.com";
 14 | static char doc[] = 
 15 |  "modbam2bed -- summarise one or more BAM with modified base tags to bedMethyl.\
 16 |  \vModification information store in the BAM files is examine to derive\
 17 |  an identity of a possibly modified base. Calls are filtered by the\
 18 |  user-provided threshold probability. By default a single-modified base\
 19 |  is reported in the output, though the `--combine` option can fuse\
 20 |  calls for all modification in a family. Column 5 (\"score\") of the output\
 21 |  is calculated as the proportion of bases called as the canonical or modified\
 22 |  reference base with respect to the number of spanning reads, scaled to a\
 23 |  maximum of 1000. Column 10 is the total read coverage including reads with:\
 24 |  canonical base, modified base, undetermined (filtered) base, substituted\
 25 |  base (a base other than the canonical or modified base under consideration),\
 26 |  and deletions. Column 11 is the percentage of reference-base calls identified\
 27 |  as being modified (as a proportion of those confidently determined as\
 28 |  canonical or modified). Extended output (-e option) can give raw counts\
 29 |  of canonical, modified, alternatively modified, and undetermined bases\
 30 |  for completeness. See https://github.com/epi2me-labs/modbam2bed for a\
 31 |  overly precise explanation of the output.\
 32 |  \n\nOutput is to standard output unless multiple motif filters are specified.\
 33 |  In such cases the --prefix option controls the output file name.";
 34 | static char args_doc[] = "<reference.fasta> <reads.bam> [<reads.bam> ...]";
 35 | static struct argp_option options[] = {
 36 |     {0, 0, 0, 0,
 37 |         "General options:"},
 38 |     {"region", 'r', "chr:start-end", 0,
 39 |         "Genomic region to process."},
 40 |     {"extended", 'e', 0, 0,
 41 |         "Output extended bedMethyl including counts of canonical, modified, and filtered bases (in that order)."},
 42 |     {"mod_base", 'm', "BASE", 0,
 43 |         "Modified base of interest, one of: 5mC, 5hmC, 5fC, 5caC, 5hmU, 5fU, 5caU, 6mA, 5oxoG, Xao. (Or modA, modC, modG, modT, modU, modN for generic modified base)."},
 44 |     {"combine", 0x800, 0, 0,
 45 |         "Create output with combined modified counts: i.e. alternative modified bases within the same family (same canonical base) are included."},
 46 |     {"aggregate", 0x600, 0, 0,
 47 |         "Output additional aggregated (across strand) counts, requires --cpg or --chg."},
 48 |     {"threads", 't', "THREADS", 0,
 49 |         "Number of threads for BAM processing."},
 50 |     {"prefix", 'p', "PREFIX", 0,
 51 |         "Output file prefix. Only used when multiple output filters are given."},
 52 |     {"pileup", 'c', 0, 0,
 53 |         "Output (full) raw base counts rather than BED file."},
 54 |     {0, 0, 0, 0,
 55 |         "Base filtering options:"},
 56 |     {"canon_threshold", 'a', "THRESHOLD", 0,
 57 |         "Deprecated. The option will be removed in a future version. Please use --threshold.", 2},
 58 |     {"mod_threshold", 'b', "THRESHOLD", 0,
 59 |         "Deprecated. The option will be removed in a future version. Please use --threshold.", 2},
 60 |     {"threshold", 'f', "THRESHOLD", 0,
 61 |         "Bases with a call probability < THRESHOLD are filtered from results (default 0.66).", 2},
 62 |     {"cpg", 0x700, 0, 0,
 63 |         "Output records filtered to CpG sites.", 2},
 64 |     {"chh", 0x400, 0, 0,
 65 |         "Output records filtered to CHH sites.", 2},
 66 |     {"chg", 0x500, 0, 0,
 67 |         "Output records filtered to CHG sites.", 2},
 68 |     {"mask", 'k', 0, 0,
 69 |         "Respect soft-masking in reference file.", 2},
 70 |     {0, 0, 0, 0,
 71 |         "Read filtering options:"},
 72 |     {"max_depth", 'd', "DEPTH", 0,
 73 |         "Max. per-file depth; avoids excessive memory usage.", 3},
 74 |     {"read_group", 'g', "RG", 0,
 75 |         "Only process reads from given read group.", 3},
 76 |     {"tag_name", 0x100, "TN", 0,
 77 |         "Only process reads with a given tag (see --tag_value).", 3},
 78 |     {"tag_value", 0x200, "VAL", 0,
 79 |         "Only process reads with a given tag value.", 3},
 80 |     {"haplotype", 0x300, "VAL", 0,
 81 |         "Only process reads from a given haplotype. Equivalent to --tag_name HP --tag_value VAL.", 3},
 82 |     {"map_q", 0x900, "QUAL", 0,
 83 |         "Filter reads below this mapping quality.", 3},
 84 |     { 0 }
 85 | };
 86 | 
 87 | bool file_exists(char* filename) {
 88 |     struct stat st;
 89 |     return (stat(filename, &st) == 0);
 90 | }
 91 | 
 92 | static int tag_items = 0;
 93 | static bool tag_given = false;
 94 | static bool hp_given = false;
 95 | static error_t parse_opt (int key, char *arg, struct argp_state *state) {
 96 |     arguments_t *arguments = state->input;
 97 |     float thresh;
 98 |     bool found = false;
 99 |     switch (key) {
100 |         case 'a':
101 |         case 'b':
102 |             argp_error (state, "Options `-a` and `-b` are deprecated, Please use only `-f`. These option will be removed in a future version.\n");
103 |             break;
104 |         case 'f':
105 |             thresh = atof(arg);
106 |             if (thresh < 0 || thresh > 1.0) {
107 |                 argp_error (state, "Threshold parameter must be in (0,1), got %s", arg);
108 |             }
109 |             arguments->threshold = (int)(thresh * 255);
110 |             break;
111 |         case 'm':
112 |             for (size_t i = 0; i < n_mod_bases; ++i) {
113 |                 if (!strcmp(mod_bases[i].abbrev, arg)) {
114 |                     arguments->mod_base = mod_bases[i];
115 |                     found = true;
116 |                     break;
117 |                 }
118 |             }
119 |             if (!found) {
120 |                 argp_error(
121 |                     state,
122 |                     "Unrecognised modified base type: %s. ChEBI codes are not supported", arg);
123 |             }
124 |             break;
125 |         case 0x800:
126 |             arguments->combine = true;
127 |             break;
128 |         case 'r':
129 |             arguments->region = arg;
130 |             break;
131 |         case 0x700:
132 |             arguments->cpg = true;
133 |             break;
134 |         case 0x400:
135 |             arguments->chh = true;
136 |             break;
137 |         case 0x500:
138 |             arguments->chg = true;
139 |             break;
140 |         case 0x600:
141 |             arguments->accumulated = true;
142 |             break;
143 |         case 'k':
144 |             arguments->mask = true;
145 |             break;
146 |         case 'e':
147 |             arguments->extended = true;
148 |             break;
149 |         case 'g':
150 |             arguments->read_group = arg;
151 |             break;
152 |         case 'd':
153 |             arguments->hts_maxcnt = atoi(arg);
154 |             break;
155 |         case 0x100:
156 |             if (strlen(arg) > 2) {
157 |                 argp_error(state, "Tag name should be a two-letter code, received: '%s'.", arg);
158 |             }
159 |             memcpy(arguments->tag_name, arg, 2 *sizeof(char));
160 |             tag_items += 1;
161 |             tag_given = true;
162 |             break;
163 |         case 0x200:
164 |             arguments->tag_value = atoi(arg);
165 |             tag_items += 1;
166 |             tag_given = true;
167 |             break;
168 |         case 0x300:
169 |             memcpy(arguments->tag_name, "HP", 2 * sizeof(char));
170 |             arguments->tag_value = atoi(arg);
171 |             tag_items += 2;
172 |             hp_given = true;
173 |             break;
174 |         case 0x900:
175 |             arguments->min_mapQ = atoi(arg);
176 |             break;
177 |         case 't':
178 |             arguments->threads = atoi(arg);
179 |             break;
180 |         case 'p':
181 |             arguments->prefix = arg;
182 |             break;
183 |         case 'c':
184 |             arguments->pileup = true;
185 |             break;
186 |         case ARGP_KEY_NO_ARGS:
187 |             argp_usage (state);
188 |             break;
189 |         case ARGP_KEY_ARG:
190 |             if (state->arg_num == 0) {
191 |                 arguments->ref = arg;
192 |                 if (!file_exists(arg)) {
193 |                     argp_error(state, "Cannot access reference input file: '%s'.", arg);
194 |                 }
195 |                 faidx_t *fai = fai_load(arg);
196 |                 if (fai == NULL) {
197 |                     argp_error(state, "Cannot read .fasta(.gz) file: '%s'.", arg);
198 |                 }
199 |                 fai_destroy(fai);
200 |                 break;
201 |             } else {
202 |                 arguments->bam = (const char**)(&state->argv[state->next - 1]);
203 |                 state->next = state->argc;
204 |                 break;
205 |             }
206 |             break;
207 |         case ARGP_KEY_END:
208 |             if (state->arg_num < 2)
209 |                 argp_usage (state);
210 |             break;
211 |         default:
212 |             return ARGP_ERR_UNKNOWN;
213 |     }
214 |     return 0;
215 | }
216 | 
217 | static struct argp argp = {options, parse_opt, args_doc, doc};
218 | 
219 | arguments_t parse_arguments(int argc, char** argv) {
220 |     arguments_t args;
221 |     args.mod_base = default_mod_base;
222 |     args.combine = false;
223 |     args.threshold = (int)(0.66 * MAX_QUAL);
224 |     args.bam = NULL;
225 |     args.ref = NULL;
226 |     args.region = NULL;
227 |     args.read_group = NULL;
228 |     args.tag_name[0] = '\0';
229 |     args.tag_value = -1;
230 |     args.cpg = false;
231 |     args.chh = false;
232 |     args.chg = false;
233 |     args.mask = false;
234 |     args.accumulated = false;
235 |     args.extended = false;
236 |     args.threads = 1;
237 |     args.prefix = "mod-counts";
238 |     args.pileup = false;
239 |     args.hts_maxcnt = INT_MAX;
240 |     args.min_mapQ = 0;
241 |     argp_parse(&argp, argc, argv, 0, 0, &args);
242 |     // allow CpG only for C!
243 |     if (args.cpg || args.chh || args.chg) {
244 |         if (args.mod_base.base != 'C') {
245 |             fprintf(stderr, "ERROR: Options '--cpg/--chh/--chg' can only be used with cytosine modifications.");
246 |             exit(1);
247 |         }; 
248 |     }
249 |     if (args.cpg + args.chh + args.chh > 1) {
250 |         fprintf(stderr, "INFO: Multiple filters given, output will be to files named e.g. '%s.cpg.bed'.\n", args.prefix);
251 |     }
252 |     if (tag_items % 2 > 0) {
253 |         fprintf(stderr, "ERROR: Both or neither of --tag_name and --tag_value must be given.\n");
254 |         exit(1);
255 |     }
256 |     if (tag_given && hp_given) {
257 |         fprintf(stderr, "ERROR: If --haplotype is given neither of --tag_name or --tag_value should be provided.\n");
258 |         exit(1);
259 |     }
260 |     if (strncmp("5mC", args.mod_base.abbrev, 3) == 0 || strncmp("5hmC", args.mod_base.abbrev, 4)) {
261 |         fprintf(stderr,
262 | "WARNING: You have specified either 5mC or 5hmC as a modified base.\n\
263 |          Oxford Nanopore Basecallers jointly call C, 5mC, and 5hmC. If you\n\
264 |          wish to combine calls of these bases into a single 'modified'\n\
265 |          count, please use the `--combine` option. The default behaviour\n\
266 |          is that calls of alternative modified bases are added to the\n\
267 |          alternatively-modified count.");
268 |     }
269 |     return args;
270 | }
271 | 


--------------------------------------------------------------------------------
/src/args.h:
--------------------------------------------------------------------------------
 1 | #ifndef _MODBAMBED_ARGS_H
 2 | #define _MODBAMBED_ARGS_H
 3 | 
 4 | #include <stdbool.h>
 5 | 
 6 | #include "common.h"
 7 | 
 8 | typedef struct arguments {
 9 |     const char** bam;
10 |     char* ref;
11 |     char* region;
12 |     char* read_group;
13 |     char tag_name[2];
14 |     int tag_value;
15 |     mod_base mod_base;
16 |     bool combine;
17 |     bool mask;
18 |     bool cpg;
19 |     bool chh;
20 |     bool chg;
21 |     bool extended;
22 |     bool accumulated;
23 |     int threads;
24 |     int threshold;
25 |     char* prefix;
26 |     bool pileup;
27 |     int hts_maxcnt;
28 |     int min_mapQ;
29 | } arguments_t;
30 | 
31 | arguments_t parse_arguments(int argc, char** argv);
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/src/bamiter.c:
--------------------------------------------------------------------------------
  1 | #include <errno.h>
  2 | #include <string.h>
  3 | 
  4 | #include "bamiter.h"
  5 | #include "common.h"
  6 | 
  7 | 
  8 | // Initialise BAM file, index and header structures
  9 | bam_fset* create_bam_fset(const char* fname) {
 10 |     bam_fset* fset = xalloc(1, sizeof(bam_fset), "bam fileset");
 11 |     fset->fp = hts_open(fname, "rb");
 12 |     fset->idx = sam_index_load(fset->fp, fname);
 13 |     fset->hdr = sam_hdr_read(fset->fp);
 14 |     if (fset->hdr == 0 || fset->idx == 0 || fset->fp == 0) {
 15 |         destroy_bam_fset(fset);
 16 |         fprintf(stderr, "Failed to read .bam file '%s'.", fname);
 17 |         exit(1);
 18 |     }
 19 |     return fset;
 20 | }
 21 | 
 22 | // Destory BAM file, index and header structures
 23 | void destroy_bam_fset(bam_fset* fset) {
 24 |     hts_close(fset->fp);
 25 |     hts_idx_destroy(fset->idx);
 26 |     sam_hdr_destroy(fset->hdr);
 27 |     free(fset);
 28 | }
 29 | 
 30 | // Initialise multiple BAM filesets
 31 | set_fsets *create_filesets(const char **bam_files) {
 32 |     int nfile = 0; for (; bam_files[nfile]; nfile++);
 33 |     set_fsets *sets = xalloc(1, sizeof(set_fsets), "bam file sets");
 34 |     sets->fsets = xalloc(nfile, sizeof(bam_fset*), "bam files");
 35 |     sets->n = nfile;
 36 |     for (size_t i = 0; i < nfile; ++i) {
 37 |         sets->fsets[i] = create_bam_fset((const char *) bam_files[i]);
 38 |         if (sets->fsets[i] == NULL) {
 39 |             for (size_t j = 0; j < i; ++j) {
 40 |                 destroy_bam_fset(sets->fsets[i]);
 41 |             }
 42 |             free(sets->fsets); free(sets);
 43 |             return NULL;
 44 |         }
 45 |     }
 46 |     return sets;
 47 | }
 48 | 
 49 | // Destroy multiple BAM filesets
 50 | void destroy_filesets(set_fsets *s) {
 51 |     for (size_t i = 0; i < s->n; ++i) {
 52 |         destroy_bam_fset(s->fsets[i]);
 53 |     }
 54 |     free(s->fsets); free(s);
 55 | }
 56 | 
 57 | 
 58 | /** Set up a bam file for reading (filtered) records.
 59 |  *
 60 |  *  @param bam_file input aligment file.
 61 |  *  @param chr bam target name.
 62 |  *  @param start start position of chr to consider.
 63 |  *  @param end end position of chr to consider.
 64 |  *  @param read_group by which to filter alignments.
 65 |  *  @param tag_name by which to filter alignments.
 66 |  *  @param tag_value associated with tag_name.
 67 |  *
 68 |  *  The return value can be freed with destroy_bam_iter_data.
 69 |  *
 70 |  */
 71 | mplp_data *create_bam_iter_data(
 72 |         const bam_fset* bam_set, const char *chr, int start, int end,
 73 |         const char *read_group, const char tag_name[2], const int tag_value,
 74 |         const int min_mapQ) {
 75 | 
 76 |     // open bam etc.
 77 |     // this is all now deferred to the caller
 78 |     htsFile *fp = bam_set->fp;
 79 |     hts_idx_t *idx = bam_set->idx; 
 80 |     sam_hdr_t *hdr = bam_set->hdr;
 81 | 
 82 |     // find the target index for query below
 83 |     int mytid = -1;
 84 |     for (int i=0; i < hdr->n_targets; ++i) {
 85 |         if(!strcmp(hdr->target_name[i], chr)) {
 86 |             mytid = i;
 87 |             break;
 88 |         }
 89 |     }
 90 |     if (mytid == -1) {
 91 |         fprintf(stderr, "Failed to find reference sequence '%s' in bam.\n", chr);
 92 |         return NULL;
 93 |     }
 94 | 
 95 |     // setup bam interator
 96 |     mplp_data *data = xalloc(1, sizeof(mplp_data), "pileup init data");
 97 |     data->fp = fp; data->idx = idx; data->hdr = hdr;
 98 |     data->iter = bam_itr_queryi(idx, mytid, start, end);
 99 |     memcpy(data->tag_name, tag_name, 2); data->tag_value = tag_value;
100 |     data->min_mapQ = min_mapQ; data->read_group = read_group;
101 | 
102 |     return data;
103 | }
104 | 
105 | /** Clean up auxiliary bam reading data.
106 |  *
107 |  *  @param data auxiliary structure to clean.
108 |  *
109 |  */
110 | void destroy_bam_iter_data(mplp_data *data) {
111 |     bam_itr_destroy(data->iter);
112 |     free(data);
113 | }
114 | 
115 | 
116 | /** Read a bam record.
117 |  *
118 |  *  @param data an mplp_data encoding the bam file to read with filter options.
119 |  *  @param b output pointer.
120 |  *
121 |  */
122 | int read_bam(void *data, bam1_t *b) {
123 |     mplp_data *aux = (mplp_data*) data;
124 |     uint8_t *tag;
125 |     bool check_tag = (strcmp(aux->tag_name, "") != 0);
126 |     bool have_rg = (aux->read_group != NULL);
127 |     uint8_t *rg;
128 |     char *rg_val;
129 |     int ret;
130 |     while (1) {
131 |         ret = aux->iter ? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b);
132 |         if (ret<0) break;
133 |         // only take primary alignments
134 |         if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FQCFAIL | BAM_FDUP)) continue;
135 |         // filter by mapping quality
136 |         if ((int)b->core.qual < aux->min_mapQ) continue;
137 |         // filter by tag
138 |         if (check_tag) {
139 |             tag = bam_aux_get((const bam1_t*) b, aux->tag_name);
140 |             if (tag == NULL){ // tag isn't present or is currupt
141 |                 if (aux->keep_missing) {
142 |                     break;
143 |                 } else {
144 |                     continue;
145 |                 }
146 |             }
147 |             int tag_value = bam_aux2i(tag);
148 |             if (errno == EINVAL) continue; // tag was not integer
149 |             if (tag_value != aux->tag_value) continue;
150 |         }
151 |         // filter by RG (read group):
152 |         if (have_rg) {
153 |             rg = bam_aux_get((const bam1_t*) b, "RG");
154 |             if (rg == NULL) continue;  // missing
155 |             rg_val = bam_aux2Z(rg);
156 |             if (errno == EINVAL) continue;  // bad parse
157 |             if (strcmp(aux->read_group, rg_val) != 0) continue;  // not wanted
158 |         }
159 |         break;
160 |     }
161 |     return ret;
162 | }
163 | 
164 | 
165 | /** Create an map of query position to reference position
166 |  *
167 |  *  @param b alignment record
168 |  *
169 |  *  The length of the returned array is b->core->l_qlen.
170 |  */
171 | int *qpos2rpos(bam1_t *b) {
172 |     // we only deal in primary/soft-clipped alignments so length
173 |     // ok qseq member is the length of the intact query sequence.
174 |     uint32_t qlen = b->core.l_qseq;
175 |     uint32_t *cigar = bam_get_cigar(b);
176 |     int *posmap = xalloc(qlen, sizeof(uint32_t), "pos_map");
177 |     for (size_t i = 0; i < qlen; ++i) posmap[i] = -1;  // unaligned
178 |     int qpos = 0, rpos = b->core.pos;
179 |     for (size_t i = 0; i < b->core.n_cigar; ++i){
180 |         uint32_t op = bam_cigar_op(cigar[i]);
181 |         uint32_t len = bam_cigar_oplen(cigar[i]);
182 |         uint32_t take = bam_cigar_type(op);
183 |         if (((take&0x1)>0) & ((take&0x2)>0)) {
184 |             // consumes query and ref
185 |             for (size_t j = 0; j < len; ++j, ++qpos, ++rpos) {
186 |                 posmap[qpos] = rpos;
187 |             }
188 |         }
189 |         else if ((take&0x1)>0) {
190 |             // consumes query only
191 |             qpos += len;
192 |         }
193 |         else {
194 |             // consumes ref
195 |             rpos += len;
196 |         }
197 |     }
198 |     return posmap;
199 | }
200 | 


--------------------------------------------------------------------------------
/src/bamiter.h:
--------------------------------------------------------------------------------
 1 | #ifndef _MODBAMBED_BAMITER_H
 2 | #define _MODBAMBED_BAMITER_H
 3 | 
 4 | #include <stdbool.h>
 5 | #include "htslib/sam.h"
 6 | 
 7 | // parameters for bam iteration
 8 | typedef struct {
 9 |     htsFile *fp;
10 |     hts_idx_t *idx;
11 |     sam_hdr_t *hdr;
12 |     hts_itr_t *iter;
13 |     int min_mapQ;
14 |     char tag_name[2];
15 |     int tag_value;
16 |     bool keep_missing;
17 |     const char *read_group;
18 | } mplp_data;
19 | 
20 | 
21 | typedef struct {
22 |     htsFile *fp;
23 |     hts_idx_t *idx;
24 |     sam_hdr_t *hdr;
25 | } bam_fset;
26 | 
27 | typedef struct set_fsets {
28 |     bam_fset **fsets;
29 |     size_t n;
30 | } set_fsets;
31 | 
32 | 
33 | // Initialise BAM file, index and header structures
34 | bam_fset* create_bam_fset(const char* fname);
35 | 
36 | // Destory BAM file, index and header structures
37 | void destroy_bam_fset(bam_fset* fset);
38 | 
39 | // Initialise multiple BAM filesets
40 | set_fsets *create_filesets(const char **bams);
41 | 
42 | // Destroy multiple BAM filesets
43 | void destroy_filesets(set_fsets *s);
44 | 
45 | 
46 | /** Set up a bam file for reading (filtered) records.
47 |  *
48 |  *  @param bam_fset A BAM fileset from create_bam_fset
49 |  *  @param chr bam target name.
50 |  *  @param start start position of chr to consider.
51 |  *  @param end end position of chr to consider.
52 |  *  @param read_group by which to filter alignments.
53 |  *  @param tag_name by which to filter alignments.
54 |  *  @param tag_value associated with tag_name.
55 |  *  @param min_mapQ minimum mapping quality of reads.
56 |  *
57 |  *  The return value can be freed with destroy_bam_iter_data.
58 |  *
59 |  */
60 | mplp_data *create_bam_iter_data(
61 |     const bam_fset* fset, const char *chr, int start, int end,
62 |     const char *read_group, const char tag_name[2], const int tag_value,
63 |     const int min_mapQ);
64 | 
65 | /** Clean up auxiliary bam reading data.
66 |  *
67 |  *  @param data auxiliary structure to clean.
68 |  *
69 |  */
70 | void destroy_bam_iter_data(mplp_data *data);
71 | 
72 | /** Read a bam record.
73 |  *
74 |  *  @param data an mplp_data encoding the bam file to read with filter options.
75 |  *  @param b output pointer.
76 |  *
77 |  */
78 | int read_bam(void *data, bam1_t *b);
79 | 
80 | /** Create an map of query position to reference position
81 |  *
82 |  *  @param b alignment record
83 |  *
84 |  *  The length of the returned array is b->core->l_qlen.
85 |  */
86 | int *qpos2rpos(bam1_t *b);
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/src/common.c:
--------------------------------------------------------------------------------
 1 | #include <string.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdint.h>
 5 | #include <stdlib.h>
 6 | 
 7 | #include "common.h"
 8 | 
 9 | 
10 | /** Allocates zero-initialised memory with a message on failure.
11 |  *
12 |  *  @param num number of elements to allocate.
13 |  *  @param size size of each element.
14 |  *  @param msg message to describe allocation on failure.
15 |  *  @returns pointer to allocated memory
16 |  *
17 |  */
18 | void *xalloc(size_t num, size_t size, char* msg){
19 |     void *res = calloc(num, size);
20 |     if (res == NULL){
21 |         fprintf(stderr, "Failed to allocate mem for %s\n", msg);
22 |         exit(1);
23 |     }
24 |     return res;
25 | }
26 | 
27 | 
28 | /** Reallocates memory with a message on failure.
29 |  *
30 |  *  @param ptr pointer to realloc.
31 |  *  @param size size of each element.
32 |  *  @param msg message to describe allocation on failure.
33 |  *  @returns pointer to allocated memory
34 |  *
35 |  */
36 | void *xrealloc(void *ptr, size_t size, char* msg){
37 |     void *res = realloc(ptr, size);
38 |     if (res == NULL){
39 |         fprintf(stderr, "Failed to reallocate mem for %s\n", msg);
40 |         exit(1);
41 |     }
42 |     return res;
43 | }
44 | 
45 | 
46 | /** Retrieves a substring.
47 |  *
48 |  *  @param string input string.
49 |  *  @param postion start position of substring.
50 |  *  @param length length of substring required.
51 |  *  @returns string pointer.
52 |  *
53 |  */
54 | char *substring(char *string, int position, int length) {
55 |    char *ptr;
56 |    size_t i;
57 | 
58 |    ptr = malloc(length + 1);
59 | 
60 |    for (i = 0 ; i < length ; i++) {
61 |       *(ptr + i) = *(string + position);
62 |       string++;
63 |    }
64 | 
65 |    *(ptr + i) = '\0';
66 |    return ptr;
67 | }
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef _MODBAMBED_COMMON_H
 2 | #define _MODBAMBED_COMMON_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | 
 7 | typedef struct mod_base {
 8 |     char *name;
 9 |     char *abbrev;
10 |     char base;
11 |     int base_i; // 16bit IUPAC form A:1, C:2, G:4, T:8
12 |     int code;   // to enable htslib ChEBI support, chars below so simplicity
13 | } mod_base;
14 | 
15 | static const size_t n_mod_bases = 16;
16 | static const mod_base mod_bases[] = {
17 |     // C mods
18 |     {"5-methylcytosine", "5mC", 'C', 2, 'm'},
19 |     {"5-hydroxymethylcytosine", "5hmC", 'C', 2, 'h'},
20 |     {"5-formylcytosine", "5fC", 'C', 2, 'f'},
21 |     {"5-carboxylcytosine", "5caC", 'C', 2, 'c'},
22 |     {"Ambiguous C modification", "modC", 'C', 2, 'C'},
23 |     // T mods
24 |     {"5-hydroxymethyluracil", "5hmU", 'T', 8, 'g'},
25 |     {"5-formyluracil", "5fU", 'T', 8, 'e'},
26 |     {"5-carboxyluracil", "5caU", 'T', 8, 'b'},
27 |     {"Ambiguous T modification", "modT", 'T', 8, 'T'},
28 |     // A mods
29 |     {"6-methyladenine", "6mA", 'A', 1, 'a'},
30 |     {"Ambiguous A modification", "modA", 'A', 1, 'A'},
31 |     // G mods
32 |     {"8-Oxoguanine", "8oxoG", 'G', 4, 'o'},
33 |     {"Ambiguous G modification", "modG", 'G', 4, 'G'},
34 |     // U mods
35 |     {"Ambiguous U modification", "modU", 'U', 15, 'U'}, // TODO: should 15 (N) be something else?
36 |     // N Mods
37 |     {"Xanthosine", "Xao", 'N', 15, 'n'},
38 |     {"Ambiguous N modification", "modN", 'N', 15, 'N'},
39 | };
40 | static const mod_base default_mod_base = {"5-methylcytosine", "5mC", 'C', 2, 'm'};
41 | 
42 | //0123456789ABCDEF
43 | //=ACMGRSVTWYHKDBN  aka seq_nt16_str[]
44 | //=TGKCYSBAWRDMHVN  comp1ement of seq_nt16_str
45 | //084C2A6E195D3B7F
46 | static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 };
47 | 
48 | static const int MAX_QUAL = 255;
49 | 
50 | /** Simple integer min/max
51 |  * @param a
52 |  * @param b
53 |  *
54 |  * @returns the min/max of a and b
55 |  *
56 |  */
57 | static inline int max ( int a, int b ) { return a > b ? a : b; }
58 | static inline int min ( int a, int b ) { return a < b ? a : b; }
59 | 
60 | 
61 | /** Allocates zero-initialised memory with a message on failure.
62 |  *
63 |  *  @param num number of elements to allocate.
64 |  *  @param size size of each element.
65 |  *  @param msg message to describe allocation on failure.
66 |  *  @returns pointer to allocated memory
67 |  *
68 |  */
69 | void *xalloc(size_t num, size_t size, char* msg);
70 | 
71 | 
72 | /** Retrieves a substring.
73 |  *
74 |  *  @param string input string.
75 |  *  @param postion start position of substring.
76 |  *  @param length length of substring required.
77 |  *  @returns string pointer.
78 |  *
79 |  */
80 | char *substring(char *string, int position, int length);
81 | 
82 | #endif
83 | 


--------------------------------------------------------------------------------
/src/counts.c:
--------------------------------------------------------------------------------
  1 | #define _GNU_SOURCE
  2 | #include <assert.h>
  3 | #include <ctype.h>
  4 | #include <errno.h>
  5 | #include <math.h>
  6 | #include <string.h>
  7 | #include <stdbool.h>
  8 | #include <stdio.h>
  9 | #include <stdint.h>
 10 | #include <stdlib.h>
 11 | #include <unistd.h>
 12 | #include "htslib/sam.h"
 13 | #include "htslib/faidx.h"
 14 | #include "htslib/thread_pool.h"
 15 | 
 16 | #include "bamiter.h"
 17 | #include "common.h"
 18 | #include "counts.h"
 19 | 
 20 | #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
 21 | #define bam1_seqi(s, i) (bam_seqi((s), (i)))
 22 | #define bam_nt16_rev_table seq_nt16_str
 23 | #define bam_nt16_table seq_nt16_table
 24 | 
 25 | 
 26 | /** Constructs a pileup data structure.
 27 |  *
 28 |  *  @param buffer_cols maximum number of pileup columns.
 29 |  *  @param rname reference name.
 30 |  *  @see destroy_plp_data
 31 |  *  @returns a plp_data pointer.
 32 |  *
 33 |  *  The return value can be freed with destroy_plp_data.
 34 |  *
 35 |  */
 36 | plp_data create_plp_data(size_t buffer_cols, const char *rname) {
 37 |     plp_data data = xalloc(1, sizeof(_plp_data), "plp_data");
 38 |     data->buffer_cols = buffer_cols;
 39 |     data->n_cols = 0;
 40 |     //fprintf(stderr, buffer_cols); 
 41 |     data->matrix = xalloc(featlen * buffer_cols, sizeof(size_t), "matrix");
 42 |     data->major = xalloc(buffer_cols, sizeof(size_t), "major");
 43 |     data->rname = xalloc(strlen(rname) + 1, sizeof(char), "chr");
 44 |     strcpy(data->rname, rname);
 45 |     return data;
 46 | }
 47 | 
 48 | 
 49 | /** Destroys a pileup data structure.
 50 |  *
 51 |  *  @param data the object to cleanup.
 52 |  *  @returns void.
 53 |  *
 54 |  */
 55 | void destroy_plp_data(plp_data data) {
 56 |     free(data->matrix); free(data->major); free(data->rname); free(data);
 57 | }
 58 | 
 59 | 
 60 | /** Prints a pileup data structure.
 61 |  *
 62 |  *  @param pileup a pileup structure.
 63 |  *  @returns void
 64 |  *
 65 |  */
 66 | void print_pileup_data(plp_data pileup){
 67 |     fprintf(stdout, "chrom\tpos\t");
 68 |     for (size_t j = 0; j < featlen; ++j){
 69 |         fprintf(stdout, "%c\t", plp_bases[j]);
 70 |     }
 71 |     fprintf(stdout, "depth\n");
 72 |     for (size_t j = 0; j < pileup->n_cols; ++j) {
 73 |         int s = 0;
 74 |         fprintf(stdout, "%s\t%zu\t", pileup->rname, pileup->major[j]);
 75 |         for (size_t i = 0; i < featlen; ++i){
 76 |             size_t c = pileup->matrix[j * featlen + i];
 77 |             s += c;
 78 |             fprintf(stdout, "%zu\t", c);
 79 |         }
 80 |         fprintf(stdout, "%d\n", s);
 81 |     }
 82 | }
 83 | 
 84 | 
 85 | output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg, bool accumulated) {
 86 |     output_files files = xalloc(1, sizeof(_output_files), "output_files");
 87 |     // default to stdout for zero or one filters
 88 |     files->multi = (int)cpg + chh + chg > 1;
 89 |     files->take_all = (int)cpg + chh + chg == 0;
 90 |     files->accumulated = accumulated;
 91 |     files->fcpg = stdout;
 92 |     files->fchh = stdout;
 93 |     files->fchg = stdout;
 94 |     files->fcpg_acc = NULL;
 95 |     files->fchh_acc = NULL;
 96 |     files->fchg_acc = NULL;
 97 |     files->cpg = cpg;
 98 |     files->chh = chh;
 99 |     files->chg = chg;
100 |     // use distinct files if more than one filter
101 |     if (files->multi) {
102 |         char* fname = xalloc(strlen(prefix) + 9, sizeof(char), "fname");
103 |         if (cpg) {
104 |             strcpy(fname, prefix); strcat(fname, ".cpg.bed");
105 |             files->fcpg = fopen(fname, "w");
106 |         }
107 |         if (chh) {
108 |             strcpy(fname, prefix); strcat(fname, ".chh.bed");
109 |             files->fchh = fopen(fname, "w");
110 |         }
111 |         if (chg) {
112 |             strcpy(fname, prefix); strcat(fname, ".chg.bed");
113 |             files->fchg = fopen(fname, "w");
114 |         }
115 |         free(fname);
116 |     }
117 | 
118 |     if (files->accumulated) {
119 |         char* fname_acc = xalloc(strlen(prefix) + 13, sizeof(char), "fname");
120 |         if (cpg) {
121 |             strcpy(fname_acc, prefix); strcat(fname_acc, ".cpg.acc.bed");
122 |             files->fcpg_acc = fopen(fname_acc, "w");
123 |         }
124 |         if (chg) {
125 |             strcpy(fname_acc, prefix); strcat(fname_acc, ".chg.acc.bed");
126 |             files->fchg_acc = fopen(fname_acc, "w");
127 |         }
128 |         free(fname_acc);
129 |     }
130 | 
131 |     // store these in an array for later ease
132 |     // [CpG, CHG]
133 |     init_output_buffers(files);
134 |     files->buf_size = _buf_size;
135 |     files->motif_offsets[0] = 1;
136 |     files->motif_offsets[1] = 2;
137 |     files->motif_acc_files[0] = files->fcpg_acc;
138 |     files->motif_acc_files[1] = files->fchg_acc;
139 |     return files;
140 | }
141 | 
142 | void close_bed_files(output_files files) {
143 |    if (files->fcpg != stdout) { fclose(files->fcpg); }
144 |    if (files->fchh != stdout) { fclose(files->fchh); }
145 |    if (files->fchg != stdout) { fclose(files->fchg); }
146 |    if (files->fcpg_acc != NULL) { fclose(files->fcpg_acc); }
147 |    if (files->fchh_acc != NULL) { fclose(files->fchh_acc); }
148 |    if (files->fchg_acc != NULL) { fclose(files->fchg_acc); }
149 |    free(files);
150 | }
151 | 
152 | 
153 | // Check sequences for motifs
154 | 
155 | // CpG
156 | bool extern inline is_cpg_fwd(size_t rpos, int rlen, char* ref){
157 |     return rpos < rlen - 1 && ref[rpos] == 'C' && ref[rpos + 1] == 'G';
158 | }
159 | bool extern inline is_cpg_rev(size_t rpos, int rlen, char* ref){
160 |     return rpos != 0 && ref[rpos] == 'G' && ref[rpos - 1] == 'C';
161 | }
162 | // CHN
163 | bool extern inline _is_chn_fwd(size_t rpos, int rlen, char* ref) {
164 |     bool is_chn = false;
165 |     if (rpos < rlen - 2 && ref[rpos] == 'C') {
166 |         char b = ref[rpos + 1];
167 |         // these are all not G
168 |         is_chn = (b == 'A' || b == 'C' || b == 'T' || b == 'M' || b == 'W' || b == 'Y' || b == 'H');
169 |     }
170 |     return is_chn;
171 | }
172 | bool extern inline _is_chn_rev(size_t rpos, int rlen, char* ref) {
173 |     bool is_chn = false;
174 |     if (rpos > 1 && ref[rpos] == 'G') {
175 |         char b = ref[rpos - 1];
176 |         // these are all not C
177 |         is_chn = (b == 'A' || b == 'G' || b == 'T' || b == 'R' || b == 'W' || b == 'K' || b == 'D');
178 |     }
179 |     return is_chn;
180 | }
181 | // CHH
182 | bool extern inline is_chh_fwd(size_t rpos, int rlen, char* ref) {
183 |     bool is_chh = _is_chn_fwd(rpos, rlen, ref);
184 |     if (is_chh) { 
185 |         char b = ref[rpos + 2];
186 |         // these are all not G
187 |         is_chh = (b == 'A' || b == 'C' || b == 'T' || b == 'M' || b == 'W' || b == 'Y' || b == 'H');
188 |     }
189 |     return is_chh;
190 | }
191 | bool extern inline is_chh_rev(size_t rpos, int rlen, char* ref) {
192 |     bool is_chh = _is_chn_rev(rpos, rlen, ref);
193 |     if (is_chh) {
194 |         char b = ref[rpos - 2];
195 |         // these are all not C
196 |         is_chh = (b == 'A' || b == 'G' || b == 'T' || b == 'R' || b == 'W' || b == 'K' || b == 'D');
197 |     }
198 |     return is_chh;
199 | }
200 | // CHG
201 | bool extern inline is_chg_fwd(size_t rpos, int rlen, char* ref) {
202 |     bool is_chg = _is_chn_fwd(rpos, rlen, ref);
203 |     if (is_chg) {
204 |         is_chg = ref[rpos + 2] == 'G';
205 |     }
206 |     return is_chg;
207 | }
208 | bool extern inline is_chg_rev(size_t rpos, int rlen, char* ref) {
209 |     bool is_chg = _is_chn_rev(rpos, rlen, ref);
210 |     if (is_chg) {
211 |         is_chg = ref[rpos - 2] == 'C';
212 |     }
213 |     return is_chg;
214 | }
215 | 
216 | 
217 | void inline print_record(
218 |         FILE* fout, const char* rname, size_t start, size_t end,
219 |         char* feature, char orient, size_t depth,
220 |         bool extended, size_t cd, size_t md, size_t fd, size_t xd, size_t od) {
221 |     // https://www.encodeproject.org/data-standards/wgbs/
222 |     // column 11: "Percentage of reads that show methylation at this position in the genome"
223 |     //  - Seems to disregard possibility of non-C canonical calls
224 |     // lets calculate this as proportion of meth:non-meth C
225 |     size_t tot = cd + md + od;
226 |     float meth = tot == 0 ? nanf("") : (100.0f * md) / tot;
227 |     // column 5: "Score from 0-1000. Capped number of reads"
228 |     // lets go with proportion of (mod or canon):(mod or canon or filtered)
229 |     size_t score = depth == 0 ? nanf("") : (1000 * tot) / depth;
230 | 
231 |     // TODO: don't print when nan?
232 |     fprintf(fout,
233 |         "%s\t%zu\t%zu\t"
234 |         "%s\t%zu\t%c\t"
235 |         "%zu\t%zu\t0,0,0\t%zu\t%.2f",
236 |         rname, start, end,
237 |         feature, score, orient,
238 |         start, end, depth, meth);
239 |     if (extended) {
240 |         fprintf(fout, "\t%zu\t%zu\t%zu\t%zu\t%zu\n", cd, md, fd, xd, od);
241 |     } else {
242 |         fprintf(fout, "\n");
243 |     }
244 | }
245 | 
246 | 
247 | void init_output_buffers(output_files bed_files) {
248 |     // information regarding motif offset pairing
249 |     for (size_t i=0; i < bed_files->buf_size; ++i) {
250 |         bed_files->out_buffer[i] = (bed_buffer){-1, false, 0, 0, 0, 0};
251 |     }
252 | }
253 | 
254 | void flush_output_buffers(output_files bed_files, const char* chr, bool extended, char* feature) {
255 |     // flush accumulation buffers
256 |     if (bed_files->accumulated) {
257 |         for(size_t ibuf=0; ibuf < bed_files->buf_size; ++ibuf) {
258 |             bed_buffer buf = bed_files->out_buffer[ibuf];
259 |             FILE* fout = bed_files->motif_acc_files[ibuf];
260 |             if (buf.pos != -1 && fout != NULL) {
261 |                 print_record(
262 |                     fout, chr, buf.pos, buf.pos + 1, feature, "+-"[buf.isrev],
263 |                     buf.depth, extended, buf.cd, buf.md, buf.fd, buf.xd, buf.od);
264 |             }
265 |         }
266 |     }
267 | }
268 | 
269 | /** Prints a pileup data structure as bedmethyl file
270 |  *
271 |  *  @param pileup a pileup counts structure.
272 |  *  @param ref reference sequence.
273 |  *  @param rstart starting reference coordinate corresponding to ref.
274 |  *  @param extended whether to include counts of canonical, modified and filtered bases.
275 |  *  @param feature name to use for feature column of BED (e.g. 5mC).
276 |  *  @param canon_base canonical base to match.
277 |  *  @param output_files file handles and output options.
278 |  *  @param out_buffer state for strand accumulation (modified on output).
279 |  *  @returns void
280 |  *
281 |  */
282 | void print_bedmethyl(
283 |         plp_data pileup, char *ref, int rstart, bool extended,
284 |         char* feature, char canon_base, output_files bed_files) {
285 |     // ecoli1  100718  100719  .       4       +       100718  100719  0,0,0   3       0
286 |     
287 |     // this is a bit naff, we should introspect these indices, or have them
288 |     // as data in the header.
289 |     size_t ci, mi, fi, xi, oi;
290 |     size_t *bases;
291 |     bool isrev;
292 |     char rc_canon_base = ' ';
293 |     size_t cif, cir;
294 | 
295 |     // TODO: if canon_base were passed as an htslib int this would be cleaner
296 |     if      (canon_base == 'A') {cif=fwd_A; cir=rev_T; rc_canon_base = 'T';}
297 |     else if (canon_base == 'C') {cif=fwd_C; cir=rev_G; rc_canon_base = 'G';}
298 |     else if (canon_base == 'G') {cif=fwd_G; cir=rev_C; rc_canon_base = 'C';}
299 |     else if (canon_base == 'T') {cif=fwd_T; cir=rev_A; rc_canon_base = 'A';}
300 |     else {fprintf(stderr, "ERROR: Unrecognised canonical base: '%c'\n", canon_base); exit(1);}
301 | 
302 |     int rlen = strlen(ref);
303 | 
304 |     for (size_t i = 0; i < pileup->n_cols; ++i) {
305 |         size_t pos = pileup->major[i];
306 |         size_t rpos = pos - rstart;
307 |         char rbase = ref[rpos];
308 |         bool is_cpg = false;
309 |         bool is_chh = false;
310 |         bool is_chg = false;
311 |         if (rbase == canon_base) {
312 |             if (!bed_files->take_all) {
313 |                 if (!(
314 |                        (bed_files->cpg && (is_cpg = is_cpg_fwd(rpos, rlen, ref)))
315 |                     || (bed_files->chh && (is_chh = is_chh_fwd(rpos, rlen, ref)))
316 |                     || (bed_files->chg && (is_chg = is_chg_fwd(rpos, rlen, ref)))
317 |                     ) ) { continue; }
318 |             }
319 |             isrev = 0; mi = fwd_mod; fi = fwd_filt; xi = fwd_nocall; oi = fwd_other; ci = cif;
320 |             bases = (size_t *) fwdbases;
321 |         } else if (rbase == rc_canon_base) {
322 |             if (!bed_files->take_all) {
323 |                 if (!(
324 |                        (bed_files->cpg && (is_cpg = is_cpg_rev(rpos, rlen, ref)))
325 |                     || (bed_files->chh && (is_chh = is_chh_rev(rpos, rlen, ref)))
326 |                     || (bed_files->chg && (is_chg = is_chg_rev(rpos, rlen, ref)))
327 |                     ) ) { continue; }
328 |             }
329 |             isrev = 1; mi = rev_mod; fi = rev_filt; xi = rev_nocall; oi = rev_other; ci = cir;
330 |             bases = (size_t *)revbases;
331 |         }
332 |         else {
333 |             continue;
334 |         }
335 |         // calculate depth on strand
336 |         size_t depth = 0;
337 |         for (size_t j = 0; j < numbases; ++j) {
338 |             depth += pileup->matrix[i * featlen + bases[j]];
339 |         }
340 |         size_t cd = pileup->matrix[i * featlen + ci];
341 |         size_t md = pileup->matrix[i * featlen + mi];
342 |         size_t fd = pileup->matrix[i * featlen + fi];
343 |         size_t xd = pileup->matrix[i * featlen + xi];
344 |         size_t od = pileup->matrix[i * featlen + oi];
345 | 
346 |         // choose output for this locus, the motifs are mutually exclusive so
347 |         // no need to loop
348 |         FILE* fout = stdout;
349 |         if (bed_files->multi) {
350 |             if (is_cpg) { fout = bed_files->fcpg; }
351 |             else if (is_chh) { fout = bed_files->fchh; }
352 |             else if (is_chg) { fout = bed_files->fchg; }
353 |         }
354 |         print_record(
355 |             fout, pileup->rname, pos, pos + 1, feature, "+-"[isrev],
356 |             depth, extended, cd, md, fd, xd, od);
357 | 
358 |         // strand accumulated
359 |         if (bed_files->accumulated && (is_cpg || is_chg)) {
360 |             size_t ibuf, motif_offset;
361 |             bool do_output;
362 |             if (is_cpg) {
363 |                 ibuf = 0; do_output = bed_files->cpg;
364 |             } else { // chg
365 |                 ibuf = 1; do_output = bed_files->chh;
366 |             }
367 |             motif_offset = bed_files->motif_offsets[ibuf];
368 |             fout = bed_files->motif_acc_files[ibuf];
369 |             if (do_output) {
370 |                 assert(fout != NULL);
371 |                 bed_buffer buf = bed_files->out_buffer[ibuf];
372 |                 if (buf.pos == -1) {
373 |                     bed_files->out_buffer[ibuf] = (bed_buffer){pos, isrev, depth, cd, md, fd, xd, od};
374 |                 } else if (pos - buf.pos == motif_offset ) { // paired
375 |                     assert(buf.isrev != isrev); // shouldn't happen, they can't be same
376 |                     buf.depth += depth;
377 |                     buf.cd += cd;
378 |                     buf.md += md;
379 |                     buf.fd += fd;
380 |                     buf.xd += xd;
381 |                     buf.od += od;
382 |                     print_record(
383 |                         fout, pileup->rname, buf.pos, buf.pos + motif_offset + 1, feature, '.',
384 |                         buf.depth, extended, buf.cd, buf.md, buf.fd, buf.xd, buf.od);
385 |                     bed_files->out_buffer[ibuf] = (bed_buffer){-1, false, 0, 0, 0, 0, 0, 0};
386 |                 } else { // unrelated
387 |                     print_record(
388 |                         fout, pileup->rname, buf.pos, buf.pos + 1, feature, "+-"[buf.isrev],
389 |                         buf.depth, extended, buf.cd, buf.md, buf.fd, buf.xd, buf.od);
390 |                     bed_files->out_buffer[ibuf] = (bed_buffer){pos, isrev, depth, cd, md, fd, xd, od};
391 |                 }
392 |             }
393 |         }
394 | 
395 |     } // position loop
396 | 
397 | }
398 | 
399 | 
400 | // Control client data for pileup: in this case the mod base data
401 | int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) {
402 |     hts_base_mod_state *m = hts_base_mod_state_alloc();
403 |     bam_parse_basemod(b, m); cd->p = m;
404 |     return 0;
405 | }
406 | 
407 | int pileup_cd_destroy(void *data, const bam1_t *b, bam_pileup_cd *cd) {
408 |     hts_base_mod_state_free(cd->p);
409 |     return 0;
410 | }
411 | 
412 | 
413 | // TODO: this is taken from sam.c, its here so we can introspec some things
414 | //       for which there's no public interface. A little spicey to redefine
415 | //       this, but we do what we can.
416 | //       https://github.com/samtools/htslib/issues/1550
417 | #define MAX_BASE_MOD 256
418 | struct hts_base_mod_state {
419 |     int type[MAX_BASE_MOD];     // char or minus-CHEBI
420 |     int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15)
421 |     char strand[MAX_BASE_MOD];  // strand of modification; + or -
422 |     int MMcount[MAX_BASE_MOD];  // no. canonical bases left until next mod
423 |     char *MM[MAX_BASE_MOD];     // next pos delta (string)
424 |     char *MMend[MAX_BASE_MOD];  // end of pos-delta string
425 |     uint8_t *ML[MAX_BASE_MOD];  // next qual
426 |     int MLstride[MAX_BASE_MOD]; // bytes between quals for this type
427 |     int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified?
428 |     int seq_pos;                // current position along sequence
429 |     int nmods;                  // used array size (0 to MAX_BASE_MOD-1).
430 | };
431 | 
432 | 
433 | // Query if a specific MM subtag is present
434 | bool query_mod_subtag(hts_base_mod_state *state, int qtype, int qcanonical, char qstrand, int qimplicit) {
435 |     bool found = false;
436 |     for (size_t i=0; i<state->nmods; ++i) {
437 |         if ((state->type[i] == qtype || state->type[i] == -qtype)
438 |                 && state->canonical[i] == qcanonical
439 |                 // although strand is typed char and documented as + or -, its actually 0/1
440 |                 && "+-"[state->strand[i]] == qstrand
441 |                 && state->implicit[i] == qimplicit) {
442 |             found = true;
443 |             break;
444 |         }
445 |     }
446 |     return found;
447 | }
448 | 
449 | /** Generates base counts from a region of a bam.
450 |  *
451 |  *  @param bam_file input aligment file.
452 |  *  @param chr bam target name.
453 |  *  @param start start position of chr to consider.
454 |  *  @param end end position of chr to consider.
455 |  *  @param read_group by which to filter alignments.
456 |  *  @param tag_name by which to filter alignments.
457 |  *  @param tag_value associated with tag_name
458 |  *  @param threshold probability filter for excluding calls from counts.
459 |  *  @param mb BAM code for modified base to report. (e.g. h for 5hmC), or a ChEBI code.
460 |  *  @param combine combine all modified bases corresponding to same canonical base as mb
461 |  *  @param max_depth maximum depth of pileup.
462 |  *  @param min_mapQ minimum mapping quality of reads.
463 |  *  @returns a pileup data pointer.
464 |  *
465 |  *  The return value can be freed with destroy_plp_data.
466 |  *
467 |  */
468 | plp_data calculate_pileup(
469 |         const set_fsets *fsets, const char *chr, int start, int end,
470 |         const char *read_group, const char tag_name[2], const int tag_value,
471 |         int threshold, mod_base mb, bool combine, int max_depth, int min_mapQ) {
472 | 
473 |     static bool shown_second_strand_warning = false;
474 | 
475 |     // counting mod calls other than the one asked for
476 |     int rev_in_family = rev_other;
477 |     int fwd_in_family = fwd_other;
478 |     if (combine) { rev_in_family = rev_mod; fwd_in_family = fwd_mod; }
479 | 
480 |     // setup bam reading
481 |     size_t nfile = fsets->n;
482 |     mplp_data **data = xalloc(fsets->n, sizeof(mplp_data*), "bam files");
483 |     for (size_t i = 0; i < nfile; ++i) {
484 |         data[i] = create_bam_iter_data(
485 |             fsets->fsets[i], chr, start, end, read_group, tag_name, tag_value, min_mapQ);
486 |         if (data[i] == NULL) {
487 |             // TODO: clean-up all j<i data[i], and free data
488 |             return NULL;
489 |         }
490 |     }
491 | 
492 |     bam_mplp_t mplp = bam_mplp_init(nfile, read_bam, (void **)data);
493 |     int *n_plp = xalloc(nfile, sizeof(int), "bam read cover");
494 |     const bam_pileup1_t **plp = xalloc(nfile, sizeof(bam_pileup1_t *), "pileup");
495 |     int ret, pos, tid;
496 | 
497 |     bam_mplp_constructor(mplp, pileup_cd_create);
498 |     bam_mplp_destructor(mplp, pileup_cd_destroy);
499 |     bam_mplp_set_maxcnt(mplp, max_depth);
500 | 
501 |     // allocate output, not doing insertions here, so know maximum width
502 |     plp_data pileup = create_plp_data(end - start, chr);
503 | 
504 |     // get counts
505 |     int n_cols = 0;  // number of processed columns (not all ref positions included)
506 |     size_t major_col = 0;
507 |     while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)) {
508 |         const char *c_name = data[0]->hdr->target_name[tid];
509 |         if (strcmp(c_name, chr) != 0) continue;
510 |         if (pos < start) continue;
511 |         if (pos >= end) break;
512 | 
513 |         pileup->major[n_cols] = pos;  // dont need insert columns for this
514 | 
515 |         // go through all files, and all reads in each
516 |         for (size_t file = 0; file < nfile; ++file) {
517 |             for (int i = 0; i < n_plp[file]; ++i) {
518 |                 const bam_pileup1_t *p = plp[file] + i;
519 |                 if (p->is_refskip) continue;
520 | 
521 |                 // ONT calls are "query based", this means an attempt at a mod call is
522 |                 // made only if the first-pass canon basecall was the base of interest.
523 |                 // They are NOT "reference based": a mod call being attempted when the
524 |                 // query position aligns to a reference position containing the
525 |                 // of-interest base. (Actually reading between the lines of the spec
526 |                 // discussions, there was an implied assumption that mod calls are
527 |                 // always query based).
528 |                 //
529 |                 // There are two modes:
530 |                 //  i) "." - implicit = 1; Unlisted positions are assumed canonical
531 |                 // ii) "?" - implicit = 0; Nothing can really be said about unlisted
532 |                 //
533 |                 // Case i) is trivial and easy to handle: no mod calls, assume canonical.
534 |                 // This is like just not having a tag at all. If the above found no mods,
535 |                 // any query base (ACGT) is assumed canonical
536 |                 //
537 |                 // Case ii) is a bit more icky for us. Before deciding canon/no-call we
538 |                 // need to know if there was even a tag present, e.g. C+m for 5mC. For
539 |                 // canon base types other than that relating to our mod base, we make
540 |                 // no claims about modification status: all forms are lumped together.
541 |                 //
542 |                 // For the most part ONT callers output `?` and have a call for every
543 |                 // of-interest base. There are two cases where this isn't true:
544 |                 //  i) Guppy elided some low prob calls (as in the `.` mode)
545 |                 // ii) callers which specialise to CpG (so don't have an entry for every C)
546 |                 //
547 |                 // To complicate things further we can have tags such as "G-m" indicating
548 |                 // methylation on the second strand of the sequenced read. Such tags ought
549 |                 // not to occur without a corresponding "C+m" tag: in a simple case this
550 |                 // would imply a caller had called methylation on the strand that wasn't
551 |                 // sequenced but not on the strand that was sequenced. A more realistic
552 |                 // situation would be making calls only on the second strands of duplex reads.
553 |                 // 
554 |                 // Here we simplify our lives by restricting to the case of skipping any
555 |                 // such second strand tags, for the reasons above but also primarily
556 |                 // because ideally the second strand tag should be jointly interpreted
557 |                 // with the first strand tag:
558 |                 //    to detect hemimethylation
559 |                 //    understand and correctly report depth
560 |                 //    made hard by them being on different positions
561 | 
562 |                 int base_i = -1;  // index into counts matrix
563 |                 int base_j = bam1_seqi(bam1_seq(p->b), p->qpos);
564 |                 if (p->is_del) {
565 |                     // deletions are interesting for counting depth
566 |                     base_i = bam_is_rev(p->b) ? rev_del : fwd_del;
567 |                 } else if (!(
568 |                         (base_j == mb.base_i && !bam_is_rev(p->b))
569 |                         || (seqi_rc[base_j] == mb.base_i && bam_is_rev(p->b)))) {
570 |                     // e.g. if query we're looking for 5mC and qbase in {A,T}
571 |                     //      we'll just count a plain A/T
572 |                     // NOTE: this test assumes only first strand subtags (e.g. C+m, not C-m)
573 |                     base_i = num2countbase[bam_is_rev(p->b) ? base_j + 16: base_j];
574 |                 } else {		
575 |                     // We have the correct query base for the orientation of the alignment
576 |                     // so now look for modified bases.
577 |                     size_t n_mods = 256;
578 |                     hts_base_mod_state *mod_state = p->cd.p;
579 |                     hts_base_mod allmod[n_mods];
580 |                     int nm = bam_mods_at_qpos(p->b, p->qpos, mod_state, allmod, n_mods);
581 |                     if (nm < 0 ) continue;  // ignore reads which give error
582 |                     hts_base_mod mod;
583 |                     int our_mod = -1;
584 |                     int best_mod = -1;
585 |                     int best_score = 0;
586 |                     int canon_score = MAX_QUAL;  // we subtract from this below
587 |                     if (nm > 0) {
588 |                         for (int k = 0; k < nm && k < n_mods; ++k) {
589 |                             mod = allmod[k];
590 |                             if (mod.strand == 1) {  // second strand tag
591 |                                 if (!shown_second_strand_warning) {
592 |                                     fprintf(stderr, "WARNING: Skipping second strand tag.");
593 |                                     shown_second_strand_warning = true;
594 |                                 }
595 |                                 continue;
596 |                             }
597 |                             // our mod
598 |                             if (mb.code == mod.modified_base || mb.code == -mod.modified_base) {
599 |                                 our_mod = k;
600 |                             }
601 |                             // any mod in the family
602 |                             if (mod.canonical_base == mb.base) {
603 |                                 if (mod.qual > best_score) { best_mod = k; best_score = mod.qual; }
604 |                                 canon_score -= mod.qual;
605 |                             }
606 |                         }
607 |                     }
608 |                     
609 |                     // Now analyse scores. Note: ignoring the old lowthreshold here.
610 |                     if (best_mod != -1) {
611 |                         // we found some mods, lets not worry about funny mixes
612 |                         // of calls and no calls i.e. were assuming we have a call
613 |                         // for all the mods present (implicit non-mod doesn't matter here therefore).
614 |                         if (canon_score > threshold) { // implied canon score 
615 |                             base_i = num2countbase[bam_is_rev(p->b) ? base_j + 16 : base_j];
616 |                         }
617 |                         else if (best_mod == our_mod) { // the mod requested
618 |                             base_i = (best_score > threshold) ?
619 |                                 (bam_is_rev(p->b) ? rev_mod : fwd_mod) :
620 |                                 (bam_is_rev(p->b) ? rev_filt : fwd_filt);
621 |                         }
622 |                         else { // some other mod in the family
623 |                             base_i = (best_score > threshold) ?
624 |                                 (bam_is_rev(p->b) ? rev_in_family : fwd_in_family) :  // either mod or other depending on combine
625 |                                 (bam_is_rev(p->b) ? rev_filt : fwd_filt);
626 |                         }
627 |                     }
628 |                     else {
629 |                         // we didn't find any mods in the family
630 |                         // In the case of explicit `?`
631 |                         // tag we should not assume canonical, otherwise we can.
632 |                         // NOTE: we don't look for second strand `-` tags.
633 |                         //       or a mess of `?` and `.` for alternative mods
634 |                         if (query_mod_subtag(mod_state, mb.code, mb.base_i, '+', 0)) {
635 |                             // we had an explicit tag, but no call for this position
636 |                             base_i = bam_is_rev(p->b) ? rev_nocall : fwd_nocall;
637 |                         }
638 |                         else {
639 |                             // for everything else theres canonical
640 |                             base_i = num2countbase[bam_is_rev(p->b) ? base_j + 16 : base_j];
641 |                         }
642 |                     }
643 |                 }
644 |                 if (base_i != -1) {  // not an ambiguity code
645 |                     pileup->matrix[major_col + base_i] += 1;
646 |                 } // read loop
647 |             } // file loop
648 |         }
649 |         major_col += featlen;
650 |         n_cols++;
651 |     }
652 |     pileup->n_cols = n_cols;
653 | 
654 |     free(plp);
655 |     free(n_plp);
656 |     bam_mplp_destroy(mplp);
657 |     for (size_t i = 0; i < nfile; ++i) {
658 |         destroy_bam_iter_data(data[i]);
659 |     }
660 |     free(data);
661 | 
662 |     return pileup;
663 | }
664 | 
665 | 
666 | 


--------------------------------------------------------------------------------
/src/counts.h:
--------------------------------------------------------------------------------
  1 | #ifndef _MODBAMBED_COUNTS_H
  2 | #define _MODBAMBED_COUNTS_H
  3 | 
  4 | #include <stdbool.h>
  5 | #include <limits.h>
  6 | 
  7 | #include "common.h"
  8 | 
  9 | static const int _INT_MAX = INT_MAX;
 10 | 
 11 | // medaka-style feature data
 12 | typedef struct _plp_data {
 13 |     size_t buffer_cols;
 14 |     size_t n_cols;
 15 |     char *rname;
 16 |     size_t *matrix;
 17 |     size_t *major;
 18 | } _plp_data;
 19 | typedef _plp_data *plp_data;
 20 | 
 21 | typedef struct bed_buffer {
 22 |     int pos;
 23 |     bool isrev;
 24 |     size_t depth, cd, md, fd, xd, od;
 25 | } bed_buffer;
 26 | 
 27 | // files open for writing outputs
 28 | // this buf_size is silly, but its to work around CFFI sillyness
 29 | static const size_t _buf_size = 2;
 30 | typedef struct _output_files {
 31 |     bool multi;
 32 |     bool take_all;
 33 |     bool accumulated;
 34 |     bool cpg;
 35 |     bool chh;
 36 |     bool chg;
 37 |     FILE *fcpg;
 38 |     FILE *fchh;
 39 |     FILE *fchg;
 40 |     FILE *fcpg_acc;
 41 |     FILE *fchh_acc;
 42 |     FILE *fchg_acc;
 43 |     size_t buf_size;
 44 |     bed_buffer out_buffer[2];
 45 |     size_t motif_offsets[2];
 46 |     FILE* motif_acc_files[2];
 47 | } _output_files;
 48 | typedef _output_files *output_files;
 49 | 
 50 | 
 51 | output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg, bool accumulated);
 52 | void close_bed_files(output_files);
 53 | // reset state of buffers (to handle loci split by thread blocks)
 54 | void init_output_buffers(output_files bed_files);
 55 | void flush_output_buffers(output_files bed_files, const char* chr, bool extended, char* feature);
 56 | 
 57 | 
 58 | // Check sequences for motifs
 59 | // CpG
 60 | bool extern inline is_cpg_fwd(size_t rpos, int rlen, char* ref);
 61 | bool extern inline is_cpg_rev(size_t rpos, int rlen, char* ref);
 62 | // CHN
 63 | bool extern inline _is_chn_fwd(size_t rpos, int rlen, char* ref);
 64 | bool extern inline _is_chn_rev(size_t rpos, int rlen, char* ref);
 65 | // CHH
 66 | bool extern inline is_chh_fwd(size_t rpos, int rlen, char* ref);
 67 | bool extern inline is_chh_rev(size_t rpos, int rlen, char* ref);
 68 | // CHG
 69 | bool extern inline is_chg_fwd(size_t rpos, int rlen, char* ref);
 70 | bool extern inline is_chg_rev(size_t rpos, int rlen, char* ref);
 71 | 
 72 | // medaka-style base encoding - augmented with (a) modified base counts
 73 | static const char plp_bases[] = "acgtACGTdDmMfoOfFxX";  // o: "other mod", f:"filtered", x:"no call"
 74 | 
 75 | enum plp_index {
 76 |     rev_A, rev_C, rev_G, rev_T,
 77 |     fwd_A, fwd_C, fwd_G, fwd_T, 
 78 |     rev_del, fwd_del,
 79 |     rev_mod, fwd_mod,
 80 |     rev_other, fwd_other,
 81 |     rev_filt, fwd_filt,
 82 |     rev_nocall, fwd_nocall,
 83 |     featlen
 84 | };
 85 | static const size_t fwdbases[] = 
 86 |     {fwd_A, fwd_C, fwd_G, fwd_T, fwd_del, fwd_mod, fwd_other, fwd_filt, fwd_nocall}; 
 87 | static const size_t revbases[] = 
 88 |     {rev_A, rev_C, rev_G, rev_T, rev_del, rev_mod, rev_other, rev_filt, rev_nocall};
 89 | static const size_t numbases = featlen / 2;
 90 | 
 91 | // convert 16bit IUPAC (+16 for strand) to plp_bases index
 92 | // e.g. G=4 => fwd_G => plp_bases[6]
 93 | static const int num2countbase[32] = {
 94 |       -1, fwd_A, fwd_C,  -1, fwd_G,  -1, -1, -1,
 95 |    fwd_T,    -1,    -1,  -1,    -1,  -1, -1, -1,
 96 |       -1, rev_A, rev_C,  -1, rev_G,  -1, -1, -1,
 97 |    rev_T,    -1,    -1,  -1,    -1,  -1, -1, -1,
 98 | };
 99 | 
100 | 
101 | /** Constructs a pileup data structure.
102 |  *
103 |  *  @param buffer_cols maximum number of pileup columns.
104 |  *  @param rname reference name.
105 |  *  @see destroy_plp_data
106 |  *  @returns a plp_data pointer.
107 |  *
108 |  *  The return value can be freed with destroy_plp_data.
109 |  *
110 |  */
111 | plp_data create_plp_data(size_t buffer_cols, const char *rname);
112 | 
113 | 
114 | /** Destroys a pileup data structure.
115 |  *
116 |  *  @param data the object to cleanup.
117 |  *  @returns void.
118 |  *
119 |  */
120 | void destroy_plp_data(plp_data data);
121 | 
122 | 
123 | /** Prints a pileup data structure.
124 |  *
125 |  *  @param pileup a pileup counts structure.
126 |  *  @returns void
127 |  *
128 |  */
129 | void print_pileup_data(plp_data pileup);
130 | 
131 | 
132 | /** Prints a pileup data structure as bedmethyl file
133 |  *
134 |  *  @param pileup a pileup counts structure.
135 |  *  @param ref reference sequence.
136 |  *  @param rstart starting reference coordinate corresponding to ref.
137 |  *  @param extended whether to include counts of canonical, modified and filtered bases.
138 |  *  @param feature name to use for feature column of BED (e.g. 5mC).
139 |  *  @param canon_base canonical base to match.
140 |  *  @param bed_files output file handles (and filters).
141 |  *  @returns void
142 |  *
143 |  */
144 | void print_bedmethyl(
145 |     plp_data pileup, char *ref, int rstart, bool extended,
146 |     char *feature, char canon_base, output_files bed_files);
147 | 
148 | 
149 | /** Generates base counts from a region of a bam.
150 |  *
151 |  *  @param bam_file input aligment file.
152 |  *  @param chr bam target name.
153 |  *  @param start start position of chr to consider.
154 |  *  @param end end position of chr to consider.
155 |  *  @param read_group by which to filter alignments.
156 |  *  @param tag_name by which to filter alignments.
157 |  *  @param tag_value associated with tag_name
158 |  *  @param lowthreshold highest probability to call base as canonical.
159 |  *  @param highthreshold lowest probablity to call base as modified.
160 |  *  @param mod_base a mod_base instance
161 |  *  @param combine combine all modified bases corresponding to same canonical base as mb
162 |  *  @param max_depth maximum depth of pileup.
163 |  *  @param min_mapQ
164 |  *  @returns a pileup data pointer.
165 |  *
166 |  *  The return value can be freed with destroy_plp_data.
167 |  *
168 |  */
169 | plp_data calculate_pileup(
170 |     const set_fsets *fsets, const char *chr, int start, int end,
171 |     const char *read_group, const char tag_name[2], const int tag_value,
172 |     int threshold, mod_base mb, bool combine, int max_depth, int min_mapQ);
173 | 
174 | #endif
175 | 


--------------------------------------------------------------------------------
/src/modbam2bed.c:
--------------------------------------------------------------------------------
  1 | // modbam2bed program
  2 | 
  3 | #include <ctype.h>
  4 | #include <string.h>
  5 | #include <stdbool.h>
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <sys/resource.h>
  9 | #include <time.h>
 10 | #include "htslib/sam.h"
 11 | #include "htslib/faidx.h"
 12 | #include "htslib/thread_pool.h"
 13 | 
 14 | #include "bamiter.h"
 15 | #include "common.h"
 16 | #include "counts.h"
 17 | #include "args.h"
 18 | 
 19 | 
 20 | typedef struct twarg {
 21 |     arguments_t args;
 22 |     const char *chr;
 23 |     int start;
 24 |     int end;
 25 | } twarg;
 26 | 
 27 | 
 28 | void *pileup_worker(void *arg) {
 29 |     twarg j = *(twarg *)arg;
 30 |     set_fsets *files = create_filesets(j.args.bam);
 31 |     if (files == NULL) { free(arg); return NULL; }
 32 |     plp_data pileup = calculate_pileup(
 33 |         files, j.chr, j.start, j.end,
 34 |         j.args.read_group, j.args.tag_name, j.args.tag_value,
 35 |         j.args.threshold, j.args.mod_base, j.args.combine,
 36 |         j.args.hts_maxcnt, j.args.min_mapQ);
 37 |     destroy_filesets(files);
 38 |     free(arg);
 39 |     return pileup;
 40 | }
 41 | 
 42 | 
 43 | /* Process and print a single region using a threadpool
 44 |  *
 45 |  * @param args program arguments.
 46 |  * @param chr reference sequence to process.
 47 |  * @param start reference coordinate to process (0-based).
 48 |  * @param end reference coordiate to process (exclusive).
 49 |  * @param ref reference sequence.
 50 |  *
 51 |  */
 52 | #ifdef NOTHREADS
 53 | void process_region(arguments_t args, const char *chr, int start, int end, char *ref, output_files bed_files) {
 54 |     fprintf(stderr, "Processing: %s:%d-%d\n", chr, start, end);
 55 |     set_fsets* files = create_filesets(j.args.bam);
 56 |     if (files == NULL) return;
 57 |     plp_data pileup = calculate_pileup(
 58 |         args.bam, chr, start, end,
 59 |         args.read_group, args.tag_name, args.tag_value,
 60 |         args.threshold, args.mod_base, args.combine,
 61 |         args.hts_maxcnt, args.min_mapQ);
 62 |     if (pileup == NULL) return;
 63 | 
 64 |     init_output_buffers(bed_files);
 65 |     if (args.pileup) {
 66 |         print_pileup_data(pileup);
 67 |     } else {
 68 |         print_bedmethyl(pileup, ref, 0, args.extended, args.mod_base.abbrev, args.mod_base.base, bed_files);
 69 |     }
 70 |     flush_output_buffers(bed_files, chr, args.extended, args.mod_base.abbrev);
 71 |     destroy_plp_data(pileup);
 72 | }
 73 | #else
 74 | void process_region(arguments_t args, const char *chr, int start, int end, char *ref, output_files bed_files) {
 75 |     fprintf(stderr, "Processing: %s:%d-%d\n", chr, start, end);
 76 |     // create thread pool
 77 |     hts_tpool *p = hts_tpool_init(args.threads);
 78 |     hts_tpool_process *q = hts_tpool_process_init(p, 2 * args.threads, 0);
 79 |     hts_tpool_result *r;
 80 |     const int width = 1000000;
 81 | 
 82 |     init_output_buffers(bed_files);
 83 |     int nregs = 1 + (end - start) / width; float done = 0;
 84 |     for (int rstart = start; rstart < end; rstart += width) {
 85 |         twarg *tw_args = xalloc(1, sizeof(*tw_args), "thread worker args");  // freed in worker
 86 |         tw_args->args = args;
 87 |         tw_args->chr = chr; tw_args->start = rstart; tw_args->end=min(rstart + width, end);
 88 |         int blk;
 89 |         do {
 90 |             blk = hts_tpool_dispatch2(p, q, pileup_worker, tw_args, 1);
 91 |             if ((r = hts_tpool_next_result(q))) {
 92 |                 plp_data res = (plp_data)hts_tpool_result_data(r);
 93 |                 if (res != NULL) {
 94 |                     if (args.pileup) {
 95 |                         print_pileup_data(res);
 96 |                     } else {
 97 |                         print_bedmethyl(
 98 |                             res, ref, 0,
 99 |                             args.extended, args.mod_base.abbrev, args.mod_base.base, bed_files);
100 |                     }
101 |                     destroy_plp_data(res);
102 |                     done++;
103 |                     fprintf(stderr, "\r%.1f %%", 100*done/nregs);
104 |                 }
105 |                 hts_tpool_delete_result(r, 0);
106 |             }
107 |         } while (blk == -1);
108 |     }
109 | 
110 |     // wait for jobs, then collect.
111 |     hts_tpool_process_flush(q);
112 |     while ((r = hts_tpool_next_result(q))) {
113 |         plp_data res = (plp_data)hts_tpool_result_data(r);
114 |         if (res != NULL) {
115 |             if (args.pileup) {
116 |                 print_pileup_data(res);
117 |             } else {
118 |                 print_bedmethyl(
119 |                     res, ref, 0,
120 |                     args.extended, args.mod_base.abbrev, args.mod_base.base, bed_files);
121 |             }
122 |             destroy_plp_data(res);
123 |             done++;
124 |             fprintf(stderr, "\r%.1f %%", 100*done/nregs);
125 |         }
126 |         hts_tpool_delete_result(r, 0);
127 |     }
128 | 
129 |     // finalise any remaining singleton strands
130 |     flush_output_buffers(bed_files, chr, args.extended, args.mod_base.abbrev);
131 | 
132 |     fprintf(stderr, "\r100 %%  ");
133 |     fprintf(stderr, "\n");
134 |     // clean up pool
135 |     hts_tpool_process_destroy(q);
136 |     hts_tpool_destroy(p);
137 | }
138 | #endif
139 | 
140 | 
141 | int main(int argc, char *argv[]) {
142 |     clock_t begin = clock();
143 |     arguments_t args = parse_arguments(argc, argv);
144 |     fprintf(
145 |         stderr, "Analysing: %s (%s, %c>%c)\n",
146 |         args.mod_base.name, args.mod_base.abbrev, args.mod_base.base, args.mod_base.code);
147 | #ifdef NOTHREADS
148 |     if (args.threads != 1) {
149 |         fprintf(
150 |             stderr,
151 |             "--threads set to %d, but threading not supported by this build.\n", args.threads);
152 |     }
153 | #endif
154 | 
155 |     // large basecaller runs can produce more files than a single
156 |     // process can open, check this ahead of time.
157 | #ifndef WASM
158 |     struct rlimit reslimit;
159 |     int nfile = 0; for (; args.bam[nfile]; nfile++);
160 |     if (getrlimit(RLIMIT_NOFILE, &reslimit) == 0) {
161 |         if (nfile * args.threads > reslimit.rlim_cur - 100) {
162 |             fprintf(stderr,
163 |                 "ERROR: Too many BAM files provided (%i). Try running "
164 |                 "samtools merge on subsets of files to produce fewer files", nfile);
165 |             exit(EXIT_FAILURE);
166 |         }
167 |     }
168 | #endif
169 | 
170 |     // open output files, sort out filter options
171 |     output_files bed_files = open_bed_files(
172 |         args.prefix, args.cpg, args.chh, args.chg, args.accumulated);
173 | 
174 |     // load ref sequence
175 |     faidx_t *fai = fai_load(args.ref);
176 |     if (fai == NULL) {
177 |         fprintf(stderr,
178 |             "ERROR: Failed to parse reference file\n");
179 |         exit(EXIT_FAILURE);
180 |     }
181 |     if (args.region == NULL) {
182 |         // process all regions
183 |         int nseq = faidx_nseq(fai);
184 |         for (int i = 0; i < nseq; ++i) {
185 |             const char *chr = faidx_iseq(fai, i);
186 |             int len = faidx_seq_len(fai, chr);
187 |             int alen;
188 |             char *ref = faidx_fetch_seq(fai, chr, 0, len, &alen);
189 |             if (!args.mask) {
190 |                 for (size_t i=0; i<alen; ++i){ ref[i] = toupper(ref[i]); }
191 |             }
192 |             fprintf(stderr, "Fetched %s, %i %i\n", chr, len, alen);
193 |             process_region(args, chr, 0, len, ref, bed_files);
194 |             free(ref);
195 |         }
196 |     } else {
197 |         // process given region
198 |         int start, end;
199 |         char *chr = xalloc(strlen(args.region) + 1, sizeof(char), "chr");
200 |         strcpy(chr, args.region);
201 |         char *reg_chr = (char *) hts_parse_reg(chr, &start, &end);
202 |         // start and end now zero-based end exclusive
203 |         if (reg_chr) {
204 |             *reg_chr = '\0';  // sets chr to be terminated at correct point
205 |         } else {
206 |             fprintf(stderr, "ERROR: Failed to parse region: '%s'.\n", args.region);
207 |             exit(EXIT_FAILURE);
208 |         }
209 |         // simplify things for later (motif matching) on by fetching whole chr
210 |         int len;
211 |         char *ref = fai_fetch(fai, chr, &len);
212 |         if (len < 0) {
213 |             fprintf(stderr, "ERROR: Failed to fetch reference region: '%s'.\n", args.region);
214 |             exit(EXIT_FAILURE);
215 |         }
216 |         if (!args.mask) {
217 |             for (size_t i=0; i<len; ++i){ ref[i] = toupper(ref[i]); }
218 |         }
219 |         end = min(end, len);
220 |         process_region(args, chr, start, end, ref, bed_files);
221 | 
222 |         free(chr);
223 |         free(ref);
224 |     }
225 |     close_bed_files(bed_files);
226 |     fai_destroy(fai);
227 |     clock_t end = clock();
228 |     fprintf(stderr, "Total time: %fs\n", (double)(end - begin) / CLOCKS_PER_SEC);
229 |     exit(EXIT_SUCCESS);
230 | }
231 | 


--------------------------------------------------------------------------------
/src/version.h:
--------------------------------------------------------------------------------
1 | // remember to bump version in modbampy/__init__.py too
2 | const char *argp_program_version = "0.10.0";
3 | 


--------------------------------------------------------------------------------
/test/test_api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from modbampy import ModBam
 5 | 
 6 | test_bam = os.path.join(os.path.dirname(__file__), "..", "test_data", "400ecoli.bam")
 7 | tag_codes_bam = os.path.join(os.path.dirname(__file__), "..", "test_data", "tag_codes.bam")
 8 | 
 9 | class MotifTest(unittest.TestCase):
10 | 
11 |     def test_001_mapping_quality(self):
12 |         t = 0
13 |         i = 0
14 |         with ModBam(test_bam) as bam:
15 |             for r in bam.reads("ecoli1", 0, 4000000):
16 |                 t += r.mapping_quality
17 |                 i += 1
18 |         assert(t / i == 60.0)
19 | 
20 |     def test_002_complete_parse(self):
21 |         n_mods = 0
22 |         with ModBam(test_bam) as bam:
23 |             for r in bam.reads("ecoli1", 0, 4000000):
24 |                 for site in r.mod_sites:
25 |                     n_mods += 1
26 |         assert n_mods == 3259
27 | 
28 |     def test_010_tag_parse(self):
29 |         with ModBam(tag_codes_bam) as bam:
30 |             for r in bam.reads("ecoli1", 0, 4000000):
31 |                 # read name has expected modified base code
32 |                 expected_tag = r.query_name.split("_")[-1]
33 |                 try:
34 |                     expected_tag = int(expected_tag)
35 |                 except ValueError:
36 |                     pass
37 |                 for site in r.mod_sites:
38 |                     print(site)
39 |                     assert site.mbase == expected_tag
40 | 
41 |     def test_020_pileup(self):
42 |         with ModBam(test_bam) as bam:
43 |             bam.pileup("ecoli1", 105000, 105100)
44 |         with ModBam(tag_codes_bam) as bam:
45 |             with self.assertRaises(ValueError):
46 |                 bam.pileup("ecoli1", 0, 4000000, mod_base=27551)
47 |             bam.pileup("ecoli1", 0, 4000000, mod_base=27551, canon_base="C")
48 | 
49 |     def test_030_pythonic(self):
50 |         with ModBam(test_bam) as bam:
51 |             reads = list(bam.reads("ecoli1", 0, 4000000))
52 |         qnames = set(x.query_name for x in reads)
53 |         assert(len(qnames) > 1)
54 | 
55 |     def test_040_nonexisting_chrom(self):
56 |         with ModBam(test_bam) as bam:
57 |             reads = list(bam.reads("ecoli1xx", 0, 4000000))
58 |         assert(reads == [])
59 | 


--------------------------------------------------------------------------------
/test/test_motifs.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import unittest
 3 | 
 4 | NOT_G = 'ACTMWYH'
 5 | NOT_C = 'AGTRWKD'
 6 | MAYBE_G = 'GRSKVDBN'
 7 | MAYBE_C = 'CMQYVHBN'
 8 | 
 9 | 
10 | from modbampy import libbam
11 | 
12 | class MotifTest(unittest.TestCase):
13 | 
14 |     def test_001_is_cpg(self):
15 |         is_cpg_fwd = libbam.is_cpg_fwd
16 |         is_cpg_rev = libbam.is_cpg_rev
17 | 
18 |         # forward
19 |         assert is_cpg_fwd(3, 8, b"AAACGAAA") == True
20 |         assert is_cpg_fwd(3, 8, b"AAACTAAA") == False
21 |         # reverse
22 |         assert is_cpg_rev(4, 8, b"AAACGAAA") == True
23 |         assert is_cpg_rev(4, 8, b"AAACTAAA") == False
24 |         # end, but overrun
25 |         assert is_cpg_fwd(6, 7, b"AAAAAACG") == False
26 |         assert is_cpg_fwd(6, 8, b"AAAAAACG") == True
27 |         # don't break
28 |         assert is_cpg_rev(0, 8, b"AAAAAACG") == False
29 | 
30 | 
31 |     def test_010_is_chh(self):
32 |         is_chh_fwd = libbam.is_chh_fwd
33 |         is_chh_rev = libbam.is_chh_rev
34 | 
35 |         # forward
36 |         for b1, b2 in itertools.product(NOT_G, repeat=2):
37 |             assert is_chh_fwd(3, 8, f"AAAC{b1}{b2}AA".encode()) == True
38 |         for b1, b2 in itertools.product(MAYBE_G, repeat=2):
39 |             assert is_chh_fwd(3, 8, f"AAAC{b1}{b2}AA".encode()) == False
40 |         for b1, b2 in itertools.product(NOT_G, MAYBE_G):
41 |             assert is_chh_fwd(3, 8, f"AAAC{b1}{b2}AA".encode()) == False
42 |             assert is_chh_fwd(3, 8, f"AAAC{b2}{b1}AA".encode()) == False
43 | 
44 |         # reverse
45 |         for b1, b2 in itertools.product(NOT_C, repeat=2):
46 |             assert is_chh_rev(5, 8, f"AAA{b1}{b2}GAA".encode()) == True
47 |         for b1, b2 in itertools.product(MAYBE_C, repeat=2):
48 |             assert is_chh_rev(5, 8, f"AAA{b1}{b2}GAA".encode()) == False
49 |         for b1, b2 in itertools.product(NOT_C, MAYBE_C):
50 |             assert is_chh_rev(5, 8, f"AAA{b1}{b2}GAA".encode()) == False
51 |             assert is_chh_rev(5, 8, f"AAA{b2}{b1}GAA".encode()) == False
52 | 
53 |         # end, but overrun
54 |         assert is_chh_fwd(5, 7, b"AAAAACHH") == False
55 |         assert is_chh_fwd(5, 8, b"AAAAACHH") == True
56 | 
57 |         # don't break
58 |         for i in 0, 1:
59 |             assert is_chh_rev(5, 7, b"AAAAACHH") == False
60 | 
61 | 
62 |     def test_020_is_chg(self):
63 |         is_chg_fwd = libbam.is_chg_fwd
64 |         is_chg_rev = libbam.is_chg_rev
65 | 
66 |         # forward
67 |         for b1 in NOT_G:
68 |             assert is_chg_fwd(3, 8, f"AAAC{b1}GAA".encode()) == True
69 |         for b1 in MAYBE_G:
70 |             assert libbam.is_chg_fwd(3, 8, f"AAAC{b1}GAA".encode()) == False
71 |         for b1, b2 in itertools.product(MAYBE_G, NOT_G + MAYBE_G):
72 |             assert is_chg_fwd(3, 8, f"AAAC{b1}{b2}AA".encode()) == False
73 | 
74 |         # reverse
75 |         for b1 in NOT_C:
76 |             assert is_chg_rev(5, 8, f"AAAC{b1}GAA".encode()) == True
77 |         for b1 in MAYBE_C:
78 |             assert is_chg_rev(5, 8, f"AAAC{b1}GAA".encode()) == False
79 |         for b1, b2 in itertools.product(NOT_C + MAYBE_C, MAYBE_C):
80 |             assert is_chg_rev(5, 8, f"AAA{b1}{b2}GAA".encode()) == False
81 | 
82 |         # end, but overrun
83 |         assert is_chg_fwd(5, 7, b"AAAAACHG") == False
84 |         assert is_chg_fwd(5, 8, b"AAAAACHG") == True
85 | 
86 |         # don't break
87 |         for i in 0, 1:
88 |             assert is_chg_rev(5, 7, b"AAAAACHG") == False
89 | 


--------------------------------------------------------------------------------
/test_data/400ecoli.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/test_data/400ecoli.bam


--------------------------------------------------------------------------------
/test_data/400ecoli.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/test_data/400ecoli.bam.bai


--------------------------------------------------------------------------------
/test_data/ecoli.fasta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/test_data/ecoli.fasta.gz


--------------------------------------------------------------------------------
/test_data/tag_codes.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/test_data/tag_codes.bam


--------------------------------------------------------------------------------
/test_data/tag_codes.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/test_data/tag_codes.bam.bai


--------------------------------------------------------------------------------