├── .gitlab-ci.yml ├── .gitmodules ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── build-wheels.sh ├── build.py ├── conda ├── build.sh ├── conda_build_config.yaml └── meta.yaml ├── images └── ONT_logo_590x106.png ├── modbampy └── __init__.py ├── requirements.txt ├── setup.py ├── src ├── args.c ├── args.h ├── bamiter.c ├── bamiter.h ├── common.c ├── common.h ├── counts.c ├── counts.h ├── modbam2bed.c └── version.h ├── test ├── test_api.py └── test_motifs.py └── test_data ├── 400ecoli.bam ├── 400ecoli.bam.bai ├── ecoli.fasta.gz ├── tag_codes.bam └── tag_codes.bam.bai /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | include: 2 | - project: "epi2melabs/ci-templates" 3 | file: 4 | - "push-github.yaml" 5 | - "push-conda.yaml" 6 | - "snippets.yaml" 7 | 8 | image: ${UBUNTUIMAGE}:20.04 9 | 10 | variables: 11 | GIT_SUBMODULE_STRATEGY: recursive 12 | 13 | 14 | .prep-image: &prep-image | 15 | export DEBIAN_FRONTEND=noninteractive 16 | apt update -qq 17 | apt install -y --no-install-recommends gcc autoconf libtool automake valgrind make curl wget zlib1g-dev libbz2-dev libreadline-dev libssl-dev libffi-dev liblzma-dev libcurl4-gnutls-dev 18 | 19 | .minimal-python: &minimal-python | 20 | export DEBIAN_FRONTEND=noninteractive 21 | apt-get update -qq && apt-get install -y -qq python3-all-dev python3-venv 22 | 23 | 24 | stages: 25 | - test 26 | - prerelease 27 | - release 28 | 29 | 30 | bld:program: 31 | stage: test 32 | before_script: 33 | - *prep-image 34 | script: 35 | - !reference [.check, license] 36 | - make modbam2bed 37 | - ./modbam2bed --help 38 | - make mem_check 39 | 40 | 41 | bld:api-test: 42 | stage: test 43 | script: 44 | - *prep-image 45 | - *minimal-python 46 | - make test_api 47 | - make test_python 48 | 49 | 50 | deploy-checks: 51 | stage: prerelease 52 | variables: 53 | PACKAGE_NAME: modbampy 54 | script: 55 | - !reference [.check, argp-c-version] 56 | - !reference [.check, python-version] 57 | - !reference [.check, changelog] 58 | - !reference [.check, license] 59 | rules: 60 | - if: '$CI_COMMIT_TAG =~ /^v[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+$/' 61 | 62 | 63 | .before-script: &before-script | 64 | export CONDA_PKG=${CI_PROJECT_NAME} 65 | export CONDA_PKG_VERSION=${CI_COMMIT_TAG/v/} 66 | mkdir conda-build 67 | cd conda-build 68 | 69 | 70 | conda: 71 | extends: .deploy-conda-linux 72 | variables: 73 | RECIPEPATH: "../conda" 74 | before_script: 75 | - *prep-image 76 | - *before-script 77 | 78 | conda-arm: 79 | extends: .deploy-conda-linux-arm 80 | variables: 81 | RECIPEPATH: "../conda" 82 | before_script: 83 | - *prep-image 84 | - *before-script 85 | 86 | conda-mac: 87 | extends: .deploy-conda-mac 88 | variables: 89 | RECIPEPATH: "../conda" 90 | before_script: 91 | - *before-script 92 | 93 | conda-mac-arm: 94 | extends: .deploy-conda-mac-arm 95 | variables: 96 | RECIPEPATH: "../conda" 97 | before_script: 98 | - *before-script 99 | 100 | ### Python 101 | 102 | bld:py-sdist: 103 | stage: prerelease 104 | script: 105 | - *prep-image 106 | - *minimal-python 107 | - make sdist 108 | - cd dist 109 | - python3 -m venv venv 110 | - . venv/bin/activate 111 | - pip install --upgrade pip 112 | - pip install *.tar.gz 113 | artifacts: 114 | paths: 115 | - dist/*.tar.gz 116 | 117 | .many-linux: &many-linux-def 118 | stage: prerelease 119 | variables: 120 | DO_COUNT_TEST: 1 121 | script: 122 | - echo "Building a Python ${PYWHEEL} wheel on manylinux_${FLAVOUR}" 123 | - ./build-wheels.sh . ${PYWHEEL} 124 | artifacts: 125 | paths: 126 | - wheelhouse-final/*.whl 127 | only: 128 | - tags 129 | 130 | 131 | make-wheels-2010: 132 | extends: .many-linux 133 | image: "quay.io/pypa/manylinux2010_x86_64" 134 | parallel: 135 | matrix: 136 | - PYWHEEL: [7, 8] 137 | FLAVOUR: ["2010"] 138 | 139 | 140 | make-wheels-2014: 141 | extends: .many-linux 142 | image: "quay.io/pypa/manylinux2014_x86_64" 143 | parallel: 144 | matrix: 145 | - PYWHEEL: [7, 8, 9] 146 | FLAVOUR: ["2014"] 147 | 148 | 149 | make-wheels-2_24: 150 | extends: .many-linux 151 | image: "quay.io/pypa/manylinux_2_24_x86_64" 152 | parallel: 153 | matrix: 154 | - PYWHEEL: [8, 9, 10] 155 | FLAVOUR: ["2_24"] 156 | 157 | 158 | deploy:pypi: 159 | stage: release 160 | script: 161 | - *minimal-python 162 | - make pypi_build/bin/activate 163 | - source pypi_build/bin/activate 164 | - twine upload --non-interactive dist/modbampy*.tar.gz wheelhouse-final/modbampy*.whl 165 | rules: 166 | - if: '$CI_COMMIT_TAG =~ /^v[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+$/' 167 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libdeflate"] 2 | path = libdeflate 3 | url = https://github.com/ebiggers/libdeflate.git 4 | [submodule "htslib"] 5 | path = htslib 6 | url = https://github.com/samtools/htslib 7 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [v0.10.0] 8 | ### Changed 9 | Read iterator now returns copies of alignments for more Pythonic behaviour. 10 | 11 | ## [v0.9.5] 12 | This version adds no user facing changes. 13 | 14 | ### Fixed 15 | - Pinned pip in Makefile so CI tests will run. 16 | 17 | ## [v0.9.4] 18 | ### Fixed 19 | - Python source distribution did not include libdeflate directory. 20 | 21 | ## [v0.9.3] 22 | ### Added 23 | - Linux and macOS ARM conda builds. 24 | 25 | ## [v0.9.2] 26 | ### Changed 27 | - Rebuild for conda with more specific htslib version pin. 28 | 29 | ## [v0.9.1] 30 | ### Added 31 | - `--map_q` command line option to filter reads by minimum mapping quality. 32 | ### Fixed 33 | - The default mapping quality was erroneously 1 not 0. 34 | 35 | ## [v0.9.0] 36 | ### Added 37 | - "Other" modified base column to extended output. For example, when using 38 | `-m 5mC` to count 5-methylcytosine in reads, the "other" column will 39 | enumerate counts of other cytosine modifications present. When using the 40 | option `--combine` this column will contain zero (the counts being included 41 | in the modified base count). 42 | - `--theshold` option to replace both `-a` and `-b`. 43 | ### Fixed 44 | - In line with the "other" column, when not using the `--combine` option 45 | the potential presence of other modifications in the same family of the 46 | base requested is taking into account. This has an effect of distributing 47 | some previously erroneous "canonical/non-modified" calls to "other" and 48 | "filtered" counts. 49 | ### Removed 50 | - The options `-a` and `-b` are deprecated. Instead use `--threshold`. 51 | 52 | ## [v0.8.0] 53 | ### Added 54 | - `--combine` option to combine calls from all modified bases in a family. 55 | The previous behaviour was that the non-modified (canonical) count would 56 | have be incremented. For example when searching for 5mC modifications 57 | with `-m 5mC` and a 5hmC base was present, the read would contribute 58 | to the canonical count, not the modified count. 59 | 60 | ## [v0.7.0] 61 | ### Added 62 | - `--pileup` option to output full raw base counts rather than BED methyl. 63 | ### Changed 64 | - `-c` no longer synonym to `--cpg`. 65 | - `?`-style MM subtags now handle correctly with "missing" entries being recorded 66 | as "no call" rather than implied canonical. 67 | - extended output now includes a 15th column for "no call" bases. 68 | ### Fixed 69 | - Links in README. 70 | 71 | ## [v0.6.3] 72 | ### Changed 73 | - Bumped htslib version to version 1.16 for fixes to MM tag parsing/validation. 74 | - Change conda build back to bioconda::htslib since we're using a released version. 75 | 76 | ## [v0.6.2] 77 | ### Fixed 78 | - Off-by-one in pointless BED field. 79 | 80 | ## [v0.6.1] 81 | ### Fixed 82 | - ChEBI codes not cast correctly in Python API. 83 | ### Added 84 | - Support ambiguous modified bases as listed in HTS tags specification. 85 | 86 | ## [v0.6.0] 87 | ## Changed 88 | - Sites with no coverage now report "nan" methylation frequency and score. 89 | ## Added 90 | - Option `--aggregate` to pair information from two strands and output additional files. 91 | - Support for ChEBI codes in C and Python pileup API. 92 | 93 | ## [v0.5.3] 94 | ## Added 95 | - Python 3.9 and 3.10 wheel builds. 96 | 97 | ## [v0.5.2] 98 | ## Changed 99 | - Use commit `e51f72f` of htslib for `?` and `.` parsing of Mm tag. 100 | 101 | ## [v0.5.1] 102 | ## Added 103 | - `--max_depth` argument, and do not limit by default. 104 | - `--chh` and `--chg` filter options. 105 | 106 | ## [v0.5.0] 107 | ### Changed 108 | - Decouple file opening from read iteration. 109 | - Move Python pileup function to method of ModBam class. 110 | 111 | ## [v0.4.6] 112 | ### Changed 113 | - Reworked compilation to remove argparser from Python module. 114 | ### Fixed 115 | - Memory leak in modbampy. 116 | 117 | ## [v0.4.5] 118 | ### Fixed 119 | - Unmasking of reference sites (again). 120 | ### Added 121 | - Option `--mask`/`-k` to respect reference soft-masking. 122 | 123 | ## [v0.4.4] 124 | ### Fixed 125 | - Logic error in filtering CpG sites for masked bases. 126 | 127 | ## [v0.4.3] 128 | ### Changed 129 | - Update modbampy version to match C code. 130 | 131 | ## [v0.4.2] 132 | ### Changed 133 | - Include soft-masked reference positions when performing CpG filtering. 134 | 135 | ## [v0.4.1] 136 | ### Fixed 137 | - Inaccuracies in README. 138 | 139 | ## [v0.4.0] 140 | ### Fixed 141 | - Python pileup access after addition of multi-BAM support. 142 | ### Added 143 | - Additional properties to alignment objects in Python API. 144 | 145 | ## [v0.3.3] 146 | ### Changed 147 | - conda build now uses htslib from bioconda. 148 | 149 | ## [v0.3.2] 150 | ### Changed 151 | - Updated software build to use official version 1.14 htslib release. 152 | - Reorganised and updated README. 153 | 154 | ## [v0.3.1] 155 | ### Changed 156 | - Build conda package with explicit libdeflate version. 157 | 158 | ## [v0.3.0] 159 | ### Added 160 | - Multiple BAM parsing to streamline interaction with data from Guppy/MinKNOW. 161 | ### Changed 162 | - Reference file must now be given before list of BAM files on command-line. 163 | ### Fixed 164 | - Removed some debugging text. 165 | 166 | ## [v0.2.2] 167 | ### Changed 168 | - Updated README to note Python package. 169 | 170 | ## [v0.2.1] 171 | ### Fixed 172 | - Incorrect processing of non-primary alignments in Python API. 173 | ### Added 174 | - Add Python packages, available on PyPI. 175 | ### Changed 176 | - Updated htslib to version from samtools/dev. 177 | 178 | 179 | ## [v0.2.0] 180 | ### Fixed 181 | - Segmentation fault on exit caused by double free of faidx member. 182 | ### Added 183 | - Python API to pileup and read-level parsing. 184 | 185 | ## [v0.1.1] 186 | ### Fixed 187 | - Check input files are present and readable rather than segfaulting. 188 | ### Changed 189 | - Clearer error messaging. 190 | 191 | 192 | ## [v0.1.0] 193 | 194 | First release. 195 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Oxford Nanopore Technologies PLC. Public License Version 1.0 2 | ============================================================= 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor’s Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Executable Form" 25 | means any form of the work other than Source Code Form. 26 | 27 | 1.6. "Larger Work" 28 | means a work that combines Covered Software with other material, in 29 | a separate file or files, that is not Covered Software. 30 | 31 | 1.7. "License" 32 | means this document. 33 | 34 | 1.8. "Licensable" 35 | means having the right to grant, to the maximum extent possible, 36 | whether at the time of the initial grant or subsequently, any and 37 | all of the rights conveyed by this License. 38 | 39 | 1.9. "Modifications" 40 | means any of the following: 41 | 42 | (a) any file in Source Code Form that results from an addition to, 43 | deletion from, or modification of the contents of Covered 44 | Software; or 45 | (b) any new file in Source Code Form that contains any Covered 46 | Software. 47 | 48 | 1.10. "Research Purposes" 49 | means use for internal research and not intended for or directed 50 | towards commercial advantages or monetary compensation; provided, 51 | however, that monetary compensation does not include sponsored 52 | research of research funded by grants. 53 | 54 | 1.11 "Secondary License" 55 | means either the GNU General Public License, Version 2.0, the GNU 56 | Lesser General Public License, Version 2.1, the GNU Affero General 57 | Public License, Version 3.0, or any later versions of those 58 | licenses. 59 | 60 | 1.12. "Source Code Form" 61 | means the form of the work preferred for making modifications. 62 | 63 | 1.13. "You" (or "Your") 64 | means an individual or a legal entity exercising rights under this 65 | License. For legal entities, "You" includes any entity that 66 | controls, is controlled by, or is under common control with You. For 67 | purposes of this definition, "control" means (a) the power, direct 68 | or indirect, to cause the direction or management of such entity, 69 | whether by contract or otherwise, or (b) ownership of more than 70 | fifty percent (50%) of the outstanding shares or beneficial 71 | ownership of such entity. 72 | 73 | 2. License Grants and Conditions 74 | -------------------------------- 75 | 76 | 2.1. Grants 77 | 78 | Each Contributor hereby grants You a world-wide, royalty-free, 79 | non-exclusive license under Contributor copyrights Licensable by such 80 | Contributor to use, reproduce, make available, modify, display, 81 | perform, distribute, and otherwise exploit solely for Research Purposes 82 | its Contributions, either on an unmodified basis, with Modifications, 83 | or as part of a Larger Work. 84 | 85 | 2.2. Effective Date 86 | 87 | The licenses granted in Section 2.1 with respect to any Contribution 88 | become effective for each Contribution on the date the Contributor 89 | first distributes such Contribution. 90 | 91 | 2.3. Limitations on Grant Scope 92 | 93 | The licenses granted in this Section 2 are the only rights granted under 94 | this License. No additional rights or licenses will be implied from the 95 | distribution or licensing of Covered Software under this License. The 96 | License is incompatible with Secondary Licenses. Notwithstanding 97 | Section 2.1 above, no copyright license is granted: 98 | 99 | (a) for any code that a Contributor has removed from Covered Software; 100 | or 101 | 102 | (b) use of the Contributions or its Contributor Version other than for 103 | Research Purposes only; or 104 | 105 | (c) for infringements caused by: (i) Your and any other third party’s 106 | modifications of Covered Software, or (ii) the combination of its 107 | Contributions with other software (except as part of its Contributor 108 | Version). 109 | 110 | This License does not grant any rights in the patents, trademarks, 111 | service marks, or logos of any Contributor (except as may be necessary 112 | to comply with the notice requirements in Section 3.4). 113 | 114 | 2.4. Subsequent Licenses 115 | 116 | No Contributor makes additional grants as a result of Your choice to 117 | distribute the Covered Software under a subsequent version of this 118 | License (see Section 10.2) or under the terms of a Secondary License 119 | (if permitted under the terms of Section 3.3). 120 | 121 | 2.5. Representation 122 | 123 | Each Contributor represents that the Contributor believes its 124 | Contributions are its original creation(s) or it has sufficient rights 125 | to grant the rights to its Contributions conveyed by this License. 126 | 127 | 2.6. Fair Use 128 | 129 | This License is not intended to limit any rights You have under 130 | applicable copyright doctrines of fair use, fair dealing, or other 131 | equivalents. 132 | 133 | 2.7. Conditions 134 | 135 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 136 | in Section 2.1. 137 | 138 | 3. Responsibilities 139 | ------------------- 140 | 141 | 3.1. Distribution of Source Form 142 | 143 | All distribution of Covered Software in Source Code Form, including any 144 | Modifications that You create or to which You contribute, must be under 145 | the terms of this License. You must inform recipients that the Source 146 | Code Form of the Covered Software is governed by the terms of this 147 | License, and how they can obtain a copy of this License. You may not 148 | attempt to alter or restrict the recipients’ rights in the Source Code Form. 149 | 150 | 3.2. Distribution of Executable Form 151 | 152 | If You distribute Covered Software in Executable Form then: 153 | 154 | (a) such Covered Software must also be made available in Source Code 155 | Form, as described in Section 3.1, and You must inform recipients of 156 | the Executable Form how they can obtain a copy of such Source Code 157 | Form by reasonable means in a timely manner, at a charge no more 158 | than the cost of distribution to the recipient; and 159 | 160 | (b) You may distribute such Executable Form under the terms of this 161 | License. 162 | 163 | 3.3. Distribution of a Larger Work 164 | 165 | You may create and distribute a Larger Work under terms of Your choice, 166 | provided that You also comply with the requirements of this License for 167 | the Covered Software. The Larger Work may not be a combination of Covered 168 | Software with a work governed by one or more Secondary Licenses. 169 | 170 | 3.4. Notices 171 | 172 | You may not remove or alter the substance of any license notices 173 | (including copyright notices, patent notices, disclaimers of warranty, 174 | or limitations of liability) contained within the Source Code Form of 175 | the Covered Software, except that You may alter any license notices to 176 | the extent required to remedy known factual inaccuracies. 177 | 178 | 3.5. Application of Additional Terms 179 | 180 | You may not choose to offer, or charge a fee for use of the Covered 181 | Software or a fee for, warranty, support, indemnity or liability 182 | obligations to one or more recipients of Covered Software. You must 183 | make it absolutely clear that any such warranty, support, indemnity, or 184 | liability obligation is offered by You alone, and You hereby agree to 185 | indemnify every Contributor for any liability incurred by such 186 | Contributor as a result of warranty, support, indemnity or liability 187 | terms You offer. You may include additional disclaimers of warranty and 188 | limitations of liability specific to any jurisdiction. 189 | 190 | 4. Inability to Comply Due to Statute or Regulation 191 | --------------------------------------------------- 192 | 193 | If it is impossible for You to comply with any of the terms of this 194 | License with respect to some or all of the Covered Software due to 195 | statute, judicial order, or regulation then You must: (a) comply with 196 | the terms of this License to the maximum extent possible; and (b) 197 | describe the limitations and the code they affect. Such description must 198 | be placed in a text file included with all distributions of the Covered 199 | Software under this License. Except to the extent prohibited by statute 200 | or regulation, such description must be sufficiently detailed for a 201 | recipient of ordinary skill to be able to understand it. 202 | 203 | 5. Termination 204 | -------------- 205 | 206 | 5.1. The rights granted under this License will terminate automatically 207 | if You fail to comply with any of its terms. 208 | 209 | 5.2. If You initiate litigation against any entity by asserting an 210 | infringement claim (excluding declaratory judgment actions, 211 | counter-claims, and cross-claims) alleging that a Contributor Version 212 | directly or indirectly infringes, then the rights granted to 213 | You by any and all Contributors for the Covered Software under Section 214 | 2.1 of this License shall terminate. 215 | 216 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 217 | end user license agreements (excluding distributors and resellers) which 218 | have been validly granted by You or Your distributors under this License 219 | prior to termination shall survive termination. 220 | 221 | ************************************************************************ 222 | * * 223 | * 6. Disclaimer of Warranty * 224 | * ------------------------- * 225 | * * 226 | * Covered Software is provided under this License on an "as is" * 227 | * basis, without warranty of any kind, either expressed, implied, or * 228 | * statutory, including, without limitation, warranties that the * 229 | * Covered Software is free of defects, merchantable, fit for a * 230 | * particular purpose or non-infringing. The entire risk as to the * 231 | * quality and performance of the Covered Software is with You. * 232 | * Should any Covered Software prove defective in any respect, You * 233 | * (not any Contributor) assume the cost of any necessary servicing, * 234 | * repair, or correction. This disclaimer of warranty constitutes an * 235 | * essential part of this License. No use of any Covered Software is * 236 | * authorized under this License except under this disclaimer. * 237 | * * 238 | ************************************************************************ 239 | 240 | ************************************************************************ 241 | * * 242 | * 7. Limitation of Liability * 243 | * -------------------------- * 244 | * * 245 | * Under no circumstances and under no legal theory, whether tort * 246 | * (including negligence), contract, or otherwise, shall any * 247 | * Contributor, or anyone who distributes Covered Software as * 248 | * permitted above, be liable to You for any direct, indirect, * 249 | * special, incidental, or consequential damages of any character * 250 | * including, without limitation, damages for lost profits, loss of * 251 | * goodwill, work stoppage, computer failure or malfunction, or any * 252 | * and all other commercial damages or losses, even if such party * 253 | * shall have been informed of the possibility of such damages. This * 254 | * limitation of liability shall not apply to liability for death or * 255 | * personal injury resulting from such party’s negligence to the * 256 | * extent applicable law prohibits such limitation, but in such event, * 257 | * and to the greatest extent permissible, damages will be limited to * 258 | * direct damages not to exceed one hundred dollars. Some * 259 | * jurisdictions do not allow the exclusion or limitation of * 260 | * incidental or consequential damages, so this exclusion and * 261 | * limitation may not apply to You. * 262 | * * 263 | ************************************************************************ 264 | 265 | 8. Litigation 266 | ------------- 267 | 268 | Any litigation relating to this License may be brought only in the 269 | courts of a jurisdiction where the defendant maintains its principal 270 | place of business and such litigation shall be governed by laws of that 271 | jurisdiction, without reference to its conflict-of-law provisions. 272 | Nothing in this Section shall prevent a party’s ability to bring 273 | cross-claims or counter-claims. 274 | 275 | 9. Miscellaneous 276 | ---------------- 277 | 278 | This License represents the complete agreement concerning the subject 279 | matter hereof. If any provision of this License is held to be 280 | unenforceable, such provision shall be reformed only to the extent 281 | necessary to make it enforceable. Any law or regulation which provides 282 | that the language of a contract shall be construed against the drafter 283 | shall not be used to construe this License against a Contributor. 284 | 285 | 10. Versions of the License 286 | --------------------------- 287 | 288 | 10.1. New Versions 289 | 290 | Oxford Nanopore Technologies PLC. is the license steward. Except as 291 | provided in Section 10.3, no one other than the license steward has the 292 | right to modify or publish new versions of this License. Each version 293 | will be given a distinguishing version number. 294 | 295 | 10.2. Effect of New Versions 296 | 297 | You may distribute the Covered Software under the terms of the version 298 | of the License under which You originally received the Covered Software, 299 | or under the terms of any subsequent version published by the license 300 | steward. 301 | 302 | 10.3. Modified Versions 303 | 304 | If you create software not governed by this License, and you want to 305 | create a new license for such software, you may create and use a 306 | modified version of this License if you rename the license and remove 307 | any references to the name of the license steward (except to note that 308 | such modified license differs from this License). 309 | 310 | Exhibit A - Source Code Form License Notice 311 | ------------------------------------------- 312 | 313 | This Source Code Form is subject to the terms of the Oxford Nanopore 314 | Technologies PLC. Public License, v. 1.0. Full licence can be found 315 | obtained from support@nanoporetech.com 316 | 317 | If it is not possible or desirable to put the notice in a particular 318 | file, then You may include the notice in a location (such as a LICENSE 319 | file in a relevant directory) where a recipient would be likely to look 320 | for such a notice. 321 | 322 | You may add additional accurate notices of copyright ownership. 323 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/*.c 2 | include src/*.h 3 | include README.md 4 | include LICENSE.md 5 | include requirements.txt 6 | include build.py 7 | include Makefile 8 | graft htslib* 9 | prune htslib/test/ 10 | include htslib/test/*.c 11 | include htslib/test/*.h 12 | include htslib/test/fuzz/*.c 13 | include htslib/test/fuzz/*.h 14 | prune htslib/htscodecs/tests 15 | include htslib/htscodecs/tests**/*.c 16 | include htslib/htscodecs/tests**/*.h 17 | graft libdeflate* 18 | graft images 19 | prune build 20 | prune docs 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | OS := $(shell uname) 2 | ARCH := $(shell arch) 3 | 4 | OS := $(shell uname) 5 | ifeq ($(OS), Darwin) 6 | # mainly for dev builds using homebrew things 7 | EXTRA_LDFLAGS ?= -L$(shell brew --prefix openssl@1.1)/lib 8 | ARGP ?= $(shell brew --prefix argp-standalone)/lib/libargp.a 9 | ARGP_INCLUDE ?= -I$(shell brew --prefix argp-standalone)/include 10 | else 11 | ARGP ?= 12 | ARGP_INCLUDE ?= 13 | endif 14 | 15 | 16 | CC ?= gcc 17 | CFLAGS ?= -fpic -msse3 -O3 -std=c99 18 | DEFLATE ?= $(PWD)/libdeflate 19 | STATIC_HTSLIB ?= htslib/libhts.a 20 | EXTRA_CFLAGS ?= 21 | EXTRA_LDFLAGS ?= 22 | EXTRA_LIBS ?= 23 | HTS_CONF_ARGS ?= 24 | HTS_CONF_ENV ?= CFLAGS="$(CFLAGS) $(EXTRA_CFLAGS)" 25 | 26 | WITHDEFLATE ?= 27 | DEFLATEREQ = 28 | ifeq ($(WITHDEFLATE), 1) 29 | CFLAGS += -I$(DEFLATE) -L$(DEFLATE) 30 | HTS_CONF_ARGS += --with-libdeflate 31 | HTS_CONF_ENV += LDFLAGS="-L$(DEFLATE)" 32 | EXTRA_LIBS += -ldeflate 33 | DEFLATEREQ = libdeflate/libdeflate.so.0 34 | endif 35 | 36 | NOTHREADS ?= 37 | ifeq ($(NOTHREADS), 1) 38 | CFLAGS += -DNOTHREADS 39 | endif 40 | 41 | VALGRIND ?= valgrind 42 | 43 | 44 | .PHONY: default 45 | default: modbam2bed 46 | 47 | libdeflate/libdeflate.so.0: 48 | @echo Compiling $(@F) 49 | cd libdeflate && make 50 | 51 | 52 | htslib/libhts.a: $(DEFLATEREQ) 53 | @echo Compiling $(@F) 54 | cd htslib/ \ 55 | && autoreconf -i \ 56 | && autoheader \ 57 | && autoconf \ 58 | && $(HTS_CONF_ENV) ./configure $(HTS_CONF_ARGS) \ 59 | && make -j 4 60 | 61 | 62 | .PHONY: clean_htslib 63 | clean_htslib: 64 | rm -rf htslib/autom4te.cache/ 65 | cd htslib && make clean || exit 0 66 | 67 | 68 | %.o: src/%.c 69 | mkdir -p obj && \ 70 | $(CC) -c -pthread -Wall -fstack-protector-strong -D_FORTIFY_SOURCE=2 $(CFLAGS) \ 71 | -Isrc -Ihtslib $(ARGP_INCLUDE) $(EXTRA_CFLAGS) $^ -o $@ 72 | 73 | .PHONY: clean_obj 74 | clean_obj: 75 | rm -rf *.o 76 | 77 | 78 | modbam2bed: modbam2bed.o common.o counts.o bamiter.o args.o $(STATIC_HTSLIB) 79 | $(CC) -pthread -Wall -fstack-protector-strong -D_FORTIFY_SOURCE=2 $(CFLAGS) \ 80 | -Isrc -Ihtslib $(EXTRA_CFLAGS) $(EXTRA_LDFLAGS)\ 81 | $^ $(ARGP) \ 82 | -lm -lz -llzma -lbz2 -lpthread -lcurl -lcrypto $(EXTRA_LIBS) \ 83 | -o $(@) 84 | 85 | .PHONY: clean 86 | clean: clean_obj clean_htslib 87 | rm -rf modbam2bed modbampy.egg-info pymod.a venv obj 88 | 89 | .PHONY: mem_check 90 | mem_check: modbam2bed 91 | $(VALGRIND) --error-exitcode=1 --tool=memcheck --leak-check=full --show-leak-kinds=all -s \ 92 | ./modbam2bed --threshold 0.66 -t 2 -r ecoli1 test_data/ecoli.fasta.gz test_data/400ecoli.bam test_data/400ecoli.bam > /dev/null 93 | 94 | 95 | .PHONY: test_api 96 | test_api: python 97 | ${IN_VENV} && pip install pytest 98 | ${IN_VENV} && pytest test --doctest-modules 99 | 100 | ### Python 101 | 102 | PYTHON ?= python3 103 | VENV ?= venv 104 | venv: ${VENV}/bin/activate 105 | IN_VENV=. ./${VENV}/bin/activate 106 | 107 | $(VENV)/bin/activate: 108 | test -d $(VENV) || $(PYTHON) -m venv $(VENV) --prompt "modbam" 109 | ${IN_VENV} && pip install pip==23.0.1 --upgrade 110 | ${IN_VENV} && pip install setuptools 111 | 112 | .PHONY: python 113 | python: htslib/libhts.a pymod.a $(VENV)/bin/activate 114 | ${IN_VENV} && pip install -r requirements.txt 115 | ${IN_VENV} && WITHDEFLATE=$(WITHDEFLATE) LDFLAGS=$(EXTRA_LDFLAGS) pip install -e . 116 | 117 | .PHONY: clean_python 118 | clean_python: clean_obj 119 | rm -rf dist build modbampy.egg-info pymod.a libmodbampy.abi3.so ${VENV} 120 | 121 | pymod.a: common.o bamiter.o counts.o 122 | ar rcs $@ $^ 123 | 124 | test_python: python 125 | ${IN_VENV} && pip install flake8 flake8-rst-docstrings flake8-docstrings flake8-import-order 126 | ${IN_VENV} && flake8 modbampy \ 127 | --import-order-style google --application-import-names modbampy,libmodbampy \ 128 | --statistics 129 | ${IN_VENV} && modbampy test_data/400ecoli.bam ecoli1 0 4000000 | wc -l 130 | ${IN_VENV} && modbampy test_data/400ecoli.bam ecoli1 0 4000000 --pileup | wc -l 131 | 132 | IN_BUILD=. ./pypi_build/bin/activate 133 | pypi_build/bin/activate: 134 | test -d pypi_build || $(PYTHON) -m venv pypi_build --prompt "(pypi) " 135 | ${IN_BUILD} && pip install pip --upgrade 136 | ${IN_BUILD} && pip install --upgrade pip setuptools twine wheel readme_renderer[md] keyrings.alt 137 | 138 | .PHONY: sdist 139 | sdist: pypi_build/bin/activate 140 | ${IN_BUILD} && python setup.py sdist 141 | 142 | 143 | .PHONY: wheels 144 | wheels: clean clean_python 145 | docker run -v `pwd`:/io quay.io/pypa/manylinux2010_x86_64 /io/build-wheels.sh /io 6 7 8 146 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Oxford Nanopore Technologies logo](https://github.com/epi2me-labs/modbam2bed/raw/master/images/ONT_logo_590x106.png) 2 | 3 | 4 | We have a new bioinformatic resource that replaces the functionality of this project! See our new repository here: 5 | [modkit](https://github.com/nanoporetech/modkit/). 6 | 7 | This repository is now unsupported and we do not recommend its use. Please contact Oxford Nanopore: support@nanoporetech.com for help with your application if it is not possible to upgrade. 8 | 9 | 10 | ****************** 11 | 12 | 13 | Modified-base BAM to bedMethyl 14 | ------------------------------ 15 | 16 | A program to aggregate modified base counts stored in a 17 | [modified-base BAM](https://samtools.github.io/hts-specs/SAMtags.pdf) (Section 2.1) file to 18 | a [bedMethyl](https://www.encodeproject.org/data-standards/wgbs/) file. 19 | 20 | A Python module is also available to obtain modified base information 21 | from BAM files in a convenient form. It is envisaged that this will eventually 22 | be replaced by an implementation in [pysam](https://pysam.readthedocs.io/en/latest/index.html). 23 | 24 | ### Installation 25 | 26 | The program is available from our conda channel, so can be installed with: 27 | 28 | mamba create -n modbam2bed -c bioconda -c conda-forge -c epi2melabs modbam2bed 29 | 30 | Packages are available for both Linux and MacOS. 31 | 32 | Alternatively to install from the source code, clone the repository and then use make: 33 | 34 | git clone --recursive https://github.com/epi2me-labs/modbam2bed.git 35 | make modbam2bed 36 | ./modbam2bed 37 | 38 | See the Makefile for more information. The code has been tested on MacOS (with 39 | dependencies from brew) and on Ubuntu 18.04 and 20.04. 40 | 41 | ### Usage 42 | 43 | The code requires aligned reads with the `Mm` and `Ml` tags (`MM` and `ML` also supported), 44 | and the reference sequence used for alignment. 45 | 46 | The below is a snapshot of the command-line interface; it may not be up-to-date, please 47 | refer to the program `--help` option for the most accurate guidance. 48 | 49 | ``` 50 | Usage: modbam2bed [OPTION...] [ ...] 51 | modbam2bed -- summarise one or more BAM with modified base tags to bedMethyl. 52 | 53 | General options: 54 | --aggregate Output additional aggregated (across strand) 55 | counts, requires --cpg or --chg. 56 | --combine Create output with combined modified counts: i.e. 57 | alternative modified bases within the same family 58 | (same canonical base) are included. 59 | -c, --pileup Output (full) raw base counts rather than BED 60 | file. 61 | -e, --extended Output extended bedMethyl including counts of 62 | canonical, modified, and filtered bases (in that 63 | order). 64 | -m, --mod_base=BASE Modified base of interest, one of: 5mC, 5hmC, 5fC, 65 | 5caC, 5hmU, 5fU, 5caU, 6mA, 5oxoG, Xao. (Or modA, 66 | modC, modG, modT, modU, modN for generic modified 67 | base). 68 | -p, --prefix=PREFIX Output file prefix. Only used when multiple output 69 | filters are given. 70 | -r, --region=chr:start-end Genomic region to process. 71 | -t, --threads=THREADS Number of threads for BAM processing. 72 | 73 | Base filtering options: 74 | -a, --canon_threshold=THRESHOLD 75 | Deprecated. The option will be removed in a future 76 | version. Please use --threshold. 77 | -b, --mod_threshold=THRESHOLD Deprecated. The option will be removed in a 78 | future version. Please use --threshold. 79 | --chg Output records filtered to CHG sites. 80 | --chh Output records filtered to CHH sites. 81 | --cpg Output records filtered to CpG sites. 82 | -f, --threshold=THRESHOLD Bases with a call probability < THRESHOLD are 83 | filtered from results (default 0.66). 84 | -k, --mask Respect soft-masking in reference file. 85 | 86 | Read filtering options: 87 | -d, --max_depth=DEPTH Max. per-file depth; avoids excessive memory 88 | usage. 89 | -g, --read_group=RG Only process reads from given read group. 90 | --haplotype=VAL Only process reads from a given haplotype. 91 | Equivalent to --tag_name HP --tag_value VAL. 92 | --tag_name=TN Only process reads with a given tag (see 93 | --tag_value). 94 | --tag_value=VAL Only process reads with a given tag value. 95 | 96 | -?, --help Give this help list 97 | --usage Give a short usage message 98 | -V, --version Print program version 99 | 100 | Mandatory or optional arguments to long options are also mandatory or optional 101 | for any corresponding short options. 102 | ``` 103 | 104 | ### Method and output format 105 | 106 | Oxford Nanopore Technogies' sequencing chemistries and basecallers can detect 107 | any number of modified bases. Compared to traditional methods which force a 108 | false dichoctomy between say cytosine and 5-methylcytosine, this rich biology 109 | needs to be remembered when interpreting modified base calls. 110 | 111 | The htslib pileup API is used to create a matrix of per-strand base counts 112 | including substitutions, modified bases and deletions. Inserted bases are not 113 | counted. Bases of an abiguous nature (refered to as "filtered" below), as 114 | defined by the filter threshold probabilities option `-b` are masked and used 115 | (along with substitutions and deletions) in the definition of the "score" 116 | (column 5) and "coverage" (column 10) entries of the bedMethyl file. 117 | 118 | In the case of `?`-style `MM` subtags, where a lack of a recorded call should 119 | not be taken as implying a canonical-base call, the "no call" count is incremented. 120 | The "no call" count is used in the calculation of "coverage" and also the denominator 121 | of "score". 122 | 123 | In summary, a base is determined as being either "canonical", "modified", "filtered", 124 | or "no call". The final output includes a modification frequency and score and 125 | coverage information in order to assess the reliability of the frequency. 126 | 127 | **Call filtering** 128 | 129 | To determine the base present at a locus in a read, the query base in the 130 | BAM record is examined along with the modified base information. A "canonical" 131 | base probability is calculated as `1 - sum(P_mod)`, with `P_mod` being 132 | the set of probabilities associated with all the modifications enumerated 133 | in the BAM record. The base form with largest probability is taken as the 134 | base present subject to the user-specified threshold. If the probability 135 | is below the threshold the call is masked and contributes to the "filtered" 136 | base count rather than the "canonical" or "modified" counts. 137 | 138 | **Special Handling of alternative modified bases (`--combine` option)** 139 | 140 | To intepret the case of multiple modifications being listed in 141 | the BAM, `modbam2bed` can operate in two modes: 142 | 143 | * *default*: alternative modified bases in the same family as the requested 144 | modification are counted separatedly as "other" --- neither in 145 | the "canonical" count of the "modified" count. 146 | * `--combine`: alternative modified bases are lumped together into the 147 | "modified" count and ultimately into a single modification frequency. 148 | 149 | ***A particular case where `--combine` is useful is when comparing to the result of bisulfite sequencing.*** 150 | 151 | **Output format** 152 | 153 | > The description of the [bedMethyl](https://www.encodeproject.org/data-standards/wgbs/) 154 | > format on the ENCODE project website is rather loose. The definitions below are chosen pragmatically. 155 | 156 | The table below describes precisely the entries in each column of the output BED 157 | file. Columns seven to nine inclusive are included for compatibility with the BED 158 | file specification, the values written are fixed and no meaning should be derived 159 | from them. Columns 5, 10, and 11 are defined in terms of counts of observed 160 | bases to agree with reasonable interpretations of the bedMethyl specifications: 161 | 162 | * Ncanon - canonical (unmodified) base count, (contigent on the use of `--combine`, see above.) 163 | * Nmod - modified base count. 164 | * Nfilt - count of bases where read does not contain a substitution or deletion 165 | with respect to the reference, but the modification status is ambiguous: these bases 166 | were filtered from the calculation of the modification frequency. 167 | * Nsub - count of reads with a substitution with respect to the reference. 168 | * Ndel - count of reads with a deletion with respect to the reference. 169 | * Nno call - counts of reads with an absent modification call (but not a substitution or deletion). 170 | * Nalt mod - counts of reads with and alternative modification call (but not a substitution or deletion). 171 | 172 | Since these interpretations may differ from other tools an extended output is 173 | available (enabled with the `-e` option) which includes three additional columns 174 | with verbatim base counts. 175 | 176 | | column | description | 177 | |--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 178 | | 1 | reference sequence name | 179 | | 2 | 0-based start position | 180 | | 3 | 0-based exclusive end position (invariably start + 1) | 181 | | 4 | Abbreviated name of modified-base examined | 182 | | 5 | "Score" 1000 * (Nmod + Ncanon) / (Nmod + Ncanon + Nno call + Nalt mod + Nfilt + Nsub + Ndel). The quantity reflects the extent to which the calculated modification frequency in Column 11 is confounded by the alternative calls. The denominator here is the total read coverage as given in Column 10. | 183 | | 6 | Strand (of reference sequence). Forward "+", or reverse "-". | 184 | | 7-9 | Ignore, included simply for compatibility. | 185 | | 10 | Read coverage at reference position including all canonical, modified, undecided (no calls and filtered), substitutions from reference, and deletions. Nmod + Ncanon + Nno call + Nalt mod + Nfilt + Nsub + Ndel | 186 | | 11 | Percentage of modified bases, as a proportion of canonical and modified (excluding no calls, filtered, substitutions, and deletions). 100 \* Nmod / (Nmod + Nalt mod + Ncanon) | 187 | | 12\* | Ncanon | 188 | | 13\* | Nmod | 189 | | 14\* | Nfilt those bases with a modification probability falling between given thresholds. | 190 | | 15\* | Nno call those bases for which the query base was the correct canonical base for the modified base being considered, but no call was made (see the definition of the `.` and `?` flags in the SAM tag specification). | 191 | | 16\* | Nalt mod those bases for which the query base was the correct canonical base for the modified base being considered, but and alternative modification was present. | 192 | 193 | \* Included in extended output only. 194 | 195 | 196 | ### Limitations 197 | 198 | The code has not been developed extensively and currently has some limitations: 199 | 200 | * Support for motif filtering is limited to CpG, CHG, and CHH, sites. Without 201 | this filtering enabled all reference positions that are the canonical base 202 | (on forward or reverse strand) equivalent to the modified base under 203 | consideration are reported. 204 | * Insertion columns are completely ignored for simplicitly (and avoid 205 | any heuristics). 206 | * Second strand `MM` subtags (i.e. `MM:C-m` as compared with `MM:C+m`) 207 | are not supported. These are not typically used so shouldn't affect most users. 208 | If such a tag is detected and warning will be thrown and the tag ignored. These tags 209 | do come in to play for duplex basecalls. 210 | 211 | ### Python package 212 | 213 | A Python package is available on [PyPI](https://pypi.org/project/modbampy/) which 214 | contains basic functionality for parsing BAM files with modified-base information. 215 | It is envisaged that this will eventually be replaced by an implementation in 216 | [pysam](https://pysam.readthedocs.io/en/latest/index.html). As such the interface 217 | is supplements but does not integrate or replace pysam. 218 | 219 | The package can be installed with: 220 | 221 | ``` 222 | pip install modbampy 223 | ``` 224 | 225 | The package contains simply to modes of use. Firstly an interface to iterate 226 | over reads in a BAM file and report modification sites: 227 | 228 | ``` 229 | from modbampy import ModBam 230 | with ModBam(args.bam) as bam: 231 | for read in bam.reads(args.chrom, args.start, args.end): 232 | for pos_mod in read.mod_sites: 233 | print(*pos_mod) 234 | ``` 235 | 236 | Each line of the above reports the 237 | 238 | * read_id, 239 | * reference position, 240 | * query (read) position, 241 | * reference strand (+ or -), 242 | * modification strand (0 or 1, as defined in the HTSlib tag specification. This is invariable 0), 243 | * canonical base associated with modification, 244 | * modified base, 245 | * modified-base score (scaled to 0-255). 246 | 247 | A second method is provided which mimics the couting procedure implemented in 248 | `modbam2bed`: 249 | 250 | ``` 251 | from modbampy import ModBam 252 | with ModBam(args.bam) as bam: 253 | positions, counts = bam.pileup( 254 | args.chrom, args.start, args.end 255 | low_threshold=0.33, high_threshold=0.66, mod_base="m") 256 | ``` 257 | 258 | The result is two [numpy](https://numpy.org/) arrays. The first indicates the reference 259 | positions associated with the counts in the second array. Each row of the second array 260 | (`counts` above) enumerates the observed counts of bases in the order: 261 | 262 | a c g t A C G T d D m M f F n N 263 | 264 | where uppercase letters refer to bases on the forward strand, lowercase letters 265 | relate to the reverse strand: 266 | 267 | * A, C, G, T are the usual DNA bases, 268 | * D indicates deletion counts, 269 | * M modified base counts, 270 | * F filtered counts - bases in reads with a modified-base record but which were filtered 271 | according to the thresholds provided. 272 | * N no call base counts. 273 | 274 | **Extras** 275 | 276 | The read iterator API also contains a minimal set of functionality mirroring properties of 277 | alignments available from pysam. See the [code](https://github.com/epi2me-labs/modbam2bed/blob/master/modbampy/__init__.py) 278 | for further details. 279 | 280 | ### Acknowledgements 281 | 282 | We thank [jkbonfield](https://github.com/jkbonfield) for developing the modified base 283 | functionality into the htslib pileup API, and [Jared Simpson](https://github.com/jts) 284 | for testing and comparison to his independently developed code. 285 | 286 | ### Help 287 | 288 | **Licence and Copyright** 289 | 290 | © 2021- Oxford Nanopore Technologies Ltd. 291 | 292 | `modbam2bed` is distributed under the terms of the Mozilla Public License 2.0. 293 | 294 | **Research Release** 295 | 296 | Research releases are provided as technology demonstrators to provide early 297 | access to features or stimulate Community development of tools. Support for 298 | this software will be minimal and is only provided directly by the developers. 299 | Feature requests, improvements, and discussions are welcome and can be 300 | implemented by forking and pull requests. However much as we would 301 | like to rectify every issue and piece of feedback users may have, the 302 | developers may have limited resource for support of this software. Research 303 | releases may be unstable and subject to rapid iteration by Oxford Nanopore 304 | Technologies. 305 | -------------------------------------------------------------------------------- /build-wheels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: ./build-wheels.sh ... 3 | set -e -x 4 | 5 | PACKAGE_NAME=modbampy 6 | 7 | workdir=$1 8 | shift 9 | 10 | echo "Changing cwd to ${workdir}" 11 | cd ${workdir} 12 | 13 | # some many linux containers are centos-based, others are debian! 14 | if [ -f /etc/centos-release ]; then 15 | yum install -y zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel 16 | else 17 | # https://stackoverflow.com/questions/76094428/debian-stretch-repositories-404-not-found 18 | sed -i -e 's/deb.debian.org/archive.debian.org/g' \ 19 | -e 's|security.debian.org|archive.debian.org/|g' \ 20 | -e '/stretch-updates/d' /etc/apt/sources.list 21 | apt update 22 | apt install -y zlib1g-dev libbz2-dev liblzma-dev libncurses5-dev libcurl4-gnutls-dev libssl-dev libffi-dev 23 | fi 24 | 25 | # downgrade autoconf to work more nicely with htslib 26 | curl -L -O http://ftp.gnu.org/gnu/autoconf/autoconf-2.69.tar.gz 27 | tar zxf autoconf-2.69.tar.gz 28 | cd autoconf-2.69 29 | ./configure 30 | make && make install 31 | cd .. 32 | 33 | export WITHDEFLATE=1 34 | LIBDEFLATE="${PWD}/libdeflate" 35 | LDFLAGS="-L${LIBDEFLATE}" 36 | 37 | make htslib/libhts.a 38 | mkdir -p wheelhouse 39 | 40 | echo "PYTHON VERSIONS AVAILABLE" 41 | ls /opt/python/ 42 | 43 | # Compile wheels 44 | for minor in $@; do 45 | if [[ "${minor}" == "8" ]] || [[ "${minor}" == "9" ]] || [[ "${minor}" == "10" ]]; then 46 | PYBIN="/opt/python/cp3${minor}-cp3${minor}/bin" 47 | else 48 | PYBIN="/opt/python/cp3${minor}-cp3${minor}m/bin" 49 | fi 50 | # auditwheel/issues/102 51 | "${PYBIN}"/pip install --upgrade setuptools pip wheel==0.31.1 cffi==1.15.0 52 | "${PYBIN}"/pip wheel --no-dependencies . -w ./wheelhouse/ 53 | done 54 | 55 | 56 | # Bundle external shared libraries into the wheels 57 | export LD_LIBRARY_PATH=$PWD/libdeflate 58 | ls ${LD_LIBRARY_PATH} 59 | for whl in "wheelhouse/${PACKAGE_NAME}"*.whl; do 60 | LD_LIBRARY_PATH=${LIBDEFLATE} auditwheel repair "${whl}" -w ./wheelhouse/ 61 | done 62 | unset LD_LIBRARY_PATH 63 | 64 | 65 | ## Install packages 66 | for minor in $@; do 67 | if [[ "${minor}" == "8" ]] || [[ "${minor}" == "9" ]] || [[ "${minor}" == "10" ]]; then 68 | PYBIN="/opt/python/cp3${minor}-cp3${minor}/bin" 69 | else 70 | PYBIN="/opt/python/cp3${minor}-cp3${minor}m/bin" 71 | fi 72 | "${PYBIN}"/pip install -r requirements.txt 73 | "${PYBIN}"/pip install "${PACKAGE_NAME}" --no-index -f ./wheelhouse 74 | "${PYBIN}"/modbampy --pileup test_data/400ecoli.bam ecoli1 105000 105100 75 | done 76 | 77 | mkdir wheelhouse-final 78 | cp wheelhouse/${PACKAGE_FILE_NAME}*manylinux* wheelhouse-final 79 | -------------------------------------------------------------------------------- /build.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | 4 | from cffi import FFI 5 | 6 | dir_path = os.path.dirname(os.path.realpath(__file__)) 7 | src_dir='src' 8 | libraries=['m', 'lzma', 'bz2', 'pthread', 'curl', 'crypto'] 9 | library_dirs=[] 10 | print("WITHDEFLATE:", os.getenv('WITHDEFLATE')) 11 | if os.getenv('WITHDEFLATE') == "1": 12 | print("Using deflate") 13 | libraries.append('deflate') 14 | library_dirs.append(os.path.join(dir_path, 'libdeflate')) 15 | 16 | ffibuilder = FFI() 17 | ffibuilder.set_source("libmodbampy", 18 | r""" 19 | #include "htslib/sam.h" 20 | #include "bamiter.h" 21 | #include "common.h" 22 | #include "counts.h" 23 | 24 | """, 25 | libraries=libraries, 26 | library_dirs=library_dirs, 27 | include_dirs=[src_dir, 'htslib'], 28 | extra_compile_args=['-std=c99', '-msse3', '-O3'], 29 | extra_objects=[ 30 | 'pymod.a', 31 | os.path.join('htslib', 'libhts.a')] 32 | ) 33 | 34 | cdef = [""" 35 | // START: custom header 36 | 37 | // export free 38 | void free(void *ptr); 39 | 40 | typedef int64_t hts_pos_t; 41 | 42 | // basic bam opening/handling 43 | typedef struct bam1_core_t { 44 | hts_pos_t pos; 45 | int32_t tid; 46 | uint16_t bin; // NB: invalid on 64-bit pos 47 | uint8_t qual; 48 | uint8_t l_extranul; 49 | uint16_t flag; 50 | uint16_t l_qname; 51 | uint32_t n_cigar; 52 | int32_t l_qseq; 53 | int32_t mtid; 54 | hts_pos_t mpos; 55 | hts_pos_t isize; 56 | } bam1_core_t; 57 | 58 | 59 | typedef struct bam1_t { 60 | bam1_core_t core; 61 | uint64_t id; 62 | uint8_t *data; 63 | int l_data; 64 | uint32_t m_data; 65 | uint32_t mempolicy:2, :30 /* Reserved */; 66 | } bam1_t; 67 | 68 | bam1_t *bam_init1(); 69 | void bam_destroy1(bam1_t *b); 70 | bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc); 71 | typedef struct mplp_data {...;} mplp_data; 72 | 73 | // opening bam with idx and hdr info 74 | typedef struct { ...; } bam_fset; 75 | bam_fset* create_bam_fset(char* fname); 76 | void destroy_bam_fset(bam_fset* fset); 77 | typedef struct set_fsets { 78 | bam_fset **fsets; 79 | size_t n; 80 | } set_fsets; 81 | set_fsets *create_filesets(const char **bams); 82 | void destroy_filesets(set_fsets *s); 83 | 84 | mplp_data *create_bam_iter_data( 85 | const bam_fset* fset, const char *chr, int start, int end, 86 | const char *read_group, const char tag_name[2], const int tag_value, const int min_mapq); 87 | void destroy_bam_iter_data(mplp_data *data); 88 | // iterate a file 89 | int read_bam(void *data, bam1_t *b); 90 | // cigar parsing 91 | int *qpos2rpos(bam1_t *b); 92 | 93 | // things from htslib 94 | hts_pos_t bam_endpos(const bam1_t *b); 95 | 96 | // retrieving mod data 97 | typedef struct hts_base_mod_state hts_base_mod_state; 98 | hts_base_mod_state *hts_base_mod_state_alloc(); 99 | void hts_base_mod_state_free(hts_base_mod_state *state); 100 | int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state); 101 | 102 | typedef struct hts_base_mod { 103 | int modified_base; 104 | int canonical_base; 105 | int strand; 106 | int qual; 107 | } hts_base_mod; 108 | int bam_next_basemod( 109 | const bam1_t *b, hts_base_mod_state *state, 110 | hts_base_mod *mods, int n_mods, int *pos); 111 | 112 | // from common.h needed in functions in counts.h 113 | //typedef struct mod_base {...;} mod_base; 114 | 115 | // END: custom header 116 | """] 117 | 118 | # add in some things from headers, removing directives 119 | for header in ('src/common.h', 'src/counts.h'): 120 | with open(header, 'r') as fh: 121 | cdef.append("// START: {}".format(header)) 122 | cdef.append( 123 | ''.join( 124 | x for x in fh.readlines() 125 | if not (x.startswith('#') or x.startswith("static inline int")))) 126 | cdef.append("// END: {}".format(header)) 127 | 128 | ffibuilder.cdef('\n\n'.join(cdef)) 129 | 130 | 131 | if __name__ == "__main__": 132 | ffibuilder.compile(verbose=True) 133 | -------------------------------------------------------------------------------- /conda/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NAME=modbam2bed 4 | 5 | ## self-built htslib 6 | #export HTS_CONF_ARGS="--prefix=${PREFIX} --enable-libcurl --with-libdeflate --enable-plugins --enable-gcs --enable-s3" 7 | #export EXTRA_CFLAGS="-I$PREFIX/include" 8 | #export EXTRA_LDFLAGS="-L$PREFIX/lib" 9 | #export EXTRA_LIBS="-ldl -lhts -ldeflate" 10 | ##export STATIC_HTSLIB="" 11 | 12 | # just link to htslib from bioconda 13 | export EXTRA_CFLAGS="-I$PREFIX/include" 14 | export STATIC_HTSLIB="" 15 | export EXTRA_LDFLAGS="-L$PREFIX/lib" 16 | export EXTRA_LIBS="-ldl -lhts" 17 | 18 | OS=$(uname) 19 | if [[ "$OS" == "Darwin" ]]; then 20 | echo "Setting Darwin args" 21 | export ARGP=${PREFIX}/lib/libargp.a 22 | export EXTRA_CFLAGS="${EXTRA_CFLAGS} -isysroot ${CONDA_BUILD_SYSROOT} -mmacosx-version-min=${MACOSX_DEPLOYMENT_TARGET}" 23 | fi 24 | 25 | make clean $NAME 26 | 27 | mkdir -p $PREFIX/bin 28 | cp $NAME $PREFIX/bin && chmod +x $PREFIX/bin/$NAME 29 | -------------------------------------------------------------------------------- /conda/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | CONDA_BUILD_SYSROOT: 2 | - /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk # [osx] 3 | -------------------------------------------------------------------------------- /conda/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: {{ environ.get('CONDA_PKG') }} 3 | version: {{ environ.get('CONDA_PKG_VERSION') }} 4 | 5 | source: 6 | path: ../ 7 | 8 | build: 9 | number: {{ environ.get('CONDA_PKG_BUILD', 0) }} 10 | 11 | requirements: 12 | build: 13 | - {{ compiler('c') }} 14 | host: 15 | - argp-standalone # [osx] 16 | - binutils # [not osx] 17 | # explicitly list htslib to get same versions of 18 | # other things, even when we build our own. When 19 | # using bioconda htslib the other things can be 20 | # removed from here. The pin is because we copy 21 | # private interface code for hts_base_mod_state 22 | # so need to ensure compatibility. We should 23 | # hopefully be fine with ABI 24 | - bioconda::htslib==1.16 25 | # - libcurl 26 | # - bzip2 27 | # - xz 28 | # - zlib 29 | # - libdeflate 30 | # - openssl # [not osx] 31 | run: 32 | - bioconda::htslib==1.16 33 | # - libcurl 34 | # - bzip2 35 | # - xz 36 | # - zlib 37 | # - libdeflate 38 | # - openssl # [not osx] 39 | 40 | test: 41 | commands: 42 | - modbam2bed --help 43 | 44 | about: 45 | home: "https://github.com/epi2me-labs/modbam2bed" 46 | license: Mozilla Public License 2.0 47 | license_family: OTHER 48 | license_file: LICENSE 49 | summary: "Summarise BAM files containing modified-base information to bedMethyl format." 50 | doc_url: https://github.com/epi2me-labs/modbam2bed 51 | dev_url: https://github.com/epi2me-labs/modbam2bed 52 | 53 | extra: 54 | recipe-maintainers: 55 | - cjw85 56 | -------------------------------------------------------------------------------- /images/ONT_logo_590x106.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/images/ONT_logo_590x106.png -------------------------------------------------------------------------------- /modbampy/__init__.py: -------------------------------------------------------------------------------- 1 | """Functionality for interacting with modified base tags in BAM files.""" 2 | 3 | import argparse 4 | import collections 5 | 6 | import numpy as np 7 | 8 | import libmodbampy 9 | 10 | # remember to bump version in src/version.h too 11 | __version__ = "0.10.0" 12 | ffi = libmodbampy.ffi 13 | libbam = libmodbampy.lib 14 | 15 | MAX_MODS = 256 # from htslib 16 | 17 | ModInfo = collections.namedtuple( 18 | 'ModInfo', ( 19 | 'query_name', 'rpos', 'qpos', 'strand', 'mstrand', 20 | 'cbase', 'mbase', 'qual')) 21 | 22 | 23 | def _tidy_args(read_group, tag_name, tag_value): 24 | """Turn Python variables into CFFI ones.""" 25 | if read_group is None: 26 | read_group = ffi.NULL 27 | else: 28 | read_group = ffi.new("char[]", read_group.encode()) 29 | if tag_name is None: 30 | tag_name = ffi.new("char[2]", "".encode()) 31 | tag_value = 0 32 | elif len(tag_name) != 2: 33 | raise ValueError("'tag_name' must be a length-2 string.") 34 | else: 35 | tag_name = ffi.new("char[2]", tag_name.encode()) 36 | return read_group, tag_name, tag_value 37 | 38 | 39 | class ModBase: 40 | """Helper to create a mod_base instance. 41 | 42 | :param code: modified base ChEBI code (e.g. "h" or 104) 43 | :param base: one of {A, C, G, T} 44 | :param name: long name of modified base (e.g. "5-methylcytosine") 45 | :param abbrev: short name of modified base (e.g. "5mC") 46 | 47 | Actually just a compatible list is created. Reuses the predefined 48 | instances from header where possible. 49 | """ 50 | 51 | def __init__(self, code, base=None, name="unknown", abbrev="unknown"): 52 | """Initialise the instance.""" 53 | self._name = ffi.new("char[]", name.encode()) 54 | self._abbrev = ffi.new("char[]", abbrev.encode()) 55 | self._base = base 56 | err = TypeError( 57 | "'base' should be a single character or None") 58 | if isinstance(self._base, str): 59 | if len(self._base) != 1: 60 | raise err 61 | self._base = base.encode() 62 | self._base_i = {"A": 1, "C": 2, "G": 4, "T": 8}[base] 63 | elif self._base is not None: 64 | raise err 65 | 66 | err = TypeError( 67 | "'code' should be a single character or an " 68 | "integer (ChEBI) code.") 69 | self._code = code 70 | if isinstance(self._code, str): 71 | # ffi won't coerce a char to int, so we need to do it 72 | if len(self._code) != 1: 73 | raise err 74 | self._code = ord(self._code) 75 | elif not isinstance(self._code, int): 76 | raise err 77 | 78 | @property 79 | def struct(self): 80 | """Return a list compatible with C structure.""" 81 | for i in range(libbam.n_mod_bases): 82 | if libbam.mod_bases[i].code == self._code: 83 | return libbam.mod_bases[i] 84 | 85 | # make a new mod_base using a code and a canonical base 86 | if self._base is None: 87 | raise ValueError( 88 | f"Modified base type '{self._code}' unknown. Please provide " 89 | "a value for 'base' to describe the unmodified base.") 90 | mod_base_type = [ 91 | self._name, self._abbrev, 92 | self._base, self._base_i, self._code] 93 | return mod_base_type 94 | 95 | 96 | class ModBam: 97 | """A minimal class to iterate over a bam.""" 98 | 99 | def __init__(self, bam): 100 | """Open a BAM file. 101 | 102 | :param bam: BAM file to open. 103 | """ 104 | self.bam = bam 105 | self._bam_fset = ffi.gc( 106 | libbam.create_bam_fset(self.bam.encode()), 107 | libbam.destroy_bam_fset) 108 | 109 | def __enter__(self): 110 | """Open context.""" 111 | return self 112 | 113 | def __exit__(self, type, value, traceback): 114 | """Exit context.""" 115 | pass 116 | 117 | def reads( 118 | self, chrom, start, end, 119 | read_group=None, tag_name=None, tag_value=None, min_mapq=0): 120 | """Iterate over (filtered) alignments in file. 121 | 122 | :param chrom: reference sequence from BAM. 123 | :param start: reference start coordinate. 124 | :param end: reference end coordinate. 125 | :param read group: read group of read to return. 126 | :param tag_name: read tag to check during read filtering. 127 | :param tag_value: tag value for reads to keep. 128 | :param min_mapq: minimum read mapping quality. 129 | """ 130 | read_group, tag_name, tag_value = _tidy_args( 131 | read_group, tag_name, tag_value) 132 | 133 | it = libbam.create_bam_iter_data( 134 | self._bam_fset, chrom.encode(), start, end, 135 | read_group, tag_name, tag_value, min_mapq) 136 | if it == ffi.NULL: 137 | return 138 | 139 | data = ffi.gc(it, libbam.destroy_bam_iter_data) 140 | mod_state = ffi.gc( 141 | libbam.hts_base_mod_state_alloc(), 142 | libbam.hts_base_mod_state_free) 143 | 144 | bam1_t = ffi.gc(libbam.bam_init1(), libbam.bam_destroy1) 145 | while libbam.read_bam(data, bam1_t) > 0: 146 | yield ModRead(bam1_t, mod_state) 147 | 148 | def pileup( 149 | self, chrom, start, end, 150 | read_group=None, tag_name=None, tag_value=None, 151 | low_threshold=0.33, high_threshold=0.66, threshold=0.66, 152 | mod_base="m", max_depth=None, canon_base=None, combine=False, 153 | min_mapq=0): 154 | """Create a base count matrix. 155 | 156 | :param chrom: reference sequence from BAM. 157 | :param start: reference start coordinate. 158 | :param end: reference end coordinate. 159 | :param read group: read group of read to return. 160 | :param tag_name: read tag to check during read filtering. 161 | :param tag_value: tag value for reads to keep. 162 | :param threshold: probability filter threshold for excluding 163 | calls from counts. 164 | :param mod_base: ChEBI code of modified base to examine. 165 | :param max_depth: maximum read depth to examine. 166 | :param canon_base: canonical base corresponding to `mod_base`. 167 | Required only if `mod_base` is not a modification known to 168 | the code. 169 | :param combine: combine (include) all alternative modifications 170 | with the same parent canonical base. 171 | :param min_mapq: minimum read mapping quality. 172 | """ 173 | for thresh in (low_threshold, high_threshold): 174 | if thresh < 0.0 or thresh > 1.0: 175 | raise ValueError("Thresholds should be in (0,1).") 176 | threshold = int(threshold * 255.0) 177 | # C code currently uses high_threshold as the only threshold 178 | high_threshold = threshold 179 | read_group, tag_name, tag_value = _tidy_args( 180 | read_group, tag_name, tag_value) 181 | 182 | if max_depth is None: 183 | max_depth = libbam._INT_MAX 184 | 185 | _f = ffi.new("bam_fset *[]", [self._bam_fset]) 186 | fsets = ffi.new("set_fsets *", {"fsets": _f, "n": 1}) 187 | mod_base = ModBase(code=mod_base, base=canon_base) 188 | plp_data = libbam.calculate_pileup( 189 | fsets, chrom.encode(), start, end, 190 | read_group, tag_name, tag_value, 191 | threshold, mod_base.struct, 192 | combine, max_depth, min_mapq) 193 | # TODO: check for NULL 194 | 195 | # copy data to numpy, we could be more clever here an wrap 196 | # the pointer in a subclass of ndarray to track its lifetime 197 | # and avoid the explicit copy 198 | n_rows = libbam.featlen 199 | size_sizet = np.dtype(np.uintp).itemsize 200 | np_counts = np.frombuffer(ffi.buffer( 201 | plp_data.matrix, size_sizet * plp_data.n_cols * n_rows), 202 | dtype=np.uintp 203 | ).reshape(plp_data.n_cols, n_rows).copy() 204 | np_positions = np.frombuffer( 205 | ffi.buffer(plp_data.major, size_sizet * plp_data.n_cols), 206 | dtype=np.uintp).copy() 207 | libbam.destroy_plp_data(plp_data) 208 | return np_positions, np_counts 209 | 210 | 211 | class ModRead: 212 | """Proxy for a bam alignment. 213 | 214 | The class is not intended to be instantiated by users. 215 | """ 216 | 217 | def __init__(self, bam1_t, mod_state, header=None): 218 | """Create an interface to alignment. 219 | 220 | The input alignment is copied. 221 | """ 222 | self._bam1_t = ffi.gc(libbam.bam_init1(), libbam.bam_destroy1) 223 | libbam.bam_copy1(self._bam1_t, bam1_t) 224 | self._mod_state = mod_state 225 | self._header = header 226 | 227 | @property 228 | def flags(self): 229 | """Return alignment flags.""" 230 | return self._bam1_t.core.flag 231 | 232 | @property 233 | def is_unmapped(self): 234 | """Return if read is unmapped.""" 235 | return self._bam1_t.core.flag & 4 > 0 236 | 237 | @property 238 | def is_reverse(self): 239 | """Return if alignment is to reverse strand.""" 240 | return self._bam1_t.core.flag & 16 > 0 241 | 242 | @property 243 | def is_secondary(self): 244 | """Return if alignment is a secondary alignment.""" 245 | return self._bam1_t.core.flag & 256 > 0 246 | 247 | @property 248 | def is_supplementary(self): 249 | """Return is alignment is a supplementary alignment.""" 250 | return self._bam1_t.core.flag & 2048 > 0 251 | 252 | @property 253 | def mapping_quality(self): 254 | """Return mapping quality.""" 255 | return self._bam1_t.core.qual 256 | 257 | @property 258 | def strand(self): 259 | """Return strand as '+' or '-'.""" 260 | return "+-"[self.is_reverse] 261 | 262 | @property 263 | def query_name(self): 264 | """Return query name.""" 265 | return ffi.string( 266 | (ffi.cast("char*", self._bam1_t.data))).decode() 267 | 268 | @property 269 | def query_length(self): 270 | """Return query length as record in BAM. See `query_sequence`.""" 271 | return self._bam1_t.core.l_qseq 272 | 273 | @property 274 | def query_sequence(self): 275 | """Return the query sequence as recorded in the BAM. 276 | 277 | Includes soft-clipped bases, does not include hard-clipped bases, and 278 | may return an error when sequence is not recorded. 279 | """ 280 | # bam1_seq() define 281 | # (b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname) 282 | raise NotImplementedError("query_sequence not implemented") 283 | 284 | @property 285 | def query_qualities(self): 286 | """Return the query quality array. 287 | 288 | Includes soft-clipped bases as for `query_sequence`. 289 | """ 290 | # bam1_qual define 291 | # ((b)->data + ((b)->core.n_cigar<<2) 292 | # + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) 293 | raise NotImplementedError("query_qualities not implemented") 294 | 295 | @property 296 | def reference_name(self): 297 | """Return the reference name associated with the alignment.""" 298 | if self._bam1_t.core.tid == -1: 299 | return None 300 | elif self.header is None: 301 | raise IndexError( 302 | "Require header information to retrieve reference_name") 303 | else: 304 | raise NotImplementedError( 305 | "Fetching reference_name not implemented") 306 | 307 | @property 308 | def reference_start(self): 309 | """Return the 0-based start position of the alignment.""" 310 | return self._bam1_t.core.pos 311 | 312 | @property 313 | def reference_end(self): 314 | """Return the 0-based (exclusive) end position of the alignment.""" 315 | return libbam.bam_endpos(self._bam1_t) 316 | 317 | @property 318 | def reference_length(self): 319 | """Return the length of the alignment on the reference.""" 320 | return self.reference_end - self.reference_start 321 | 322 | @property 323 | def get_aligned_pairs(self): 324 | """Return aligned query and reference positions.""" 325 | raise NotImplementedError("get_aligned_pairs not implemented") 326 | 327 | @property 328 | def alignment(self): 329 | """Create array representing alignment. 330 | 331 | The returned item is of length self.query_length 332 | """ 333 | if not hasattr(self, "_alignment"): 334 | self._alignment = ffi.gc( 335 | libbam.qpos2rpos(self._bam1_t), libbam.free) 336 | return self._alignment 337 | 338 | @property 339 | def mod_sites(self): 340 | """Iterate over all modified bases in read. 341 | 342 | :yields: (read_id, ref. pos., query pos., ref. strand, 343 | mod. strand, canon. base, mod. base, mod. quality) 344 | 345 | The ref. strand is that recorded in the Mm tag from the bam. 346 | """ 347 | mods = ffi.new("hts_base_mod[{}]".format(MAX_MODS)) 348 | pos = ffi.new("int *") 349 | align = self.alignment 350 | libbam.bam_parse_basemod(self._bam1_t, self._mod_state) 351 | n = 1 352 | while n > 0: 353 | n = libbam.bam_next_basemod( 354 | self._bam1_t, self._mod_state, mods, MAX_MODS, pos) 355 | rpos = align[pos[0]] 356 | if n > 0: 357 | for i in range(n): 358 | m = mods[i] 359 | # note m.strand refers to strand recorded in the Mm tag. 360 | modbase = m.modified_base 361 | if modbase > 0: 362 | modbase = chr(modbase) 363 | else: 364 | modbase = -modbase 365 | yield ModInfo( 366 | self.query_name, rpos, pos[0], self.strand, m.strand, 367 | chr(m.canonical_base), modbase, m.qual) 368 | 369 | 370 | def main(): 371 | """Test entry point.""" 372 | parser = argparse.ArgumentParser( 373 | description="Modified base demo program.", 374 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 375 | parser.add_argument( 376 | "bam", help="Indexed .bam file.") 377 | parser.add_argument( 378 | "chrom", help="Chromosome for which to fetch read") 379 | parser.add_argument( 380 | "start", type=int, 381 | help="Reference start coordinate.") 382 | parser.add_argument( 383 | "end", type=int, 384 | help="Reference end coordinate.") 385 | parser.add_argument( 386 | "--pileup", action="store_true", 387 | help="Create pileup counts rather than per-read modified base data") 388 | parser.add_argument( 389 | "--mod_base", default="m", 390 | help="Modified base to count during pileup.") 391 | parser.add_argument( 392 | "--low_threshold", type=float, default=0.33, 393 | help="Lower threshold for filtering.") 394 | parser.add_argument( 395 | "--high_threshold", type=float, default=0.66, 396 | help="High threshold for filtering.") 397 | args = parser.parse_args() 398 | 399 | with ModBam(args.bam) as bam: 400 | if args.pileup: 401 | codes = ffi.string(libbam.plp_bases).decode() 402 | print("pos\t", end="") 403 | print("\t".join(x for x in codes)) 404 | positions, counts = bam.pileup( 405 | args.chrom, args.start, args.end, mod_base=args.mod_base, 406 | low_threshold=args.low_threshold, 407 | high_threshold=args.high_threshold) 408 | for p, row in zip(positions, counts): 409 | print(p, end='\t') 410 | print("\t".join(str(x) for x in row)) 411 | else: 412 | counts = collections.Counter() 413 | for read in bam.reads(args.chrom, args.start, args.end): 414 | for pos_mod in read.mod_sites: 415 | counts[pos_mod.qual] += 1 416 | for k in sorted(counts.keys()): 417 | print(k, counts[k]) 418 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Add comments to keep track of why we are using particular versions 2 | cffi==1.15.0 # 1.15.1 leads to c/_cffi_backend.c:15:10: fatal error: ffi.h: No such file or directory 3 | numpy 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import re 5 | import shutil 6 | import platform 7 | from glob import glob 8 | from setuptools import setup, find_packages, Extension 9 | from setuptools import Distribution, Command 10 | from setuptools.command.install import install 11 | from setuptools.command.build_ext import build_ext 12 | import subprocess 13 | import pkg_resources 14 | 15 | 16 | __pkg_name__ = 'modbampy' 17 | __author__ = 'cwright' 18 | __description__ = 'Accessing modified-base data from BAM files.' 19 | 20 | # Use readme as long description and say its github-flavour markdown 21 | from os import path 22 | this_directory = path.abspath(path.dirname(__file__)) 23 | kwargs = {'encoding':'utf-8'} if sys.version_info.major == 3 else {} 24 | with open(path.join(this_directory, 'README.md'), **kwargs) as f: 25 | __long_description__ = f.read() 26 | __long_description_content_type__ = 'text/markdown' 27 | 28 | __path__ = os.path.dirname(__file__) 29 | __pkg_path__ = os.path.join(os.path.join(__path__, __pkg_name__)) 30 | 31 | # Get the version number from __init__.py 32 | verstrline = open(os.path.join(__pkg_name__, '__init__.py'), 'r').read() 33 | vsre = r"^__version__ = ['\"]([^'\"]*)['\"]" 34 | mo = re.search(vsre, verstrline, re.M) 35 | if mo: 36 | __version__ = mo.group(1) 37 | else: 38 | raise RuntimeError('Unable to find version string in "{}/__init__.py".'.format(__pkg_name__)) 39 | 40 | dir_path = os.path.dirname(__file__) 41 | with open(os.path.join(dir_path, 'requirements.txt')) as fh: 42 | install_requires = [ 43 | str(requirement) for requirement in 44 | pkg_resources.parse_requirements(fh)] 45 | 46 | data_files = [] 47 | extra_requires = {} 48 | extensions = [] 49 | 50 | class HTSBuild(build_ext): 51 | # uses the Makefile to build libhts.a, this will get done before the cffi extension 52 | def run(self): 53 | 54 | def compile_hts(): 55 | subprocess.check_call(['make', os.path.join('htslib', 'libhts.a'), 'pymod.a']) 56 | 57 | self.execute(compile_hts, [], 'Compiling htslib using Makefile') 58 | build_ext.run(self) 59 | 60 | 61 | setup( 62 | name=__pkg_name__, 63 | version=__version__, 64 | url='https://github.com/epi2me-labs/modbam2bed', 65 | author=__author__, 66 | author_email='{}@nanoporetech.com'.format(__author__), 67 | cffi_modules=["build.py:ffibuilder"], 68 | description=__description__, 69 | long_description=__long_description__, 70 | long_description_content_type=__long_description_content_type__, 71 | dependency_links=[], 72 | ext_modules=extensions, 73 | install_requires=install_requires, 74 | tests_require=[].extend(install_requires), 75 | extras_require=extra_requires, 76 | # don't include any testing subpackages in dist 77 | packages=find_packages(exclude=['*.test', '*.test.*', 'test.*', 'test']), 78 | package_data={__pkg_name__:[os.path.join('data', '*')]}, 79 | zip_safe=False, 80 | data_files=data_files, 81 | entry_points={ 82 | 'console_scripts': [ 83 | '{0} = {0}:main'.format(__pkg_name__) 84 | ] 85 | }, 86 | cmdclass={ 87 | 'build_ext': HTSBuild 88 | }, 89 | scripts=[] 90 | ) 91 | -------------------------------------------------------------------------------- /src/args.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "htslib/sam.h" 9 | #include "htslib/faidx.h" 10 | #include "args.h" 11 | #include "version.h" 12 | 13 | const char *argp_program_bug_address = "chris.wright@nanoporetech.com"; 14 | static char doc[] = 15 | "modbam2bed -- summarise one or more BAM with modified base tags to bedMethyl.\ 16 | \vModification information store in the BAM files is examine to derive\ 17 | an identity of a possibly modified base. Calls are filtered by the\ 18 | user-provided threshold probability. By default a single-modified base\ 19 | is reported in the output, though the `--combine` option can fuse\ 20 | calls for all modification in a family. Column 5 (\"score\") of the output\ 21 | is calculated as the proportion of bases called as the canonical or modified\ 22 | reference base with respect to the number of spanning reads, scaled to a\ 23 | maximum of 1000. Column 10 is the total read coverage including reads with:\ 24 | canonical base, modified base, undetermined (filtered) base, substituted\ 25 | base (a base other than the canonical or modified base under consideration),\ 26 | and deletions. Column 11 is the percentage of reference-base calls identified\ 27 | as being modified (as a proportion of those confidently determined as\ 28 | canonical or modified). Extended output (-e option) can give raw counts\ 29 | of canonical, modified, alternatively modified, and undetermined bases\ 30 | for completeness. See https://github.com/epi2me-labs/modbam2bed for a\ 31 | overly precise explanation of the output.\ 32 | \n\nOutput is to standard output unless multiple motif filters are specified.\ 33 | In such cases the --prefix option controls the output file name."; 34 | static char args_doc[] = " [ ...]"; 35 | static struct argp_option options[] = { 36 | {0, 0, 0, 0, 37 | "General options:"}, 38 | {"region", 'r', "chr:start-end", 0, 39 | "Genomic region to process."}, 40 | {"extended", 'e', 0, 0, 41 | "Output extended bedMethyl including counts of canonical, modified, and filtered bases (in that order)."}, 42 | {"mod_base", 'm', "BASE", 0, 43 | "Modified base of interest, one of: 5mC, 5hmC, 5fC, 5caC, 5hmU, 5fU, 5caU, 6mA, 5oxoG, Xao. (Or modA, modC, modG, modT, modU, modN for generic modified base)."}, 44 | {"combine", 0x800, 0, 0, 45 | "Create output with combined modified counts: i.e. alternative modified bases within the same family (same canonical base) are included."}, 46 | {"aggregate", 0x600, 0, 0, 47 | "Output additional aggregated (across strand) counts, requires --cpg or --chg."}, 48 | {"threads", 't', "THREADS", 0, 49 | "Number of threads for BAM processing."}, 50 | {"prefix", 'p', "PREFIX", 0, 51 | "Output file prefix. Only used when multiple output filters are given."}, 52 | {"pileup", 'c', 0, 0, 53 | "Output (full) raw base counts rather than BED file."}, 54 | {0, 0, 0, 0, 55 | "Base filtering options:"}, 56 | {"canon_threshold", 'a', "THRESHOLD", 0, 57 | "Deprecated. The option will be removed in a future version. Please use --threshold.", 2}, 58 | {"mod_threshold", 'b', "THRESHOLD", 0, 59 | "Deprecated. The option will be removed in a future version. Please use --threshold.", 2}, 60 | {"threshold", 'f', "THRESHOLD", 0, 61 | "Bases with a call probability < THRESHOLD are filtered from results (default 0.66).", 2}, 62 | {"cpg", 0x700, 0, 0, 63 | "Output records filtered to CpG sites.", 2}, 64 | {"chh", 0x400, 0, 0, 65 | "Output records filtered to CHH sites.", 2}, 66 | {"chg", 0x500, 0, 0, 67 | "Output records filtered to CHG sites.", 2}, 68 | {"mask", 'k', 0, 0, 69 | "Respect soft-masking in reference file.", 2}, 70 | {0, 0, 0, 0, 71 | "Read filtering options:"}, 72 | {"max_depth", 'd', "DEPTH", 0, 73 | "Max. per-file depth; avoids excessive memory usage.", 3}, 74 | {"read_group", 'g', "RG", 0, 75 | "Only process reads from given read group.", 3}, 76 | {"tag_name", 0x100, "TN", 0, 77 | "Only process reads with a given tag (see --tag_value).", 3}, 78 | {"tag_value", 0x200, "VAL", 0, 79 | "Only process reads with a given tag value.", 3}, 80 | {"haplotype", 0x300, "VAL", 0, 81 | "Only process reads from a given haplotype. Equivalent to --tag_name HP --tag_value VAL.", 3}, 82 | {"map_q", 0x900, "QUAL", 0, 83 | "Filter reads below this mapping quality.", 3}, 84 | { 0 } 85 | }; 86 | 87 | bool file_exists(char* filename) { 88 | struct stat st; 89 | return (stat(filename, &st) == 0); 90 | } 91 | 92 | static int tag_items = 0; 93 | static bool tag_given = false; 94 | static bool hp_given = false; 95 | static error_t parse_opt (int key, char *arg, struct argp_state *state) { 96 | arguments_t *arguments = state->input; 97 | float thresh; 98 | bool found = false; 99 | switch (key) { 100 | case 'a': 101 | case 'b': 102 | argp_error (state, "Options `-a` and `-b` are deprecated, Please use only `-f`. These option will be removed in a future version.\n"); 103 | break; 104 | case 'f': 105 | thresh = atof(arg); 106 | if (thresh < 0 || thresh > 1.0) { 107 | argp_error (state, "Threshold parameter must be in (0,1), got %s", arg); 108 | } 109 | arguments->threshold = (int)(thresh * 255); 110 | break; 111 | case 'm': 112 | for (size_t i = 0; i < n_mod_bases; ++i) { 113 | if (!strcmp(mod_bases[i].abbrev, arg)) { 114 | arguments->mod_base = mod_bases[i]; 115 | found = true; 116 | break; 117 | } 118 | } 119 | if (!found) { 120 | argp_error( 121 | state, 122 | "Unrecognised modified base type: %s. ChEBI codes are not supported", arg); 123 | } 124 | break; 125 | case 0x800: 126 | arguments->combine = true; 127 | break; 128 | case 'r': 129 | arguments->region = arg; 130 | break; 131 | case 0x700: 132 | arguments->cpg = true; 133 | break; 134 | case 0x400: 135 | arguments->chh = true; 136 | break; 137 | case 0x500: 138 | arguments->chg = true; 139 | break; 140 | case 0x600: 141 | arguments->accumulated = true; 142 | break; 143 | case 'k': 144 | arguments->mask = true; 145 | break; 146 | case 'e': 147 | arguments->extended = true; 148 | break; 149 | case 'g': 150 | arguments->read_group = arg; 151 | break; 152 | case 'd': 153 | arguments->hts_maxcnt = atoi(arg); 154 | break; 155 | case 0x100: 156 | if (strlen(arg) > 2) { 157 | argp_error(state, "Tag name should be a two-letter code, received: '%s'.", arg); 158 | } 159 | memcpy(arguments->tag_name, arg, 2 *sizeof(char)); 160 | tag_items += 1; 161 | tag_given = true; 162 | break; 163 | case 0x200: 164 | arguments->tag_value = atoi(arg); 165 | tag_items += 1; 166 | tag_given = true; 167 | break; 168 | case 0x300: 169 | memcpy(arguments->tag_name, "HP", 2 * sizeof(char)); 170 | arguments->tag_value = atoi(arg); 171 | tag_items += 2; 172 | hp_given = true; 173 | break; 174 | case 0x900: 175 | arguments->min_mapQ = atoi(arg); 176 | break; 177 | case 't': 178 | arguments->threads = atoi(arg); 179 | break; 180 | case 'p': 181 | arguments->prefix = arg; 182 | break; 183 | case 'c': 184 | arguments->pileup = true; 185 | break; 186 | case ARGP_KEY_NO_ARGS: 187 | argp_usage (state); 188 | break; 189 | case ARGP_KEY_ARG: 190 | if (state->arg_num == 0) { 191 | arguments->ref = arg; 192 | if (!file_exists(arg)) { 193 | argp_error(state, "Cannot access reference input file: '%s'.", arg); 194 | } 195 | faidx_t *fai = fai_load(arg); 196 | if (fai == NULL) { 197 | argp_error(state, "Cannot read .fasta(.gz) file: '%s'.", arg); 198 | } 199 | fai_destroy(fai); 200 | break; 201 | } else { 202 | arguments->bam = (const char**)(&state->argv[state->next - 1]); 203 | state->next = state->argc; 204 | break; 205 | } 206 | break; 207 | case ARGP_KEY_END: 208 | if (state->arg_num < 2) 209 | argp_usage (state); 210 | break; 211 | default: 212 | return ARGP_ERR_UNKNOWN; 213 | } 214 | return 0; 215 | } 216 | 217 | static struct argp argp = {options, parse_opt, args_doc, doc}; 218 | 219 | arguments_t parse_arguments(int argc, char** argv) { 220 | arguments_t args; 221 | args.mod_base = default_mod_base; 222 | args.combine = false; 223 | args.threshold = (int)(0.66 * MAX_QUAL); 224 | args.bam = NULL; 225 | args.ref = NULL; 226 | args.region = NULL; 227 | args.read_group = NULL; 228 | args.tag_name[0] = '\0'; 229 | args.tag_value = -1; 230 | args.cpg = false; 231 | args.chh = false; 232 | args.chg = false; 233 | args.mask = false; 234 | args.accumulated = false; 235 | args.extended = false; 236 | args.threads = 1; 237 | args.prefix = "mod-counts"; 238 | args.pileup = false; 239 | args.hts_maxcnt = INT_MAX; 240 | args.min_mapQ = 0; 241 | argp_parse(&argp, argc, argv, 0, 0, &args); 242 | // allow CpG only for C! 243 | if (args.cpg || args.chh || args.chg) { 244 | if (args.mod_base.base != 'C') { 245 | fprintf(stderr, "ERROR: Options '--cpg/--chh/--chg' can only be used with cytosine modifications."); 246 | exit(1); 247 | }; 248 | } 249 | if (args.cpg + args.chh + args.chh > 1) { 250 | fprintf(stderr, "INFO: Multiple filters given, output will be to files named e.g. '%s.cpg.bed'.\n", args.prefix); 251 | } 252 | if (tag_items % 2 > 0) { 253 | fprintf(stderr, "ERROR: Both or neither of --tag_name and --tag_value must be given.\n"); 254 | exit(1); 255 | } 256 | if (tag_given && hp_given) { 257 | fprintf(stderr, "ERROR: If --haplotype is given neither of --tag_name or --tag_value should be provided.\n"); 258 | exit(1); 259 | } 260 | if (strncmp("5mC", args.mod_base.abbrev, 3) == 0 || strncmp("5hmC", args.mod_base.abbrev, 4)) { 261 | fprintf(stderr, 262 | "WARNING: You have specified either 5mC or 5hmC as a modified base.\n\ 263 | Oxford Nanopore Basecallers jointly call C, 5mC, and 5hmC. If you\n\ 264 | wish to combine calls of these bases into a single 'modified'\n\ 265 | count, please use the `--combine` option. The default behaviour\n\ 266 | is that calls of alternative modified bases are added to the\n\ 267 | alternatively-modified count."); 268 | } 269 | return args; 270 | } 271 | -------------------------------------------------------------------------------- /src/args.h: -------------------------------------------------------------------------------- 1 | #ifndef _MODBAMBED_ARGS_H 2 | #define _MODBAMBED_ARGS_H 3 | 4 | #include 5 | 6 | #include "common.h" 7 | 8 | typedef struct arguments { 9 | const char** bam; 10 | char* ref; 11 | char* region; 12 | char* read_group; 13 | char tag_name[2]; 14 | int tag_value; 15 | mod_base mod_base; 16 | bool combine; 17 | bool mask; 18 | bool cpg; 19 | bool chh; 20 | bool chg; 21 | bool extended; 22 | bool accumulated; 23 | int threads; 24 | int threshold; 25 | char* prefix; 26 | bool pileup; 27 | int hts_maxcnt; 28 | int min_mapQ; 29 | } arguments_t; 30 | 31 | arguments_t parse_arguments(int argc, char** argv); 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /src/bamiter.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "bamiter.h" 5 | #include "common.h" 6 | 7 | 8 | // Initialise BAM file, index and header structures 9 | bam_fset* create_bam_fset(const char* fname) { 10 | bam_fset* fset = xalloc(1, sizeof(bam_fset), "bam fileset"); 11 | fset->fp = hts_open(fname, "rb"); 12 | fset->idx = sam_index_load(fset->fp, fname); 13 | fset->hdr = sam_hdr_read(fset->fp); 14 | if (fset->hdr == 0 || fset->idx == 0 || fset->fp == 0) { 15 | destroy_bam_fset(fset); 16 | fprintf(stderr, "Failed to read .bam file '%s'.", fname); 17 | exit(1); 18 | } 19 | return fset; 20 | } 21 | 22 | // Destory BAM file, index and header structures 23 | void destroy_bam_fset(bam_fset* fset) { 24 | hts_close(fset->fp); 25 | hts_idx_destroy(fset->idx); 26 | sam_hdr_destroy(fset->hdr); 27 | free(fset); 28 | } 29 | 30 | // Initialise multiple BAM filesets 31 | set_fsets *create_filesets(const char **bam_files) { 32 | int nfile = 0; for (; bam_files[nfile]; nfile++); 33 | set_fsets *sets = xalloc(1, sizeof(set_fsets), "bam file sets"); 34 | sets->fsets = xalloc(nfile, sizeof(bam_fset*), "bam files"); 35 | sets->n = nfile; 36 | for (size_t i = 0; i < nfile; ++i) { 37 | sets->fsets[i] = create_bam_fset((const char *) bam_files[i]); 38 | if (sets->fsets[i] == NULL) { 39 | for (size_t j = 0; j < i; ++j) { 40 | destroy_bam_fset(sets->fsets[i]); 41 | } 42 | free(sets->fsets); free(sets); 43 | return NULL; 44 | } 45 | } 46 | return sets; 47 | } 48 | 49 | // Destroy multiple BAM filesets 50 | void destroy_filesets(set_fsets *s) { 51 | for (size_t i = 0; i < s->n; ++i) { 52 | destroy_bam_fset(s->fsets[i]); 53 | } 54 | free(s->fsets); free(s); 55 | } 56 | 57 | 58 | /** Set up a bam file for reading (filtered) records. 59 | * 60 | * @param bam_file input aligment file. 61 | * @param chr bam target name. 62 | * @param start start position of chr to consider. 63 | * @param end end position of chr to consider. 64 | * @param read_group by which to filter alignments. 65 | * @param tag_name by which to filter alignments. 66 | * @param tag_value associated with tag_name. 67 | * 68 | * The return value can be freed with destroy_bam_iter_data. 69 | * 70 | */ 71 | mplp_data *create_bam_iter_data( 72 | const bam_fset* bam_set, const char *chr, int start, int end, 73 | const char *read_group, const char tag_name[2], const int tag_value, 74 | const int min_mapQ) { 75 | 76 | // open bam etc. 77 | // this is all now deferred to the caller 78 | htsFile *fp = bam_set->fp; 79 | hts_idx_t *idx = bam_set->idx; 80 | sam_hdr_t *hdr = bam_set->hdr; 81 | 82 | // find the target index for query below 83 | int mytid = -1; 84 | for (int i=0; i < hdr->n_targets; ++i) { 85 | if(!strcmp(hdr->target_name[i], chr)) { 86 | mytid = i; 87 | break; 88 | } 89 | } 90 | if (mytid == -1) { 91 | fprintf(stderr, "Failed to find reference sequence '%s' in bam.\n", chr); 92 | return NULL; 93 | } 94 | 95 | // setup bam interator 96 | mplp_data *data = xalloc(1, sizeof(mplp_data), "pileup init data"); 97 | data->fp = fp; data->idx = idx; data->hdr = hdr; 98 | data->iter = bam_itr_queryi(idx, mytid, start, end); 99 | memcpy(data->tag_name, tag_name, 2); data->tag_value = tag_value; 100 | data->min_mapQ = min_mapQ; data->read_group = read_group; 101 | 102 | return data; 103 | } 104 | 105 | /** Clean up auxiliary bam reading data. 106 | * 107 | * @param data auxiliary structure to clean. 108 | * 109 | */ 110 | void destroy_bam_iter_data(mplp_data *data) { 111 | bam_itr_destroy(data->iter); 112 | free(data); 113 | } 114 | 115 | 116 | /** Read a bam record. 117 | * 118 | * @param data an mplp_data encoding the bam file to read with filter options. 119 | * @param b output pointer. 120 | * 121 | */ 122 | int read_bam(void *data, bam1_t *b) { 123 | mplp_data *aux = (mplp_data*) data; 124 | uint8_t *tag; 125 | bool check_tag = (strcmp(aux->tag_name, "") != 0); 126 | bool have_rg = (aux->read_group != NULL); 127 | uint8_t *rg; 128 | char *rg_val; 129 | int ret; 130 | while (1) { 131 | ret = aux->iter ? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); 132 | if (ret<0) break; 133 | // only take primary alignments 134 | if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FQCFAIL | BAM_FDUP)) continue; 135 | // filter by mapping quality 136 | if ((int)b->core.qual < aux->min_mapQ) continue; 137 | // filter by tag 138 | if (check_tag) { 139 | tag = bam_aux_get((const bam1_t*) b, aux->tag_name); 140 | if (tag == NULL){ // tag isn't present or is currupt 141 | if (aux->keep_missing) { 142 | break; 143 | } else { 144 | continue; 145 | } 146 | } 147 | int tag_value = bam_aux2i(tag); 148 | if (errno == EINVAL) continue; // tag was not integer 149 | if (tag_value != aux->tag_value) continue; 150 | } 151 | // filter by RG (read group): 152 | if (have_rg) { 153 | rg = bam_aux_get((const bam1_t*) b, "RG"); 154 | if (rg == NULL) continue; // missing 155 | rg_val = bam_aux2Z(rg); 156 | if (errno == EINVAL) continue; // bad parse 157 | if (strcmp(aux->read_group, rg_val) != 0) continue; // not wanted 158 | } 159 | break; 160 | } 161 | return ret; 162 | } 163 | 164 | 165 | /** Create an map of query position to reference position 166 | * 167 | * @param b alignment record 168 | * 169 | * The length of the returned array is b->core->l_qlen. 170 | */ 171 | int *qpos2rpos(bam1_t *b) { 172 | // we only deal in primary/soft-clipped alignments so length 173 | // ok qseq member is the length of the intact query sequence. 174 | uint32_t qlen = b->core.l_qseq; 175 | uint32_t *cigar = bam_get_cigar(b); 176 | int *posmap = xalloc(qlen, sizeof(uint32_t), "pos_map"); 177 | for (size_t i = 0; i < qlen; ++i) posmap[i] = -1; // unaligned 178 | int qpos = 0, rpos = b->core.pos; 179 | for (size_t i = 0; i < b->core.n_cigar; ++i){ 180 | uint32_t op = bam_cigar_op(cigar[i]); 181 | uint32_t len = bam_cigar_oplen(cigar[i]); 182 | uint32_t take = bam_cigar_type(op); 183 | if (((take&0x1)>0) & ((take&0x2)>0)) { 184 | // consumes query and ref 185 | for (size_t j = 0; j < len; ++j, ++qpos, ++rpos) { 186 | posmap[qpos] = rpos; 187 | } 188 | } 189 | else if ((take&0x1)>0) { 190 | // consumes query only 191 | qpos += len; 192 | } 193 | else { 194 | // consumes ref 195 | rpos += len; 196 | } 197 | } 198 | return posmap; 199 | } 200 | -------------------------------------------------------------------------------- /src/bamiter.h: -------------------------------------------------------------------------------- 1 | #ifndef _MODBAMBED_BAMITER_H 2 | #define _MODBAMBED_BAMITER_H 3 | 4 | #include 5 | #include "htslib/sam.h" 6 | 7 | // parameters for bam iteration 8 | typedef struct { 9 | htsFile *fp; 10 | hts_idx_t *idx; 11 | sam_hdr_t *hdr; 12 | hts_itr_t *iter; 13 | int min_mapQ; 14 | char tag_name[2]; 15 | int tag_value; 16 | bool keep_missing; 17 | const char *read_group; 18 | } mplp_data; 19 | 20 | 21 | typedef struct { 22 | htsFile *fp; 23 | hts_idx_t *idx; 24 | sam_hdr_t *hdr; 25 | } bam_fset; 26 | 27 | typedef struct set_fsets { 28 | bam_fset **fsets; 29 | size_t n; 30 | } set_fsets; 31 | 32 | 33 | // Initialise BAM file, index and header structures 34 | bam_fset* create_bam_fset(const char* fname); 35 | 36 | // Destory BAM file, index and header structures 37 | void destroy_bam_fset(bam_fset* fset); 38 | 39 | // Initialise multiple BAM filesets 40 | set_fsets *create_filesets(const char **bams); 41 | 42 | // Destroy multiple BAM filesets 43 | void destroy_filesets(set_fsets *s); 44 | 45 | 46 | /** Set up a bam file for reading (filtered) records. 47 | * 48 | * @param bam_fset A BAM fileset from create_bam_fset 49 | * @param chr bam target name. 50 | * @param start start position of chr to consider. 51 | * @param end end position of chr to consider. 52 | * @param read_group by which to filter alignments. 53 | * @param tag_name by which to filter alignments. 54 | * @param tag_value associated with tag_name. 55 | * @param min_mapQ minimum mapping quality of reads. 56 | * 57 | * The return value can be freed with destroy_bam_iter_data. 58 | * 59 | */ 60 | mplp_data *create_bam_iter_data( 61 | const bam_fset* fset, const char *chr, int start, int end, 62 | const char *read_group, const char tag_name[2], const int tag_value, 63 | const int min_mapQ); 64 | 65 | /** Clean up auxiliary bam reading data. 66 | * 67 | * @param data auxiliary structure to clean. 68 | * 69 | */ 70 | void destroy_bam_iter_data(mplp_data *data); 71 | 72 | /** Read a bam record. 73 | * 74 | * @param data an mplp_data encoding the bam file to read with filter options. 75 | * @param b output pointer. 76 | * 77 | */ 78 | int read_bam(void *data, bam1_t *b); 79 | 80 | /** Create an map of query position to reference position 81 | * 82 | * @param b alignment record 83 | * 84 | * The length of the returned array is b->core->l_qlen. 85 | */ 86 | int *qpos2rpos(bam1_t *b); 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /src/common.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "common.h" 8 | 9 | 10 | /** Allocates zero-initialised memory with a message on failure. 11 | * 12 | * @param num number of elements to allocate. 13 | * @param size size of each element. 14 | * @param msg message to describe allocation on failure. 15 | * @returns pointer to allocated memory 16 | * 17 | */ 18 | void *xalloc(size_t num, size_t size, char* msg){ 19 | void *res = calloc(num, size); 20 | if (res == NULL){ 21 | fprintf(stderr, "Failed to allocate mem for %s\n", msg); 22 | exit(1); 23 | } 24 | return res; 25 | } 26 | 27 | 28 | /** Reallocates memory with a message on failure. 29 | * 30 | * @param ptr pointer to realloc. 31 | * @param size size of each element. 32 | * @param msg message to describe allocation on failure. 33 | * @returns pointer to allocated memory 34 | * 35 | */ 36 | void *xrealloc(void *ptr, size_t size, char* msg){ 37 | void *res = realloc(ptr, size); 38 | if (res == NULL){ 39 | fprintf(stderr, "Failed to reallocate mem for %s\n", msg); 40 | exit(1); 41 | } 42 | return res; 43 | } 44 | 45 | 46 | /** Retrieves a substring. 47 | * 48 | * @param string input string. 49 | * @param postion start position of substring. 50 | * @param length length of substring required. 51 | * @returns string pointer. 52 | * 53 | */ 54 | char *substring(char *string, int position, int length) { 55 | char *ptr; 56 | size_t i; 57 | 58 | ptr = malloc(length + 1); 59 | 60 | for (i = 0 ; i < length ; i++) { 61 | *(ptr + i) = *(string + position); 62 | string++; 63 | } 64 | 65 | *(ptr + i) = '\0'; 66 | return ptr; 67 | } 68 | 69 | 70 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef _MODBAMBED_COMMON_H 2 | #define _MODBAMBED_COMMON_H 3 | 4 | #include 5 | 6 | 7 | typedef struct mod_base { 8 | char *name; 9 | char *abbrev; 10 | char base; 11 | int base_i; // 16bit IUPAC form A:1, C:2, G:4, T:8 12 | int code; // to enable htslib ChEBI support, chars below so simplicity 13 | } mod_base; 14 | 15 | static const size_t n_mod_bases = 16; 16 | static const mod_base mod_bases[] = { 17 | // C mods 18 | {"5-methylcytosine", "5mC", 'C', 2, 'm'}, 19 | {"5-hydroxymethylcytosine", "5hmC", 'C', 2, 'h'}, 20 | {"5-formylcytosine", "5fC", 'C', 2, 'f'}, 21 | {"5-carboxylcytosine", "5caC", 'C', 2, 'c'}, 22 | {"Ambiguous C modification", "modC", 'C', 2, 'C'}, 23 | // T mods 24 | {"5-hydroxymethyluracil", "5hmU", 'T', 8, 'g'}, 25 | {"5-formyluracil", "5fU", 'T', 8, 'e'}, 26 | {"5-carboxyluracil", "5caU", 'T', 8, 'b'}, 27 | {"Ambiguous T modification", "modT", 'T', 8, 'T'}, 28 | // A mods 29 | {"6-methyladenine", "6mA", 'A', 1, 'a'}, 30 | {"Ambiguous A modification", "modA", 'A', 1, 'A'}, 31 | // G mods 32 | {"8-Oxoguanine", "8oxoG", 'G', 4, 'o'}, 33 | {"Ambiguous G modification", "modG", 'G', 4, 'G'}, 34 | // U mods 35 | {"Ambiguous U modification", "modU", 'U', 15, 'U'}, // TODO: should 15 (N) be something else? 36 | // N Mods 37 | {"Xanthosine", "Xao", 'N', 15, 'n'}, 38 | {"Ambiguous N modification", "modN", 'N', 15, 'N'}, 39 | }; 40 | static const mod_base default_mod_base = {"5-methylcytosine", "5mC", 'C', 2, 'm'}; 41 | 42 | //0123456789ABCDEF 43 | //=ACMGRSVTWYHKDBN aka seq_nt16_str[] 44 | //=TGKCYSBAWRDMHVN comp1ement of seq_nt16_str 45 | //084C2A6E195D3B7F 46 | static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; 47 | 48 | static const int MAX_QUAL = 255; 49 | 50 | /** Simple integer min/max 51 | * @param a 52 | * @param b 53 | * 54 | * @returns the min/max of a and b 55 | * 56 | */ 57 | static inline int max ( int a, int b ) { return a > b ? a : b; } 58 | static inline int min ( int a, int b ) { return a < b ? a : b; } 59 | 60 | 61 | /** Allocates zero-initialised memory with a message on failure. 62 | * 63 | * @param num number of elements to allocate. 64 | * @param size size of each element. 65 | * @param msg message to describe allocation on failure. 66 | * @returns pointer to allocated memory 67 | * 68 | */ 69 | void *xalloc(size_t num, size_t size, char* msg); 70 | 71 | 72 | /** Retrieves a substring. 73 | * 74 | * @param string input string. 75 | * @param postion start position of substring. 76 | * @param length length of substring required. 77 | * @returns string pointer. 78 | * 79 | */ 80 | char *substring(char *string, int position, int length); 81 | 82 | #endif 83 | -------------------------------------------------------------------------------- /src/counts.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "htslib/sam.h" 13 | #include "htslib/faidx.h" 14 | #include "htslib/thread_pool.h" 15 | 16 | #include "bamiter.h" 17 | #include "common.h" 18 | #include "counts.h" 19 | 20 | #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) 21 | #define bam1_seqi(s, i) (bam_seqi((s), (i))) 22 | #define bam_nt16_rev_table seq_nt16_str 23 | #define bam_nt16_table seq_nt16_table 24 | 25 | 26 | /** Constructs a pileup data structure. 27 | * 28 | * @param buffer_cols maximum number of pileup columns. 29 | * @param rname reference name. 30 | * @see destroy_plp_data 31 | * @returns a plp_data pointer. 32 | * 33 | * The return value can be freed with destroy_plp_data. 34 | * 35 | */ 36 | plp_data create_plp_data(size_t buffer_cols, const char *rname) { 37 | plp_data data = xalloc(1, sizeof(_plp_data), "plp_data"); 38 | data->buffer_cols = buffer_cols; 39 | data->n_cols = 0; 40 | //fprintf(stderr, buffer_cols); 41 | data->matrix = xalloc(featlen * buffer_cols, sizeof(size_t), "matrix"); 42 | data->major = xalloc(buffer_cols, sizeof(size_t), "major"); 43 | data->rname = xalloc(strlen(rname) + 1, sizeof(char), "chr"); 44 | strcpy(data->rname, rname); 45 | return data; 46 | } 47 | 48 | 49 | /** Destroys a pileup data structure. 50 | * 51 | * @param data the object to cleanup. 52 | * @returns void. 53 | * 54 | */ 55 | void destroy_plp_data(plp_data data) { 56 | free(data->matrix); free(data->major); free(data->rname); free(data); 57 | } 58 | 59 | 60 | /** Prints a pileup data structure. 61 | * 62 | * @param pileup a pileup structure. 63 | * @returns void 64 | * 65 | */ 66 | void print_pileup_data(plp_data pileup){ 67 | fprintf(stdout, "chrom\tpos\t"); 68 | for (size_t j = 0; j < featlen; ++j){ 69 | fprintf(stdout, "%c\t", plp_bases[j]); 70 | } 71 | fprintf(stdout, "depth\n"); 72 | for (size_t j = 0; j < pileup->n_cols; ++j) { 73 | int s = 0; 74 | fprintf(stdout, "%s\t%zu\t", pileup->rname, pileup->major[j]); 75 | for (size_t i = 0; i < featlen; ++i){ 76 | size_t c = pileup->matrix[j * featlen + i]; 77 | s += c; 78 | fprintf(stdout, "%zu\t", c); 79 | } 80 | fprintf(stdout, "%d\n", s); 81 | } 82 | } 83 | 84 | 85 | output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg, bool accumulated) { 86 | output_files files = xalloc(1, sizeof(_output_files), "output_files"); 87 | // default to stdout for zero or one filters 88 | files->multi = (int)cpg + chh + chg > 1; 89 | files->take_all = (int)cpg + chh + chg == 0; 90 | files->accumulated = accumulated; 91 | files->fcpg = stdout; 92 | files->fchh = stdout; 93 | files->fchg = stdout; 94 | files->fcpg_acc = NULL; 95 | files->fchh_acc = NULL; 96 | files->fchg_acc = NULL; 97 | files->cpg = cpg; 98 | files->chh = chh; 99 | files->chg = chg; 100 | // use distinct files if more than one filter 101 | if (files->multi) { 102 | char* fname = xalloc(strlen(prefix) + 9, sizeof(char), "fname"); 103 | if (cpg) { 104 | strcpy(fname, prefix); strcat(fname, ".cpg.bed"); 105 | files->fcpg = fopen(fname, "w"); 106 | } 107 | if (chh) { 108 | strcpy(fname, prefix); strcat(fname, ".chh.bed"); 109 | files->fchh = fopen(fname, "w"); 110 | } 111 | if (chg) { 112 | strcpy(fname, prefix); strcat(fname, ".chg.bed"); 113 | files->fchg = fopen(fname, "w"); 114 | } 115 | free(fname); 116 | } 117 | 118 | if (files->accumulated) { 119 | char* fname_acc = xalloc(strlen(prefix) + 13, sizeof(char), "fname"); 120 | if (cpg) { 121 | strcpy(fname_acc, prefix); strcat(fname_acc, ".cpg.acc.bed"); 122 | files->fcpg_acc = fopen(fname_acc, "w"); 123 | } 124 | if (chg) { 125 | strcpy(fname_acc, prefix); strcat(fname_acc, ".chg.acc.bed"); 126 | files->fchg_acc = fopen(fname_acc, "w"); 127 | } 128 | free(fname_acc); 129 | } 130 | 131 | // store these in an array for later ease 132 | // [CpG, CHG] 133 | init_output_buffers(files); 134 | files->buf_size = _buf_size; 135 | files->motif_offsets[0] = 1; 136 | files->motif_offsets[1] = 2; 137 | files->motif_acc_files[0] = files->fcpg_acc; 138 | files->motif_acc_files[1] = files->fchg_acc; 139 | return files; 140 | } 141 | 142 | void close_bed_files(output_files files) { 143 | if (files->fcpg != stdout) { fclose(files->fcpg); } 144 | if (files->fchh != stdout) { fclose(files->fchh); } 145 | if (files->fchg != stdout) { fclose(files->fchg); } 146 | if (files->fcpg_acc != NULL) { fclose(files->fcpg_acc); } 147 | if (files->fchh_acc != NULL) { fclose(files->fchh_acc); } 148 | if (files->fchg_acc != NULL) { fclose(files->fchg_acc); } 149 | free(files); 150 | } 151 | 152 | 153 | // Check sequences for motifs 154 | 155 | // CpG 156 | bool extern inline is_cpg_fwd(size_t rpos, int rlen, char* ref){ 157 | return rpos < rlen - 1 && ref[rpos] == 'C' && ref[rpos + 1] == 'G'; 158 | } 159 | bool extern inline is_cpg_rev(size_t rpos, int rlen, char* ref){ 160 | return rpos != 0 && ref[rpos] == 'G' && ref[rpos - 1] == 'C'; 161 | } 162 | // CHN 163 | bool extern inline _is_chn_fwd(size_t rpos, int rlen, char* ref) { 164 | bool is_chn = false; 165 | if (rpos < rlen - 2 && ref[rpos] == 'C') { 166 | char b = ref[rpos + 1]; 167 | // these are all not G 168 | is_chn = (b == 'A' || b == 'C' || b == 'T' || b == 'M' || b == 'W' || b == 'Y' || b == 'H'); 169 | } 170 | return is_chn; 171 | } 172 | bool extern inline _is_chn_rev(size_t rpos, int rlen, char* ref) { 173 | bool is_chn = false; 174 | if (rpos > 1 && ref[rpos] == 'G') { 175 | char b = ref[rpos - 1]; 176 | // these are all not C 177 | is_chn = (b == 'A' || b == 'G' || b == 'T' || b == 'R' || b == 'W' || b == 'K' || b == 'D'); 178 | } 179 | return is_chn; 180 | } 181 | // CHH 182 | bool extern inline is_chh_fwd(size_t rpos, int rlen, char* ref) { 183 | bool is_chh = _is_chn_fwd(rpos, rlen, ref); 184 | if (is_chh) { 185 | char b = ref[rpos + 2]; 186 | // these are all not G 187 | is_chh = (b == 'A' || b == 'C' || b == 'T' || b == 'M' || b == 'W' || b == 'Y' || b == 'H'); 188 | } 189 | return is_chh; 190 | } 191 | bool extern inline is_chh_rev(size_t rpos, int rlen, char* ref) { 192 | bool is_chh = _is_chn_rev(rpos, rlen, ref); 193 | if (is_chh) { 194 | char b = ref[rpos - 2]; 195 | // these are all not C 196 | is_chh = (b == 'A' || b == 'G' || b == 'T' || b == 'R' || b == 'W' || b == 'K' || b == 'D'); 197 | } 198 | return is_chh; 199 | } 200 | // CHG 201 | bool extern inline is_chg_fwd(size_t rpos, int rlen, char* ref) { 202 | bool is_chg = _is_chn_fwd(rpos, rlen, ref); 203 | if (is_chg) { 204 | is_chg = ref[rpos + 2] == 'G'; 205 | } 206 | return is_chg; 207 | } 208 | bool extern inline is_chg_rev(size_t rpos, int rlen, char* ref) { 209 | bool is_chg = _is_chn_rev(rpos, rlen, ref); 210 | if (is_chg) { 211 | is_chg = ref[rpos - 2] == 'C'; 212 | } 213 | return is_chg; 214 | } 215 | 216 | 217 | void inline print_record( 218 | FILE* fout, const char* rname, size_t start, size_t end, 219 | char* feature, char orient, size_t depth, 220 | bool extended, size_t cd, size_t md, size_t fd, size_t xd, size_t od) { 221 | // https://www.encodeproject.org/data-standards/wgbs/ 222 | // column 11: "Percentage of reads that show methylation at this position in the genome" 223 | // - Seems to disregard possibility of non-C canonical calls 224 | // lets calculate this as proportion of meth:non-meth C 225 | size_t tot = cd + md + od; 226 | float meth = tot == 0 ? nanf("") : (100.0f * md) / tot; 227 | // column 5: "Score from 0-1000. Capped number of reads" 228 | // lets go with proportion of (mod or canon):(mod or canon or filtered) 229 | size_t score = depth == 0 ? nanf("") : (1000 * tot) / depth; 230 | 231 | // TODO: don't print when nan? 232 | fprintf(fout, 233 | "%s\t%zu\t%zu\t" 234 | "%s\t%zu\t%c\t" 235 | "%zu\t%zu\t0,0,0\t%zu\t%.2f", 236 | rname, start, end, 237 | feature, score, orient, 238 | start, end, depth, meth); 239 | if (extended) { 240 | fprintf(fout, "\t%zu\t%zu\t%zu\t%zu\t%zu\n", cd, md, fd, xd, od); 241 | } else { 242 | fprintf(fout, "\n"); 243 | } 244 | } 245 | 246 | 247 | void init_output_buffers(output_files bed_files) { 248 | // information regarding motif offset pairing 249 | for (size_t i=0; i < bed_files->buf_size; ++i) { 250 | bed_files->out_buffer[i] = (bed_buffer){-1, false, 0, 0, 0, 0}; 251 | } 252 | } 253 | 254 | void flush_output_buffers(output_files bed_files, const char* chr, bool extended, char* feature) { 255 | // flush accumulation buffers 256 | if (bed_files->accumulated) { 257 | for(size_t ibuf=0; ibuf < bed_files->buf_size; ++ibuf) { 258 | bed_buffer buf = bed_files->out_buffer[ibuf]; 259 | FILE* fout = bed_files->motif_acc_files[ibuf]; 260 | if (buf.pos != -1 && fout != NULL) { 261 | print_record( 262 | fout, chr, buf.pos, buf.pos + 1, feature, "+-"[buf.isrev], 263 | buf.depth, extended, buf.cd, buf.md, buf.fd, buf.xd, buf.od); 264 | } 265 | } 266 | } 267 | } 268 | 269 | /** Prints a pileup data structure as bedmethyl file 270 | * 271 | * @param pileup a pileup counts structure. 272 | * @param ref reference sequence. 273 | * @param rstart starting reference coordinate corresponding to ref. 274 | * @param extended whether to include counts of canonical, modified and filtered bases. 275 | * @param feature name to use for feature column of BED (e.g. 5mC). 276 | * @param canon_base canonical base to match. 277 | * @param output_files file handles and output options. 278 | * @param out_buffer state for strand accumulation (modified on output). 279 | * @returns void 280 | * 281 | */ 282 | void print_bedmethyl( 283 | plp_data pileup, char *ref, int rstart, bool extended, 284 | char* feature, char canon_base, output_files bed_files) { 285 | // ecoli1 100718 100719 . 4 + 100718 100719 0,0,0 3 0 286 | 287 | // this is a bit naff, we should introspect these indices, or have them 288 | // as data in the header. 289 | size_t ci, mi, fi, xi, oi; 290 | size_t *bases; 291 | bool isrev; 292 | char rc_canon_base = ' '; 293 | size_t cif, cir; 294 | 295 | // TODO: if canon_base were passed as an htslib int this would be cleaner 296 | if (canon_base == 'A') {cif=fwd_A; cir=rev_T; rc_canon_base = 'T';} 297 | else if (canon_base == 'C') {cif=fwd_C; cir=rev_G; rc_canon_base = 'G';} 298 | else if (canon_base == 'G') {cif=fwd_G; cir=rev_C; rc_canon_base = 'C';} 299 | else if (canon_base == 'T') {cif=fwd_T; cir=rev_A; rc_canon_base = 'A';} 300 | else {fprintf(stderr, "ERROR: Unrecognised canonical base: '%c'\n", canon_base); exit(1);} 301 | 302 | int rlen = strlen(ref); 303 | 304 | for (size_t i = 0; i < pileup->n_cols; ++i) { 305 | size_t pos = pileup->major[i]; 306 | size_t rpos = pos - rstart; 307 | char rbase = ref[rpos]; 308 | bool is_cpg = false; 309 | bool is_chh = false; 310 | bool is_chg = false; 311 | if (rbase == canon_base) { 312 | if (!bed_files->take_all) { 313 | if (!( 314 | (bed_files->cpg && (is_cpg = is_cpg_fwd(rpos, rlen, ref))) 315 | || (bed_files->chh && (is_chh = is_chh_fwd(rpos, rlen, ref))) 316 | || (bed_files->chg && (is_chg = is_chg_fwd(rpos, rlen, ref))) 317 | ) ) { continue; } 318 | } 319 | isrev = 0; mi = fwd_mod; fi = fwd_filt; xi = fwd_nocall; oi = fwd_other; ci = cif; 320 | bases = (size_t *) fwdbases; 321 | } else if (rbase == rc_canon_base) { 322 | if (!bed_files->take_all) { 323 | if (!( 324 | (bed_files->cpg && (is_cpg = is_cpg_rev(rpos, rlen, ref))) 325 | || (bed_files->chh && (is_chh = is_chh_rev(rpos, rlen, ref))) 326 | || (bed_files->chg && (is_chg = is_chg_rev(rpos, rlen, ref))) 327 | ) ) { continue; } 328 | } 329 | isrev = 1; mi = rev_mod; fi = rev_filt; xi = rev_nocall; oi = rev_other; ci = cir; 330 | bases = (size_t *)revbases; 331 | } 332 | else { 333 | continue; 334 | } 335 | // calculate depth on strand 336 | size_t depth = 0; 337 | for (size_t j = 0; j < numbases; ++j) { 338 | depth += pileup->matrix[i * featlen + bases[j]]; 339 | } 340 | size_t cd = pileup->matrix[i * featlen + ci]; 341 | size_t md = pileup->matrix[i * featlen + mi]; 342 | size_t fd = pileup->matrix[i * featlen + fi]; 343 | size_t xd = pileup->matrix[i * featlen + xi]; 344 | size_t od = pileup->matrix[i * featlen + oi]; 345 | 346 | // choose output for this locus, the motifs are mutually exclusive so 347 | // no need to loop 348 | FILE* fout = stdout; 349 | if (bed_files->multi) { 350 | if (is_cpg) { fout = bed_files->fcpg; } 351 | else if (is_chh) { fout = bed_files->fchh; } 352 | else if (is_chg) { fout = bed_files->fchg; } 353 | } 354 | print_record( 355 | fout, pileup->rname, pos, pos + 1, feature, "+-"[isrev], 356 | depth, extended, cd, md, fd, xd, od); 357 | 358 | // strand accumulated 359 | if (bed_files->accumulated && (is_cpg || is_chg)) { 360 | size_t ibuf, motif_offset; 361 | bool do_output; 362 | if (is_cpg) { 363 | ibuf = 0; do_output = bed_files->cpg; 364 | } else { // chg 365 | ibuf = 1; do_output = bed_files->chh; 366 | } 367 | motif_offset = bed_files->motif_offsets[ibuf]; 368 | fout = bed_files->motif_acc_files[ibuf]; 369 | if (do_output) { 370 | assert(fout != NULL); 371 | bed_buffer buf = bed_files->out_buffer[ibuf]; 372 | if (buf.pos == -1) { 373 | bed_files->out_buffer[ibuf] = (bed_buffer){pos, isrev, depth, cd, md, fd, xd, od}; 374 | } else if (pos - buf.pos == motif_offset ) { // paired 375 | assert(buf.isrev != isrev); // shouldn't happen, they can't be same 376 | buf.depth += depth; 377 | buf.cd += cd; 378 | buf.md += md; 379 | buf.fd += fd; 380 | buf.xd += xd; 381 | buf.od += od; 382 | print_record( 383 | fout, pileup->rname, buf.pos, buf.pos + motif_offset + 1, feature, '.', 384 | buf.depth, extended, buf.cd, buf.md, buf.fd, buf.xd, buf.od); 385 | bed_files->out_buffer[ibuf] = (bed_buffer){-1, false, 0, 0, 0, 0, 0, 0}; 386 | } else { // unrelated 387 | print_record( 388 | fout, pileup->rname, buf.pos, buf.pos + 1, feature, "+-"[buf.isrev], 389 | buf.depth, extended, buf.cd, buf.md, buf.fd, buf.xd, buf.od); 390 | bed_files->out_buffer[ibuf] = (bed_buffer){pos, isrev, depth, cd, md, fd, xd, od}; 391 | } 392 | } 393 | } 394 | 395 | } // position loop 396 | 397 | } 398 | 399 | 400 | // Control client data for pileup: in this case the mod base data 401 | int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) { 402 | hts_base_mod_state *m = hts_base_mod_state_alloc(); 403 | bam_parse_basemod(b, m); cd->p = m; 404 | return 0; 405 | } 406 | 407 | int pileup_cd_destroy(void *data, const bam1_t *b, bam_pileup_cd *cd) { 408 | hts_base_mod_state_free(cd->p); 409 | return 0; 410 | } 411 | 412 | 413 | // TODO: this is taken from sam.c, its here so we can introspec some things 414 | // for which there's no public interface. A little spicey to redefine 415 | // this, but we do what we can. 416 | // https://github.com/samtools/htslib/issues/1550 417 | #define MAX_BASE_MOD 256 418 | struct hts_base_mod_state { 419 | int type[MAX_BASE_MOD]; // char or minus-CHEBI 420 | int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15) 421 | char strand[MAX_BASE_MOD]; // strand of modification; + or - 422 | int MMcount[MAX_BASE_MOD]; // no. canonical bases left until next mod 423 | char *MM[MAX_BASE_MOD]; // next pos delta (string) 424 | char *MMend[MAX_BASE_MOD]; // end of pos-delta string 425 | uint8_t *ML[MAX_BASE_MOD]; // next qual 426 | int MLstride[MAX_BASE_MOD]; // bytes between quals for this type 427 | int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? 428 | int seq_pos; // current position along sequence 429 | int nmods; // used array size (0 to MAX_BASE_MOD-1). 430 | }; 431 | 432 | 433 | // Query if a specific MM subtag is present 434 | bool query_mod_subtag(hts_base_mod_state *state, int qtype, int qcanonical, char qstrand, int qimplicit) { 435 | bool found = false; 436 | for (size_t i=0; inmods; ++i) { 437 | if ((state->type[i] == qtype || state->type[i] == -qtype) 438 | && state->canonical[i] == qcanonical 439 | // although strand is typed char and documented as + or -, its actually 0/1 440 | && "+-"[state->strand[i]] == qstrand 441 | && state->implicit[i] == qimplicit) { 442 | found = true; 443 | break; 444 | } 445 | } 446 | return found; 447 | } 448 | 449 | /** Generates base counts from a region of a bam. 450 | * 451 | * @param bam_file input aligment file. 452 | * @param chr bam target name. 453 | * @param start start position of chr to consider. 454 | * @param end end position of chr to consider. 455 | * @param read_group by which to filter alignments. 456 | * @param tag_name by which to filter alignments. 457 | * @param tag_value associated with tag_name 458 | * @param threshold probability filter for excluding calls from counts. 459 | * @param mb BAM code for modified base to report. (e.g. h for 5hmC), or a ChEBI code. 460 | * @param combine combine all modified bases corresponding to same canonical base as mb 461 | * @param max_depth maximum depth of pileup. 462 | * @param min_mapQ minimum mapping quality of reads. 463 | * @returns a pileup data pointer. 464 | * 465 | * The return value can be freed with destroy_plp_data. 466 | * 467 | */ 468 | plp_data calculate_pileup( 469 | const set_fsets *fsets, const char *chr, int start, int end, 470 | const char *read_group, const char tag_name[2], const int tag_value, 471 | int threshold, mod_base mb, bool combine, int max_depth, int min_mapQ) { 472 | 473 | static bool shown_second_strand_warning = false; 474 | 475 | // counting mod calls other than the one asked for 476 | int rev_in_family = rev_other; 477 | int fwd_in_family = fwd_other; 478 | if (combine) { rev_in_family = rev_mod; fwd_in_family = fwd_mod; } 479 | 480 | // setup bam reading 481 | size_t nfile = fsets->n; 482 | mplp_data **data = xalloc(fsets->n, sizeof(mplp_data*), "bam files"); 483 | for (size_t i = 0; i < nfile; ++i) { 484 | data[i] = create_bam_iter_data( 485 | fsets->fsets[i], chr, start, end, read_group, tag_name, tag_value, min_mapQ); 486 | if (data[i] == NULL) { 487 | // TODO: clean-up all j 0)) { 508 | const char *c_name = data[0]->hdr->target_name[tid]; 509 | if (strcmp(c_name, chr) != 0) continue; 510 | if (pos < start) continue; 511 | if (pos >= end) break; 512 | 513 | pileup->major[n_cols] = pos; // dont need insert columns for this 514 | 515 | // go through all files, and all reads in each 516 | for (size_t file = 0; file < nfile; ++file) { 517 | for (int i = 0; i < n_plp[file]; ++i) { 518 | const bam_pileup1_t *p = plp[file] + i; 519 | if (p->is_refskip) continue; 520 | 521 | // ONT calls are "query based", this means an attempt at a mod call is 522 | // made only if the first-pass canon basecall was the base of interest. 523 | // They are NOT "reference based": a mod call being attempted when the 524 | // query position aligns to a reference position containing the 525 | // of-interest base. (Actually reading between the lines of the spec 526 | // discussions, there was an implied assumption that mod calls are 527 | // always query based). 528 | // 529 | // There are two modes: 530 | // i) "." - implicit = 1; Unlisted positions are assumed canonical 531 | // ii) "?" - implicit = 0; Nothing can really be said about unlisted 532 | // 533 | // Case i) is trivial and easy to handle: no mod calls, assume canonical. 534 | // This is like just not having a tag at all. If the above found no mods, 535 | // any query base (ACGT) is assumed canonical 536 | // 537 | // Case ii) is a bit more icky for us. Before deciding canon/no-call we 538 | // need to know if there was even a tag present, e.g. C+m for 5mC. For 539 | // canon base types other than that relating to our mod base, we make 540 | // no claims about modification status: all forms are lumped together. 541 | // 542 | // For the most part ONT callers output `?` and have a call for every 543 | // of-interest base. There are two cases where this isn't true: 544 | // i) Guppy elided some low prob calls (as in the `.` mode) 545 | // ii) callers which specialise to CpG (so don't have an entry for every C) 546 | // 547 | // To complicate things further we can have tags such as "G-m" indicating 548 | // methylation on the second strand of the sequenced read. Such tags ought 549 | // not to occur without a corresponding "C+m" tag: in a simple case this 550 | // would imply a caller had called methylation on the strand that wasn't 551 | // sequenced but not on the strand that was sequenced. A more realistic 552 | // situation would be making calls only on the second strands of duplex reads. 553 | // 554 | // Here we simplify our lives by restricting to the case of skipping any 555 | // such second strand tags, for the reasons above but also primarily 556 | // because ideally the second strand tag should be jointly interpreted 557 | // with the first strand tag: 558 | // to detect hemimethylation 559 | // understand and correctly report depth 560 | // made hard by them being on different positions 561 | 562 | int base_i = -1; // index into counts matrix 563 | int base_j = bam1_seqi(bam1_seq(p->b), p->qpos); 564 | if (p->is_del) { 565 | // deletions are interesting for counting depth 566 | base_i = bam_is_rev(p->b) ? rev_del : fwd_del; 567 | } else if (!( 568 | (base_j == mb.base_i && !bam_is_rev(p->b)) 569 | || (seqi_rc[base_j] == mb.base_i && bam_is_rev(p->b)))) { 570 | // e.g. if query we're looking for 5mC and qbase in {A,T} 571 | // we'll just count a plain A/T 572 | // NOTE: this test assumes only first strand subtags (e.g. C+m, not C-m) 573 | base_i = num2countbase[bam_is_rev(p->b) ? base_j + 16: base_j]; 574 | } else { 575 | // We have the correct query base for the orientation of the alignment 576 | // so now look for modified bases. 577 | size_t n_mods = 256; 578 | hts_base_mod_state *mod_state = p->cd.p; 579 | hts_base_mod allmod[n_mods]; 580 | int nm = bam_mods_at_qpos(p->b, p->qpos, mod_state, allmod, n_mods); 581 | if (nm < 0 ) continue; // ignore reads which give error 582 | hts_base_mod mod; 583 | int our_mod = -1; 584 | int best_mod = -1; 585 | int best_score = 0; 586 | int canon_score = MAX_QUAL; // we subtract from this below 587 | if (nm > 0) { 588 | for (int k = 0; k < nm && k < n_mods; ++k) { 589 | mod = allmod[k]; 590 | if (mod.strand == 1) { // second strand tag 591 | if (!shown_second_strand_warning) { 592 | fprintf(stderr, "WARNING: Skipping second strand tag."); 593 | shown_second_strand_warning = true; 594 | } 595 | continue; 596 | } 597 | // our mod 598 | if (mb.code == mod.modified_base || mb.code == -mod.modified_base) { 599 | our_mod = k; 600 | } 601 | // any mod in the family 602 | if (mod.canonical_base == mb.base) { 603 | if (mod.qual > best_score) { best_mod = k; best_score = mod.qual; } 604 | canon_score -= mod.qual; 605 | } 606 | } 607 | } 608 | 609 | // Now analyse scores. Note: ignoring the old lowthreshold here. 610 | if (best_mod != -1) { 611 | // we found some mods, lets not worry about funny mixes 612 | // of calls and no calls i.e. were assuming we have a call 613 | // for all the mods present (implicit non-mod doesn't matter here therefore). 614 | if (canon_score > threshold) { // implied canon score 615 | base_i = num2countbase[bam_is_rev(p->b) ? base_j + 16 : base_j]; 616 | } 617 | else if (best_mod == our_mod) { // the mod requested 618 | base_i = (best_score > threshold) ? 619 | (bam_is_rev(p->b) ? rev_mod : fwd_mod) : 620 | (bam_is_rev(p->b) ? rev_filt : fwd_filt); 621 | } 622 | else { // some other mod in the family 623 | base_i = (best_score > threshold) ? 624 | (bam_is_rev(p->b) ? rev_in_family : fwd_in_family) : // either mod or other depending on combine 625 | (bam_is_rev(p->b) ? rev_filt : fwd_filt); 626 | } 627 | } 628 | else { 629 | // we didn't find any mods in the family 630 | // In the case of explicit `?` 631 | // tag we should not assume canonical, otherwise we can. 632 | // NOTE: we don't look for second strand `-` tags. 633 | // or a mess of `?` and `.` for alternative mods 634 | if (query_mod_subtag(mod_state, mb.code, mb.base_i, '+', 0)) { 635 | // we had an explicit tag, but no call for this position 636 | base_i = bam_is_rev(p->b) ? rev_nocall : fwd_nocall; 637 | } 638 | else { 639 | // for everything else theres canonical 640 | base_i = num2countbase[bam_is_rev(p->b) ? base_j + 16 : base_j]; 641 | } 642 | } 643 | } 644 | if (base_i != -1) { // not an ambiguity code 645 | pileup->matrix[major_col + base_i] += 1; 646 | } // read loop 647 | } // file loop 648 | } 649 | major_col += featlen; 650 | n_cols++; 651 | } 652 | pileup->n_cols = n_cols; 653 | 654 | free(plp); 655 | free(n_plp); 656 | bam_mplp_destroy(mplp); 657 | for (size_t i = 0; i < nfile; ++i) { 658 | destroy_bam_iter_data(data[i]); 659 | } 660 | free(data); 661 | 662 | return pileup; 663 | } 664 | 665 | 666 | -------------------------------------------------------------------------------- /src/counts.h: -------------------------------------------------------------------------------- 1 | #ifndef _MODBAMBED_COUNTS_H 2 | #define _MODBAMBED_COUNTS_H 3 | 4 | #include 5 | #include 6 | 7 | #include "common.h" 8 | 9 | static const int _INT_MAX = INT_MAX; 10 | 11 | // medaka-style feature data 12 | typedef struct _plp_data { 13 | size_t buffer_cols; 14 | size_t n_cols; 15 | char *rname; 16 | size_t *matrix; 17 | size_t *major; 18 | } _plp_data; 19 | typedef _plp_data *plp_data; 20 | 21 | typedef struct bed_buffer { 22 | int pos; 23 | bool isrev; 24 | size_t depth, cd, md, fd, xd, od; 25 | } bed_buffer; 26 | 27 | // files open for writing outputs 28 | // this buf_size is silly, but its to work around CFFI sillyness 29 | static const size_t _buf_size = 2; 30 | typedef struct _output_files { 31 | bool multi; 32 | bool take_all; 33 | bool accumulated; 34 | bool cpg; 35 | bool chh; 36 | bool chg; 37 | FILE *fcpg; 38 | FILE *fchh; 39 | FILE *fchg; 40 | FILE *fcpg_acc; 41 | FILE *fchh_acc; 42 | FILE *fchg_acc; 43 | size_t buf_size; 44 | bed_buffer out_buffer[2]; 45 | size_t motif_offsets[2]; 46 | FILE* motif_acc_files[2]; 47 | } _output_files; 48 | typedef _output_files *output_files; 49 | 50 | 51 | output_files open_bed_files(char* prefix, bool cpg, bool chh, bool chg, bool accumulated); 52 | void close_bed_files(output_files); 53 | // reset state of buffers (to handle loci split by thread blocks) 54 | void init_output_buffers(output_files bed_files); 55 | void flush_output_buffers(output_files bed_files, const char* chr, bool extended, char* feature); 56 | 57 | 58 | // Check sequences for motifs 59 | // CpG 60 | bool extern inline is_cpg_fwd(size_t rpos, int rlen, char* ref); 61 | bool extern inline is_cpg_rev(size_t rpos, int rlen, char* ref); 62 | // CHN 63 | bool extern inline _is_chn_fwd(size_t rpos, int rlen, char* ref); 64 | bool extern inline _is_chn_rev(size_t rpos, int rlen, char* ref); 65 | // CHH 66 | bool extern inline is_chh_fwd(size_t rpos, int rlen, char* ref); 67 | bool extern inline is_chh_rev(size_t rpos, int rlen, char* ref); 68 | // CHG 69 | bool extern inline is_chg_fwd(size_t rpos, int rlen, char* ref); 70 | bool extern inline is_chg_rev(size_t rpos, int rlen, char* ref); 71 | 72 | // medaka-style base encoding - augmented with (a) modified base counts 73 | static const char plp_bases[] = "acgtACGTdDmMfoOfFxX"; // o: "other mod", f:"filtered", x:"no call" 74 | 75 | enum plp_index { 76 | rev_A, rev_C, rev_G, rev_T, 77 | fwd_A, fwd_C, fwd_G, fwd_T, 78 | rev_del, fwd_del, 79 | rev_mod, fwd_mod, 80 | rev_other, fwd_other, 81 | rev_filt, fwd_filt, 82 | rev_nocall, fwd_nocall, 83 | featlen 84 | }; 85 | static const size_t fwdbases[] = 86 | {fwd_A, fwd_C, fwd_G, fwd_T, fwd_del, fwd_mod, fwd_other, fwd_filt, fwd_nocall}; 87 | static const size_t revbases[] = 88 | {rev_A, rev_C, rev_G, rev_T, rev_del, rev_mod, rev_other, rev_filt, rev_nocall}; 89 | static const size_t numbases = featlen / 2; 90 | 91 | // convert 16bit IUPAC (+16 for strand) to plp_bases index 92 | // e.g. G=4 => fwd_G => plp_bases[6] 93 | static const int num2countbase[32] = { 94 | -1, fwd_A, fwd_C, -1, fwd_G, -1, -1, -1, 95 | fwd_T, -1, -1, -1, -1, -1, -1, -1, 96 | -1, rev_A, rev_C, -1, rev_G, -1, -1, -1, 97 | rev_T, -1, -1, -1, -1, -1, -1, -1, 98 | }; 99 | 100 | 101 | /** Constructs a pileup data structure. 102 | * 103 | * @param buffer_cols maximum number of pileup columns. 104 | * @param rname reference name. 105 | * @see destroy_plp_data 106 | * @returns a plp_data pointer. 107 | * 108 | * The return value can be freed with destroy_plp_data. 109 | * 110 | */ 111 | plp_data create_plp_data(size_t buffer_cols, const char *rname); 112 | 113 | 114 | /** Destroys a pileup data structure. 115 | * 116 | * @param data the object to cleanup. 117 | * @returns void. 118 | * 119 | */ 120 | void destroy_plp_data(plp_data data); 121 | 122 | 123 | /** Prints a pileup data structure. 124 | * 125 | * @param pileup a pileup counts structure. 126 | * @returns void 127 | * 128 | */ 129 | void print_pileup_data(plp_data pileup); 130 | 131 | 132 | /** Prints a pileup data structure as bedmethyl file 133 | * 134 | * @param pileup a pileup counts structure. 135 | * @param ref reference sequence. 136 | * @param rstart starting reference coordinate corresponding to ref. 137 | * @param extended whether to include counts of canonical, modified and filtered bases. 138 | * @param feature name to use for feature column of BED (e.g. 5mC). 139 | * @param canon_base canonical base to match. 140 | * @param bed_files output file handles (and filters). 141 | * @returns void 142 | * 143 | */ 144 | void print_bedmethyl( 145 | plp_data pileup, char *ref, int rstart, bool extended, 146 | char *feature, char canon_base, output_files bed_files); 147 | 148 | 149 | /** Generates base counts from a region of a bam. 150 | * 151 | * @param bam_file input aligment file. 152 | * @param chr bam target name. 153 | * @param start start position of chr to consider. 154 | * @param end end position of chr to consider. 155 | * @param read_group by which to filter alignments. 156 | * @param tag_name by which to filter alignments. 157 | * @param tag_value associated with tag_name 158 | * @param lowthreshold highest probability to call base as canonical. 159 | * @param highthreshold lowest probablity to call base as modified. 160 | * @param mod_base a mod_base instance 161 | * @param combine combine all modified bases corresponding to same canonical base as mb 162 | * @param max_depth maximum depth of pileup. 163 | * @param min_mapQ 164 | * @returns a pileup data pointer. 165 | * 166 | * The return value can be freed with destroy_plp_data. 167 | * 168 | */ 169 | plp_data calculate_pileup( 170 | const set_fsets *fsets, const char *chr, int start, int end, 171 | const char *read_group, const char tag_name[2], const int tag_value, 172 | int threshold, mod_base mb, bool combine, int max_depth, int min_mapQ); 173 | 174 | #endif 175 | -------------------------------------------------------------------------------- /src/modbam2bed.c: -------------------------------------------------------------------------------- 1 | // modbam2bed program 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "htslib/sam.h" 11 | #include "htslib/faidx.h" 12 | #include "htslib/thread_pool.h" 13 | 14 | #include "bamiter.h" 15 | #include "common.h" 16 | #include "counts.h" 17 | #include "args.h" 18 | 19 | 20 | typedef struct twarg { 21 | arguments_t args; 22 | const char *chr; 23 | int start; 24 | int end; 25 | } twarg; 26 | 27 | 28 | void *pileup_worker(void *arg) { 29 | twarg j = *(twarg *)arg; 30 | set_fsets *files = create_filesets(j.args.bam); 31 | if (files == NULL) { free(arg); return NULL; } 32 | plp_data pileup = calculate_pileup( 33 | files, j.chr, j.start, j.end, 34 | j.args.read_group, j.args.tag_name, j.args.tag_value, 35 | j.args.threshold, j.args.mod_base, j.args.combine, 36 | j.args.hts_maxcnt, j.args.min_mapQ); 37 | destroy_filesets(files); 38 | free(arg); 39 | return pileup; 40 | } 41 | 42 | 43 | /* Process and print a single region using a threadpool 44 | * 45 | * @param args program arguments. 46 | * @param chr reference sequence to process. 47 | * @param start reference coordinate to process (0-based). 48 | * @param end reference coordiate to process (exclusive). 49 | * @param ref reference sequence. 50 | * 51 | */ 52 | #ifdef NOTHREADS 53 | void process_region(arguments_t args, const char *chr, int start, int end, char *ref, output_files bed_files) { 54 | fprintf(stderr, "Processing: %s:%d-%d\n", chr, start, end); 55 | set_fsets* files = create_filesets(j.args.bam); 56 | if (files == NULL) return; 57 | plp_data pileup = calculate_pileup( 58 | args.bam, chr, start, end, 59 | args.read_group, args.tag_name, args.tag_value, 60 | args.threshold, args.mod_base, args.combine, 61 | args.hts_maxcnt, args.min_mapQ); 62 | if (pileup == NULL) return; 63 | 64 | init_output_buffers(bed_files); 65 | if (args.pileup) { 66 | print_pileup_data(pileup); 67 | } else { 68 | print_bedmethyl(pileup, ref, 0, args.extended, args.mod_base.abbrev, args.mod_base.base, bed_files); 69 | } 70 | flush_output_buffers(bed_files, chr, args.extended, args.mod_base.abbrev); 71 | destroy_plp_data(pileup); 72 | } 73 | #else 74 | void process_region(arguments_t args, const char *chr, int start, int end, char *ref, output_files bed_files) { 75 | fprintf(stderr, "Processing: %s:%d-%d\n", chr, start, end); 76 | // create thread pool 77 | hts_tpool *p = hts_tpool_init(args.threads); 78 | hts_tpool_process *q = hts_tpool_process_init(p, 2 * args.threads, 0); 79 | hts_tpool_result *r; 80 | const int width = 1000000; 81 | 82 | init_output_buffers(bed_files); 83 | int nregs = 1 + (end - start) / width; float done = 0; 84 | for (int rstart = start; rstart < end; rstart += width) { 85 | twarg *tw_args = xalloc(1, sizeof(*tw_args), "thread worker args"); // freed in worker 86 | tw_args->args = args; 87 | tw_args->chr = chr; tw_args->start = rstart; tw_args->end=min(rstart + width, end); 88 | int blk; 89 | do { 90 | blk = hts_tpool_dispatch2(p, q, pileup_worker, tw_args, 1); 91 | if ((r = hts_tpool_next_result(q))) { 92 | plp_data res = (plp_data)hts_tpool_result_data(r); 93 | if (res != NULL) { 94 | if (args.pileup) { 95 | print_pileup_data(res); 96 | } else { 97 | print_bedmethyl( 98 | res, ref, 0, 99 | args.extended, args.mod_base.abbrev, args.mod_base.base, bed_files); 100 | } 101 | destroy_plp_data(res); 102 | done++; 103 | fprintf(stderr, "\r%.1f %%", 100*done/nregs); 104 | } 105 | hts_tpool_delete_result(r, 0); 106 | } 107 | } while (blk == -1); 108 | } 109 | 110 | // wait for jobs, then collect. 111 | hts_tpool_process_flush(q); 112 | while ((r = hts_tpool_next_result(q))) { 113 | plp_data res = (plp_data)hts_tpool_result_data(r); 114 | if (res != NULL) { 115 | if (args.pileup) { 116 | print_pileup_data(res); 117 | } else { 118 | print_bedmethyl( 119 | res, ref, 0, 120 | args.extended, args.mod_base.abbrev, args.mod_base.base, bed_files); 121 | } 122 | destroy_plp_data(res); 123 | done++; 124 | fprintf(stderr, "\r%.1f %%", 100*done/nregs); 125 | } 126 | hts_tpool_delete_result(r, 0); 127 | } 128 | 129 | // finalise any remaining singleton strands 130 | flush_output_buffers(bed_files, chr, args.extended, args.mod_base.abbrev); 131 | 132 | fprintf(stderr, "\r100 %% "); 133 | fprintf(stderr, "\n"); 134 | // clean up pool 135 | hts_tpool_process_destroy(q); 136 | hts_tpool_destroy(p); 137 | } 138 | #endif 139 | 140 | 141 | int main(int argc, char *argv[]) { 142 | clock_t begin = clock(); 143 | arguments_t args = parse_arguments(argc, argv); 144 | fprintf( 145 | stderr, "Analysing: %s (%s, %c>%c)\n", 146 | args.mod_base.name, args.mod_base.abbrev, args.mod_base.base, args.mod_base.code); 147 | #ifdef NOTHREADS 148 | if (args.threads != 1) { 149 | fprintf( 150 | stderr, 151 | "--threads set to %d, but threading not supported by this build.\n", args.threads); 152 | } 153 | #endif 154 | 155 | // large basecaller runs can produce more files than a single 156 | // process can open, check this ahead of time. 157 | #ifndef WASM 158 | struct rlimit reslimit; 159 | int nfile = 0; for (; args.bam[nfile]; nfile++); 160 | if (getrlimit(RLIMIT_NOFILE, &reslimit) == 0) { 161 | if (nfile * args.threads > reslimit.rlim_cur - 100) { 162 | fprintf(stderr, 163 | "ERROR: Too many BAM files provided (%i). Try running " 164 | "samtools merge on subsets of files to produce fewer files", nfile); 165 | exit(EXIT_FAILURE); 166 | } 167 | } 168 | #endif 169 | 170 | // open output files, sort out filter options 171 | output_files bed_files = open_bed_files( 172 | args.prefix, args.cpg, args.chh, args.chg, args.accumulated); 173 | 174 | // load ref sequence 175 | faidx_t *fai = fai_load(args.ref); 176 | if (fai == NULL) { 177 | fprintf(stderr, 178 | "ERROR: Failed to parse reference file\n"); 179 | exit(EXIT_FAILURE); 180 | } 181 | if (args.region == NULL) { 182 | // process all regions 183 | int nseq = faidx_nseq(fai); 184 | for (int i = 0; i < nseq; ++i) { 185 | const char *chr = faidx_iseq(fai, i); 186 | int len = faidx_seq_len(fai, chr); 187 | int alen; 188 | char *ref = faidx_fetch_seq(fai, chr, 0, len, &alen); 189 | if (!args.mask) { 190 | for (size_t i=0; i 1) 54 | 55 | def test_040_nonexisting_chrom(self): 56 | with ModBam(test_bam) as bam: 57 | reads = list(bam.reads("ecoli1xx", 0, 4000000)) 58 | assert(reads == []) 59 | -------------------------------------------------------------------------------- /test/test_motifs.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import unittest 3 | 4 | NOT_G = 'ACTMWYH' 5 | NOT_C = 'AGTRWKD' 6 | MAYBE_G = 'GRSKVDBN' 7 | MAYBE_C = 'CMQYVHBN' 8 | 9 | 10 | from modbampy import libbam 11 | 12 | class MotifTest(unittest.TestCase): 13 | 14 | def test_001_is_cpg(self): 15 | is_cpg_fwd = libbam.is_cpg_fwd 16 | is_cpg_rev = libbam.is_cpg_rev 17 | 18 | # forward 19 | assert is_cpg_fwd(3, 8, b"AAACGAAA") == True 20 | assert is_cpg_fwd(3, 8, b"AAACTAAA") == False 21 | # reverse 22 | assert is_cpg_rev(4, 8, b"AAACGAAA") == True 23 | assert is_cpg_rev(4, 8, b"AAACTAAA") == False 24 | # end, but overrun 25 | assert is_cpg_fwd(6, 7, b"AAAAAACG") == False 26 | assert is_cpg_fwd(6, 8, b"AAAAAACG") == True 27 | # don't break 28 | assert is_cpg_rev(0, 8, b"AAAAAACG") == False 29 | 30 | 31 | def test_010_is_chh(self): 32 | is_chh_fwd = libbam.is_chh_fwd 33 | is_chh_rev = libbam.is_chh_rev 34 | 35 | # forward 36 | for b1, b2 in itertools.product(NOT_G, repeat=2): 37 | assert is_chh_fwd(3, 8, f"AAAC{b1}{b2}AA".encode()) == True 38 | for b1, b2 in itertools.product(MAYBE_G, repeat=2): 39 | assert is_chh_fwd(3, 8, f"AAAC{b1}{b2}AA".encode()) == False 40 | for b1, b2 in itertools.product(NOT_G, MAYBE_G): 41 | assert is_chh_fwd(3, 8, f"AAAC{b1}{b2}AA".encode()) == False 42 | assert is_chh_fwd(3, 8, f"AAAC{b2}{b1}AA".encode()) == False 43 | 44 | # reverse 45 | for b1, b2 in itertools.product(NOT_C, repeat=2): 46 | assert is_chh_rev(5, 8, f"AAA{b1}{b2}GAA".encode()) == True 47 | for b1, b2 in itertools.product(MAYBE_C, repeat=2): 48 | assert is_chh_rev(5, 8, f"AAA{b1}{b2}GAA".encode()) == False 49 | for b1, b2 in itertools.product(NOT_C, MAYBE_C): 50 | assert is_chh_rev(5, 8, f"AAA{b1}{b2}GAA".encode()) == False 51 | assert is_chh_rev(5, 8, f"AAA{b2}{b1}GAA".encode()) == False 52 | 53 | # end, but overrun 54 | assert is_chh_fwd(5, 7, b"AAAAACHH") == False 55 | assert is_chh_fwd(5, 8, b"AAAAACHH") == True 56 | 57 | # don't break 58 | for i in 0, 1: 59 | assert is_chh_rev(5, 7, b"AAAAACHH") == False 60 | 61 | 62 | def test_020_is_chg(self): 63 | is_chg_fwd = libbam.is_chg_fwd 64 | is_chg_rev = libbam.is_chg_rev 65 | 66 | # forward 67 | for b1 in NOT_G: 68 | assert is_chg_fwd(3, 8, f"AAAC{b1}GAA".encode()) == True 69 | for b1 in MAYBE_G: 70 | assert libbam.is_chg_fwd(3, 8, f"AAAC{b1}GAA".encode()) == False 71 | for b1, b2 in itertools.product(MAYBE_G, NOT_G + MAYBE_G): 72 | assert is_chg_fwd(3, 8, f"AAAC{b1}{b2}AA".encode()) == False 73 | 74 | # reverse 75 | for b1 in NOT_C: 76 | assert is_chg_rev(5, 8, f"AAAC{b1}GAA".encode()) == True 77 | for b1 in MAYBE_C: 78 | assert is_chg_rev(5, 8, f"AAAC{b1}GAA".encode()) == False 79 | for b1, b2 in itertools.product(NOT_C + MAYBE_C, MAYBE_C): 80 | assert is_chg_rev(5, 8, f"AAA{b1}{b2}GAA".encode()) == False 81 | 82 | # end, but overrun 83 | assert is_chg_fwd(5, 7, b"AAAAACHG") == False 84 | assert is_chg_fwd(5, 8, b"AAAAACHG") == True 85 | 86 | # don't break 87 | for i in 0, 1: 88 | assert is_chg_rev(5, 7, b"AAAAACHG") == False 89 | -------------------------------------------------------------------------------- /test_data/400ecoli.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/test_data/400ecoli.bam -------------------------------------------------------------------------------- /test_data/400ecoli.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/test_data/400ecoli.bam.bai -------------------------------------------------------------------------------- /test_data/ecoli.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/test_data/ecoli.fasta.gz -------------------------------------------------------------------------------- /test_data/tag_codes.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/test_data/tag_codes.bam -------------------------------------------------------------------------------- /test_data/tag_codes.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/modbam2bed/d5b4d4800a4ee05040e89e386304d7334f13eb60/test_data/tag_codes.bam.bai --------------------------------------------------------------------------------