├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ ├── feature_request.md │ └── bug_report.md └── workflows │ ├── dnmtools_distcheck_ubuntu.yml │ ├── dnmtools_build_ubuntu.yml │ ├── dnmtools_build_macos.yml │ ├── dnmtools_release_linux.yml │ └── dnmtools_release_macos.yml ├── docs ├── content │ ├── environment.yaml │ ├── Makefile │ ├── merge-bsrate.md │ ├── cleanhp.md │ ├── liftfilter.md │ ├── hmr-rep.md │ ├── allelic.md │ ├── amrtester.md │ ├── radmerge.md │ ├── selectsites.md │ ├── entropy.md │ ├── radadjust.md │ ├── states.md │ ├── uniq.md │ ├── diff.md │ ├── sym.md │ ├── fastlift.md │ ├── counts-nano.md │ ├── hypermr.md │ ├── multistat.md │ ├── visualization.md │ ├── guessprotocol.md │ ├── levels.md │ └── amrfinder.md ├── requirements.txt ├── dnmtools_bash_completion ├── README.md └── mkdocs.yml ├── data ├── reads_1.fq.gz ├── reads_2.fq.gz ├── araTha1_simulated.counts.gz ├── pmd_test_data.counts.sym.gz ├── make_full_license_info_header.sh ├── radmeth_test_design.txt ├── tRex1_promoters.bed ├── config.h.in ├── md5sum.txt ├── methylome_a.counts.sym └── methylome_b.counts.sym ├── pipeline ├── config.yaml ├── runconfig.yaml └── Dockerfile ├── iwyu.json ├── .readthedocs.yaml ├── .gitmodules ├── .clang-format ├── test_scripts ├── test_abismalidx.test ├── test_hmr.test ├── test_sym.test ├── test_levels.test ├── test_pmd.test ├── test_hypermr.test ├── test_diff.test ├── test_xcounts.test ├── test_unxcounts.test ├── test_amrfinder.test ├── test_bsrate.test ├── test_radmeth.test ├── test_uniq.test ├── test_selectsites.test ├── test_counts.test ├── test_states.test ├── test_roi.test ├── test_format.test ├── test_mlml.test └── test_abismal.test ├── src ├── common │ ├── dnmtools_utils.hpp │ ├── dnmtools_gaussinv.hpp │ ├── dnmtools_utils.cpp │ ├── CMakeLists.txt │ ├── numerical_utils.hpp │ ├── dnmt_error.hpp │ ├── Smoothing.hpp │ ├── BetaBin.hpp │ ├── xcounts_utils.hpp │ ├── numerical_utils.cpp │ ├── Epiread.hpp │ ├── counts_header.hpp │ ├── Interval.cpp │ ├── EmissionDistribution.hpp │ ├── bsutils.cpp │ ├── Interval.hpp │ ├── bsutils.hpp │ ├── Interval6.hpp │ ├── EpireadStats.hpp │ ├── Interval6.cpp │ ├── ThreeStateHMM.hpp │ ├── Epiread.cpp │ └── BetaBin.cpp ├── radmeth │ ├── radmeth_optimize_params.hpp │ ├── radmeth_optimize_series.hpp │ ├── radmeth_optimize_gamma.hpp │ ├── radmeth_utils.hpp │ ├── CMakeLists.txt │ ├── README.md │ └── radmeth_design.hpp ├── mlml │ └── CMakeLists.txt ├── utils │ ├── CMakeLists.txt │ └── lift-filter.cpp ├── analysis │ └── CMakeLists.txt ├── amrfinder │ └── CMakeLists.txt └── CMakeLists.txt ├── CPPLINT.cfg ├── .cppcheck_suppress ├── cmake ├── FindLIBDEFLATE.cmake └── static_analysis.cmake ├── CMakeLists.txt ├── autogen.sh ├── Dockerfile ├── .clang-tidy ├── MAINTAINERS.md └── README.md /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /docs/content/environment.yaml: -------------------------------------------------------------------------------- 1 | name: docs 2 | dependencies: 3 | - Jinja2>3.1.4 4 | -------------------------------------------------------------------------------- /data/reads_1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/dnmtools/HEAD/data/reads_1.fq.gz -------------------------------------------------------------------------------- /data/reads_2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/dnmtools/HEAD/data/reads_2.fq.gz -------------------------------------------------------------------------------- /data/araTha1_simulated.counts.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/dnmtools/HEAD/data/araTha1_simulated.counts.gz -------------------------------------------------------------------------------- /data/pmd_test_data.counts.sym.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/dnmtools/HEAD/data/pmd_test_data.counts.sym.gz -------------------------------------------------------------------------------- /data/make_full_license_info_header.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | input=$1 4 | 5 | echo 'static const char *license_text = R"(' 6 | cat "$input" 7 | echo ')";' 8 | -------------------------------------------------------------------------------- /pipeline/config.yaml: -------------------------------------------------------------------------------- 1 | dnmtools_dir: '/home/username/bin' 2 | trim_galore_dir: '/usr/bin' 3 | samtools_dir: '/usr/bin' 4 | scratch_dir: '/tmp' 5 | threads: 6 6 | -------------------------------------------------------------------------------- /data/radmeth_test_design.txt: -------------------------------------------------------------------------------- 1 | base sex factor 2 | sample_FA1 1 1 1 3 | sample_FA2 1 1 1 4 | sample_FB1 1 1 0 5 | sample_FB2 1 1 0 6 | sample_MA1 1 0 1 7 | sample_MA2 1 0 1 8 | sample_MB1 1 0 0 9 | sample_MB2 1 0 0 10 | -------------------------------------------------------------------------------- /pipeline/runconfig.yaml: -------------------------------------------------------------------------------- 1 | outfiles: 2 | - 'test.hmr' 3 | - 'test.bsrate' 4 | - 'test.hypermr' 5 | - 'test.levels' 6 | - 'test.pmd' 7 | - 'test.amr' 8 | 9 | genome_fasta_file: '/home/username/data/genome.fa' 10 | paired: True 11 | cpg_only: True 12 | -------------------------------------------------------------------------------- /iwyu.json: -------------------------------------------------------------------------------- 1 | [ 2 | { "include": ["", "private", "", "public"] }, 3 | { "include": ["@[\"<]htslib/kstring.h[\">]", "private", "", "public"] }, 4 | { "include": ["@[\"<]htslib/hts.h[\">]", "private", "", "public"] }, 5 | ] 6 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | build: 3 | os: ubuntu-20.04 4 | tools: 5 | python: "3.9" 6 | 7 | mkdocs: 8 | configuration: docs/mkdocs.yml 9 | fail_on_warning: false 10 | 11 | python: 12 | install: 13 | - requirements: docs/requirements.txt 14 | 15 | formats: 16 | - pdf 17 | - epub 18 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/smithlab_cpp"] 2 | path = src/smithlab_cpp 3 | url = ../smithlab_cpp.git 4 | ignore = dirty 5 | [submodule "src/abismal"] 6 | path = src/abismal 7 | url = ../abismal.git 8 | ignore = dirty 9 | [submodule "src/bamxx"] 10 | path = src/bamxx 11 | url = ../bamxx.git 12 | ignore = dirty 13 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | ColumnLimit: 80 3 | IndentWidth: 2 4 | AlwaysBreakAfterReturnType: All 5 | ContinuationIndentWidth: 2 6 | ConstructorInitializerIndentWidth: 2 7 | BraceWrapping: 8 | BeforeElse: true 9 | BeforeCatch: true 10 | BreakBeforeBraces: Custom 11 | BreakConstructorInitializers: AfterColon 12 | SpacesBeforeTrailingComments: 2 13 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Jinja2>=3.1.4 2 | mkdocs>=1.3.1 3 | babel>=2.9.0 4 | click>=7.0 5 | Markdown>=3.2.1,<3.4 6 | PyYAML>=5.2 7 | watchdog>=2.0.0 8 | mdx_gh_links>=0.2 9 | ghp-import>=1.0 10 | pyyaml_env_tag>=0.1 11 | mkdocs-redirects>=1.0.1 12 | importlib_metadata>=4.3 13 | packaging>=20.5 14 | mergedeep>=1.3.4 15 | pygments>=2.12 16 | pymdown-extensions 17 | mkdocs-material 18 | -------------------------------------------------------------------------------- /data/tRex1_promoters.bed: -------------------------------------------------------------------------------- 1 | chr1 42178 44225 2 | chr1 45867 47867 3 | chr1 113357 115357 4 | chr1 195388 197388 5 | chr1 288263 290263 6 | chr1 320602 322602 7 | chr1 332945 334945 8 | chr1 456998 458998 9 | chr1 481945 483998 10 | chr2 144282 146282 11 | chr2 243609 245609 12 | chr2 270268 272434 13 | chr2 323639 325760 14 | chr2 373828 376724 15 | chr2 495275 497290 16 | -------------------------------------------------------------------------------- /test_scripts/test_abismalidx.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile=tests/tRex1.fa 4 | outfile=tests/tRex1.idx 5 | if [[ -e "${infile}" ]]; then 6 | ./dnmtools abismalidx ${infile} ${outfile} 7 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 8 | if [[ "${x}" != "OK" ]]; then 9 | exit 1; 10 | fi 11 | else 12 | echo "${infile} not found; skipping remaining tests"; 13 | exit 77; 14 | fi 15 | -------------------------------------------------------------------------------- /test_scripts/test_hmr.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile=tests/reads.counts.sym 4 | outfile=tests/reads.hmr 5 | if [[ -e "${infile}" ]]; then 6 | ./dnmtools hmr -v -o ${outfile} ${infile} 7 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 8 | if [[ "${x}" != "OK" ]]; then 9 | exit 1; 10 | fi 11 | else 12 | echo "${infile} not found; skipping remaining tests"; 13 | exit 77; 14 | fi 15 | -------------------------------------------------------------------------------- /test_scripts/test_sym.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile=tests/reads.counts 4 | outfile=tests/reads.counts.sym 5 | if [[ -e "${infile}" ]]; then 6 | ./dnmtools sym -o ${outfile} ${infile} 7 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 8 | if [[ "${x}" != "OK" ]]; then 9 | exit 1; 10 | fi 11 | else 12 | echo "${infile} not found; skipping remaining tests"; 13 | exit 77; 14 | fi 15 | -------------------------------------------------------------------------------- /test_scripts/test_levels.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile=tests/reads.counts 4 | outfile=tests/reads.levels 5 | if [[ -e "${infile}" ]]; then 6 | ./dnmtools levels -v -o ${outfile} ${infile} 7 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 8 | if [[ "${x}" != "OK" ]]; then 9 | exit 1; 10 | fi 11 | else 12 | echo "${infile} not found; skipping remaining tests"; 13 | exit 77; 14 | fi 15 | -------------------------------------------------------------------------------- /docs/dnmtools_bash_completion: -------------------------------------------------------------------------------- 1 | _dnmtools() 2 | { 3 | local cur prev opts 4 | COMPREPLY=() 5 | cur="${COMP_WORDS[COMP_CWORD]}" 6 | prev="${COMP_WORDS[COMP_CWORD-1]}" 7 | 8 | opts=`dnmtools | grep "^ " | awk '{print $1}' | tr -d ':'` 9 | 10 | case $prev in 11 | dnmtools) 12 | COMPREPLY=( $(compgen -f -W "${opts}" -- "$cur") ) 13 | ;; 14 | esac 15 | return 0 16 | } 17 | complete -F _dnmtools -o default dnmtools 18 | -------------------------------------------------------------------------------- /test_scripts/test_pmd.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile=tests/pmd_test_data.counts.sym.gz 4 | outfile=tests/methylome.pmd 5 | if [[ -e "${infile}" ]]; then 6 | ./dnmtools pmd -o ${outfile} ${infile} 7 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 8 | if [[ "${x}" != "OK" ]]; then 9 | exit 1; 10 | fi 11 | else 12 | echo "${infile} not found; skipping remaining tests"; 13 | exit 77; 14 | fi 15 | -------------------------------------------------------------------------------- /test_scripts/test_hypermr.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile=tests/araTha1_simulated.counts.gz 4 | outfile=tests/araTha1_simulated.hypermr 5 | if [[ -e "${infile}" ]]; then 6 | ./dnmtools hypermr -o ${outfile} ${infile} 7 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 8 | if [[ "${x}" != "OK" ]]; then 9 | exit 1; 10 | fi 11 | else 12 | echo "${infile} not found; skipping test"; 13 | exit 77; 14 | fi 15 | -------------------------------------------------------------------------------- /test_scripts/test_diff.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/methylome_a.counts.sym 4 | infile2=tests/methylome_b.counts.sym 5 | outfile=tests/methylome_ab.diff 6 | if [[ -e "${infile1}" || -e "${infile2}" ]]; then 7 | ./dnmtools diff -o ${outfile} ${infile1} ${infile2} 8 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 9 | if [[ "${x}" != "OK" ]]; then 10 | exit 1; 11 | fi 12 | else 13 | echo "input missing; skipping test"; 14 | exit 77; 15 | fi 16 | -------------------------------------------------------------------------------- /test_scripts/test_xcounts.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/reads.counts 4 | infile2=tests/tRex1.fa 5 | outfile=tests/reads.xcounts 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then 7 | ./dnmtools xcounts -c ${infile2} -o ${outfile} ${infile1} 8 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 9 | if [[ "${x}" != "OK" ]]; then 10 | exit 1; 11 | fi 12 | else 13 | echo "xcounts input file(s) not found; skipping test"; 14 | exit 77; 15 | fi 16 | -------------------------------------------------------------------------------- /test_scripts/test_unxcounts.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/reads.xcounts 4 | infile2=tests/tRex1.fa 5 | outfile=tests/reads.unxcounts 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then 7 | ./dnmtools unxcounts -c ${infile2} -o ${outfile} ${infile1} 8 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 9 | if [[ "${x}" != "OK" ]]; then 10 | exit 1; 11 | fi 12 | else 13 | echo "unxcounts input file not found; skipping remaining tests"; 14 | exit 77; 15 | fi 16 | -------------------------------------------------------------------------------- /test_scripts/test_amrfinder.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/two_epialleles.states 4 | infile2=tests/tRex1.fa 5 | outfile=tests/two_epialleles.amr 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then 7 | ./dnmtools amrfinder -v -c ${infile2} -o ${outfile} ${infile1} 8 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 9 | if [[ "${x}" != "OK" ]]; then 10 | exit 1; 11 | fi 12 | else 13 | echo "amrfinder input file(s) not found; skipping test"; 14 | exit 77; 15 | fi 16 | -------------------------------------------------------------------------------- /test_scripts/test_bsrate.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/reads.fmt.srt.uniq.sam 4 | infile2=tests/tRex1.fa 5 | outfile=tests/reads.bsrate 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then 7 | ./dnmtools bsrate -c ${infile2} -o ${outfile} ${infile1} 8 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 9 | if [[ "${x}" != "OK" ]]; then 10 | exit 1; 11 | fi 12 | else 13 | echo "${infile1} and ${infile2} not found; skipping dependent tests"; 14 | exit 77; 15 | fi 16 | -------------------------------------------------------------------------------- /test_scripts/test_radmeth.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/radmeth_test_table.txt 4 | infile2=tests/radmeth_test_design.txt 5 | outfile=tests/radmeth_test_output.txt 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then 7 | ./dnmtools radmeth -o ${outfile} -f factor ${infile2} ${infile1} 8 | x=$(wc -l tests/radmeth_test_output.txt | awk '$1 == 17903 {print "OK"}') 9 | if [[ "${x}" != "OK" ]]; then 10 | exit 1; 11 | fi 12 | else 13 | echo "radmeth input file(s) not found; skipping test"; 14 | exit 77; 15 | fi 16 | -------------------------------------------------------------------------------- /test_scripts/test_uniq.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile=tests/reads.fmt.srt.sam 4 | outfile1=tests/reads.fmt.srt.uniq.sam 5 | outfile2=tests/reads.ustats 6 | if [[ -e "${infile}" ]]; then 7 | ./dnmtools uniq -v -S ${outfile2} ${infile} ${outfile1} 8 | x1=$(md5sum -c tests/md5sum.txt | grep "${outfile1}:" | cut -d ' ' -f 2) 9 | x2=$(md5sum -c tests/md5sum.txt | grep "${outfile2}:" | cut -d ' ' -f 2) 10 | if [[ "${x1}" != "OK" || "${x2}" != "OK" ]]; then 11 | exit 1; 12 | fi 13 | else 14 | echo "${infile} not found; skipping dependent tests"; 15 | exit 77; 16 | fi 17 | -------------------------------------------------------------------------------- /test_scripts/test_selectsites.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/tRex1_promoters.bed 4 | infile2=tests/reads.counts 5 | outfile=tests/reads.counts.select 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then 7 | ./dnmtools selectsites -o ${outfile} ${infile1} ${infile2} 8 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 9 | if [[ "${x}" != "OK" ]]; then 10 | exit 1; 11 | fi 12 | elif [[ -e "${infile1}" ]]; then 13 | echo "${infile1} not found; skipping remaining tests"; 14 | exit 77; 15 | else 16 | echo "${infile2} not found; skipping remaining tests"; 17 | exit 77; 18 | fi 19 | -------------------------------------------------------------------------------- /test_scripts/test_counts.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/reads.fmt.srt.uniq.sam 4 | infile2=tests/tRex1.fa 5 | outfile=tests/reads.counts 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then 7 | ./dnmtools counts -v -o ${outfile} -c ${infile2} ${infile1} 8 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 9 | if [[ "${x}" != "OK" ]]; then 10 | exit 1; 11 | fi 12 | elif [[ -e "${infile1}" ]]; then 13 | echo "${infile1} not found; skipping remaining tests"; 14 | exit 77; 15 | else # if [[ -e "${infile2}" ]] 16 | echo "${infile2} not found; skipping remaining tests"; 17 | exit 77; 18 | fi 19 | -------------------------------------------------------------------------------- /test_scripts/test_states.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/reads.fmt.srt.uniq.sam 4 | infile2=tests/tRex1.fa 5 | outfile=tests/reads.epiread 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then 7 | ./dnmtools states -v -o ${outfile} -c ${infile2} ${infile1} 8 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 9 | if [[ "${x}" != "OK" ]]; then 10 | exit 1; 11 | fi 12 | elif [[ -e "${infile1}" ]]; then 13 | echo "${infile1} not found; skipping remaining tests"; 14 | exit 77; 15 | else # if [[ -e "${infile2}" ]] 16 | echo "${infile2} not found; skipping remaining tests"; 17 | exit 77; 18 | fi 19 | -------------------------------------------------------------------------------- /test_scripts/test_roi.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/reads.counts.sym 4 | infile2=tests/tRex1_promoters.bed 5 | outfile=tests/tRex1_promoters.roi.bed 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then 7 | ./dnmtools roi -v -M -o ${outfile} ${infile2} ${infile1} 8 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 9 | if [[ "${x}" != "OK" ]]; then 10 | exit 1; 11 | fi 12 | elif [[ -e "${infile1}" ]]; then 13 | echo "${infile1} not found; skipping remaining tests"; 14 | exit 77; 15 | else # if [[ -e "${infile2}" ]] 16 | echo "${infile2} not found; skipping remaining tests"; 17 | exit 77; 18 | fi 19 | -------------------------------------------------------------------------------- /pipeline/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | ## Copy this file to where the directory where you have the dnmtools binary. Make sure it was built on linux x86_64. 4 | 5 | FROM ubuntu:22.04 6 | 7 | # install pipeline dependencies 8 | RUN apt-get update 9 | RUN apt-get install -y libgsl-dev libhts-dev libgomp1 samtools libcurl4 trim-galore sra-toolkit rsync 10 | RUN rm -rf /var/lib/apt/lists/* 11 | RUN rsync -a hgdownload.soe.ucsc.edu::genome/admin/exe/linux.x86_64/bedToBigBed /usr/bin 12 | RUN rsync -a hgdownload.soe.ucsc.edu::genome/admin/exe/linux.x86_64/wigToBigWig /usr/bin 13 | 14 | # install dnmtools and it must be build for Ubuntu 15 | COPY dnmtools /usr/bin 16 | -------------------------------------------------------------------------------- /data/config.h.in: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Andrew D Smith 2 | * 3 | * This program is free software: you can redistribute it and/or modify it 4 | * under the terms of the GNU General Public License as published by the Free 5 | * Software Foundation, either version 3 of the License, or (at your option) 6 | * any later version. 7 | * 8 | * This program is distributed in the hope that it will be useful, but WITHOUT 9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 | * more details. 12 | */ 13 | 14 | #define PROJECT_NAME "@PROJECT_NAME@" 15 | #define VERSION "@PROJECT_VERSION@" 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /docs/content/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/content/merge-bsrate.md: -------------------------------------------------------------------------------- 1 | # merge-bsrate - Combine bisulfite conversion rate statistics files 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools merge-bsrate [OPTIONS] ... 6 | ``` 7 | 8 | ## Description 9 | 10 | Given several bisulfite conversion summary statistics generated using 11 | the [bsrate](../bsrate) program, the `merge-bsrate` utility 12 | combines their information. This is usually useful if your dataset has 13 | been split into multipe files and processed in parallel, after which 14 | one would like to combine the summaries of separate runs. 15 | 16 | ## Options 17 | 18 | ```txt 19 | -o -output 20 | ``` 21 | output file (default : STDOUT) 22 | 23 | ```txt 24 | -v -verbose 25 | ``` 26 | print more run info to STDERR as the program runs. 27 | -------------------------------------------------------------------------------- /test_scripts/test_format.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile=tests/reads.sam 4 | outfile1=tests/reads.fmt.sam 5 | outfile2=tests/reads.fmt.srt.sam 6 | cmd=samtools 7 | if [[ -e "${infile}" ]]; then 8 | ./dnmtools format -f abismal ${infile} ${outfile1} 9 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile1}:" | cut -d ' ' -f 2) 10 | ### ADS: only want to check the first output here; any failure 11 | ### later will result in a skip for subsequent tests. 12 | if [[ "${x}" != "OK" ]]; then 13 | exit 1; 14 | fi 15 | else 16 | echo "${infile} not found; skipping remaining tests"; 17 | exit 77; 18 | fi 19 | 20 | if [[ -e $(type -P "${cmd}") ]]; then 21 | samtools sort --no-PG -O SAM -o ${outfile2} ${outfile1}; 22 | else 23 | echo "${cmd} not found" 24 | fi 25 | -------------------------------------------------------------------------------- /test_scripts/test_mlml.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | archive=tests/mlml_test_data.tgz 4 | if [[ ! -e "${archive}" ]] ; then 5 | echo "input missing; skipping test"; 6 | exit 77; 7 | fi 8 | 9 | infile1=tests/bs.counts 10 | infile2=tests/tab.counts 11 | infile3=tests/oxbs.counts 12 | outfile=tests/mlml.out 13 | tar -xf ${archive} -C tests 14 | if [[ -e "${infile1}" || -e "${infile2}" || -e "${infile3}" ]]; then 15 | ./dnmtools mlml -o ${outfile} -bsseq ${infile1} -tabseq ${infile2} -oxbsseq ${infile3} 16 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 17 | rm -f ${infile1} ${infile2} ${infile3} 18 | if [[ "${x}" != "OK" ]]; then 19 | exit 1; 20 | fi 21 | else 22 | echo "input missing; skipping test"; 23 | rm -f ${infile1} ${infile2} ${infile3} 24 | exit 77; 25 | fi 26 | -------------------------------------------------------------------------------- /src/common/dnmtools_utils.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2019-2023 Andrew D. Smith 2 | * 3 | * Authors: Andrew D. Smith 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | */ 15 | 16 | #ifndef DNMTOOLS_UTILS_HPP 17 | #define DNMTOOLS_UTILS_HPP 18 | 19 | #include 20 | 21 | auto 22 | get_command_line(const int argc, 23 | char *argv[]) -> std::string; // NOLINT(*-c-arrays) 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /test_scripts/test_abismal.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | infile1=tests/reads_1.fq.gz 4 | infile2=tests/reads_2.fq.gz 5 | infile3=tests/tRex1.idx 6 | outfile1=tests/reads.sam 7 | outfile2=tests/reads.mstats 8 | if [[ -e "${infile1}" && -e "${infile2}" && -e "${infile3}" ]]; then 9 | ./dnmtools abismal -s ${outfile2} -o ${outfile1} \ 10 | -i ${infile3} ${infile1} ${infile2}; 11 | x1=$(md5sum -c tests/md5sum.txt | grep "${outfile1}:" | cut -d ' ' -f 2) 12 | x2=$(md5sum -c tests/md5sum.txt | grep "${outfile2}:" | cut -d ' ' -f 2) 13 | if [[ "${x1}" != "OK" || "${x2}" != "OK" ]]; then 14 | exit 1; 15 | fi 16 | elif [[ ! -e "${infile1}" || ! -e "${infile2}" ]]; then 17 | echo "missing fastq input file(s); skipping remaining tests"; 18 | exit 77; 19 | else ## if [[ ! -e "${infile3}" ]]; then 20 | echo "missing index file; skipping remaining tests"; 21 | exit 77; 22 | fi 23 | -------------------------------------------------------------------------------- /docs/content/cleanhp.md: -------------------------------------------------------------------------------- 1 | # cleanhp - Remove hairpin reads 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools cleanhp [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | ## Options 11 | 12 | ```txt 13 | -o, -output 14 | ``` 15 | output filename prefix [required] 16 | ```txt 17 | -s, -stat 18 | ``` 19 | stats output filename [required] 20 | ```txt 21 | -h, -hairpin 22 | ``` 23 | maximum hairpin rate 24 | ```txt 25 | -check 26 | ``` 27 | check for hairpin contamination 28 | ```txt 29 | -n, -nreads 30 | ``` 31 | number of reads in initial check 32 | ```txt 33 | -c, -cutoff 34 | ``` 35 | cutoff for calling an inverse duplication(default: 0.95) 36 | ```txt 37 | -i, -ignore 38 | ``` 39 | length of read name suffix to ignore when matching 40 | ```txt 41 | -v, -verbose 42 | ``` 43 | print more run info to the terminal while the program is running 44 | ```txt 45 | -h, -hist 46 | ``` 47 | write a histogram of hairpin matches to this file 48 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # DNMTools documentation 2 | 3 | This is the documentation for DNMTools that uses 4 | [mkdocs](https://mkdocs.readthedocs.io) to generate readthedocs pages. 5 | The public web verison of this documentation is available at 6 | [dnmtools.readthedocs.io](https://dnmtools.readthedocs.io), but for 7 | uses who wish to see the documentation on a web browser offline, you 8 | can build the documentation locally as described below. 9 | 10 | ### Dependencies 11 | 12 | To build the documentation locally, install mkdocs 13 | 14 | ``` 15 | pip install -U mkdocs 16 | ``` 17 | 18 | ### Local compilation 19 | 20 | Build the HTML documentation by running 21 | ``` 22 | mkdocs build -f docs/mkdocs.yml 23 | ``` 24 | which will create a `site` directory where markdown files are 25 | converted to HTML 26 | 27 | Create a local host for the HTML documentation by running 28 | 29 | ``` 30 | mkdocs serve -f docs/mkdocs.yml 31 | ``` 32 | 33 | This will create the documentation, usually at http://localhost:8000 . 34 | -------------------------------------------------------------------------------- /docs/content/liftfilter.md: -------------------------------------------------------------------------------- 1 | # liftfilter - merge lifted entries to the same position 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools liftfilter [OPTIONS] -o 6 | ``` 7 | 8 | ## Description 9 | 10 | The [fastlift](../fastlift) program may report multiple mm9 11 | sites mapped to a same position in hg19. In this situation, we may 12 | either collapse read counts at those mm9 sites, or keep the data for 13 | only one mm9 site. We can use the lift-filter program to achieve these 14 | two options. Use 15 | 16 | ```shell 17 | $ dnmtools liftfilter -o output-filtered.meth input.meth 18 | ``` 19 | 20 | to merge data from mm9 sites lifted to the same hg19 position. Use the 21 | option `-u` to keep the first record of duplicated sites. 22 | 23 | ## Options 24 | 25 | ```txt 26 | -o, -output 27 | ``` 28 | Output processed methcount [required] 29 | ```txt 30 | -u, -unique 31 | ``` 32 | keep unique sites 33 | ```txt 34 | -v, -verbose 35 | ``` 36 | print more information to STDERR as the program runs. 37 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /src/radmeth/radmeth_optimize_params.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Andrew D Smith 2 | * 3 | * Author: Andrew D. Smith 4 | * 5 | * This program is free software: you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) 8 | * any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | * more details. 14 | */ 15 | 16 | #ifndef RADMETH_OPTIMIZE_PARAMS_HPP 17 | #define RADMETH_OPTIMIZE_PARAMS_HPP 18 | 19 | #include 20 | 21 | namespace radmeth_optimize_params { 22 | inline double tolerance = 1e-4; 23 | inline double stepsize = 0.01; 24 | inline std::uint32_t max_iter = 250; 25 | }; // namespace radmeth_optimize_params 26 | 27 | #endif // RADMETH_OPTIMIZE_PARAMS_HPP 28 | -------------------------------------------------------------------------------- /src/radmeth/radmeth_optimize_series.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Andrew D. 2 | * 3 | * Author: Andrew D. Smith 4 | * 5 | * This program is free software: you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) 8 | * any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | * more details. 14 | */ 15 | 16 | #ifndef RADMETH_OPTIMIZE_SERIES_HPP 17 | #define RADMETH_OPTIMIZE_SERIES_HPP 18 | 19 | #include 20 | #include 21 | 22 | template struct Regression; 23 | 24 | void 25 | fit_regression_model(Regression &r, 26 | std::vector &p_estimates, 27 | double &dispersion_estimate); 28 | 29 | #endif // RADMETH_OPTIMIZE_SERIES_HPP 30 | -------------------------------------------------------------------------------- /CPPLINT.cfg: -------------------------------------------------------------------------------- 1 | # This file is part of dnmtools 2 | # 3 | # Copyright (C) 2023-2025 Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | set noparent 17 | filter=-runtime/references 18 | filter=-build/include_subdir 19 | filter=-build/include_order 20 | filter=-build/c++11 21 | filter=-build/c++17 22 | # Formatting below handled by clang-format 23 | filter=-whitespace/line_length 24 | filter=-whitespace/newline 25 | filter=-readability/braces 26 | filter=-whitespace/semicolon 27 | filter=-whitespace/indent 28 | filter=-whitespace/braces 29 | filter=-whitespace/parens 30 | filter=-readability/nolint 31 | -------------------------------------------------------------------------------- /.github/workflows/dnmtools_distcheck_ubuntu.yml: -------------------------------------------------------------------------------- 1 | name: DNMTools distcheck (Ubuntu) 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: [ "master" ] 7 | pull_request: 8 | branches: [ "master" ] 9 | 10 | jobs: 11 | distcheck: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | with: 16 | submodules: recursive 17 | - name: Install dependencies 18 | run: | 19 | sudo apt-get update 20 | sudo apt-get install -y \ 21 | libgsl-dev \ 22 | libcurl4-gnutls-dev \ 23 | libdeflate-dev \ 24 | liblzma-dev \ 25 | zlib1g-dev \ 26 | libbz2-dev 27 | - name: Build and install htslib (for recent version) 28 | run: | 29 | git clone --recursive https://github.com/samtools/htslib.git 30 | cd htslib 31 | make -j4 32 | sudo make install prefix=/usr 33 | - name: Generate configure script 34 | run: ./autogen.sh 35 | - name: configure 36 | run: ./configure 37 | - name: Generate the source archive 38 | run: make dist 39 | - name: make distcheck 40 | run: make -j4 distcheck 41 | -------------------------------------------------------------------------------- /src/common/dnmtools_gaussinv.hpp: -------------------------------------------------------------------------------- 1 | /* Code from GSl, see copyright below. 2 | */ 3 | 4 | /* cdf/gsl_cdf.h 5 | * 6 | * Copyright (C) 2002 Jason H. Stover. 7 | * 8 | * This program is free software; you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation; either version 3 of the License, or (at 11 | * your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, but 14 | * WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 | * General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program; if not, write to the Free Software Foundation, 20 | * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 21 | */ 22 | 23 | /* Author: J. Stover */ 24 | 25 | double dnmt_gsl_cdf_ugaussian_Pinv(const double P); 26 | double dnmt_gsl_cdf_ugaussian_Qinv(const double Q); 27 | 28 | double dnmt_gsl_cdf_gaussian_P(const double x, const double sigma); 29 | double dnmt_gsl_cdf_gaussian_Q(const double x, const double sigma); 30 | -------------------------------------------------------------------------------- /.cppcheck_suppress: -------------------------------------------------------------------------------- 1 | # This file is part of dnmtools 2 | # 3 | # Copyright (C) 2023-2025 Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | missingIncludeSystem 18 | constVariablePointer 19 | checkersReport 20 | unknownMacro 21 | unmatchedSuppression 22 | # Ignore unused function because it's too hard to get right 23 | unusedFunction 24 | # Ignore unused struct member because this won't go unnoticed anyway 25 | unusedStructMember 26 | # Ignore missing includes because if they are real things won't build 27 | missingInclude 28 | # Exclude external files 29 | *:*smithlab_cpp* 30 | *:*popcnt.hpp 31 | # Problem caused by external files 32 | toomanyconfigs 33 | # More problems caused by external files -- with too many ifdefs 34 | normalCheckLevelMaxBranches 35 | -------------------------------------------------------------------------------- /src/radmeth/radmeth_optimize_gamma.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Andrew D Smith 2 | * 3 | * Author: Andrew D. Smith 4 | * 5 | * This program is free software: you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) 8 | * any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | * more details. 14 | */ 15 | 16 | #ifndef RADMETH_OPTIMIZE_GAMMA_HPP 17 | #define RADMETH_OPTIMIZE_GAMMA_HPP 18 | 19 | #include 20 | #include 21 | 22 | template struct Regression; 23 | 24 | void 25 | fit_regression_model_gamma(Regression &r, 26 | std::vector &p_estimates, 27 | double &dispersion_estimate); 28 | 29 | void 30 | fit_regression_model_gamma(Regression &r, 31 | std::vector &p_estimates, 32 | double &dispersion_estimate); 33 | 34 | #endif // RADMETH_OPTIMIZE_GAMMA_HPP 35 | -------------------------------------------------------------------------------- /src/common/dnmtools_utils.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2019-2023 Andrew D. Smith 2 | * 3 | * Authors: Andrew D. Smith 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | */ 15 | 16 | #include "dnmtools_utils.hpp" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | using std::copy; 24 | using std::ostream_iterator; 25 | using std::ostringstream; 26 | using std::string; 27 | 28 | auto 29 | get_command_line(const int argc, 30 | char *argv[]) -> std::string { // NOLINT(*-c-arrays) 31 | if (argc == 0) 32 | return std::string{}; 33 | std::ostringstream cmd; 34 | cmd << '"'; 35 | // NOLINTBEGIN(*-pointer-arithmetic) 36 | copy(argv, argv + (argc - 1), ostream_iterator(cmd, " ")); 37 | cmd << argv[argc - 1] << '"'; 38 | // NOLINTEND(*-pointer-arithmetic) 39 | return cmd.str(); 40 | } 41 | -------------------------------------------------------------------------------- /data/md5sum.txt: -------------------------------------------------------------------------------- 1 | ae05a28de5643a512386e767b3aa963a tests/araTha1_simulated.hypermr 2 | 0048de3fc412cb12ec2e070c8151f86f tests/methylome_ab.diff 3 | 86ca23015535cf3295c0da3587a95f22 tests/radmeth_test_output.txt 4 | 75777c209bf820ab700801d87a0a3615 tests/reads.bsrate 5 | e73facd597c3b903cbfe29afa9f58371 tests/reads.counts 6 | 56575da7d3af9b696258512142903d1e tests/reads.counts.select 7 | 0f72560aa101e85679783a1ecaf80615 tests/reads.epiread 8 | 9dbd476424d48a8d0f043dfc00af0d23 tests/reads.fmt.srt.sam 9 | 4085cc74b003a918b4a4743fca7922a4 tests/reads.hmr 10 | d8856f9731af76b8a4ab3cc7d667cdb2 tests/reads.ustats 11 | bcbf01be810cbf4051292813eb6b9225 tests/tRex1.idx 12 | ec6a686617cad31e9f7a37a3d378e6ed tests/two_epialleles.states 13 | 93e38b20d162062a5d147c4290095a13 tests/mlml.out 14 | d947fe3d61ef7b1564558a69608f0e64 tests/methylome.pmd 15 | d41d8cd98f00b204e9800998ecf8427e tests/two_epialleles.amr 16 | 001b9d966f62fa439b24cf2198cc3de5 tests/reads.counts.sym 17 | 2b8a0406015458be51b8b1c9e58b3602 tests/tRex1_promoters.roi.bed 18 | 33640b24cb64ad3179f364af5a887f95 tests/reads.fmt.sam 19 | b5a63997c57dcde5c3a6635f7beb2cce tests/reads.fmt.srt.uniq.sam 20 | 3ac2b51545740bafd1a548ba7f73e739 tests/reads.xcounts 21 | 054fe804a32063c80862fbee30f74579 tests/reads.unxcounts 22 | 830157684f1ddbf1f1c37d354188dc2b tests/reads.sam 23 | 8dbcdabecb6cfe6aebb73c94c605f696 tests/reads.mstats 24 | 490723e9af084c8f957f5a265cf02994 tests/reads.levels 25 | -------------------------------------------------------------------------------- /docs/content/hmr-rep.md: -------------------------------------------------------------------------------- 1 | # hmr-rep - Hypomethylated regions across replicates 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools hmr-rep [OPTIONS] ... 6 | ``` 7 | 8 | ## Description 9 | 10 | This program is similar to [hmr](../hmr), but it identifies 11 | HMRs in a set of replicate methylomes. Methylation must be provided in 12 | the [counts](../counts) format. This program assumes 13 | only data at CpG sites and that strands are collapsed so only the 14 | positive site appears in the file (e.g. using 15 | [sym](../sym)). 16 | 17 | ## Options 18 | 19 | ```txt 20 | -o, -out 21 | ``` 22 | output file (default: STDOUT) 23 | ```txt 24 | -d, -desert 25 | ``` 26 | maximum distance between covered CpGs in HMR (default: 1000) 27 | 28 | ```txt 29 | -i, -itr 30 | ``` 31 | max number of iterations (default: 100) 32 | ```txt 33 | -v, -verbose 34 | ``` 35 | print more run info to STDERR while the program is running. 36 | ```txt 37 | -post-hypo 38 | ``` 39 | output file for single-CpG posterior hypomethylation probability (default: none) 40 | 41 | ```txt 42 | -post-meth 43 | ``` 44 | output file for single-CpG posteiror methylation probability (default: none) 45 | 46 | ```txt 47 | -P, -params-in 48 | ``` 49 | HMM parameter file (override training step) 50 | ```txt 51 | -p, -params-out 52 | ``` 53 | write HMM parameters to this file (default: none) 54 | ```txt 55 | -s, -seed 56 | ``` 57 | specify random seed (default: 408) 58 | -------------------------------------------------------------------------------- /docs/content/allelic.md: -------------------------------------------------------------------------------- 1 | # allelic - Single-site ASM scoring 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools allelic [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | The program `allelicmeth` calculates allele specific methylation scores 11 | for each CpG site. Input files should be the epiread files (.epiread 12 | suffix) produced using [states](../states). In the output file, each row 13 | represents a CpG pair made by any CpG and its previous CpG, the first 14 | three columns indicate the positions of the CpG site, the fourth 15 | column is the name including the number of reads covering the CpG 16 | pair, the fifth column is the score for ASM, and the last four columns 17 | record the number of reads of four different methylation combinations 18 | of the CpG pair: methylated methylated (mm), methylated unmethylated 19 | (mu), unmethylated methylated (um), or unmethylated unmethylated (uu). 20 | The following command will calculate allele specific methylation 21 | scores using the allelicmeth component of dnmtools: 22 | 23 | ```shell 24 | $ dnmtools allelic -c /path/to/genome.fa -o output.allelic input.epiread 25 | ``` 26 | 27 | ## Options 28 | 29 | ```txt 30 | -o, -output 31 | ``` 32 | output file name (default: STDOUT) 33 | ```txt 34 | -c, -chrom 35 | ``` 36 | FASTA file or directory of chromosomes containing FASTA files [required] 37 | 38 | ```txt 39 | -v, -verbose 40 | ``` 41 | print more run info to STDERR while the program is running. 42 | -------------------------------------------------------------------------------- /src/radmeth/radmeth_utils.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Andrew D Smith 2 | * 3 | * Author: Andrew D. Smith 4 | * 5 | * This program is free software: you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) 8 | * any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | * more details. 14 | */ 15 | 16 | #ifndef RADMETH_UTILS_HPP 17 | #define RADMETH_UTILS_HPP 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | [[nodiscard]] std::string 26 | format_duration(const std::chrono::duration elapsed); 27 | 28 | struct file_progress { 29 | double one_thousand_over_filesize{}; 30 | std::size_t prev_offset{}; 31 | explicit file_progress(const std::string &filename); 32 | void 33 | operator()(std::ifstream &in); // cppcheck-suppress constParameterReference 34 | }; 35 | 36 | [[nodiscard]] double 37 | llr_test(const double null_loglik, const double full_loglik); 38 | 39 | [[nodiscard]] inline double 40 | overdispersion_factor(const std::uint32_t n_samples, const double dispersion) { 41 | return (n_samples - 1) / (dispersion + 1); 42 | } 43 | 44 | #endif // RADMETH_UTILS_HPP 45 | -------------------------------------------------------------------------------- /src/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 Andrew D Smith 2 | # 3 | # This program is free software: you can redistribute it and/or modify it 4 | # under the terms of the GNU General Public License as published by the Free 5 | # Software Foundation, either version 3 of the License, or (at your option) 6 | # any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 | # more details. 12 | # 13 | # You should have received a copy of the GNU General Public License along with 14 | # this program. If not, see . 15 | 16 | find_package(GSL REQUIRED) 17 | 18 | file(GLOB cpp_files "*.cpp") 19 | 20 | set(LIBRARY_OBJECTS "") 21 | foreach(cpp_file ${cpp_files}) 22 | get_filename_component(BASE_NAME ${cpp_file} NAME_WE) 23 | add_library(${BASE_NAME} OBJECT ${cpp_file}) 24 | target_link_libraries(${BASE_NAME} PRIVATE 25 | bamxx 26 | smithlab_cpp 27 | GSL::gsl 28 | ) 29 | target_include_directories(${BASE_NAME} PUBLIC 30 | ${CMAKE_BINARY_DIR} 31 | ) 32 | list(APPEND LIBRARY_OBJECTS ${BASE_NAME}) 33 | endforeach() 34 | 35 | # Create static library linking the individual objects 36 | add_library(dnmtools_objs STATIC) 37 | target_include_directories(dnmtools_objs PUBLIC 38 | ${CMAKE_CURRENT_SOURCE_DIR} 39 | ) 40 | target_link_libraries(dnmtools_objs PUBLIC 41 | ${LIBRARY_OBJECTS} 42 | smithlab_cpp 43 | bamxx 44 | ) 45 | -------------------------------------------------------------------------------- /src/common/numerical_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011-2022 University of Southern California 3 | * Andrew D Smith and Qiang Song 4 | * Author: Qiang Song and Andrew D. Smith 5 | * 6 | * This is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | */ 16 | 17 | #ifndef NUMERICAL_UTILS_HPP 18 | #define NUMERICAL_UTILS_HPP 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | inline double 25 | log_sum_log(const double p, const double q) { 26 | if (p == 0) { 27 | return q; 28 | } 29 | else if (q == 0) { 30 | return p; 31 | } 32 | const double larger = (p > q) ? p : q; 33 | const double smaller = (p > q) ? q : p; 34 | return larger + log1p(exp(smaller - larger)); 35 | } 36 | 37 | inline double 38 | log_sum_log(const double p, const double q, const double r) { 39 | return log_sum_log(log_sum_log(p, q), r); 40 | } 41 | 42 | double 43 | log_sum_log_vec(const std::vector &vals, const size_t limit); 44 | 45 | double 46 | log_sum_log(const std::vector::const_iterator &begin, 47 | const std::vector::const_iterator &end); 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /src/common/dnmt_error.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2023 Andrew D. Smith 2 | * 3 | * Authors: Andrew Smith 4 | * 5 | * This program is free software: you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation, either version 3 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | */ 15 | 16 | #ifndef DNMT_ERROR_HPP 17 | #define DNMT_ERROR_HPP 18 | 19 | #include // for int64_t 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | struct dnmt_error : public std::exception { 26 | int64_t err; // error possibly from HTSlib 27 | int the_errno; // ERRNO at time of construction 28 | std::string msg; // the message 29 | std::string the_what; // to report 30 | dnmt_error(const int64_t _err, const std::string &_msg) : 31 | err{_err}, the_errno{errno}, msg{_msg} { 32 | std::ostringstream oss; 33 | oss << "[error: " << err << "][" << "ERRNO: " << the_errno << "]" 34 | << "[" << strerror(the_errno) << "][" << msg << "]"; 35 | the_what = oss.str(); 36 | } 37 | explicit dnmt_error(const std::string &_msg) : dnmt_error(0, _msg) {} 38 | const char * 39 | what() const noexcept override { 40 | return the_what.c_str(); 41 | } 42 | }; 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /src/common/Smoothing.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2008-2022 Cold Spring Harbor Laboratory 3 | Authors: Andrew D. Smith 4 | 5 | This file is part of dnmtools. 6 | 7 | dnmtools is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | dnmtools is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with dnmtools; if not, write to the Free Software 19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 | */ 21 | 22 | #ifndef SMOOTHING_HPP 23 | #define SMOOTHING_HPP 24 | 25 | #include 26 | 27 | void 28 | KernelSmoothing(const double bandwidth, 29 | const std::vector &x_values, 30 | const std::vector &y_values, 31 | const std::vector &x_target, 32 | std::vector &y_target); 33 | 34 | void 35 | LocalLinearRegression(const double bandwidth, 36 | const std::vector &x_values, 37 | const std::vector &y_values, 38 | const std::vector &x_target, 39 | std::vector &y_target); 40 | 41 | 42 | void 43 | KernelSmoothing(const double bandwidth, 44 | const std::vector &y_vals, 45 | std::vector &y_target); 46 | 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /docs/content/amrtester.md: -------------------------------------------------------------------------------- 1 | # amrtester - resolve epi-alleles 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools amrtester [OPTIONS] 6 | ``` 7 | 8 | In addition to [amrfinder](../amrfinder), which uses a sliding 9 | window, the `amrtester` program tests for allele-specific methylation 10 | in a given set of genomic intervals. The program can be run like this: 11 | 12 | ```shell 13 | $ dnmtools amrtester -o output.amr -c /path/to/genome.fa intervals.bed input.epiread 14 | ``` 15 | 16 | This program works very similarly to `amrfinder`, but does not have 17 | options related to the sliding window. This program outputs a score 18 | for each input interval, and when the likelihood ratio test is used, 19 | the score is the p-value, which can easily be filtered later. 20 | 21 | ## Options 22 | 23 | ```txt 24 | -o, -output 25 | ``` 26 | The name of the output file. If no file name is provided, the output 27 | will be written to standard output. Due to the size of this output, a 28 | file should be specified unless the output will be piped to another 29 | command or program. The output file contains genomic intervals in BED 30 | format, with intervals corresponding to those provided as input. 31 | 32 | ```txt 33 | -c, -chrom 34 | ``` 35 | FASTA file or directory of chromosomes containing FASTA files [required] 36 | ```txt 37 | -i, -itr 38 | ``` 39 | max iterations 40 | ```txt 41 | -v, -verbose 42 | ``` 43 | print more run info 44 | ```txt 45 | -P, -progress 46 | ``` 47 | print more run info to STDERR while the program is running. 48 | ```txt 49 | -b, -bic 50 | ``` 51 | use Bayesian Information Criterion (BIC) to compare models 52 | -------------------------------------------------------------------------------- /src/common/BetaBin.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2011-2022 University of Southern California 3 | Authors: Andrew D. Smith, Song Qiang 4 | 5 | This file is part of dnmtools. 6 | 7 | dnmtools is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | dnmtools is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | */ 17 | 18 | #ifndef BETABIN_HPP 19 | #define BETABIN_HPP 20 | 21 | #include // IWYU pragma: keep 22 | #include 23 | #include 24 | #include 25 | 26 | struct betabin { 27 | betabin(); 28 | betabin(const double a, const double b); 29 | explicit betabin(const std::string &str); 30 | double 31 | operator()(const std::pair &val) const; 32 | double 33 | log_likelihood(const std::pair &val) const; 34 | double 35 | sign(const double x); 36 | double 37 | invpsi(const double tolerance, const double x); 38 | double 39 | movement(const double curr, const double prev); 40 | void 41 | fit(const std::vector &vals_a, const std::vector &vals_b, 42 | const std::vector &p); 43 | std::string 44 | tostring() const; 45 | double alpha{}; 46 | double beta{}; 47 | double lnbeta_helper{}; 48 | 49 | static const double tolerance; 50 | }; 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /docs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: DNMTools 2 | strict: true 3 | 4 | docs_dir: content 5 | 6 | theme: readthedocs 7 | nav: 8 | - Home: 'index.md' 9 | - 'Installation': 'quickstart.md' 10 | - 'DNMTools on GitHub' : https://github.com/smithlabcode/dnmtools 11 | - Methylome construction: 12 | - 'abismal': 'abismal.md' 13 | - 'format': 'format.md' 14 | - 'uniq': 'uniq.md' 15 | - 'bsrate' : 'bsrate.md' 16 | - 'counts' : 'counts.md' 17 | - 'sym': 'sym.md' 18 | - 'levels' : 'levels.md' 19 | - Methylome analysis: 20 | - 'hmr' : 'hmr.md' 21 | - 'hmr-rep' : 'hmr-rep.md' 22 | - 'hypermr' : 'hypermr.md' 23 | - 'entropy' : 'entropy.md' 24 | - 'multistat' : 'multistat.md' 25 | - 'pmd' : 'pmd.md' 26 | - 'roi' : 'roi.md' 27 | - 'mlml' : 'mlml.md' 28 | - Allele-specific methylation: 29 | - 'states' : 'states.md' 30 | - 'allelic' : 'allelic.md' 31 | - 'amrfinder' : 'amrfinder.md' 32 | - 'amrtester' : 'amrtester.md' 33 | - Differential methylation: 34 | - 'diff' : 'diff.md' 35 | - 'dmr' : 'dmr.md' 36 | - 'radmeth' : 'radmeth.md' 37 | - 'radadjust' : 'radadjust.md' 38 | - 'radmerge' : 'radmerge.md' 39 | - Methylation visualisation: 40 | - 'fastlift': 'fastlift.md' 41 | - 'liftfilter': 'liftfilter.md' 42 | - General-purpose tools: 43 | - 'cleanhp': 'cleanhp.md' 44 | - 'guessprotocol': 'guessprotocol.md' 45 | - 'merge-bsrate': 'merge-bsrate.md' 46 | - 'merge': 'merge.md' 47 | - 'selectsites': 'selectsites.md' 48 | - Visualization: 49 | - 'Visualization' : 'visualization.md' 50 | - Other: 51 | - 'Cytosine contexts' : 'cytosine_contexts.md' 52 | -------------------------------------------------------------------------------- /cmake/FindLIBDEFLATE.cmake: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: GPL-3.0-or-later; (c) 2025 Andrew D Smith (author) 2 | #[=======================================================================[.rst: 3 | FindLIBDEFLATE 4 | -------------- 5 | 6 | Find the native libdeflate includes and library. 7 | 8 | #]=======================================================================] 9 | 10 | # FindLIBDEFLATE.cmake 11 | # Custom CMake module to find libdeflate 12 | 13 | # Support preference of static libs by adjusting CMAKE_FIND_LIBRARY_SUFFIXES 14 | # ADS: this is taken from the FindBoost.cmake file 15 | if(LIBDEFLATE_USE_STATIC_LIBS) 16 | set(_libdeflate_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES 17 | ${CMAKE_FIND_LIBRARY_SUFFIXES} 18 | ) 19 | if(WIN32) 20 | list(INSERT CMAKE_FIND_LIBRARY_SUFFIXES 0 .lib .a) 21 | else() 22 | set(CMAKE_FIND_LIBRARY_SUFFIXES .a) 23 | endif() 24 | endif() 25 | 26 | find_path(LIBDEFLATE_INCLUDE_DIR NAMES libdeflate.h) 27 | find_library(LIBDEFLATE_LIBRARY NAMES deflate libdeflate) 28 | 29 | include(FindPackageHandleStandardArgs) 30 | find_package_handle_standard_args(LIBDEFLATE 31 | REQUIRED_VARS LIBDEFLATE_LIBRARY LIBDEFLATE_INCLUDE_DIR 32 | VERSION_VAR LIBDEFLATE_VERSION 33 | ) 34 | 35 | if(LIBDEFLATE_FOUND AND NOT TARGET LIBDEFLATE::LIBDEFLATE) 36 | add_library(LIBDEFLATE::LIBDEFLATE UNKNOWN IMPORTED) 37 | set_target_properties(LIBDEFLATE::LIBDEFLATE PROPERTIES 38 | INTERFACE_INCLUDE_DIRECTORIES "${LIBDEFLATE_INCLUDE_DIR}" 39 | IMPORTED_LOCATION "${LIBDEFLATE_LIBRARY}" 40 | ) 41 | endif() 42 | 43 | # Restore the original find library ordering 44 | # ADS: this is take from the FindBoost.cmake file 45 | if(LIBDEFLATE_USE_STATIC_LIBS) 46 | set(CMAKE_FIND_LIBRARY_SUFFIXES 47 | ${_libdeflate_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES} 48 | ) 49 | endif() 50 | -------------------------------------------------------------------------------- /src/common/xcounts_utils.hpp: -------------------------------------------------------------------------------- 1 | /* xcounts_utils: code for doing things with xcounts format and some 2 | * for counts format that is common to several tools. 3 | * 4 | * Copyright (C) 2023-2024 Andrew D. Smith 5 | * 6 | * Authors: Andrew D. Smith 7 | * 8 | * This program is free software: you can redistribute it and/or 9 | * modify it under the terms of the GNU General Public License as 10 | * published by the Free Software Foundation, either version 3 of the 11 | * License, or (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, but 14 | * WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 | * General Public License for more details. 17 | */ 18 | 19 | #ifndef XCOUNTS_UTILS_HPP 20 | #define XCOUNTS_UTILS_HPP 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | struct xcounts_entry { 29 | std::uint64_t pos{}; // absolute position 30 | std::uint32_t n_meth{}; 31 | std::uint32_t n_unmeth{}; 32 | 33 | [[nodiscard]] std::uint32_t 34 | n_reads() const { 35 | return n_meth + n_unmeth; 36 | } 37 | 38 | [[nodiscard]] double 39 | frac() const { 40 | return static_cast(n_meth) / n_reads(); 41 | } 42 | }; 43 | 44 | inline std::ostream & 45 | operator<<(std::ostream &o, const xcounts_entry &e) { 46 | return o << e.pos << '\t' << e.n_meth << '\t' << e.n_unmeth; 47 | } 48 | 49 | std::unordered_map> 50 | read_xcounts_by_chrom(const std::int32_t n_threads, 51 | const std::string &xcounts_file); 52 | 53 | bool 54 | get_is_xcounts_file(const std::string &filename); 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /docs/content/radmerge.md: -------------------------------------------------------------------------------- 1 | # radmerge - Merge CpGs to differentially methylated regions 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools radmerge [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | After running [radmeth](../radmeth) followed by 11 | [radadjust](../radadjust), it is possible to further join individually 12 | differentially methylated CpGs into differentially methylated 13 | regions. This can be achieved with the command 14 | 15 | ```shell 16 | $ dnmtools radmerge -p 0.01 radmeth-input.bed > output-dmrs.bed 17 | ``` 18 | 19 | The current algorithm is conservative: it joins neighboring 20 | differentially methylated sites with p-value below 0.01 (set by the -p 21 | parameter). The output format is 22 | 23 | ```txt 24 | chrom start end dmr num-sites meth-diff 25 | ``` 26 | 27 | Above, `num-sites` and `meth-diff` are the number of significantly 28 | differentially methylated CpGs in the DMR and the estimated 29 | methylation difference, respectively. Example output might look like 30 | this: 31 | 32 | ```txt 33 | chr1 57315 57721 dmr 10 -0.498148 34 | chr1 58263 59009 dmr 27 -0.521182 35 | chr1 138522 139012 dmr 13 -0.443182 36 | chr1 149284 149444 dmr 7 -0.430453 37 | chr1 274339 275254 dmr 18 -0.520114 38 | ``` 39 | 40 | Note that in addition to being conservative, the work done by 41 | `radmerge` is very simple, and does not consider genomic distance 42 | between neighboring sites. It will merge consecutive significant sites 43 | into one interval no matter how distant are those sites on a 44 | chromosome. 45 | 46 | ## Options 47 | 48 | ```txt 49 | -o, -output 50 | ``` 51 | Output file (default: stdout). 52 | 53 | ```txt 54 | -p, -cutoff 55 | ``` 56 | P-value cutoff (default: 0.01). 57 | -------------------------------------------------------------------------------- /src/mlml/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 Andrew D Smith 2 | # 3 | # This program is free software: you can redistribute it and/or modify it 4 | # under the terms of the GNU General Public License as published by the Free 5 | # Software Foundation, either version 3 of the License, or (at your option) 6 | # any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 | # more details. 12 | # 13 | # You should have received a copy of the GNU General Public License along with 14 | # this program. If not, see . 15 | 16 | file(GLOB cpp_files "*.cpp") 17 | 18 | # Gather all the object files that will be put in the static library 19 | # and prepare to compile them. 20 | set(LIBRARY_OBJECTS "") 21 | foreach(cpp_file ${cpp_files}) 22 | get_filename_component(BASE_NAME ${cpp_file} NAME_WE) 23 | add_library(${BASE_NAME} OBJECT ${cpp_file}) 24 | target_link_libraries(${BASE_NAME} PUBLIC 25 | dnmtools_objs 26 | smithlab_cpp 27 | HTSLIB::HTSLIB 28 | ) 29 | ## Below is to make sure 'config.h' is visible for includes and any 30 | ## of the headers for configured libraries 31 | target_include_directories(${BASE_NAME} PUBLIC 32 | ${PROJECT_BINARY_DIR} 33 | ) 34 | list(APPEND LIBRARY_OBJECTS ${BASE_NAME}) 35 | endforeach() 36 | 37 | # Create static library linking the individual objects 38 | add_library(dnmtools_mlml_objs STATIC) 39 | target_include_directories(dnmtools_mlml_objs PUBLIC 40 | ${CMAKE_BINARY_DIR} 41 | ${CMAKE_CURRENT_SOURCE_DIR} 42 | ) 43 | target_link_libraries(dnmtools_mlml_objs PUBLIC 44 | ${LIBRARY_OBJECTS} 45 | smithlab_cpp 46 | ) 47 | -------------------------------------------------------------------------------- /src/utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 Andrew D Smith 2 | # 3 | # This program is free software: you can redistribute it and/or modify it 4 | # under the terms of the GNU General Public License as published by the Free 5 | # Software Foundation, either version 3 of the License, or (at your option) 6 | # any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 | # more details. 12 | # 13 | # You should have received a copy of the GNU General Public License along with 14 | # this program. If not, see . 15 | 16 | file(GLOB cpp_files "*.cpp") 17 | 18 | # Gather all the object files that will be put in the static library 19 | # and prepare to compile them. 20 | set(LIBRARY_OBJECTS "") 21 | foreach(cpp_file ${cpp_files}) 22 | get_filename_component(BASE_NAME ${cpp_file} NAME_WE) 23 | add_library(${BASE_NAME} OBJECT ${cpp_file}) 24 | target_link_libraries(${BASE_NAME} PUBLIC 25 | dnmtools_objs 26 | bamxx 27 | smithlab_cpp 28 | HTSLIB::HTSLIB 29 | ) 30 | ## Below is to make sure 'config.h' is visible for includes and any 31 | ## of the headers for configured libraries 32 | target_include_directories(${BASE_NAME} PUBLIC 33 | ${PROJECT_BINARY_DIR} 34 | ) 35 | list(APPEND LIBRARY_OBJECTS ${BASE_NAME}) 36 | endforeach() 37 | 38 | # Create static library linking the individual objects 39 | add_library(dnmtools_utils_objs STATIC) 40 | target_include_directories(dnmtools_utils_objs PUBLIC 41 | ${CMAKE_BINARY_DIR} 42 | ${CMAKE_CURRENT_SOURCE_DIR} 43 | ) 44 | target_link_libraries(dnmtools_utils_objs PUBLIC 45 | ${LIBRARY_OBJECTS} 46 | smithlab_cpp 47 | bamxx 48 | ) 49 | -------------------------------------------------------------------------------- /src/analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 Andrew D Smith 2 | # 3 | # This program is free software: you can redistribute it and/or modify it 4 | # under the terms of the GNU General Public License as published by the Free 5 | # Software Foundation, either version 3 of the License, or (at your option) 6 | # any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 | # more details. 12 | # 13 | # You should have received a copy of the GNU General Public License along with 14 | # this program. If not, see . 15 | 16 | file(GLOB cpp_files "*.cpp") 17 | 18 | # Gather all the object files that will be put in the static library 19 | # and prepare to compile them. 20 | set(LIBRARY_OBJECTS "") 21 | foreach(cpp_file ${cpp_files}) 22 | get_filename_component(BASE_NAME ${cpp_file} NAME_WE) 23 | add_library(${BASE_NAME} OBJECT ${cpp_file}) 24 | target_link_libraries(${BASE_NAME} PUBLIC 25 | dnmtools_objs 26 | bamxx 27 | smithlab_cpp 28 | HTSLIB::HTSLIB 29 | ) 30 | ## Below is to make sure 'config.h' is visible for includes and any 31 | ## of the headers for configured libraries 32 | target_include_directories(${BASE_NAME} PUBLIC 33 | ${PROJECT_BINARY_DIR} 34 | ) 35 | list(APPEND LIBRARY_OBJECTS ${BASE_NAME}) 36 | endforeach() 37 | 38 | # Create static library linking the individual objects 39 | add_library(dnmtools_analysis_objs STATIC) 40 | target_include_directories(dnmtools_analysis_objs PUBLIC 41 | ${CMAKE_BINARY_DIR} 42 | ${CMAKE_CURRENT_SOURCE_DIR} 43 | ) 44 | target_link_libraries(dnmtools_analysis_objs PUBLIC 45 | ${LIBRARY_OBJECTS} 46 | smithlab_cpp 47 | bamxx 48 | ) 49 | -------------------------------------------------------------------------------- /src/radmeth/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 Andrew D Smith 2 | # 3 | # This program is free software: you can redistribute it and/or modify it 4 | # under the terms of the GNU General Public License as published by the Free 5 | # Software Foundation, either version 3 of the License, or (at your option) 6 | # any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 | # more details. 12 | # 13 | # You should have received a copy of the GNU General Public License along with 14 | # this program. If not, see . 15 | 16 | file(GLOB cpp_files "*.cpp") 17 | 18 | # Gather all the object files that will be put in the static library 19 | # and prepare to compile them. 20 | set(LIBRARY_OBJECTS "") 21 | foreach(cpp_file ${cpp_files}) 22 | get_filename_component(BASE_NAME ${cpp_file} NAME_WE) 23 | add_library(${BASE_NAME} OBJECT ${cpp_file}) 24 | target_link_libraries(${BASE_NAME} PUBLIC 25 | dnmtools_objs 26 | bamxx 27 | smithlab_cpp 28 | HTSLIB::HTSLIB 29 | ) 30 | ## Below is to make sure 'config.h' is visible for includes and any 31 | ## of the headers for configured libraries 32 | target_include_directories(${BASE_NAME} PUBLIC 33 | ${PROJECT_BINARY_DIR} 34 | ) 35 | list(APPEND LIBRARY_OBJECTS ${BASE_NAME}) 36 | endforeach() 37 | 38 | # Create static library linking the individual objects 39 | add_library(dnmtools_radmeth_objs STATIC) 40 | target_include_directories(dnmtools_radmeth_objs PUBLIC 41 | ${CMAKE_BINARY_DIR} 42 | ${CMAKE_CURRENT_SOURCE_DIR} 43 | ) 44 | target_link_libraries(dnmtools_radmeth_objs PUBLIC 45 | ${LIBRARY_OBJECTS} 46 | smithlab_cpp 47 | bamxx 48 | Threads::Threads 49 | ) 50 | -------------------------------------------------------------------------------- /src/amrfinder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 Andrew D Smith 2 | # 3 | # This program is free software: you can redistribute it and/or modify it 4 | # under the terms of the GNU General Public License as published by the Free 5 | # Software Foundation, either version 3 of the License, or (at your option) 6 | # any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 | # more details. 12 | # 13 | # You should have received a copy of the GNU General Public License along with 14 | # this program. If not, see . 15 | 16 | file(GLOB cpp_files "*.cpp") 17 | 18 | # Gather all the object files that will be put in the static library 19 | # and prepare to compile them. 20 | set(LIBRARY_OBJECTS "") 21 | foreach(cpp_file ${cpp_files}) 22 | get_filename_component(BASE_NAME ${cpp_file} NAME_WE) 23 | add_library(${BASE_NAME} OBJECT ${cpp_file}) 24 | target_link_libraries(${BASE_NAME} PUBLIC 25 | dnmtools_objs 26 | bamxx 27 | smithlab_cpp 28 | HTSLIB::HTSLIB 29 | ) 30 | ## Below is to make sure 'config.h' is visible for includes and any 31 | ## of the headers for configured libraries 32 | target_include_directories(${BASE_NAME} PUBLIC 33 | ${PROJECT_BINARY_DIR} 34 | ) 35 | list(APPEND LIBRARY_OBJECTS ${BASE_NAME}) 36 | endforeach() 37 | 38 | # Create static library linking the individual objects 39 | add_library(dnmtools_amrfinder_objs STATIC) 40 | target_include_directories(dnmtools_amrfinder_objs PUBLIC 41 | ${CMAKE_BINARY_DIR} 42 | ${CMAKE_CURRENT_SOURCE_DIR} 43 | ) 44 | target_link_libraries(dnmtools_amrfinder_objs PUBLIC 45 | ${LIBRARY_OBJECTS} 46 | smithlab_cpp 47 | bamxx 48 | Threads::Threads 49 | ) 50 | -------------------------------------------------------------------------------- /src/common/numerical_utils.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011-2022 University of Southern California 3 | * Andrew D Smith and Qiang Song 4 | * Author: Qiang Song and Andrew D. Smith 5 | * 6 | * This is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | */ 16 | 17 | #include "numerical_utils.hpp" 18 | 19 | #include 20 | #include 21 | #include // IWYU pragma: keep 22 | #include 23 | 24 | double 25 | log_sum_log_vec(const std::vector &vals, const size_t limit) { 26 | const auto x = std::max_element( 27 | std::cbegin(vals), std::cbegin(vals) + static_cast(limit)); 28 | const double max_val = *x; 29 | const std::size_t max_idx = std::distance(std::cbegin(vals), x); 30 | double sum = 1.0; 31 | for (std::size_t i = 0; i < limit; ++i) 32 | if (i != max_idx) 33 | sum += std::exp(vals[i] - max_val); // cppcheck-suppress useStlAlgorithm 34 | return max_val + std::log(sum); 35 | } 36 | 37 | double 38 | log_sum_log(const std::vector::const_iterator &begin, 39 | const std::vector::const_iterator &end) { 40 | const auto max_itr = std::max_element(begin, end); 41 | const double max_val = *max_itr; 42 | double sum = 1.0; 43 | for (auto itr = begin; itr < end; ++itr) 44 | if (itr != max_itr) 45 | sum += std::exp(*itr - max_val); // cppcheck-suppress useStlAlgorithm 46 | return max_val + std::log(sum); 47 | } 48 | -------------------------------------------------------------------------------- /src/radmeth/README.md: -------------------------------------------------------------------------------- 1 | RADMeth: Regression Analysis of Differential Methylation 2 | ======================================================== 3 | 4 | RADMeth: Regression Analysis of Differential Methilation is a software for 5 | computing individual differentially methylated sites and genomic regions in 6 | whole genome bisulfite sequencing (WGBS) data. 7 | 8 | Contact Information 9 | ------------------- 10 | 11 | Egor Dolzhenko 12 | dolzhenk@usc.edu 13 | http://smithlabresearch.org/ 14 | 15 | Installation 16 | ------------ 17 | *Before attempting to compile RADMeth please make sure that GNU Scientific 18 | Library (http://www.gnu.org/software/gsl/) is installed on your system* 19 | Alternatively, you can download pre-compiled binaries for either Linux or Mac 20 | from http://smithlabresearch.org/software/radmeth/ 21 | 22 | To compile RADMeth, enter the program's root directory (e.g. radmeth/) and 23 | execute 24 | 25 | > make 26 | 27 | After the compilation, the binaries can be found in radmeth/bin/ 28 | 29 | Usage 30 | ----- 31 | 32 | Please see the manual, which can be obtained at 33 | http://smithlabresearch.org/software/radmeth/ 34 | 35 | License 36 | ------- 37 | Copyright (C) 2013 University of Southern California and 38 | Egor Dolzhenko 39 | Andrew D Smith 40 | 41 | Authors: Andrew D. Smith and Egor Dolzhenko 42 | 43 | This program is free software: you can redistribute it and/or modify 44 | it under the terms of the GNU General Public License as published by 45 | the Free Software Foundation, either version 3 of the License, or 46 | (at your option) any later version. 47 | 48 | This program is distributed in the hope that it will be useful, 49 | but WITHOUT ANY WARRANTY; without even the implied warranty of 50 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 51 | GNU General Public License for more details. 52 | -------------------------------------------------------------------------------- /docs/content/selectsites.md: -------------------------------------------------------------------------------- 1 | # selectsites - get subsets of cytosines from counts files 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools selectsites [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | In many cases, we may be interested in analyzing only a subset of 11 | cytosines or CpGs in a sample. Some instances of these cases including 12 | calculating average methylation levels in (1) annotated regions, such 13 | as promoter regions or repeats or (2) regions defined by the data 14 | itself, such as HMRs or PMDs. 15 | 16 | A possible solution to subset these regions is to convert the counts file to 17 | BED format, intersect it with a BED file of the regions of interest (using 18 | [bedtools](https://bedtools.readthedocs.io)), then convert it back to 19 | counts. The program selectsites simplifies these operations. It takes a 20 | [counts](../counts) format file and a set of intervals in a BED file and 21 | produces a subset of the entries in the counts file included in the BED 22 | regions. We can select entries in `input.counts` contained in any inverval in 23 | `regions.bed` using the following command. 24 | 25 | ```shell 26 | $ dnmtools selectsites -o output.counts regions.bed input.counts 27 | ``` 28 | 29 | ## Options 30 | 31 | ```txt 32 | -o, -output 33 | ``` 34 | Name of output file (default: STDOUT) 35 | 36 | ```txt 37 | -p, -preload 38 | ``` 39 | Preload sites (use for large target intervals). 40 | 41 | ```txt 42 | -v, -verbose 43 | ``` 44 | Print more run info to STDERR while the program is running. 45 | 46 | ```txt 47 | -d, -disk 48 | ``` 49 | Process sites on disk (fast if target intervals are few). 50 | 51 | ```txt 52 | -S, -summary 53 | ``` 54 | Write summary to this file. 55 | 56 | ```txt 57 | -z, -zip 58 | ``` 59 | The output file will be in gzip compressed format. 60 | 61 | ```txt 62 | -relaxed 63 | ``` 64 | Allow additional columns in the input file. 65 | -------------------------------------------------------------------------------- /src/common/Epiread.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2011-2022 University of Southern California and 2 | * Andrew D. Smith and Fang Fang 3 | * 4 | * Authors: Fang Fang and Andrew D. Smith 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | */ 16 | 17 | #ifndef EPIREAD 18 | #define EPIREAD 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | struct epiread { 26 | std::string chr{}; 27 | size_t pos{}; 28 | std::string seq{}; 29 | epiread() = default; 30 | explicit epiread(const std::string &line); 31 | epiread(const size_t p, const std::string &s) : pos(p), seq(s) {} 32 | epiread(const std::string &c, const size_t p, const std::string &s) : 33 | chr(c), pos(p), seq(s) {} 34 | 35 | bool 36 | operator<(const epiread &other) const { 37 | return (chr < other.chr || (chr == other.chr && pos < other.pos)); 38 | } 39 | size_t 40 | end() const { 41 | return pos + seq.length(); 42 | } 43 | size_t 44 | length() const { 45 | return seq.length(); 46 | } 47 | }; 48 | 49 | std::istream & 50 | operator>>(std::istream &in, epiread &er); 51 | std::ostream & 52 | operator<<(std::ostream &out, const epiread &er); 53 | 54 | size_t 55 | adjust_read_offsets(std::vector &reads); 56 | 57 | size_t 58 | get_n_cpgs(const std::vector &reads); 59 | 60 | bool 61 | validate_epiread_file(const std::string &filename); 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 Andrew D Smith 2 | # 3 | # This program is free software: you can redistribute it and/or modify it 4 | # under the terms of the GNU General Public License as published by the Free 5 | # Software Foundation, either version 3 of the License, or (at your option) 6 | # any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 | # more details. 12 | # 13 | # You should have received a copy of the GNU General Public License along with 14 | # this program. If not, see . 15 | 16 | # Packages 17 | if(USE_LIBDEFLATE) 18 | find_package(LIBDEFLATE REQUIRED) 19 | endif() 20 | if(BUILD_NANOPORE) 21 | find_package(HTSLIB 1.20 REQUIRED) 22 | add_compile_definitions(BUILD_NANOPORE) 23 | else() 24 | find_package(HTSLIB REQUIRED) 25 | endif() 26 | find_package(Threads REQUIRED) 27 | find_package(ZLIB REQUIRED) 28 | 29 | # Subdirs 30 | if(NOT TARGET smithlab_cpp) 31 | add_subdirectory(smithlab_cpp) 32 | endif() 33 | if(NOT TARGET bamxx) 34 | add_subdirectory(bamxx) 35 | endif() 36 | add_subdirectory(common) 37 | add_subdirectory(radmeth) 38 | add_subdirectory(utils) 39 | add_subdirectory(analysis) 40 | add_subdirectory(amrfinder) 41 | add_subdirectory(abismal) 42 | add_subdirectory(mlml) 43 | 44 | add_executable(dnmtools dnmtools.cpp) 45 | # ADS: below, for config.h 46 | target_include_directories(dnmtools PUBLIC ${CMAKE_BINARY_DIR}) 47 | target_link_libraries(dnmtools PUBLIC 48 | dnmtools_objs 49 | abismal_objs 50 | dnmtools_analysis_objs 51 | dnmtools_utils_objs 52 | dnmtools_radmeth_objs 53 | dnmtools_mlml_objs 54 | dnmtools_amrfinder_objs 55 | bamxx 56 | HTSLIB::HTSLIB 57 | ZLIB::ZLIB 58 | Threads::Threads 59 | ) 60 | if(USE_LIBDEFLATE) 61 | target_link_libraries(dnmtools PUBLIC 62 | LIBDEFLATE::LIBDEFLATE 63 | ) 64 | endif() 65 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This file is part of dnmtools 2 | # 3 | # Copyright (C) 2025 Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it under the 8 | # terms of the GNU General Public License as published by the Free Software 9 | # Foundation, either version 3 of the License, or (at your option) any later 10 | # version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but WITHOUT 13 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 | # more details. 16 | 17 | # to find the version of cmake do 18 | # $ cmake --version 19 | cmake_minimum_required(VERSION 3.28) 20 | project( 21 | dnmtools 22 | VERSION 1.5.1 23 | DESCRIPTION 24 | "Tools for analyzing DNA methylation data" 25 | HOMEPAGE_URL https://github.com/smithlabcode/dnmtools 26 | LANGUAGES CXX) 27 | 28 | # Set language version used 29 | set(CMAKE_CXX_STANDARD 17) 30 | set(CMAKE_CXX_STANDARD_REQUIRED on) 31 | set(CMAKE_CXX_EXTENSIONS off) # prevents std=gnu++17 32 | set(CMAKE_EXPORT_COMPILE_COMMANDS on) 33 | 34 | include(CheckIncludeFileCXX) 35 | include(CheckFunctionExists) 36 | include(CheckCXXCompilerFlag) 37 | 38 | include(GNUInstallDirs) 39 | 40 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") 41 | 42 | configure_file(data/config.h.in config.h) 43 | 44 | if(ENABLE_LTO) 45 | # Turn on LTO if we are building for distribution 46 | include(CheckIPOSupported) 47 | check_ipo_supported(RESULT result OUTPUT output) 48 | if(result) 49 | set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) 50 | else() 51 | message(FATAL_ERROR "IPO is not supported: ${output}") 52 | endif() 53 | endif() 54 | 55 | if(STATIC_ANALYSIS) 56 | include(cmake/static_analysis.cmake) 57 | endif() 58 | 59 | # ADS: set the most stringent warnings we can 60 | add_compile_options( 61 | -Wall 62 | -Wextra 63 | -Wpedantic 64 | -Werror 65 | -Wfatal-errors 66 | ) 67 | 68 | add_subdirectory(src) 69 | -------------------------------------------------------------------------------- /.github/workflows/dnmtools_build_ubuntu.yml: -------------------------------------------------------------------------------- 1 | name: DNMTools build (Ubuntu) 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: [ "master" ] 7 | pull_request: 8 | branches: [ "master" ] 9 | 10 | jobs: 11 | build-with-gcc: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | with: 16 | submodules: recursive 17 | - name: Install dependencies 18 | run: | 19 | sudo apt-get update 20 | sudo apt-get install -y \ 21 | libgsl-dev \ 22 | libcurl4-gnutls-dev \ 23 | libdeflate-dev \ 24 | liblzma-dev \ 25 | zlib1g-dev \ 26 | libbz2-dev 27 | - name: Build and install htslib (for recent version) 28 | run: | 29 | git clone --recursive https://github.com/samtools/htslib.git 30 | cd htslib 31 | make -j4 32 | sudo make install prefix=/usr 33 | - name: Generate configure script 34 | run: ./autogen.sh 35 | - name: Configure for GCC 36 | run: ./configure CXX="g++" 37 | - name: Build with g++ 38 | run: make -j4 39 | - name: Test the g++ build 40 | run: make -j4 check 41 | build-with-clang: 42 | runs-on: ubuntu-latest 43 | steps: 44 | - uses: actions/checkout@v4 45 | with: 46 | submodules: recursive 47 | - name: Install dependencies 48 | run: | 49 | sudo apt-get update 50 | sudo apt-get install -y \ 51 | libgsl-dev \ 52 | libcurl4-gnutls-dev \ 53 | libdeflate-dev \ 54 | liblzma-dev \ 55 | zlib1g-dev \ 56 | libbz2-dev 57 | - name: Build and install htslib (for recent version) 58 | run: | 59 | git clone --recursive https://github.com/samtools/htslib.git 60 | cd htslib 61 | make -j4 62 | sudo make install prefix=/usr 63 | - name: Generate configure script 64 | run: ./autogen.sh 65 | - name: Configure for Clang 66 | run: ./configure CXX="clang++" 67 | - name: Build with clang++ 68 | run: make -j4 69 | - name: Test the clang++ build 70 | run: make -j4 check 71 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Run 'autoreconf -i' to generate 'configure', 'Makefile.in', etc., 4 | # including in the subdirectories of dnmtools src (for the git 5 | # submodules). 6 | # 7 | # The first time this is run on a new cloned git repo the configure 8 | # script will not be present, only the configure.ac and 9 | # Makefile.am. The rest must be generated by `autoreconf -i` and this 10 | # must happen in the `src/smithlab_cpp`, 11 | # `src/abismal/src/smithlab_cpp` and `src/abismal` subdirs. Running 12 | # `autoreconf -i` in some of these directories will move recursively 13 | # into others, but this is not guaranteed. This script will do each 14 | # separately. 15 | # 16 | # If you are working with a distribution (file ending with ".tar.gz" 17 | # or similar) then this script should not be needed, and should not be 18 | # present, as all the files should already exist. You should only run 19 | # this script if you know what you are doing with autoreconf. 20 | # 21 | # This script will only work with an argument to confirm the help 22 | # message has been read. 23 | 24 | runautoreconf() { 25 | autoreconf -i src/abismal/src/smithlab_cpp; 26 | autoreconf -i src/abismal; 27 | autoreconf -i src/smithlab_cpp; 28 | autoreconf -i; 29 | } 30 | 31 | if test -d .git && test "$(basename "${PWD}")" = "dnmtools" 32 | then 33 | runautoreconf 34 | exit 0 35 | else 36 | echo " It seems you are either attempting to run this script " 37 | echo " from the wrong directory, or in a source tree that was " 38 | echo " not obtained by cloning the dnmtools git repo. " 39 | echo " " 40 | echo " ./autogen.sh generates the configure script in the " 41 | echo " relevant subdirectories. Only run this if you know " 42 | echo " what you are doing with autoreconf and are simply " 43 | echo " avoiding doing that. If you just want to use the " 44 | echo " software, download a release and this script will " 45 | echo " not be needed. " 46 | exit 1 47 | fi 48 | -------------------------------------------------------------------------------- /.github/workflows/dnmtools_build_macos.yml: -------------------------------------------------------------------------------- 1 | name: DNMTools build (macOS) 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: [ "master" ] 7 | pull_request: 8 | branches: [ "master" ] 9 | 10 | jobs: 11 | ## ADS: removing this because of 'brownout' on github runners 12 | # build-with-gcc-on-x86: 13 | # runs-on: macos-13 14 | # steps: 15 | # - uses: actions/checkout@v4 16 | # with: 17 | # submodules: recursive 18 | # - name: Update Homebrew 19 | # run: brew update 20 | # - name: Install autotools 21 | # run: brew install automake 22 | # - name: Install dependencies 23 | # run: brew install htslib gsl 24 | # - name: Generate configure script 25 | # run: ./autogen.sh 26 | # - name: configure with g++-14 27 | # run: ./configure CXX="g++-14" CPPFLAGS="-I$(brew --prefix)/include" LDFLAGS="-L$(brew --prefix)/lib" 28 | # - name: make 29 | # run: make -j4 30 | build-with-gcc-on-arm64: 31 | runs-on: macos-15 32 | steps: 33 | - uses: actions/checkout@v4 34 | with: 35 | submodules: recursive 36 | - name: Update Homebrew 37 | run: brew update 38 | - name: Install autotools 39 | run: brew install automake 40 | - name: Install dependencies 41 | run: brew install htslib gsl 42 | - name: Generate configure script 43 | run: ./autogen.sh 44 | - name: configure with g++-14 45 | run: ./configure CXX="g++-14" CPPFLAGS="-I$(brew --prefix)/include" LDFLAGS="-L$(brew --prefix)/lib" 46 | - name: make 47 | run: make -j4 48 | build-with-clang-on-arm64: 49 | runs-on: macos-15 50 | steps: 51 | - uses: actions/checkout@v4 52 | with: 53 | submodules: recursive 54 | - name: Update Homebrew 55 | run: brew update 56 | - name: Install autotools 57 | run: brew install automake 58 | - name: Install dependencies 59 | run: brew install htslib gsl 60 | - name: Generate configure script 61 | run: ./autogen.sh 62 | - name: configure with clang++ 63 | run: ./configure CXX="clang++" CPPFLAGS="-I$(brew --prefix)/include" LDFLAGS="-L$(brew --prefix)/lib" 64 | - name: make 65 | run: make -j4 66 | -------------------------------------------------------------------------------- /docs/content/entropy.md: -------------------------------------------------------------------------------- 1 | # entropy - Computing methylation entropy 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools entropy [OPTIONS] 6 | ``` 7 | ## Description 8 | The concept of methylation entropy was introduced into epigenetics 9 | study to characterize the randomness of methylation patterns over 10 | several consecutive CpG sites (Xie et al, 2011). The `methentropy` 11 | program processes epireads and calculates the methylation entropy 12 | value in sliding windows of specified number of CpGs. Two input files 13 | are required. 14 | 15 | * (1) either a genome in FASTA format or a directory containing FASTA 16 | chromosome files files 17 | 18 | * (2) an epiread file as produced by 19 | [states](../states) program. The input epiread file 20 | needs to be sorted, first by chromosome, then by position. It can 21 | be done with the following command. 22 | 23 | ```shell 24 | $ LC_ALL=C sort -k1,1 -k2,2g input.epiread -o input-sorted.epiread 25 | ``` 26 | 27 | Use the `-w` option to specify the desired number of CpGs in the 28 | sliding window; if unspecified, the default value is 4. In cases where 29 | symmetric patterns are considered the same, specify option -F, this 30 | will cause the majority state in each epiread to be forced into 31 | "methylated", and the minority to "unmethylated". The processed 32 | epireads will then be used for entropy calculation. To run the 33 | program, type command: 34 | ```shell 35 | $ dnmtools entropy -w 5 -v -o output.meth /path/to/genome.fa input-sorted.epiread 36 | ``` 37 | 38 | The output format is the same as [counts](../counts) 39 | output. The first 3 columns indicate the genomic location of the 40 | center CpG in each sliding window, the 5th column contains the entropy 41 | values, and the 6th column shows the number of reads used for each 42 | sliding window. Below is an output example. 43 | 44 | ```txt 45 | chr1 483 + CpG 2.33914 27 46 | chr1 488 + CpG 2.05298 23 47 | chr1 492 + CpG 1.4622 24 48 | chr1 496 + CpG 1.8784 35 49 | ``` 50 | 51 | ## Options 52 | ```txt 53 | -w, -window 54 | ``` 55 | number of CpGs in sliding window (default: 4) 56 | ```txt 57 | -F, -flip 58 | ``` 59 | flip read majority state to meth 60 | ```txt 61 | -o, -output 62 | ``` 63 | Name of output file (default: STDOUT) 64 | ```txt 65 | -v, -verbose 66 | ``` 67 | print more run info to STDERR while the program is running. 68 | 69 | -------------------------------------------------------------------------------- /src/common/counts_header.hpp: -------------------------------------------------------------------------------- 1 | /* xcounts_utils: code for doing things with xcounts format and some 2 | * for counts format that is common to several tools. 3 | * 4 | * Copyright (C) 2023 Andrew D. Smith 5 | * 6 | * Authors: Andrew D. Smith 7 | * 8 | * This program is free software: you can redistribute it and/or 9 | * modify it under the terms of the GNU General Public License as 10 | * published by the Free Software Foundation, either version 3 of the 11 | * License, or (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, but 14 | * WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 | * General Public License for more details. 17 | */ 18 | 19 | #ifndef COUNTS_HEADER_HPP 20 | #define COUNTS_HEADER_HPP 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | namespace bamxx { 27 | struct bam_header; 28 | } 29 | namespace bamxx { 30 | struct bgzf_file; 31 | } 32 | 33 | std::unordered_map 34 | write_counts_header_from_chrom_sizes( 35 | const std::vector &chrom_names, 36 | const std::vector &chrom_sizes, bamxx::bgzf_file &out); 37 | 38 | std::unordered_map 39 | write_counts_header_from_file(const std::string &header_file, 40 | bamxx::bgzf_file &out); 41 | 42 | // returns -1 on failure, 0 on success 43 | int 44 | get_chrom_sizes_for_counts_header(const std::int32_t n_threads, 45 | const std::string &filename, 46 | std::vector &chrom_names, 47 | std::vector &chrom_sizes); 48 | 49 | void 50 | write_counts_header_from_bam_header(const bamxx::bam_header &hdr, 51 | bamxx::bgzf_file &out); 52 | 53 | bool 54 | write_counts_header_line(std::string line, bamxx::bgzf_file &out); 55 | 56 | bamxx::bgzf_file & 57 | skip_counts_header(bamxx::bgzf_file &in); 58 | 59 | bool 60 | get_has_counts_header(const std::string &filename); 61 | 62 | inline bool 63 | is_counts_header_version_line(const std::string &line) { 64 | const auto version_line = "#DNMTOOLS"; 65 | return line.compare(0, 9, version_line) == 0; 66 | } 67 | 68 | template 69 | inline bool 70 | is_counts_header_line(T &line) { 71 | return line[0] == '#'; 72 | } 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /src/radmeth/radmeth_design.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Andrew D Smith 2 | * 3 | * Author: Andrew D Smith 4 | * 5 | * This program is free software: you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) 8 | * any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | * more details. 14 | */ 15 | 16 | #ifndef RADMETH_DESIGN_HPP 17 | #define RADMETH_DESIGN_HPP 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | struct Design { 27 | std::vector factor_names; 28 | std::vector sample_names; 29 | std::vector> matrix; // samples=rows, factors=cols 30 | std::vector> tmatrix; // factors=rows, samples=cols 31 | std::vector> groups; // combs of fact levels 32 | std::vector group_id; // assign group to sample 33 | 34 | [[nodiscard]] static Design 35 | read_design(const std::string &design_filename); 36 | 37 | [[nodiscard]] std::size_t 38 | n_factors() const { 39 | return std::size(factor_names); 40 | } 41 | 42 | [[nodiscard]] std::size_t 43 | n_groups() const { 44 | return std::size(groups); 45 | } 46 | 47 | [[nodiscard]] std::size_t 48 | n_samples() const { 49 | return std::size(sample_names); 50 | } 51 | 52 | [[nodiscard]] Design 53 | drop_factor(const std::uint32_t factor_idx); 54 | 55 | void 56 | order_samples(const std::vector &ordered_names); 57 | 58 | [[nodiscard]] std::uint32_t 59 | get_test_factor_idx(const std::string &test_factor) const; 60 | 61 | [[nodiscard]] bool 62 | has_two_values(const std::size_t test_factor) const; 63 | }; 64 | 65 | std::istream & 66 | operator>>(std::istream &is, Design &design); 67 | 68 | std::ostream & 69 | operator<<(std::ostream &os, const Design &design); 70 | 71 | void 72 | ensure_sample_order(const std::string &table_filename, Design &design); 73 | 74 | [[nodiscard]] std::vector 75 | get_sample_names_from_header(const std::string &header); 76 | 77 | [[nodiscard]] bool 78 | consistent_sample_names(const Design &design, const std::string &header); 79 | 80 | #endif // RADMETH_DESIGN_HPP 81 | -------------------------------------------------------------------------------- /src/common/Interval.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Andrew D Smith 2 | * 3 | * This is free software; you can redistribute it and/or modify it under the 4 | * terms of the GNU General Public License as published by the Free Software 5 | * Foundation; either version 2 of the License, or (at your option) any later 6 | * version. 7 | * 8 | * This is distributed in the hope that it will be useful, but WITHOUT ANY 9 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 10 | * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 11 | * details. 12 | * 13 | * You should have received a copy of the GNU General Public License along 14 | * with this software; if not, write to the Free Software Foundation, Inc., 51 15 | * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 | */ 17 | 18 | #include "Interval.hpp" 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | auto 29 | Interval::initialize(const char *c, const char *c_end) -> bool { 30 | constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; }; 31 | constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; }; 32 | 33 | bool failed = false; 34 | 35 | // NOLINTBEGIN(*-pointer-arithmetic) 36 | auto field_s = c; 37 | auto field_e = std::find_if(field_s + 1, c_end, is_sep); 38 | if (field_e == c_end) 39 | failed = true; 40 | 41 | // chrom 42 | { 43 | const std::uint32_t d = std::distance(field_s, field_e); 44 | chrom = std::string{field_s, d}; 45 | } 46 | 47 | // start 48 | field_s = std::find_if(field_e + 1, c_end, not_sep); 49 | field_e = std::find_if(field_s + 1, c_end, is_sep); 50 | failed = failed || (field_e == c_end); 51 | { 52 | const auto [ptr, ec] = std::from_chars(field_s, field_e, start); 53 | failed = failed || ec != std::errc{}; 54 | } 55 | 56 | // stop 57 | field_s = std::find_if(field_e + 1, c_end, not_sep); 58 | field_e = std::find_if(field_s + 1, c_end, is_sep); 59 | { 60 | const auto [ptr, ec] = std::from_chars(field_s, field_e, stop); 61 | failed = failed || ec != std::errc{}; 62 | } 63 | // NOLINTEND(*-pointer-arithmetic) 64 | 65 | return !failed; 66 | } 67 | 68 | [[nodiscard]] auto 69 | read_intervals(const std::string &intervals_file) -> std::vector { 70 | std::ifstream in(intervals_file); 71 | if (!in) 72 | throw std::runtime_error("failed to open file: " + intervals_file); 73 | std::string line; 74 | std::vector intervals; 75 | while (getline(in, line)) 76 | intervals.emplace_back(line); 77 | return intervals; 78 | } 79 | -------------------------------------------------------------------------------- /docs/content/radadjust.md: -------------------------------------------------------------------------------- 1 | # radadjust - Correct p-values of individual CpGs 2 | 3 | ## Synopsis 4 | ```console 5 | $ dnmtools radadjust [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | This program adjusts the p-value of individual CpGs in the output of 11 | [radmeth](../radmeth). A typical application 12 | that takes the regression output as input and combines the p-values of 13 | 200 neighboring CpGs is done as follows. 14 | ```console 15 | $ dnmtools radadjust -bins 1:200:1 input.bed >output-adjusted.bed 16 | ``` 17 | 18 | Here, the only required parameter, besides the input file, is `-bins` 19 | whose value is set to `1:200:1` (which is also the default value). This 20 | means that for each `n = 1, 2, ...199`, `radmeth-adjust` computes the 21 | correlation between p-values of CpGs located at distance n from each 22 | other. These correlations are used during significance combination 23 | step. In addition, bin sizes determine the window for combining 24 | significance. In contrast, if `-bins` is set to `1:15:5`, then the 25 | correlation is computed separately for p-values corresponding to CpGs 26 | at distances `[1, 5)`, `[5, 10)`, and `[10, 15)` from one another. The 27 | first five columns and the last four columns of `radmeth-adjust` have 28 | the same meaning as those output by radmeth regression. The 6th column 29 | gives the modified p-value based on the original p-value of the site 30 | and the p-values of its neighbors. The 7th column gives the 31 | FDR-corrected p-value. Then the last four columns correspond to the 32 | total read counts and methylated read counts of the case group and 33 | control group, respectively. Here is what the `output-adjusted.bed` 34 | file looks like for our example dataset: 35 | 36 | ```txt 37 | chr1 108 + CpG 0.157971 0.099290 0.353466 18 4 20 15 38 | chr1 114 + CpG 0.559191 0.099290 0.353466 21 3 41 10 39 | chr1 160 + CpG 0.095112 0.099290 0.353466 32 24 39 17 40 | chr1 309 + CpG 0.239772 0.122248 0.368902 33 17 19 13 41 | chr1 499 + CpG 0.770140 0.204467 0.419872 43 22 29 15 42 | ``` 43 | 44 | After completing the previous steps, individual differentially methy- 45 | lated sites can be obtained with 'awk'. To get all CpGs with 46 | FDR-corrected p-value below 0.01, run 47 | 48 | ```console 49 | $ awk '$7 <= 0.01' output-adjusted.bed >output-significant.bed 50 | ``` 51 | 52 | ## Options 53 | 54 | ```txt 55 | -o, -out 56 | ``` 57 | Name of the output file (default: stdout). 58 | 59 | ```txt 60 | -b, -bins 61 | ``` 62 | Correlation bin specification string (default is 1:200:1). 63 | 64 | ```txt 65 | -v, -verbose 66 | ``` 67 | Print more information while the command is running. 68 | -------------------------------------------------------------------------------- /docs/content/states.md: -------------------------------------------------------------------------------- 1 | # states - Allele-specific methylation file format 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools states [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | All programs that calculate statistics related to ASM must take the 11 | linked states of CpG sites within reads into account. Using full read 12 | sequences for this purpose is inefficient, so we defined an 13 | intermediate format, "epiread," for this purpose. The `states` command 14 | will convert a BAM or SAM file of mapped reads into a "states" file in 15 | the format used by `amrfinder` and `amrtester`. 16 | 17 | The epiread format consists of three columns. The first column is the 18 | chromosome name for the mapped read, the second is the "index" of the 19 | first CpG in the read. The index `x` indicates that the first CpG site 20 | in the read corresponds to the `x`'th (starting from 0) CpG site in 21 | the chromosome. Therefore, these are not nucleotide positions in the 22 | genome. The final column in the epiread format is the sequence of 23 | methylation states within the read. This sequence of states is 24 | composed of 3 possible letters: C if the corresponding letter at that 25 | CpG site in the mapped read is a C, and similar for T. Within this 26 | state sequence, letters in mapped reads at positions corresponding to 27 | CpG sites that are neither C nor T are encoded as N. Aside from the 28 | "N" this is effectively a binary encoding of methylation states. 29 | 30 | Here is an example showing how some lines of an epiread format file might 31 | look: 32 | ```txt 33 | chr1 1460 CCCCCCCC 34 | chr1 1460 CCC 35 | chr1 1461 TCTTNNNNTTCT 36 | chr1 1468 CCCC 37 | chr1 1469 CCC 38 | chr1 1469 CCCT 39 | chr1 1469 CCC 40 | chr1 1469 CCCCCCT 41 | chr1 1469 CCC 42 | chr1 1470 CCCC 43 | chr1 1471 CCCNNNNNNTCCC 44 | chr1 1472 CCC 45 | ``` 46 | Those epireads with the "N" in the middle correspond to paired-end 47 | reads with ends that are joined. It is important to use these as one 48 | fragment because linking methylation states within a fragment, over as 49 | large a distance as possible, helps the inference methods within both 50 | `amrfinder` and `amrtester`. 51 | 52 | The following is an example of how to run the `states` command: 53 | ```shell 54 | $ dnmtools states -c /path/to/genome.fa -o output.epiread input.sam 55 | ``` 56 | 57 | ## Options 58 | 59 | ```txt 60 | -o, -output 61 | ``` 62 | The name of the output file. 63 | 64 | ```txt 65 | -c, -chrom 66 | ``` 67 | FASTA file of chromosomes containing FASTA files [required]. 68 | 69 | ```txt 70 | -v, -verbose 71 | ``` 72 | Print information to the terminal while the program runs. 73 | 74 | ```txt 75 | -z, -zip 76 | ``` 77 | Write output in gzip compressed format. 78 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Define a base to inherit from so ARGs can be collected here 2 | FROM alpine:latest as base 3 | ARG NUM_JOBS=16 4 | ARG HTSLIB_VERSION=1.21 5 | ARG SAMTOOLS_VERSION=1.21 6 | 7 | # All builds from source are installed to their own directory so the 8 | # essential file from them can be retrieved separately as needed. 9 | 10 | # Build htslib and samtools from source 11 | FROM base as build_htslib 12 | RUN apk update 13 | WORKDIR /build 14 | 15 | RUN apk add --no-cache \ 16 | build-base \ 17 | autoconf \ 18 | automake \ 19 | gsl-dev \ 20 | zlib-dev \ 21 | bzip2-dev \ 22 | xz-dev \ 23 | ncurses-dev \ 24 | wget 25 | ENV HTSLIB=htslib-${HTSLIB_VERSION} 26 | ENV SAMTOOLS=samtools-${SAMTOOLS_VERSION} 27 | RUN cd /build \ 28 | && wget -nv https://github.com/samtools/htslib/releases/download/${HTSLIB_VERSION}/${HTSLIB}.tar.bz2 \ 29 | && tar -xf ${HTSLIB}.tar.bz2 \ 30 | && cd ${HTSLIB} \ 31 | && autoreconf -i \ 32 | && ./configure --prefix=$(pwd) \ 33 | && make -j${NUM_JOBS} \ 34 | && make install 35 | RUN cd /build \ 36 | && wget -nv https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/${SAMTOOLS}.tar.bz2 \ 37 | && tar -xf ${SAMTOOLS}.tar.bz2 \ 38 | && cd ${SAMTOOLS} \ 39 | && ./configure --prefix=$(pwd) \ 40 | && make -j${NUM_JOBS} \ 41 | && make install 42 | 43 | # Build dnmtools 44 | FROM base as build_dnmtools 45 | RUN apk update 46 | WORKDIR /build 47 | ENV HTSLIB=htslib-${HTSLIB_VERSION} 48 | ENV SAMTOOLS=samtools-${SAMTOOLS_VERSION} 49 | RUN apk add --no-cache \ 50 | build-base \ 51 | autoconf \ 52 | automake \ 53 | gsl-dev \ 54 | zlib-dev \ 55 | bzip2-dev \ 56 | xz-dev \ 57 | wget \ 58 | gzip \ 59 | bash 60 | 61 | # Copying the install from within the build tree helps keep things 62 | # smaller than installing earlier and copying the /usr or /usr/local 63 | COPY --from=build_htslib /build/${HTSLIB}/lib /usr/lib 64 | COPY --from=build_htslib /build/${HTSLIB}/include /usr/include 65 | COPY --from=build_htslib /build/${SAMTOOLS}/bin /usr/bin 66 | 67 | RUN mkdir /build/dnmtools 68 | COPY . /build/dnmtools 69 | RUN cd /build/dnmtools \ 70 | && ./autogen.sh \ 71 | && ./configure --prefix=$(pwd) \ 72 | && make -j${NUM_JOBS} \ 73 | && make -j${NUM_JOBS} check \ 74 | && make -j${NUM_JOBS} distcheck \ 75 | && make install 76 | 77 | # Build a light-weight image just with binaries 78 | FROM base 79 | ENV HTSLIB=htslib-${HTSLIB_VERSION} 80 | RUN apk update 81 | WORKDIR /build 82 | RUN apk add \ 83 | gsl-dev \ 84 | zlib-dev \ 85 | bzip2-dev \ 86 | xz-dev \ 87 | libstdc++ \ 88 | libgomp 89 | COPY --from=build_htslib /build/${HTSLIB}/lib /usr/lib 90 | COPY --from=build_dnmtools /build/dnmtools/dnmtools /usr/bin 91 | 92 | ENTRYPOINT ["dnmtools"] 93 | -------------------------------------------------------------------------------- /src/common/EmissionDistribution.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2022 Andrew D Smith 3 | 4 | Authors: Andrew D. Smith and Benjamin E. Decato 5 | 6 | This file is part of dnmtools. 7 | 8 | dnmtools is free software; you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation; either version 2 of the License, or 11 | (at your option) any later version. 12 | 13 | dnmtools is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | */ 18 | 19 | #ifndef EM_DTN 20 | #define EM_DTN 21 | 22 | #include // IWYU pragma: keep 23 | #include 24 | #include 25 | #include 26 | 27 | /** Emission distributions for methylation should be modeled either as 28 | * Beta or Beta Binomial. Since they will be used simultaneously, it is 29 | * helpful to have an abstraction so that we can put them in the same 30 | * container. 31 | */ 32 | class EmissionDistribution { 33 | public: 34 | EmissionDistribution(); 35 | virtual ~EmissionDistribution(); 36 | EmissionDistribution(const double a, const double b); 37 | EmissionDistribution(const std::string &str); 38 | virtual double 39 | operator()(const std::pair &val) const = 0; 40 | virtual double 41 | log_likelihood(const std::pair &val) const = 0; 42 | std::string 43 | tostring() const; 44 | double 45 | getalpha() { 46 | return alpha; 47 | }; 48 | double 49 | getbeta() { 50 | return beta; 51 | }; 52 | void 53 | fit(const std::vector &vals_a, const std::vector &vals_b, 54 | const std::vector &p); 55 | 56 | protected: 57 | double 58 | sign(const double x); 59 | double 60 | invpsi(const double tolerance, const double x); 61 | double 62 | movement(const double curr, const double prev); 63 | double alpha{}; 64 | double beta{}; 65 | double lnbeta_helper{}; 66 | 67 | static constexpr double tolerance = 1e-10; 68 | }; 69 | 70 | class Beta : public EmissionDistribution { 71 | public: 72 | Beta(); 73 | Beta(const double a, const double b); 74 | explicit Beta(const std::string &str); 75 | double 76 | operator()(const std::pair &val) const override; 77 | double 78 | log_likelihood(const std::pair &val) const override; 79 | }; 80 | 81 | class BetaBinomial : public EmissionDistribution { 82 | public: 83 | BetaBinomial(); 84 | BetaBinomial(const double a, const double b); 85 | explicit BetaBinomial(const std::string &str); 86 | double 87 | operator()(const std::pair &val) const override; 88 | double 89 | log_likelihood(const std::pair &val) const override; 90 | }; 91 | 92 | #endif 93 | -------------------------------------------------------------------------------- /src/common/bsutils.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2018-2025 Andrew D. Smith 2 | * 3 | * Author: Andrew D. Smith 4 | * 5 | * This program is free software: you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) 8 | * any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | * more details. 14 | */ 15 | 16 | #include "bsutils.hpp" 17 | #include "dnmtools_gaussinv.hpp" 18 | 19 | #include "Interval6.hpp" 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | void 33 | wilson_ci_for_binomial(const double alpha, const double n, const double p_hat, 34 | double &lower, double &upper) { 35 | if (n <= 0.0) { // protection 36 | lower = 0.0; 37 | upper = 1.0; 38 | return; 39 | } 40 | const double z = dnmt_gsl_cdf_ugaussian_Pinv(1 - alpha / 2); 41 | const double denom = 1 + z * z / n; 42 | const double first_term = p_hat + z * z / (2 * n); 43 | const double discriminant = 44 | std::max(0.0, p_hat * (1 - p_hat) / n + z * z / (4 * n * n)); 45 | lower = std::max(0.0, (first_term - z * std::sqrt(discriminant)) / denom); 46 | upper = std::min(1.0, (first_term + z * std::sqrt(discriminant)) / denom); 47 | } 48 | 49 | void 50 | adjust_region_ends(const std::vector> &clusters, 51 | std::vector ®ions) { 52 | assert(std::size(clusters) == std::size(regions)); 53 | for (std::size_t i = 0; i < std::size(regions); ++i) { 54 | auto max_pos = regions[i].stop; 55 | auto min_pos = regions[i].start; 56 | for (std::size_t j = 0; j < std::size(clusters[i]); ++j) { 57 | max_pos = std::max(clusters[i][j].stop, max_pos); 58 | min_pos = std::min(clusters[i][j].start, min_pos); 59 | } 60 | regions[i].stop = max_pos; 61 | regions[i].start = min_pos; 62 | } 63 | } 64 | 65 | void 66 | relative_sort(const std::vector &mapped_locations, 67 | const std::vector &names, 68 | std::vector &lookup) { 69 | std::unordered_map names_map; 70 | for (std::size_t i = 0; i < std::size(names); ++i) 71 | names_map[names[i]] = i; 72 | for (std::size_t i = 0; i < std::size(mapped_locations); ++i) { 73 | const auto j = names_map.find(mapped_locations[i].name); 74 | if (j == std::cend(names_map)) 75 | throw std::runtime_error("read sequence not found for: " + names[i]); 76 | lookup.push_back(j->second); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/common/Interval.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Andrew D Smith 2 | * 3 | * This is free software; you can redistribute it and/or modify it under the 4 | * terms of the GNU General Public License as published by the Free Software 5 | * Foundation; either version 2 of the License, or (at your option) any later 6 | * version. 7 | * 8 | * This is distributed in the hope that it will be useful, but WITHOUT ANY 9 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 10 | * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 11 | * details. 12 | * 13 | * You should have received a copy of the GNU General Public License along 14 | * with this software; if not, write to the Free Software Foundation, Inc., 51 15 | * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 | */ 17 | 18 | #ifndef INTERVAL_HPP_ 19 | #define INTERVAL_HPP_ 20 | 21 | #include 22 | // #include // ADS: needs c++20 23 | #include 24 | #include // std::size 25 | #include 26 | #include 27 | #include 28 | 29 | struct Interval { 30 | std::string chrom; 31 | std::uint32_t start{}; 32 | std::uint32_t stop{}; 33 | 34 | Interval() = default; 35 | Interval(const std::string &chrom, const std::uint32_t start, 36 | const std::uint32_t stop) : chrom{chrom}, start{start}, stop{stop} {} 37 | 38 | explicit Interval(const std::string &line) { 39 | if (!initialize(line.data(), line.data() + std::size(line))) 40 | throw std::runtime_error("bad interval line: " + line); 41 | } 42 | auto 43 | initialize(const char *, const char *) -> bool; 44 | 45 | [[nodiscard]] auto 46 | operator<(const Interval &rhs) const { 47 | return (chrom < rhs.chrom || 48 | (chrom == rhs.chrom && 49 | (start < rhs.start || (start == rhs.start && stop < rhs.stop)))); 50 | } 51 | 52 | [[nodiscard]] auto 53 | operator==(const Interval &rhs) const { 54 | return chrom == rhs.chrom && start == rhs.start && stop < rhs.stop; 55 | } 56 | 57 | // auto 58 | // operator<=>(const Interval &) const = default; 59 | }; 60 | 61 | inline auto 62 | operator<<(std::ostream &os, const Interval &x) -> std::ostream & { 63 | return os << x.chrom << "\t" << x.start << "\t" << x.stop; 64 | } 65 | 66 | [[nodiscard]] inline auto 67 | to_string(const Interval &x) -> std::string { 68 | return x.chrom + "\t" + std::to_string(x.start) + "\t" + 69 | std::to_string(x.stop); 70 | } 71 | 72 | // ADS: need to bump to c++20 for this 73 | // 74 | // template <> struct std::formatter : std::formatter { 75 | // auto 76 | // format(const Interval &i, format_context &ctx) const { 77 | // static constexpr auto fmt = "{}\t{}\t{}"; 78 | // return std::formatter::format( 79 | // std::format(fmt, i.chrom, i.start, i.stop), ctx); 80 | // } 81 | // }; 82 | 83 | [[nodiscard]] inline auto 84 | size(const Interval &x) { 85 | return x.stop > x.start ? x.stop - x.start : 0ul; 86 | } 87 | 88 | [[nodiscard]] auto 89 | read_intervals(const std::string &intervals_file) -> std::vector; 90 | 91 | #endif // INTERVAL_HPP_ 92 | -------------------------------------------------------------------------------- /docs/content/uniq.md: -------------------------------------------------------------------------------- 1 | # uniq - ensure reads are not duplicates 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools uniq [OPTIONS] [out-sorted.sam] 6 | ``` 7 | 8 | ## Description 9 | 10 | The `uniq` command removes PCR duplicates. Before calculating 11 | methylation level, you should now remove duplicate reads, which in 12 | wgbs data are typically identified by their mapping to identical 13 | genomic locations. These reads are most likely PCR clones rather than 14 | representations of distinct DNA molecules. The command `uniq` remove 15 | such duplicates. It collects duplicate reads and/or fragments that 16 | have identical sequences and are mapped to the same genomic location 17 | (same chromosome, same start and end positions, and same strand), and 18 | chooses a random one to be the representative of the original DNA 19 | sequence. 20 | 21 | *Note* As of dnmtools v1.2.5, the option to use the sequence of reads 22 | when deciding if two reads are duplicates has been removed. In the 23 | context of analyzing bisulfite sequencing reads, this has the danger 24 | of introducing bias in downstream analyses. Also, in the same version 25 | the test for sorted order of reads cannot be disabled. Empirical tests 26 | showed very little improvement to speed when disabling this test. 27 | 28 | The `uniq` command can take reads sorted by (chrom, start, end, 29 | strand). If the reads in the input file are not sorted, run the 30 | following sort command using [samtools](https://samtools.github.io): 31 | 32 | ```shell 33 | $ samtools sort -o reads_sorted.bam reads.bam 34 | ``` 35 | 36 | Next, execute the following command to remove duplicate reads: 37 | 38 | ```shell 39 | $ dnmtools uniq -S duplicate-removal-stats.txt reads_sorted.bam reads_uniq.bam 40 | ``` 41 | 42 | ## Options 43 | 44 | ```txt 45 | -t, -threads 46 | ``` 47 | The number of threads to use. These threads are used for I/O, and are 48 | most helpful when the input and output are both BAM, where the threads 49 | can really speed things up. 50 | 51 | ```txt 52 | -S, -summary 53 | ``` 54 | Save statistics on duplication rates to this file. The statistics are not 55 | reported unless a file is specified here. This option is correct as of v1.4.0. 56 | 57 | ```txt 58 | -hist 59 | ``` 60 | Output a histogram of duplication frequencies into the specified file 61 | for library complexity analysis. 62 | 63 | ```txt 64 | -B, -bam 65 | ``` 66 | The output is in BAM format. This is an option to help prevent 67 | accidentally writing BAM format to the terminal or through a pipe that 68 | expects plain text, e.g., SAM. 69 | 70 | ```txt 71 | -stdout 72 | ``` 73 | Write the output to standard out. This is not done by default even 74 | without an output file given, because of the danger of writing BAM to 75 | the terminal or through a pipe unexpectedly. It is possible to write 76 | BAM redirected or through a pipe, but the `-stdout` argument is 77 | required. 78 | 79 | ```txt 80 | -s, -seed 81 | ``` 82 | Random number seed. Affects which read is kept among duplicates. The 83 | default seed is 408. This option is typically only used for testing. 84 | 85 | ```txt 86 | -v, -verbose 87 | ``` 88 | Report more information while the program is running. 89 | -------------------------------------------------------------------------------- /.github/workflows/dnmtools_release_linux.yml: -------------------------------------------------------------------------------- 1 | name: DNMTools release (Linux) 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | env: 7 | CONTAINER: andrewdavidsmith/transferase-build 8 | 9 | jobs: 10 | linux-releases: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Start docker container 14 | # Pull the container, run it in detached mode, mount the workspace 15 | run: | 16 | docker pull $CONTAINER 17 | docker run --name build-container \ 18 | -d -v ${{ github.workspace }}:/workspace $CONTAINER tail -f /dev/null 19 | - uses: actions/checkout@v4 20 | with: 21 | submodules: recursive 22 | - name: Get version number 23 | id: get-vn 24 | run: | 25 | awk '/AC_INIT/ {print "vn="$2}' configure.ac | sed "s/\[//; s/\]//; s/,//" >> "$GITHUB_OUTPUT" 26 | env: 27 | GH_TOKEN: ${{ github.token }} 28 | - name: Configure and build 29 | env: 30 | SCRIPT: | 31 | export DEBIAN_FRONTEND=noninteractive && \ 32 | apt-get update && apt-get install --no-install-recommends -y automake libgsl-dev && \ 33 | find /usr -name libz.so -exec rm {} \; && \ 34 | find /usr -name libgsl\*.so -exec rm {} \; && \ 35 | git clone https://github.com/ebiggers/libdeflate.git && \ 36 | cd libdeflate && \ 37 | cmake -B build \ 38 | -DLIBDEFLATE_BUILD_GZIP=off \ 39 | -DLIBDEFLATE_BUILD_TESTS=off \ 40 | -DLIBDEFLATE_BUILD_SHARED_LIB=off \ 41 | -DCMAKE_VERBOSE_MAKEFILE=on \ 42 | -DCMAKE_BUILD_TYPE=Release && \ 43 | cmake --build build -j4 && \ 44 | cmake --install build --prefix=/usr/local && \ 45 | cd .. && \ 46 | git clone --recursive https://github.com/samtools/htslib.git && \ 47 | cd htslib && \ 48 | autoreconf -i && \ 49 | mkdir build && cd build && \ 50 | ../configure \ 51 | --disable-bz2 \ 52 | --disable-libcurl \ 53 | --disable-lzma \ 54 | --disable-ref-cache \ 55 | --with-libdeflate && \ 56 | make -j4 CFLAGS="-Wall -O2 -fvisibility=hidden" libhts.a && \ 57 | cp libhts.a /usr/local/lib/ && \ 58 | cp -r ../htslib /usr/local/include/ && \ 59 | cd /workspace && \ 60 | autoreconf -i && \ 61 | mkdir build && cd build && \ 62 | ../configure --with-libdeflate && \ 63 | ../data/make_full_license_info_header.sh ../data/LICENSE > license.h && \ 64 | echo "#define INCLUDE_FULL_LICENSE_INFO 1" >> config.h && \ 65 | make -j4 LDFLAGS="-static-libgcc -static-libstdc++ -s" && \ 66 | tar -cf dnmtools-${{ steps.get-vn.outputs.vn }}-Linux.tar.gz dnmtools 67 | run: | 68 | docker exec build-container bash -c "$SCRIPT" 69 | - name: Upload the binary 70 | uses: actions/upload-artifact@v4 71 | with: 72 | name: dnmtools-${{ steps.get-vn.outputs.vn }}-Linux.tar.gz 73 | path: build/dnmtools-${{ steps.get-vn.outputs.vn }}-Linux.tar.gz 74 | -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- 1 | Checks: 'cert-*,cppcoreguidelines-*,performance-*,clang-diagnostic-*,clang-analyzer-*,-clang-diagnostic-unqualified-std-cast-call,-clang-diagnostic-unknown-warning-option,-clang-analyzer-unix.BlockInCriticalSection,-cppcoreguidelines-pro-type-vararg' 2 | WarningsAsErrors: '*' 3 | HeaderFileExtensions: 4 | - '' 5 | - h 6 | - hh 7 | - hpp 8 | - hxx 9 | ImplementationFileExtensions: 10 | - c 11 | - cc 12 | - cpp 13 | - cxx 14 | HeaderFilterRegex: '' 15 | ExcludeHeaderFilterRegex: 'OptionParser.hpp' 16 | FormatStyle: none 17 | CheckOptions: 18 | cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU' 19 | cert-err33-c.AllowCastToVoid: 'true' 20 | cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;' 21 | cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false' 22 | cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false' 23 | cppcoreguidelines-non-private-member-variables-in-classes.IgnorePublicMemberVariables: 'true' 24 | google-readability-braces-around-statements.ShortStatementLines: '1' 25 | google-readability-function-size.StatementThreshold: '800' 26 | google-readability-namespace-comments.ShortNamespaceLines: '10' 27 | google-readability-namespace-comments.SpacesBeforeComments: '2' 28 | llvm-else-after-return.WarnOnConditionVariables: 'false' 29 | llvm-else-after-return.WarnOnUnfixable: 'false' 30 | llvm-qualified-auto.AddConstToQualified: 'false' 31 | SystemHeaders: 'false' 32 | -------------------------------------------------------------------------------- /docs/content/diff.md: -------------------------------------------------------------------------------- 1 | # diff - compute methylation difference probabilities 2 | 3 | ## Synopsis 4 | ```console 5 | $ dnmtools diff [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | Suppose that we want to compare two methylomes: `input-a.meth` and 11 | `input-b.meth`. Both these files would have been produced by the 12 | [counts](../counts) command. We start by calculating the differential 13 | methylation score (probability) for each CpG site using the `diff` 14 | command: 15 | 16 | ```console 17 | $ dnmtools diff -o output.diff input-a.meth input-b.meth 18 | ``` 19 | 20 | Here are the first few lines of the output: 21 | 22 | ```txt 23 | chr1 3000826 + CpG 0.609908 16 7 21 11 24 | chr1 3001006 + CpG 0.874119 21 18 15 22 25 | chr1 3001017 + CpG 0.888384 20 19 15 25 26 | chr1 3001276 + CpG 0.010825 3 20 12 16 27 | ``` 28 | 29 | The first four columns are the same as the counts input files. The 5th 30 | column gives the probability that the methylation level at each given 31 | site is lower in `input-a.meth` than `input-b.meth`. (For the other 32 | direction, you can either swap the order of the two input files or 33 | just subtract the probability from 1.0.) The method used to calculate 34 | this probability is explained by Altham (see reference below), and is 35 | simply a one-directional version of Fisher's exact test. The remaining 36 | columns in the output give the number of methylated reads of each CpG 37 | in `input-a.meth`, number of unmethylated reads in `input-a.meth`, 38 | number of methylated reads in `input-b.meth`, and number of 39 | unmethylated reads in `input-b.meth`, respectively. 40 | 41 | The two input files must be have all sites within a chromosomes 42 | consecutive, have the same chromosome order, and have sites sorted in 43 | increasing order within each chromosome. The order of chromosomes does 44 | not matter (e.g., chr10 may precede chr2, or chr2 may precede chr10). 45 | 46 | **Warning** the order of the samples/methylomes given as input, the 47 | "a" and "b", matters. It is probably a good idea to include this order 48 | in the output file name, for example as `output_a_lt_b.diff`. 49 | 50 | The output from the `diff` command is used as input for the 51 | [dmr](../dmr) program, but may also form the basis of visualization if 52 | you want to plot differential methylation probabilities, for example 53 | along the genome in a genome browser. 54 | 55 | Reference: 56 | ```txt 57 | Patricia M. E. Altham (1969) 58 | Exact bayesian analysis of a 2x2 contingency table, and Fisher's "exact" significance test 59 | Journal of the Royal Statistical Society, Series B (Methodological) 60 | 31(2):261-269 61 | ``` 62 | 63 | ## Options 64 | 65 | ```txt 66 | -p, -pseudo 67 | ``` 68 | The pseudocount to use (default: 1). 69 | 70 | ```txt 71 | -A, -nonzero-only 72 | ``` 73 | Process only sites with coveage in both samples. 74 | 75 | ```txt 76 | -o, -out 77 | ``` 78 | The name of the output file. If no file name is provided, the output 79 | will be written to standard output. Due to the size of this output, a 80 | file name should be specified unless the output will be piped to 81 | another command or program. 82 | 83 | ```txt 84 | -v, -verbose 85 | ``` 86 | Print more information while the command is running. 87 | -------------------------------------------------------------------------------- /src/common/bsutils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009-2022 University of Southern California and 3 | * Andrew D. Smith 4 | * 5 | * Authors: Andrew D. Smith 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | */ 17 | 18 | #ifndef BSUTILS_HPP 19 | #define BSUTILS_HPP 20 | 21 | #include 22 | #include 23 | #include 24 | class Interval6; 25 | 26 | inline bool 27 | is_cytosine(char c) { 28 | return (c == 'c' || c == 'C'); 29 | } 30 | 31 | inline bool 32 | is_guanine(char c) { 33 | return (c == 'g' || c == 'G'); 34 | } 35 | 36 | inline bool 37 | is_thymine(char c) { 38 | return (c == 't' || c == 'T'); 39 | } 40 | 41 | inline bool 42 | is_adenine(char c) { 43 | return (c == 'a' || c == 'A'); 44 | } 45 | 46 | //// CONFIDENCE INTERVALS //**************//////////////////////// 47 | void 48 | wilson_ci_for_binomial(const double alpha, const double n, const double p_hat, 49 | double &lower, double &upper); 50 | 51 | inline bool 52 | is_cpg(const std::string &s, size_t i) { 53 | return (i < (s.length() - 1)) && is_cytosine(s[i]) && is_guanine(s[i + 1]); 54 | } 55 | 56 | void 57 | adjust_region_ends(const std::vector> &clusters, 58 | std::vector ®ions); 59 | 60 | void 61 | relative_sort(const std::vector &mapped_locations, 62 | const std::vector &names, 63 | std::vector &lookup); 64 | 65 | template 66 | static void 67 | separate_regions(const std::vector &big_regions, 68 | const std::vector ®ions, const std::vector &seqs, 69 | std::vector> &sep_regions, 70 | std::vector> &sep_seqs) { 71 | size_t rr_id = 0; 72 | const size_t n_regions = regions.size(); 73 | assert(n_regions <= seqs.size()); 74 | 75 | const size_t n_big_regions = big_regions.size(); 76 | sep_regions.resize(n_big_regions); 77 | sep_seqs.resize(n_big_regions); 78 | for (size_t i = 0; i < n_big_regions; ++i) { 79 | const std::string current_chrom(big_regions[i].get_chrom()); 80 | const size_t current_start = big_regions[i].get_start(); 81 | const size_t current_end = big_regions[i].get_end(); 82 | while (rr_id < n_regions && (regions[rr_id].get_chrom() < current_chrom || 83 | (regions[rr_id].get_chrom() == current_chrom && 84 | regions[rr_id].get_end() <= current_start))) 85 | ++rr_id; 86 | while (rr_id < n_regions && (regions[rr_id].get_chrom() == current_chrom && 87 | regions[rr_id].get_start() < current_end)) { 88 | sep_regions[i].push_back(regions[rr_id]); 89 | sep_seqs[i].push_back(seqs[rr_id]); 90 | ++rr_id; 91 | } 92 | } 93 | } 94 | 95 | #endif 96 | -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | ## Docker images 2 | 3 | The docker images for `dnmtools` are hosted in GitHub Container registry. The 4 | process of building and pushing the image to the registry is handled by the 5 | workflow specified in 6 | [docker-build.yml](https://github.com/smithlabcode/dnmtools/blob/master/.github/workflows/docker-build.yml). 7 | The build instruction is in 8 | [Dockerfile](https://github.com/smithlabcode/dnmtools/blob/master/Dockerfile). 9 | You can see the published images 10 | [here](https://github.com/smithlabcode/dnmtools/pkgs/container/dnmtools). 11 | 12 | The workflow is triggered either manually or automatically by a tag event of 13 | type `v*.*.*`, which is intended for new releases. Currently, publishing the 14 | images can happen only to commits tagged by a version number. This is intended 15 | to associate every docker image with a version number. This means that there is 16 | no option to push the image for the latest commit if it is not tagged by 17 | a version number. 18 | 19 | ### Automatic build and publish in a tag event 20 | 21 | In a tag event of type `v*.*.*`, such as new release or retagging of versoin 22 | number, this work flow is triggered to build and publish the image for the 23 | tagged version number. The published image is tagged with SHA hash and the 24 | version number. It is also taged with `latest` if the version number is the 25 | latest. 26 | 27 | ### Manual build (and publish) 28 | 29 | Manual trigger is intedned to test the image build processes as well as publish 30 | an image for an existing version. In 31 | [Actions](https://github.com/smithlabcode/dnmtools/actions), go to `Docker image 32 | build` under `All workflows` and click `Run workflow` and choose from the 33 | following options: 34 | 35 | 1. `Build latest commit`: for testing for the latest commit 36 | 2. `Build existing version`: for testing a particular version 37 | 3. `Build + push existing version`: for publishing a particular version 38 | 39 | For options 2 and 3, specify the version number in the form `v*.*.*`. If not 40 | specified, the workflow will assume the latest verion. 41 | 42 | ### Use scenarios 43 | 44 | **Before a new release**: It is a good idea to test image building before a new 45 | release. Manually trigger the workflow with opiton 1. If it builds with no 46 | issues, make a new release and the image will automatically be built and 47 | published. 48 | 49 | **Publish an existing version**: It is possible to publish a docker image for an 50 | existing version by option 3 in the manual trigger. First, test build using 51 | option 2, and then publish using option 3. The published image is tagged with 52 | SHA hash and the version number. It is also taged with `latest` if the version 53 | number is the latest. If option 3 is deployed with a version number for which 54 | a docker image already exists, it will simply rebuild and update the existing 55 | image. 56 | 57 | **Deleting an image**: If you have owner access to `smithlabcode`, you can 58 | delete an image by going 59 | [here](https://github.com/smithlabcode/dnmtools/pkgs/container/dnmtools/versions) 60 | and manually delete a version. 61 | 62 | 63 | 64 | ## Installation 65 | The image can be pulled by one of the following commands. 66 | 67 | ```bash 68 | docker pull ghcr.io/smithlabcode/dnmtools:latest 69 | docker pull ghcr.io/smithlabcode/dnmtools:[7-DIGIT SHA] 70 | docker pull ghcr.io/smithlabcode/dnmtools:v[VERSION NUMBER] #(e.g. v1.4.2) 71 | ``` 72 | 73 | -------------------------------------------------------------------------------- /src/common/Interval6.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Andrew D Smith 2 | * 3 | * This is free software; you can redistribute it and/or modify it under the 4 | * terms of the GNU General Public License as published by the Free Software 5 | * Foundation; either version 2 of the License, or (at your option) any later 6 | * version. 7 | * 8 | * This is distributed in the hope that it will be useful, but WITHOUT ANY 9 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 10 | * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 11 | * details. 12 | * 13 | * You should have received a copy of the GNU General Public License along 14 | * with this software; if not, write to the Free Software Foundation, Inc., 51 15 | * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 | */ 17 | 18 | #ifndef INTERVAL6_HPP_ 19 | #define INTERVAL6_HPP_ 20 | 21 | #include 22 | // #include // ADS: needs c++20 23 | #include // std::size 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | struct Interval6 { 30 | std::string chrom; 31 | std::uint32_t start{}; 32 | std::uint32_t stop{}; 33 | std::string name; 34 | double score{}; 35 | char strand{}; 36 | 37 | Interval6() = default; 38 | Interval6(const std::string &chrom, const std::uint32_t start, 39 | const std::uint32_t stop, const std::string &name, 40 | const double score, const char strand) : 41 | chrom{chrom}, start{start}, stop{stop}, name{name}, score{score}, 42 | strand{strand} {} 43 | 44 | explicit Interval6(const std::string &line) { 45 | if (!initialize(line.data(), line.data() + std::size(line))) 46 | throw std::runtime_error("bad interval6 line: " + line); 47 | } 48 | auto 49 | initialize(const char *, const char *) -> bool; 50 | 51 | auto 52 | operator<(const Interval6 &rhs) const { 53 | return (chrom < rhs.chrom || 54 | (chrom == rhs.chrom && 55 | (start < rhs.start || (start == rhs.start && stop < rhs.stop)))); 56 | } 57 | 58 | // auto 59 | // operator<=>(const Interval6 &) const = default; 60 | }; 61 | 62 | inline auto 63 | operator<<(std::ostream &os, const Interval6 &x) -> std::ostream & { 64 | return os << x.chrom << "\t" << x.start << "\t" << x.stop << "\t" << x.name 65 | << "\t" << x.score << "\t" << x.strand; 66 | } 67 | 68 | [[nodiscard]] inline auto 69 | to_string(const Interval6 &x) -> std::string { 70 | std::ostringstream oss; 71 | oss << x; 72 | return oss.str(); 73 | // return x.chrom + "\t" + std::to_string(x.start) + "\t" + 74 | // std::to_string(x.stop) + "\t" + x.name + "\t" + 75 | // std::to_string(x.score) + "\t" + std::string(1, x.strand); 76 | } 77 | 78 | // ADS: need to bump to c++20 for this 79 | // 80 | // template <> struct std::formatter : std::formatter { 81 | // auto 82 | // format(const Interval6 &i, format_context &ctx) const { 83 | // static constexpr auto fmt = "{}\t{}\t{}\t{}\t{:.6g}\t{}"; 84 | // return std::formatter::format( 85 | // std::format(fmt, i.chrom, i.start, i.stop, i.name, i.score, i.strand), 86 | // ctx); 87 | // } 88 | // }; 89 | 90 | [[nodiscard]] inline auto 91 | size(const Interval6 &x) { 92 | return x.stop > x.start ? x.stop - x.start : 0ul; 93 | } 94 | 95 | [[nodiscard]] auto 96 | read_intervals6(const std::string &intervals_file) -> std::vector; 97 | 98 | #endif // INTERVAL6_HPP_ 99 | -------------------------------------------------------------------------------- /data/methylome_a.counts.sym: -------------------------------------------------------------------------------- 1 | chr1 163 + CpG 0.885371 2495 2 | chr1 206 + CpG 0.900059 3362 3 | chr1 232 + CpG 0.891898 4283 4 | chr1 278 + CpG 0.895936 4872 5 | chr1 296 + CpG 0.904536 5070 6 | chr1 310 + CpG 0.900655 5194 7 | chr1 322 + CpG 0.102944 5333 8 | chr1 324 + CpG 0.0979768 5338 9 | chr1 350 + CpG 0.0992214 5523 10 | chr1 356 + CpG 0.0980427 5569 11 | chr1 358 + CpG 0.0944375 5591 12 | chr1 367 + CpG 0.0983925 5661 13 | chr1 388 + CpG 0.100379 5808 14 | chr1 402 + CpG 0.894009 5859 15 | chr1 404 + CpG 0.898489 5891 16 | chr1 422 + CpG 0.890427 5996 17 | chr1 434 + CpG 0.891272 6061 18 | chr1 442 + CpG 0.890658 6091 19 | chr1 448 + CpG 0.896047 6147 20 | chr1 461 + CpG 0.893198 6189 21 | chr1 467 + CpG 0.895397 6214 22 | chr1 473 + CpG 0.890295 6244 23 | chr1 485 + CpG 0.896256 6304 24 | chr1 488 + CpG 0.897663 6332 25 | chr1 496 + CpG 0.896302 6355 26 | chr1 502 + CpG 0.895712 6367 27 | chr1 514 + CpG 0.896622 6365 28 | chr1 517 + CpG 0.895009 6372 29 | chr1 520 + CpG 0.892313 6361 30 | chr1 522 + CpG 0.894836 6352 31 | chr1 535 + CpG 0.893348 6404 32 | chr1 537 + CpG 0.900701 6415 33 | chr1 540 + CpG 0.898191 6414 34 | chr1 564 + CpG 0.893356 6367 35 | chr1 569 + CpG 0.89719 6371 36 | chr1 572 + CpG 0.89482 6332 37 | chr1 577 + CpG 0.892193 6289 38 | chr1 583 + CpG 0.894065 6268 39 | chr1 585 + CpG 0.894627 6254 40 | chr1 588 + CpG 0.896248 6236 41 | chr1 594 + CpG 0.896346 6213 42 | chr1 602 + CpG 0.893856 6152 43 | chr1 606 + CpG 0.900572 6115 44 | chr1 609 + CpG 0.889762 6105 45 | chr1 612 + CpG 0.90954 6069 46 | chr1 617 + CpG 0.89103 6020 47 | chr1 620 + CpG 0.897577 5985 48 | chr1 631 + CpG 0.896323 5874 49 | chr1 633 + CpG 0.895214 5850 50 | chr1 642 + CpG 0.900296 5737 51 | chr1 650 + CpG 0.902435 5709 52 | chr1 654 + CpG 0.896709 5683 53 | chr1 660 + CpG 0.897639 5676 54 | chr1 665 + CpG 0.886054 5643 55 | chr1 673 + CpG 0.900411 5593 56 | chr1 679 + CpG 0.892864 5535 57 | chr1 681 + CpG 0.895913 5505 58 | chr1 684 + CpG 0.906811 5462 59 | chr1 702 + CpG 0.893238 5339 60 | chr1 705 + CpG 0.893273 5322 61 | chr1 708 + CpG 0.89059 5292 62 | chr1 710 + CpG 0.895027 5268 63 | chr1 713 + CpG 0.896526 5267 64 | chr1 729 + CpG 0.891296 5170 65 | chr1 731 + CpG 0.894326 5129 66 | chr1 737 + CpG 0.101157 5101 67 | chr1 745 + CpG 0.098996 4980 68 | chr1 755 + CpG 0.103188 4768 69 | chr1 757 + CpG 0.0993447 4731 70 | chr1 760 + CpG 0.0984832 4681 71 | chr1 766 + CpG 0.100824 4612 72 | chr1 779 + CpG 0.097355 4499 73 | chr1 785 + CpG 0.104054 4440 74 | chr1 787 + CpG 0.0980481 4406 75 | chr1 792 + CpG 0.104547 4333 76 | chr1 799 + CpG 0.0990355 4251 77 | chr1 801 + CpG 0.0969194 4220 78 | chr1 804 + CpG 0.884496 4199 79 | chr1 816 + CpG 0.89358 4003 80 | chr1 824 + CpG 0.893299 3880 81 | chr1 828 + CpG 0.892152 3848 82 | chr1 831 + CpG 0.890568 3838 83 | chr1 834 + CpG 0.891522 3798 84 | chr1 839 + CpG 0.897553 3719 85 | chr1 845 + CpG 0.899183 3670 86 | chr1 853 + CpG 0.898612 3531 87 | chr1 857 + CpG 0.900296 3380 88 | chr1 860 + CpG 0.896175 3294 89 | chr1 863 + CpG 0.892756 3189 90 | chr1 868 + CpG 0.891703 3001 91 | chr1 874 + CpG 0.886834 2757 92 | chr1 882 + CpG 0.907975 2445 93 | chr1 886 + CpG 0.880694 2305 94 | chr1 889 + CpG 0.882969 2196 95 | chr1 892 + CpG 0.896952 2067 96 | chr1 894 + CpG 0.889332 2006 97 | chr1 897 + CpG 0.886603 1896 98 | chr1 903 + CpG 0.896429 1680 99 | chr1 911 + CpG 0.881223 1406 100 | chr1 915 + CpG 0.868526 1255 101 | chr1 918 + CpG 0.887457 1164 102 | chr1 921 + CpG 0.887417 1057 103 | chr1 923 + CpG 0.872802 967 104 | chr1 926 + CpG 0.875887 846 105 | chr1 932 + CpG 0.88853 619 106 | chr1 940 + CpG 0.865714 350 107 | chr1 944 + CpG 0.884058 207 108 | chr1 947 + CpG 0.708738 103 109 | -------------------------------------------------------------------------------- /data/methylome_b.counts.sym: -------------------------------------------------------------------------------- 1 | chr1 163 + CpG 0.896375 2538 2 | chr1 206 + CpG 0.897481 3414 3 | chr1 232 + CpG 0.888131 4398 4 | chr1 278 + CpG 0.894575 4866 5 | chr1 296 + CpG 0.892725 5127 6 | chr1 310 + CpG 0.893081 5275 7 | chr1 322 + CpG 0.899683 5363 8 | chr1 324 + CpG 0.892228 5391 9 | chr1 350 + CpG 0.898259 5573 10 | chr1 356 + CpG 0.884881 5655 11 | chr1 358 + CpG 0.899364 5664 12 | chr1 367 + CpG 0.887803 5731 13 | chr1 388 + CpG 0.8979 5906 14 | chr1 402 + CpG 0.887257 6058 15 | chr1 404 + CpG 0.889255 6077 16 | chr1 422 + CpG 0.892903 6200 17 | chr1 434 + CpG 0.898734 6320 18 | chr1 442 + CpG 0.896819 6319 19 | chr1 448 + CpG 0.89785 6373 20 | chr1 461 + CpG 0.105519 6378 21 | chr1 467 + CpG 0.0939797 6395 22 | chr1 473 + CpG 0.0959203 6422 23 | chr1 485 + CpG 0.0894118 6375 24 | chr1 488 + CpG 0.101708 6381 25 | chr1 496 + CpG 0.0996085 6385 26 | chr1 502 + CpG 0.0979121 6322 27 | chr1 514 + CpG 0.100904 6303 28 | chr1 517 + CpG 0.0972134 6316 29 | chr1 520 + CpG 0.0994906 6282 30 | chr1 522 + CpG 0.0937997 6290 31 | chr1 535 + CpG 0.100511 6258 32 | chr1 537 + CpG 0.090749 6248 33 | chr1 540 + CpG 0.103028 6241 34 | chr1 564 + CpG 0.0994068 6237 35 | chr1 569 + CpG 0.100256 6244 36 | chr1 572 + CpG 0.0916251 6221 37 | chr1 577 + CpG 0.0930195 6203 38 | chr1 583 + CpG 0.101648 6188 39 | chr1 585 + CpG 0.0988014 6174 40 | chr1 588 + CpG 0.888853 6154 41 | chr1 594 + CpG 0.895356 6116 42 | chr1 602 + CpG 0.892945 6109 43 | chr1 606 + CpG 0.894495 6085 44 | chr1 609 + CpG 0.899095 6075 45 | chr1 612 + CpG 0.906008 6075 46 | chr1 617 + CpG 0.890079 6068 47 | chr1 620 + CpG 0.893045 6068 48 | chr1 631 + CpG 0.896924 6015 49 | chr1 633 + CpG 0.906344 5990 50 | chr1 642 + CpG 0.891856 5992 51 | chr1 650 + CpG 0.895422 5919 52 | chr1 654 + CpG 0.892596 5875 53 | chr1 660 + CpG 0.901209 5790 54 | chr1 665 + CpG 0.892863 5731 55 | chr1 673 + CpG 0.910198 5668 56 | chr1 679 + CpG 0.897883 5621 57 | chr1 681 + CpG 0.889067 5607 58 | chr1 684 + CpG 0.89372 5589 59 | chr1 702 + CpG 0.895077 5423 60 | chr1 705 + CpG 0.892007 5380 61 | chr1 708 + CpG 0.895394 5363 62 | chr1 710 + CpG 0.894124 5327 63 | chr1 713 + CpG 0.891525 5310 64 | chr1 729 + CpG 0.892229 5057 65 | chr1 731 + CpG 0.901375 5019 66 | chr1 737 + CpG 0.889204 4937 67 | chr1 745 + CpG 0.892381 4804 68 | chr1 755 + CpG 0.898156 4664 69 | chr1 757 + CpG 0.888985 4648 70 | chr1 760 + CpG 0.893792 4623 71 | chr1 766 + CpG 0.900198 4539 72 | chr1 779 + CpG 0.897518 4352 73 | chr1 785 + CpG 0.89578 4289 74 | chr1 787 + CpG 0.895231 4257 75 | chr1 792 + CpG 0.897337 4169 76 | chr1 799 + CpG 0.898918 4066 77 | chr1 801 + CpG 0.899803 4052 78 | chr1 804 + CpG 0.897532 4011 79 | chr1 816 + CpG 0.895012 3829 80 | chr1 824 + CpG 0.903985 3739 81 | chr1 828 + CpG 0.898031 3707 82 | chr1 831 + CpG 0.892002 3676 83 | chr1 834 + CpG 0.905847 3643 84 | chr1 839 + CpG 0.889659 3607 85 | chr1 845 + CpG 0.893179 3548 86 | chr1 853 + CpG 0.889873 3387 87 | chr1 857 + CpG 0.887658 3249 88 | chr1 860 + CpG 0.89147 3142 89 | chr1 863 + CpG 0.889328 3036 90 | chr1 868 + CpG 0.89277 2863 91 | chr1 874 + CpG 0.892424 2640 92 | chr1 882 + CpG 0.899573 2340 93 | chr1 886 + CpG 0.890511 2192 94 | chr1 889 + CpG 0.892601 2095 95 | chr1 892 + CpG 0.898949 1999 96 | chr1 894 + CpG 0.891316 1923 97 | chr1 897 + CpG 0.873894 1808 98 | chr1 903 + CpG 0.893949 1603 99 | chr1 911 + CpG 0.883738 1359 100 | chr1 915 + CpG 0.870968 1209 101 | chr1 918 + CpG 0.889488 1113 102 | chr1 921 + CpG 0.892644 1006 103 | chr1 923 + CpG 0.884289 942 104 | chr1 926 + CpG 0.890361 830 105 | chr1 932 + CpG 0.893142 627 106 | chr1 940 + CpG 0.830357 336 107 | chr1 944 + CpG 0.0909091 187 108 | chr1 947 + CpG 0.120482 83 109 | -------------------------------------------------------------------------------- /src/common/EpireadStats.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2011-2022 University of Southern California and 2 | * Andrew D. Smith and Fang Fang 3 | * 4 | * Authors: Fang Fang and Andrew D. Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | */ 16 | 17 | #ifndef EPIREAD_STATS 18 | #define EPIREAD_STATS 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | struct small_epiread { 27 | uint32_t pos{}; 28 | std::string seq{}; 29 | 30 | small_epiread(const std::uint32_t p, const std::string &s) : pos{p}, seq{s} {} 31 | 32 | uint32_t 33 | end() const { 34 | return pos + std::size(seq); 35 | } 36 | 37 | uint32_t 38 | length() const { 39 | return std::size(seq); 40 | } 41 | 42 | std::size_t 43 | size() const { 44 | return std::size(seq); 45 | } 46 | }; 47 | 48 | double 49 | log_likelihood(const small_epiread &r, const std::vector &a); 50 | 51 | void 52 | fit_epiallele(const std::vector &reads, 53 | const std::vector &indicators, std::vector &a); 54 | double 55 | fit_single_epiallele(const std::vector &reads, 56 | std::vector &a); 57 | 58 | double 59 | log_likelihood(const small_epiread &r, const double z, 60 | const std::vector &a1, const std::vector &a2); 61 | double 62 | log_likelihood(const small_epiread &r, const std::vector &a1, 63 | const std::vector &a2); 64 | double 65 | log_likelihood(const std::vector &reads, 66 | const std::vector &indicators, 67 | const std::vector &a1, const std::vector &a2); 68 | 69 | double 70 | resolve_epialleles(const size_t max_itr, 71 | const std::vector &reads, 72 | std::vector &indicators, std::vector &a1, 73 | std::vector &a2); 74 | 75 | double 76 | test_asm_lrt(const size_t max_itr, const bool crct_for_read_count, 77 | const double low_prob, const double high_prob, 78 | std::vector &reads); 79 | 80 | double 81 | test_asm_bic(const size_t max_itr, const bool crct_for_read_count, 82 | const double low_prob, const double high_prob, 83 | std::vector &reads); 84 | 85 | struct EpireadStats { 86 | double 87 | test_asm(std::vector &reads, bool &is_significant) const { 88 | const double score = use_bic ? test_asm_bic(max_itr, crct_for_read_count, 89 | low_prob, high_prob, reads) 90 | : test_asm_lrt(max_itr, crct_for_read_count, 91 | low_prob, high_prob, reads); 92 | is_significant = use_bic ? score < 0.0 : score < critical_value; 93 | return score; 94 | } 95 | 96 | double low_prob{0.25}; 97 | double high_prob{0.75}; 98 | double critical_value{0.01}; 99 | size_t max_itr{10}; 100 | bool use_bic{false}; 101 | bool crct_for_read_count{true}; 102 | }; 103 | 104 | #endif 105 | -------------------------------------------------------------------------------- /src/common/Interval6.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2025 Andrew D Smith 2 | * 3 | * This is free software; you can redistribute it and/or modify it under the 4 | * terms of the GNU General Public License as published by the Free Software 5 | * Foundation; either version 2 of the License, or (at your option) any later 6 | * version. 7 | * 8 | * This is distributed in the hope that it will be useful, but WITHOUT ANY 9 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 10 | * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 11 | * details. 12 | * 13 | * You should have received a copy of the GNU General Public License along 14 | * with this software; if not, write to the Free Software Foundation, Inc., 51 15 | * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 | */ 17 | 18 | #include "Interval6.hpp" 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | auto 30 | Interval6::initialize(const char *c, const char *c_end) -> bool { 31 | constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; }; 32 | constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; }; 33 | 34 | bool failed = false; 35 | 36 | // NOLINTBEGIN(*-pointer-arithmetic) 37 | auto field_s = c; 38 | auto field_e = std::find_if(field_s + 1, c_end, is_sep); 39 | if (field_e == c_end) 40 | failed = true; 41 | 42 | // chrom 43 | { 44 | const std::uint32_t d = std::distance(field_s, field_e); 45 | chrom = std::string{field_s, d}; 46 | } 47 | 48 | // start 49 | field_s = std::find_if(field_e + 1, c_end, not_sep); 50 | field_e = std::find_if(field_s + 1, c_end, is_sep); 51 | failed = failed || (field_e == c_end); 52 | { 53 | const auto [ptr, ec] = std::from_chars(field_s, field_e, start); 54 | failed = failed || (ptr == field_s); 55 | } 56 | 57 | // stop 58 | field_s = std::find_if(field_e + 1, c_end, not_sep); 59 | field_e = std::find_if(field_s + 1, c_end, is_sep); 60 | failed = failed || (field_e == c_end); 61 | { 62 | const auto [ptr, ec] = std::from_chars(field_s, field_e, stop); 63 | failed = failed || (ptr == field_s); 64 | } 65 | 66 | // name 67 | field_s = std::find_if(field_e + 1, c_end, not_sep); 68 | field_e = std::find_if(field_s + 1, c_end, is_sep); 69 | failed = failed || (field_e == c_end); 70 | name = std::string(field_s, std::distance(field_s, field_e)); 71 | 72 | // score 73 | field_s = std::find_if(field_e + 1, c_end, not_sep); 74 | field_e = std::find_if(field_s + 1, c_end, is_sep); 75 | failed = failed || (field_e == c_end); 76 | { 77 | #ifdef __APPLE__ 78 | const int ret = std::sscanf(field_s, "%lf", &score); 79 | failed = failed || (ret < 1); 80 | #else 81 | const auto [ptr, ec] = std::from_chars(field_s, field_e, score); 82 | failed = failed || ec != std::errc{}; 83 | #endif 84 | } 85 | 86 | // strand (no stop; just one char and maybe end of line) 87 | field_s = std::find_if(field_e + 1, c_end, not_sep); 88 | failed = failed || (field_s == c_end); 89 | strand = *field_s; 90 | failed = failed || (strand != '-' && strand != '+'); 91 | // NOLINTEND(*-pointer-arithmetic) 92 | 93 | return !failed; 94 | } 95 | 96 | [[nodiscard]] auto 97 | read_intervals6(const std::string &intervals_file) -> std::vector { 98 | std::ifstream in(intervals_file); 99 | if (!in) 100 | throw std::runtime_error("failed to open file: " + intervals_file); 101 | std::string line; 102 | std::vector intervals; 103 | while (getline(in, line)) 104 | intervals.emplace_back(line); 105 | return intervals; 106 | } 107 | -------------------------------------------------------------------------------- /src/common/ThreeStateHMM.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2011-2023 University of Southern California 3 | Andrew D. Smith and Song Qiang 4 | 5 | Authors: Andrew D. Smith, Song Qiang 6 | 7 | This file is part of dnmtools. 8 | 9 | dnmtools is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 2 of the License, or 12 | (at your option) any later version. 13 | 14 | dnmtools is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | */ 19 | 20 | #ifndef THREE_STATE_HMM_HPP 21 | #define THREE_STATE_HMM_HPP 22 | 23 | #include "BetaBin.hpp" 24 | 25 | #include 26 | #include 27 | #include // IWYU pragma: keep 28 | #include 29 | #include 30 | 31 | enum STATE_LABELS { hypo, HYPER, HYPO }; 32 | 33 | struct Triplet { 34 | double hypo, HYPER, HYPO; 35 | }; 36 | 37 | class ThreeStateHMM { 38 | public: 39 | ThreeStateHMM(std::vector> &obs, 40 | const std::vector &res, const double tol, 41 | const size_t max_itr, const bool v); 42 | 43 | void 44 | set_parameters(const betabin &hypo_em, const betabin &HYPER_em, 45 | const betabin &HYPO_em, 46 | const std::vector> &tr); 47 | 48 | void 49 | get_parameters(betabin &hypo_em, betabin &HYPER_em, betabin &HYPO_em, 50 | std::vector> &tr) const; 51 | 52 | double 53 | BaumWelchTraining(); 54 | 55 | double 56 | PosteriorDecoding(); 57 | 58 | double 59 | ViterbiDecoding(); 60 | 61 | void 62 | get_state_posteriors(std::vector &scores) const; 63 | 64 | void 65 | get_classes(std::vector &classes) const; 66 | 67 | // private: 68 | 69 | //////////// methods //////////// 70 | double 71 | single_iteration(); 72 | double 73 | forward_algorithm(const size_t start, const size_t end); 74 | double 75 | backward_algorithm(const size_t start, const size_t end); 76 | double 77 | hypo_segment_log_likelihood(const size_t start, const size_t end); 78 | double 79 | HYPER_segment_log_likelihood(const size_t start, const size_t end); 80 | double 81 | HYPO_segment_log_likelihood(const size_t start, const size_t end); 82 | 83 | void 84 | estimate_state_posterior(const size_t start, const size_t end); 85 | void 86 | estimate_posterior_trans_prob(const size_t start, const size_t end); 87 | void 88 | estimate_parameters(); 89 | void 90 | update_observation_likelihood(); 91 | 92 | double 93 | ViterbiDecoding(const size_t start, const size_t end); 94 | 95 | std::vector> observations; 96 | std::vector reset_points; 97 | std::vector meth_lp, unmeth_lp; 98 | std::vector hypo_log_likelihood, HYPER_log_likelihood, 99 | HYPO_log_likelihood; 100 | 101 | // HMM internal data 102 | betabin hypo_emission, HYPER_emission, HYPO_emission; 103 | 104 | Triplet lp_start{}; 105 | Triplet lp_end{}; 106 | std::vector> trans; 107 | 108 | std::vector forward; 109 | std::vector backward; 110 | std::vector hypo_posteriors, HYPER_posteriors, HYPO_posteriors; 111 | std::vector hypo_hypo, hypo_HYPER, HYPER_hypo, HYPER_HYPER, 112 | HYPER_HYPO, HYPO_HYPER, HYPO_HYPO; 113 | 114 | // result 115 | std::vector classes; 116 | std::vector state_posteriors; 117 | 118 | // parameters 119 | double tolerance{}; 120 | size_t max_iterations{}; 121 | bool VERBOSE{}; 122 | }; 123 | 124 | #endif 125 | -------------------------------------------------------------------------------- /src/utils/lift-filter.cpp: -------------------------------------------------------------------------------- 1 | /* lift-filter: process lift results 2 | * 3 | * Copyright (C) 2014-2022 University of Southern California and 4 | * Andrew D. Smith 5 | * 6 | * Authors: Jenny Qu 7 | * 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | */ 18 | 19 | #include "MSite.hpp" 20 | #include "OptionParser.hpp" 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | using std::cerr; 31 | using std::endl; 32 | using std::runtime_error; 33 | using std::string; 34 | using std::vector; 35 | 36 | static bool 37 | same_chrom_pos_strand(const MSite &a, const MSite &b) { 38 | return a.pos == b.pos && a.chrom == b.chrom && a.strand == b.strand; 39 | } 40 | 41 | int 42 | main_lift_filter(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) 43 | try { 44 | string pfile; 45 | bool VERBOSE = false; 46 | bool UNIQUE = false; 47 | 48 | /****************** COMMAND LINE OPTIONS ********************/ 49 | OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) 50 | "Process duplicated sites from fast-liftover output", 51 | ""); 52 | opt_parse.add_opt("output", 'o', "Output processed methcount", true, pfile); 53 | opt_parse.add_opt("unique", 'u', "keep unique sites", false, UNIQUE); 54 | opt_parse.add_opt("verbose", 'v', "print more information", false, VERBOSE); 55 | 56 | vector leftover_args; 57 | opt_parse.parse(argc, argv, leftover_args); 58 | if (argc == 1 || opt_parse.help_requested()) { 59 | cerr << opt_parse.help_message() << '\n'; 60 | return EXIT_SUCCESS; 61 | } 62 | if (opt_parse.about_requested()) { 63 | cerr << opt_parse.about_message() << '\n'; 64 | return EXIT_SUCCESS; 65 | } 66 | if (opt_parse.option_missing()) { 67 | cerr << opt_parse.option_missing_message() << '\n'; 68 | return EXIT_SUCCESS; 69 | } 70 | if (leftover_args.empty()) { 71 | cerr << opt_parse.help_message() << '\n'; 72 | return EXIT_SUCCESS; 73 | } 74 | const string mfile(leftover_args.front()); 75 | /****************** END COMMAND LINE OPTIONS *****************/ 76 | 77 | std::ifstream in(mfile); 78 | if (!in) 79 | throw runtime_error("cannot open input file: " + mfile); 80 | 81 | std::ofstream out(pfile); 82 | // if (!of) 83 | // throw runtime_error("cannot open output file: " + pfile); 84 | // std::ostream out(of.rdbuf()); 85 | 86 | // read first site 87 | MSite curr_site; 88 | if (!(in >> curr_site)) 89 | throw runtime_error("failed reading: " + mfile); 90 | 91 | MSite next_site; 92 | bool site_is_unique = true; 93 | while (in >> next_site) { 94 | if (same_chrom_pos_strand(curr_site, next_site)) { 95 | site_is_unique = false; 96 | curr_site.add(next_site); 97 | } 98 | else { 99 | if (!UNIQUE || site_is_unique) 100 | out << curr_site << '\n'; 101 | site_is_unique = true; 102 | curr_site = next_site; 103 | } 104 | } 105 | if (!UNIQUE || site_is_unique) 106 | out << curr_site << '\n'; 107 | } 108 | catch (const std::exception &e) { 109 | std::cerr << e.what() << '\n'; 110 | return EXIT_FAILURE; 111 | } 112 | return EXIT_SUCCESS; 113 | } 114 | -------------------------------------------------------------------------------- /docs/content/sym.md: -------------------------------------------------------------------------------- 1 | # sym - collapse counts for symmetric CpGs sites 2 | 3 | ## Synopsis 4 | ```console 5 | $ dnmtools sym [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | Many of our tools were designed for data vertebrate species. In these 11 | species, the methylation levels at CpG sites tends to be symmetric, 12 | the same on each strand. Of course there are exceptions. But in many 13 | analysis settings, combining data from both strands for the same CpG 14 | site is a good idea. Assume you have output from 15 | [counts](../counts). The `sym` command will merge data on both strands 16 | for each CpG site. It takes files having the same format as output by 17 | counts with either all cytosines or CpGs only (generated with `-n` 18 | option when running counts). 19 | ```console 20 | $ dnmtools sym -o human_esc_CpG.meth human_esc.meth 21 | ``` 22 | The above command will merge all CpG pairs while also discarding sites 23 | with an indication that the CpG has mutated. Note that as long as one 24 | site of the pair is mutated, the pair is discarded. This is the 25 | default mode. If you want to keep those mutated sites, run the 26 | following: 27 | ```console 28 | $ dnmtools sym -m -o human_esc_CpG.meth human_esc.meth 29 | ``` 30 | 31 | Here is an example to show what `sym` actually does with the data. 32 | First, the following is several lines of output generated by 33 | [counts](../counts). This partial output includes sites in multiple 34 | contexts, and among them 4 are CpG sites: 35 | 36 | ```txt 37 | chr10 11473 + CHH 0 3 38 | chr10 11474 + CXG 0 13 39 | chr10 11476 - CXG 0 22 40 | chr10 11477 + CpG 0.181818 11 41 | chr10 11478 - CpG 0.391304 23 42 | chr10 11479 - CCG 0 22 43 | chr10 11481 - CHH 0 23 44 | chr10 11483 + CCG 0 11 45 | chr10 11484 + CpG 0.909091 11 46 | chr10 11485 - CpG 0.913043 23 47 | chr10 11486 - CCG 0 19 48 | chr10 11487 - CHH 0 20 49 | chr10 11489 - CHH 0.105263 19 50 | ``` 51 | 52 | The first CpG site above is at position 11477 on chr10, and there is 53 | another one immediately following it on the opposite strand. These are 54 | the two C in the same CpG site. The first one is covered by 11 reads, 55 | and among those 2 indicate methylation (a C in the reads). This is 56 | obtained by 0.181818 x 11. The next CpG has a "-" for the strand, so 57 | it refers to the G on the positive reference strand, which is the same 58 | as the C on the opposite strand for that site. This one is covered by 59 | 23 reads, 9 of which indicate methylation (0.391304 x 23). For this 60 | one CpG dinucleotide, the total methylation observations are 2 + 9 = 61 | 11, and the total reads are 11 + 23 = 34. Therefore, the methylation 62 | level for the dinucleotide is 11/34 = 0.3235294. The `sym` command 63 | would produce the following: 64 | 65 | ```txt 66 | chr10 11477 + CpG 0.323529 34 67 | chr10 11484 + CpG 0.911765 34 68 | ``` 69 | 70 | By chance, the other CpG site in the partial output above had the same 71 | number, 34, of reads covering the site when counting both 72 | strands. Notice that non-CpG sites are removed. Your input/output 73 | might look slightly different in your terminal, as the format involves 74 | tabs and not spaces. 75 | 76 | ## Options 77 | 78 | ```txt 79 | -o, -output 80 | ``` 81 | The name of the output file (default: stdout). The format is 82 | the same as output by [counts](../counts). 83 | 84 | ```txt 85 | -m, -muts 86 | ``` 87 | Include mutated CpG sites among the output, i.e. entries with an "x" 88 | terminating the fourth column of each line of input. 89 | 90 | ```txt 91 | -v, -verbose 92 | ``` 93 | Report more information while the program is running. 94 | -------------------------------------------------------------------------------- /docs/content/fastlift.md: -------------------------------------------------------------------------------- 1 | # fastlift - Mapping methylomes between species 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools fastlift -i -f -t 6 | ``` 7 | 8 | ## Description 9 | 10 | Mapping methylomes between species builds on the 11 | [liftOver tool](http://genome.ucsc.edu/cgi-bin/hgLiftOver) provided by 12 | [UCSC Genome Browser](https://genome.ucsc.edu). However it is time 13 | consuming to convert each methcounts output file from one assembly 14 | to another using the UCSC liftOver tool, given that they all should 15 | have the same locations but different read counts. Therefore, we use 16 | liftOver to generate an index file between two assemblies, and provide 17 | the `fast-liftover` tool. Suppose we have downloaded the `liftOver` tool 18 | and the chain file `mm9ToHg19.over.chain.gz` from the UCSC Genome 19 | Browser website. If we have a methcounts file `mm9.meth` of 20 | CpG sites or all cytosines in mm9. Entries in `mm9.meth` 21 | look like 22 | 23 | ```txt 24 | chr1 3005765 + CpG 0.166667 6 25 | chr1 3005846 + CpG 0.5 10 26 | chr1 3005927 + CpG 0 9 27 | ``` 28 | 29 | We would like to lift it over to the human genome hg19, and generate 30 | an index file `mm9-hg19.index` to facilitate later lift-over 31 | operations from mm9 to hg19, and keep a record of unlifted mm9 32 | cytosine positions in the file `mm9-hg19.unlifted`. First, convert the 33 | [counts](../counts) file `mm9.meth` to the 34 | BED file `mm9-cpg.bed` file for liftOver using the following command. 35 | 36 | ```shell 37 | $ awk '{print $1"\t"$2"\t"$2+1"\t",$1":"$2":"$2+1":+\t0\t+"}' mm9.meth >mm9-cpg.bed 38 | ``` 39 | 40 | The output file `mm9-cpg.bed` should look like this: 41 | 42 | ```txt 43 | chr1 3005765 3005766 chr1:3005765:3005766:+ 0 + 44 | chr1 3005846 3005847 chr1:3005846:3005847:+ 0 + 45 | chr1 3005927 3005928 chr1:3005927:3005928:+ 0 + 46 | ``` 47 | 48 | Note that the fourth column is the genomic location data linked with 49 | colons. 50 | 51 | Then, run UCSC Genome Browser tool `liftOver` as follows: 52 | 53 | ```shell 54 | $ liftOver mm9-cpg.bed mm9ToHg19.over.chain.gz mm9-hg19.index mm9-hg19.unlifted 55 | ``` 56 | 57 | The generated index file `mm9-hg19.index` will be a BED format file in 58 | hg19 coordinates, with entries like 59 | 60 | ```txt 61 | chr8 56539820 56539821 chr1:3005765:3005766:+ 0 - 62 | chr8 56539547 56539548 chr1:3005846:3005847:+ 0 - 63 | chr8 56539209 56539210 chr1:3005927:3005928:+ 0 - 64 | ``` 65 | 66 | where the 4th column contains the genomic position of the cytosine 67 | site in mm9 coordinates. 68 | 69 | Next, convert the file `mm9-hg19.index` to a tab-separated input to be 70 | passed onto the fast-liftover tool as follows. 71 | 72 | ```shell 73 | $ tr ':' '\t' mm9-hg19-fastliftover.index 74 | ``` 75 | 76 | After the index file is converted, we can use the `fast-liftover` 77 | program on any mm9 methcounts file to lift it to hg19: 78 | 79 | ```shell 80 | $ dnmtools fastlift -i mm9-hg19-fastliftover.index -f mm9.meth -t hg19-lift.meth 81 | ``` 82 | 83 | The `-p` option should be specified to report positions on the 84 | positive strand of the target assembly. Before using the lifted 85 | methcounts file, make sure it is sorted properly. 86 | 87 | ```shell 88 | $ LC_ALL=C sort -k1,1 -k2,2g -k3,3 hg19-lift.meth -o hg19-lift-sorted.meth 89 | ``` 90 | 91 | ## Options 92 | ```txt 93 | -i, -indexfile 94 | ``` 95 | index file [required] 96 | ```txt 97 | -f, -from 98 | ``` 99 | Original file [required] 100 | ```txt 101 | -t, -to 102 | ``` 103 | Output file liftovered [required] 104 | ```txt 105 | -u, -unlifted 106 | ``` 107 | (optional) File for unlifted sites 108 | ```txt 109 | -p, -plus-strand 110 | ``` 111 | (optional) Report sites on + strand 112 | ```txt 113 | -v, -verbose 114 | ``` 115 | print more run info to STDERR as the program runs 116 | 117 | -------------------------------------------------------------------------------- /docs/content/counts-nano.md: -------------------------------------------------------------------------------- 1 | # counts-nano - compute single-site methylation from nanopore data 2 | 3 | ## Synopsis 4 | ```console 5 | $ dnmtools counts-nano [OPTIONS] -c 6 | ``` 7 | 8 | ## Description 9 | 10 | The `counts-nano` command introduced in v1.5.0 is designed specifically to 11 | generate DNMTools [counts](../counts) format files from nanopore data called 12 | for the `5mCG_5hmCG` modification. Currently this is only supported for 13 | methylation and hydroxymethylation called at CpG sites. 14 | 15 | More documentation will come as this tool evolves, but for now: 16 | 17 | - Most behavior is very similar to what you will find from [counts](../counts). 18 | - Mutation information is not estimated by `nano-counts`. 19 | - Currently this only works for CpG sites and when the only modified sites are 20 | marked as `C+m?` or `C+h?` in the `MM` field of each BAM/SAM read record. 21 | - The first 6 columns of the output are the same as explained in the 22 | [counts](../counts) format, except the fraction for the 5th column is both 23 | 5mC and 5hmC. The 7th column is for 5hmC alone and the 8th is for 5mC alone. 24 | - The methylation levels will not result in integer values when multiplied by 25 | the number of reads because probabilities on modifications are used, so 26 | methylation levels for each site are expected values (the best estimates we 27 | can make), and do not use arbitrary cutoffs. 28 | - Several other commands in DNMTools have been modified to use this form of 29 | expected methylation level, and behave as previously for bisulfite 30 | sequencing data, but have updated behavior when the data is from 31 | nanopore. The user does not need to specify the technology used. 32 | - Some commands need to use a `-relaxed` flag to work with the additional 33 | columns in the output from `counts-nano` compared with `counts`. For 34 | commands without this option, simply do `cut -f1-6` on the output of 35 | `counts-nano` to remove those. 36 | 37 | ## Options 38 | 39 | ```txt 40 | -o, -output 41 | ``` 42 | Output file name. The default is to write output to the terminal, 43 | which is not useful unless commands are piped. 44 | 45 | ```txt 46 | -c, -chrom 47 | ``` 48 | Reference genome file, which must be in FASTA format. This is 49 | required. 50 | 51 | ```txt 52 | -t, -threads 53 | ``` 54 | 55 | The number of threads to use. This is only really helpful if the input is BAM 56 | (not helpful for SAM), and the output is to be zipped (see `-z` below). These 57 | threads are used to decompress BAM input and compress gzip output. If only one 58 | of these conditions holds, using more threads can still help. Because most 59 | computation in `counts-nano` is processing reads sequentially, using too many 60 | threads will have decreasing returns. 61 | 62 | ```txt 63 | -z, -zip 64 | ``` 65 | 66 | The output should be zipped (in gzip format). This is not deduced by the 67 | filename, but specifying this argument should be accompanied by using a `.gz` 68 | filename suffix for the output. 69 | 70 | ```txt 71 | -n, -cpg-only 72 | ``` 73 | 74 | Print only CpG context cytosines. This significantly reduces the output size 75 | in most genomes. Note that using this option does not merge data as symmetric 76 | CpGs. 77 | 78 | ```txt 79 | -sym 80 | ``` 81 | 82 | This will turn on `-n, -cpg-only` automatically and will output symmetric CpG 83 | sites, with each level including all counts and methylation levels as a 84 | (weighted) average of both strands. 85 | 86 | ```txt 87 | -H, -header 88 | ``` 89 | 90 | Add a header to the output file to identify the reference genome. This will be 91 | in the form of "comment" lines beginning with `#`. This is not required for most 92 | downstream processing, but is used by commands that check for consistency with 93 | a reference genome. 94 | 95 | ```txt 96 | -v, -verbose 97 | ``` 98 | 99 | Report more information while the program is running. 100 | 101 | ```txt 102 | -progress 103 | ``` 104 | Show progress while the program is running. 105 | -------------------------------------------------------------------------------- /docs/content/hypermr.md: -------------------------------------------------------------------------------- 1 | # hypermr - Detecting hypermethylated regions 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools hypermr [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | The plant genomes, exemplified by *A. thaliana*, are devoid of DNA 11 | methylation by default, with genic regions and transposons being 12 | hyper-methylated, which we termed HyperMRs to stress their difference 13 | from hypo-methylated regions in mammalian methylomes. DNA methylation 14 | in plants has been associated with expression regulation and 15 | transposon repression, and therefore characterizing HyperMRs is of 16 | much biological relevance. In addition to plants, hydroxymethylation 17 | tends to appear in a small fraction of the mammalian genome, and 18 | therefore it makes sense to identify hyper-hydroxymethylated regions. 19 | 20 | The first kind of HyperMR analysis involves finding continuous blocks 21 | of hyper-methylated CpGs with the hmr program. Since the 22 | [hmr](../hmr) program is designed to find hypo-methylated 23 | regions, one can use it to identify HyperMRs by inverting the 24 | methylation levels in the methcounts output file as follows: 25 | 26 | ```shell 27 | $ awk '{$5=1-$5; print $0}' input.meth > input_inverted.meth 28 | ``` 29 | 30 | Next one may use the hmr program to find "valleys" in the inverted 31 | Arabidopsis methylome, which are the hyper-methylated regions in the 32 | original methylome. The command is invoked as below 33 | 34 | ```shell 35 | $ dnmtools hmr -o output.hmr input_inverted.meth 36 | ``` 37 | 38 | This kind of HyperMR analysis produces continuous blocks of 39 | hyper-methylated CpGs. However in some regions, intragenic regions in 40 | particular, such continuous blocks of hyper-methylated CpGs are 41 | separated by a few unmethylated CpGs, which have distinct sequence 42 | preference when compared to those CpGs in the majority of unmethylated 43 | genome. The blocks of hyper-methylated CpGs and gap CpGs together form 44 | composite HyperMRs. The hypermr program, which implements a 45 | three-state HMM, is used to identify such HyperMRs. Suppose the 46 | [counts](../counts) output file is Col0 Meth.bed, to 47 | find HyperMRs from this dataset, run 48 | 49 | ```shell 50 | $ dnmtools hypermr -o output.hypermr input.meth 51 | ``` 52 | 53 | The output file is a 6-column 54 | [BED](https://en.wikipedia.org/wiki/BED_(file_format)) file. The 55 | first three columns give the chromosome, starting position and ending 56 | position of that HyperMR. The fourth column starts with the `hyper:`, 57 | followed by the number of CpGs within this HyperMR. The fifth column 58 | is the accumulative methylation level of all CpGs. The last column 59 | indicates the strand, which is always +. 60 | 61 | Lastly, it is worth noting that plants exhibit significantly more 62 | methylation in the non-CpG context, and therefore inclusion of non-CpG 63 | methylation in the calling of hyper-methylated regions could possibly 64 | be informative. We suggest separating each cytosine context from the 65 | methcounts output file as illustrated in the previous section (via 66 | grep) and calling HyperMRs separately for each context. 67 | 68 | ## Options 69 | 70 | ```txt 71 | -o, -out 72 | ``` 73 | output BED file (default: STDOUT) 74 | 75 | ```txt 76 | -s, -scores 77 | ``` 78 | output file for posterior scores 79 | 80 | ```txt 81 | -t, -tolerance 82 | ``` 83 | tolerance (default: 0) 84 | 85 | ```txt 86 | -d, -desert 87 | ``` 88 | maximum distance between covered CpGs in HyperMR (default: 1000) 89 | 90 | ```txt 91 | -i, -itr 92 | ``` 93 | max number of iterations (default: 100) 94 | 95 | ```txt 96 | -V, -viterbi 97 | ``` 98 | Use Viterbi decoding 99 | 100 | ```txt 101 | -M, -min-meth 102 | ``` 103 | min cumulative methylation level in HypeMR (default: 4) 104 | ```txt 105 | -v, -verbose 106 | ``` 107 | print more run info to STDERR while the program is running. 108 | ```txt 109 | -P, -params-in 110 | ``` 111 | HMM parameters input file 112 | ```txt 113 | -p, -params-out 114 | ``` 115 | HMM parameters output file 116 | 117 | -------------------------------------------------------------------------------- /src/common/Epiread.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2011-2022 University of Southern California and 2 | * Andrew D. Smith and Fang Fang 3 | * 4 | * Authors: Fang Fang and Andrew D. Smith 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | */ 16 | 17 | #include "Epiread.hpp" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | size_t 30 | adjust_read_offsets(std::vector &reads) { 31 | size_t first_read_offset = std::numeric_limits::max(); 32 | for (size_t i = 0; i < reads.size(); ++i) 33 | first_read_offset = std::min(reads[i].pos, first_read_offset); 34 | for (size_t i = 0; i < reads.size(); ++i) 35 | reads[i].pos -= first_read_offset; 36 | return first_read_offset; 37 | } 38 | 39 | size_t 40 | get_n_cpgs(const std::vector &reads) { 41 | size_t n_cpgs = 0; 42 | for (size_t i = 0; i < reads.size(); ++i) 43 | n_cpgs = std::max(n_cpgs, reads[i].end()); 44 | return n_cpgs; 45 | } 46 | 47 | std::istream & 48 | operator>>(std::istream &in, epiread &er) { 49 | std::string buffer; 50 | if (getline(in, buffer)) { 51 | std::istringstream is(buffer); 52 | if (!(is >> er.chr >> er.pos >> er.seq)) 53 | throw std::runtime_error("malformed epiread line:\n" + buffer); 54 | } 55 | return in; 56 | } 57 | 58 | std::ostream & 59 | operator<<(std::ostream &out, const epiread &er) { 60 | return out << er.chr << '\t' << er.pos << '\t' << er.seq; 61 | } 62 | 63 | bool 64 | validate_epiread_file(const std::string &filename) { 65 | const size_t max_lines_to_validate = 10000; 66 | std::ifstream in(filename); 67 | if (!in) 68 | throw std::runtime_error("failed to open file: " + filename); 69 | 70 | std::string c, s, other; 71 | size_t p = 0; 72 | 73 | size_t n_lines = 0; 74 | std::string line; 75 | while (getline(in, line) && n_lines++ < max_lines_to_validate) { 76 | std::istringstream iss(line); 77 | if (!(iss >> c >> p >> s) || iss >> other) 78 | return false; 79 | } 80 | return true; 81 | } 82 | 83 | epiread::epiread(const std::string &line) { 84 | constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; }; 85 | constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; }; 86 | 87 | using std::distance; 88 | using std::find_if; 89 | using std::from_chars; 90 | 91 | bool failed = false; 92 | 93 | // NOLINTBEGIN(*-pointer-arithmetic) 94 | 95 | const auto c = line.data(); 96 | const auto c_end = c + line.size(); 97 | 98 | auto field_s = c; 99 | auto field_e = find_if(field_s + 1, c_end, is_sep); 100 | if (field_e == c_end) 101 | failed = true; 102 | 103 | chr = std::string{field_s, static_cast(distance(field_s, field_e))}; 104 | 105 | field_s = find_if(field_e + 1, c_end, not_sep); 106 | field_e = find_if(field_s + 1, c_end, is_sep); 107 | failed = failed || (field_e == c_end); 108 | 109 | const auto [ptr, ec] = from_chars(field_s, field_e, pos); 110 | failed = failed || (ptr == field_s); 111 | 112 | field_s = find_if(field_e + 1, c_end, not_sep); 113 | field_e = find_if(field_s + 1, c_end, is_sep); 114 | failed = failed || (field_e != c_end); 115 | 116 | seq = std::string{field_s, static_cast(distance(field_s, field_e))}; 117 | 118 | if (failed) { 119 | throw std::runtime_error("bad epiread line: " + line); 120 | // ADS: the value below would work for a flag 121 | // pos = std::numeric_limits::max(); 122 | } 123 | 124 | // NOLINTEND(*-pointer-arithmetic) 125 | } 126 | -------------------------------------------------------------------------------- /cmake/static_analysis.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2025 Andrew D Smith 2 | # 3 | # This program is free software: you can redistribute it and/or modify it 4 | # under the terms of the GNU General Public License as published by the Free 5 | # Software Foundation, either version 3 of the License, or (at your option) 6 | # any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, but WITHOUT 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 | # more details. 12 | # 13 | # You should have received a copy of the GNU General Public License along with 14 | # this program. If not, see . 15 | 16 | # StaticAnalysis 17 | message(STATUS "Enabling static analysis") 18 | # If no specific static analysis is requested, do them all 19 | if(NOT RUN_CPPCHECK AND NOT RUN_IWYU AND 20 | NOT RUN_CPPLINT AND NOT RUN_CLANG_TIDY) 21 | set(RUN_CPPCHECK on) 22 | set(RUN_IWYU on) 23 | set(RUN_CPPLINT on) 24 | set(RUN_CLANG_TIDY on) 25 | endif() 26 | 27 | set(STATIC_ANALYSIS_CHECKS "") 28 | if(RUN_CPPCHECK) 29 | list(APPEND STATIC_ANALYSIS_CHECKS "cppcheck") 30 | endif() 31 | if(RUN_CPPLINT) 32 | list(APPEND STATIC_ANALYSIS_CHECKS "cpplint") 33 | endif() 34 | if(RUN_IWYU) 35 | list(APPEND STATIC_ANALYSIS_CHECKS "iwyu") 36 | endif() 37 | if(RUN_CLANG_TIDY) 38 | list(APPEND STATIC_ANALYSIS_CHECKS "clang-tidy") 39 | endif() 40 | 41 | message(STATUS "Requested static analysis: ${STATIC_ANALYSIS_CHECKS}") 42 | 43 | # cpplint: all options are in the config file 44 | if ("cpplint" IN_LIST STATIC_ANALYSIS_CHECKS) 45 | find_program(FOUND_CPPLINT cpplint) 46 | if(FOUND_CPPLINT) 47 | message(STATUS "Enabling cpplint analysis") 48 | set(CMAKE_CXX_CPPLINT cpplint --quiet) 49 | else() 50 | message(STATUS "Could not find cpplint; disabling cpplint") 51 | endif() 52 | endif() 53 | 54 | # include-what-you-use: config is a mappings file 55 | if ("iwyu" IN_LIST STATIC_ANALYSIS_CHECKS) 56 | find_program(FOUND_IWYU include-what-you-use) 57 | if(FOUND_IWYU) 58 | message(STATUS "Enabling include-what-you-use analysis") 59 | set(CMAKE_CXX_INCLUDE_WHAT_YOU_USE 60 | include-what-you-use 61 | -Xiwyu 62 | --comment_style=none 63 | -Xiwyu 64 | --quoted_includes_first 65 | -Xiwyu 66 | --mapping_file=${PROJECT_SOURCE_DIR}/iwyu.json 67 | ) 68 | else() 69 | message(STATUS "Could not find iwyu; disabling iwyu") 70 | endif() 71 | endif() 72 | 73 | # cppcheck: options on the command line as there is no config file 74 | if ("cppcheck" IN_LIST STATIC_ANALYSIS_CHECKS) 75 | find_program(FOUND_CPPCHECK cppcheck) 76 | if(FOUND_CPPCHECK) 77 | message(STATUS "Enabling cppcheck analysis") 78 | set(CMAKE_CXX_CPPCHECK 79 | cppcheck 80 | --quiet 81 | --enable=all 82 | --inline-suppr 83 | --max-configs=1 84 | --suppressions-list=${PROJECT_SOURCE_DIR}/.cppcheck_suppress 85 | ) 86 | else() 87 | message(STATUS "Could not find cppcheck; disabling cppcheck") 88 | endif() 89 | endif() 90 | 91 | # clang-tidy: need to make sure version is at least 20 92 | if ("clang-tidy" IN_LIST STATIC_ANALYSIS_CHECKS) 93 | find_program(CLANG_TIDY_EXECUTABLE NAMES clang-tidy) 94 | # Minimum required version 95 | set(MIN_CLANG_TIDY_VERSION "20.0.0") 96 | if(CLANG_TIDY_EXECUTABLE) 97 | execute_process( 98 | COMMAND 99 | bash -c 100 | "${CLANG_TIDY_EXECUTABLE} --version | grep version | tr -cd '0-9.\n'" 101 | OUTPUT_VARIABLE CLANG_TIDY_VERSION 102 | OUTPUT_STRIP_TRAILING_WHITESPACE 103 | ) 104 | # Compare the version numbers 105 | if(CLANG_TIDY_VERSION VERSION_GREATER_EQUAL MIN_CLANG_TIDY_VERSION) 106 | message(STATUS "Enabling clang-tidy (version: ${CLANG_TIDY_VERSION})") 107 | set(CMAKE_CXX_CLANG_TIDY 108 | clang-tidy 109 | --quiet 110 | --allow-no-checks 111 | -p ${PROJECT_BINARY_DIR} 112 | ) 113 | else() 114 | message(STATUS "Not enabling clang-tidy (min version not found") 115 | endif() 116 | else() 117 | message(STATUS "Could not find clang-tidy; disabling clang-tidy") 118 | endif() 119 | endif() 120 | -------------------------------------------------------------------------------- /docs/content/multistat.md: -------------------------------------------------------------------------------- 1 | # multistat 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools multistat [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | The `multistat` program is similar to [roi](../roi), but instead of 11 | creating a BED file with averge methylation levels from a single 12 | counts file, it takes as an input the output of [merge](../merge) with 13 | tabular format (i.e. using the `-tabular` flag to make a data frame) 14 | and using the `-radmeth` flag to remove suffixes that are not used in 15 | this program. In other words, `multistat` takes a data frame as input 16 | and produces a data frame as output. 17 | 18 | The input of `multistat` starts with a line with `2n` column names, with each 19 | column name appearing sequentially twice. The file is then followed by a set of 20 | lines containing `2n+1` elements. Each sample contains two columns. The first 21 | column is the number of reads that cover the CpG in the sample, and the second 22 | column is the number of CpGs that are methylated among the reads. 23 | 24 | Here is a visual example of a file called `input-tabular.tsv` with four samples 25 | (D083a, D083b, D091a and D091b): 26 | 27 | ```txt 28 | D083a D083a D083b D083b D091a D091a D091b D091b 29 | chr1:10468:+:CpG 3 0 2 0 2 0 1 0 30 | chr1:10470:+:CpG 6 0 3 0 4 0 3 0 31 | chr1:10483:+:CpG 7 0 3 0 5 0 3 1 32 | chr1:10488:+:CpG 7 0 3 0 5 0 3 0 33 | chr1:10492:+:CpG 7 0 2 0 5 0 3 0 34 | chr1:10496:+:CpG 6 0 4 0 5 0 4 0 35 | chr1:10524:+:CpG 6 2 4 0 7 0 5 1 36 | chr1:10541:+:CpG 4 0 4 0 7 2 5 0 37 | chr1:10562:+:CpG 3 0 3 0 6 0 4 0 38 | chr1:10570:+:CpG 2 0 3 0 6 0 4 0 39 | chr1:10576:+:CpG 2 0 3 0 6 0 4 0 40 | ``` 41 | 42 | Note that, if you do not add the `-radmeth` flag when running `merge`, 43 | the tabular output may contain suffixes `_R` and `_M` on the column 44 | names (e.g. `D083a_R` and `D083a_M` corresponding to the "Reads" and 45 | "Methylated" columns). You can remove these to make the input proper 46 | by running 47 | 48 | ```shell 49 | $ sed -i '1s/_[MR]//g' input-tabular.tsv 50 | ``` 51 | 52 | `multistat` also requires an input BED file representing the genomic 53 | intervals of interest. The regions must be sorted by chromosome, position and 54 | strand. If they are not, you can add the `-s` flag to sort the file prior to 55 | running the program. Note that for very large BED files, this may take a long 56 | time. Given an input file `regions.bed`, you can sort it in one of the two 57 | following ways: 58 | 59 | ```shell 60 | $ LC_ALL=C sort -k1,1 -k2,2n -k3,3n -k6,6 -o regions.bed regions.bed 61 | ``` 62 | 63 | ```shell 64 | $ bedtools sort -i regions.bed 65 | ``` 66 | 67 | Finally, to create a file `data-frame.tsv` with methylation levels (which can be 68 | [weighted, unweighted or fractional](../levels) methylation), run 69 | 70 | ```shell 71 | $ dnmtools multistat -o data-frame.tsv regions.bed input-tabular.tsv 72 | ``` 73 | 74 | ## Options 75 | 76 | ```txt 77 | -o, -output 78 | ``` 79 | 80 | Name of output file (default: STDOUT) 81 | 82 | ```txt 83 | -N, -numeric 84 | ``` 85 | 86 | print numeric values only (not NAs), guaranteeing that the output 87 | contains as many rows as there are regions in the BED input. 88 | 89 | ```txt 90 | -L, -preload 91 | ``` 92 | 93 | Load all CpG sites 94 | 95 | ```txt 96 | -s, -sort 97 | ``` 98 | 99 | sort data if needed 100 | 101 | 102 | ```txt 103 | -l, -level 104 | ``` 105 | 106 | the level to report as score column in bed format output (w, u or f), 107 | corresponding to weighted, unweighted or fractional methylation (default: w) 108 | 109 | ```txt 110 | -M, -more-levels 111 | ``` 112 | 113 | report more methylation information 114 | 115 | ```txt 116 | -v, -verbose 117 | ``` 118 | 119 | print more run info to STDERR 120 | -------------------------------------------------------------------------------- /docs/content/visualization.md: -------------------------------------------------------------------------------- 1 | # Visualizing methylome data 2 | 3 | Here we explain how to visualize data using the UCSC Genome 4 | Browser. When we refer to the genome browser below, we mean the UCSC 5 | kind. 6 | 7 | ## Single-site methylation levels 8 | 9 | Here we are concerned with individual sites. These need not be CpG 10 | sites -- the could be any/all cytosines, but we will assume they are 11 | CpGs through our explanation. 12 | 13 | To view the methylation level at individual CpG sites in a genome 14 | browser, the data should be converted into bigWig format. The starting 15 | point should be a "counts" file, as output from the 16 | [counts](../counts) command. The bigWig format is intended for the 17 | "wiggle" tracks, which shows information associated with individual 18 | genomic positions, but in the bigWig format this information is 19 | encoded concisely and is not for direct human viewing. The same 20 | approach is used to build files that show the coverage at individual 21 | CpG sites. 22 | 23 | To create methylation level tracks or read coverage tracks, follow 24 | these steps: 25 | 26 | * Download the `wigToBigWig` program from the UCSC Genome Browser 27 | directory of [binaries](http://hgdownload.cse.ucsc.edu/admin/exe/). 28 | 29 | * Use the `fetchChromSizes` script, from the same directory, to get 30 | the `.chrom.sizes` file for the database (reference genome) you are 31 | working with (e.g., hg38). Note: this is the file mentioned below as 32 | `hg19.chrom.sizes` for the hg19 reference genome. 33 | 34 | * To create a bigWig format track for methylation levels at CpG 35 | sites, convert the symmetric methylation file ([counts](../counts) 36 | format) as follows: 37 | ```console 38 | $ awk '{print $1,$2,$2+1,$5}' sample.meth | wigToBigWig /dev/stdin hg19.chrom.sizes sample.meth.bw 39 | ``` 40 | In the command above, the first part selects the appropriate columns 41 | to generate bedgraph format, and then the second part converts this 42 | directly into a bigWig format file, which is not human-readable. 43 | 44 | * To create a bigWig format track for read coverage at CpG sites, use the 45 | following command, which is very similar to the previous one above: 46 | ```console 47 | $ awk '{print $1,$2,$2+1,$6}' sample.meth | wigToBigWig /dev/stdin hg19.chrom.sizes sample.reads.bw 48 | ``` 49 | 50 | If the `wigToBigWig` or `fetchChromSizes` programs are not 51 | executable when downloaded, try the following: 52 | ```console 53 | $ chmod +x wigToBigWig 54 | $ chmod +x fetchChromSizes 55 | ``` 56 | 57 | ## The identified features 58 | 59 | This refers to the HMRs, the AMRs, the PMDs, and possibly the 60 | HyperMRs. These are contiguous genomic intervals. It happens that for 61 | an individual set of these features, as identified using dnmtools, no 62 | two features will overlap. This fact isn't relevant here, though. 63 | 64 | We will assume you want to make browser tracks for HMRs. The same 65 | procedure also works for [AMRs](../amrfinder), [PMDs](../pmd), or 66 | [DMRs](../dmr). To do so, follow these steps: 67 | 68 | * Download the `bedToBigBed` program from the UCSC Genome Browser 69 | directory of [binaries](http://hgdownload.cse.ucsc.edu/admin/exe/). 70 | 71 | * Use the `fetchChromSizes` script, from the same directory, to get 72 | the `.chrom.sizes` file for the database (reference genome) you are 73 | working with (e.g., hg38). Note: this is the file mentioned below as 74 | `hg19.chrom.sizes` for the hg19 reference genome. 75 | 76 | * Modify and use the following commands: PMDs, HMRs and AMRs may have 77 | a score greater than 1000 in the 5th column, in which case 78 | `bedToBigBed` will output an error. Also, HMR file `sample.bed` may 79 | have a non-integer score in the 5th column. The following script 80 | rounds the 5th column and prints 1000 if the score is greater than 81 | 1000: 82 | ```console 83 | $ awk -v OFS="\t" '{if ($5>1000) print $1,$2,$3,$4,"1000"; \ 84 | else print $1,$2,$3,$4,int($5)}' sample.bed > sample.for_bigbed 85 | ``` 86 | In the above command, since the HMRs are not stranded, we do not keep 87 | the 6th column. Keeping the 6th column would make all the HMRs appear 88 | as though they have a direction and they would all appear to be on the + 89 | strand. This would be visually misleading (and somewhat annoying). 90 | 91 | * Generate the `.bb` track file using the command below: 92 | ```console 93 | $ bedToBigBed sample.for_bigbed hg19.chrom.sizes output.bb 94 | ``` 95 | -------------------------------------------------------------------------------- /.github/workflows/dnmtools_release_macos.yml: -------------------------------------------------------------------------------- 1 | name: DNMTools release (macOS) 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build-macos-binaries: 8 | strategy: 9 | matrix: 10 | os: [macos-13, macos-14] 11 | runs-on: ${{ matrix.os }} 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | submodules: recursive 16 | - name: Make dnmtools dependency directories 17 | run: sudo mkdir -p /opt/dnmtools/lib /opt/dnmtools/include 18 | - name: Install dependency headers and static libs 19 | run: | 20 | brew install zlib gsl automake 21 | sudo cp $(brew --prefix zlib)/lib/*.a /opt/dnmtools/lib 22 | sudo cp $(brew --prefix gsl)/lib/*.a /opt/dnmtools/lib 23 | sudo cp -r $(brew --prefix zlib)/include/* /opt/dnmtools/include 24 | sudo cp -r $(brew --prefix gsl)/include/* /opt/dnmtools/include 25 | - name: Build and install libdeflate 26 | run: | 27 | git clone https://github.com/ebiggers/libdeflate.git && \ 28 | cd libdeflate && \ 29 | cmake -B build \ 30 | -DLIBDEFLATE_BUILD_GZIP=off \ 31 | -DLIBDEFLATE_BUILD_TESTS=off \ 32 | -DLIBDEFLATE_BUILD_SHARED_LIB=off \ 33 | -DCMAKE_VERBOSE_MAKEFILE=on \ 34 | -DCMAKE_BUILD_TYPE=Release && \ 35 | cmake --build build -j4 && \ 36 | sudo cmake --install build --prefix=/opt/dnmtools 37 | - name: Build and install HTSlib 38 | run: | 39 | git clone --recursive https://github.com/samtools/htslib.git 40 | cd htslib 41 | sudo cp -r htslib /opt/dnmtools/include 42 | autoreconf -i 43 | mkdir build && cd build 44 | ../configure \ 45 | --disable-bz2 \ 46 | --disable-libcurl \ 47 | --disable-lzma \ 48 | --disable-ref-cache \ 49 | --with-libdeflate \ 50 | LDFLAGS="-L/opt/dnmtools/lib" CPPFLAGS="-I/opt/dnmtools/include" 51 | make -j4 CFLAGS="-Wall -O2 -fvisibility=hidden" libhts.a 52 | sudo cp libhts.a /opt/dnmtools/lib 53 | - name: Build dnmtools 54 | run: | 55 | ./autogen.sh 56 | mkdir build && cd build 57 | ../configure --with-libdeflate \ 58 | CXX=g++-14 \ 59 | LDFLAGS="-L/opt/dnmtools/lib -static-libgcc -static-libstdc++ -Wl,-dead_strip" \ 60 | CPPFLAGS="-I/opt/dnmtools/include" 61 | ../data/make_full_license_info_header.sh ../data/LICENSE > license.h 62 | echo "#define INCLUDE_FULL_LICENSE_INFO 1" >> config.h 63 | make -j4 64 | - name: Rename the binary 65 | run: mv build/dnmtools dnmtools_$(uname -m) 66 | - name: Get version number 67 | id: vars 68 | run: | 69 | awk '/AC_INIT/ {print "vn="$2}' configure.ac | \ 70 | sed "s/\[//; s/\]//; s/,//" >> "$GITHUB_OUTPUT" 71 | uname -m | awk '{print "arch="$0}' >> "$GITHUB_OUTPUT" 72 | env: 73 | GH_TOKEN: ${{ github.token }} 74 | - name: Upload the binary 75 | uses: actions/upload-artifact@v4 76 | with: 77 | name: dnmtools-${{ steps.vars.outputs.arch }} 78 | path: | 79 | dnmtools_${{ steps.vars.outputs.arch }} 80 | make-lipo: 81 | needs: build-macos-binaries 82 | runs-on: macos-15 83 | steps: 84 | - uses: actions/checkout@v4 85 | - name: Get version number 86 | id: vn 87 | run: awk '/AC_INIT/ {print "vn="$2}' configure.ac | sed "s/\[//; s/\]//; s/,//" >> "$GITHUB_OUTPUT" 88 | env: 89 | GH_TOKEN: ${{ github.token }} 90 | - name: Download artifacts 91 | uses: actions/download-artifact@v4 92 | with: 93 | path: binaries 94 | pattern: dnmtools-* 95 | merge-multiple: false 96 | - name: Create universal binary 97 | run: | 98 | lipo -create \ 99 | binaries/dnmtools-*/dnmtools_* \ 100 | -output dnmtools 101 | chmod +x dnmtools 102 | tar -cf dnmtools-${{ steps.vn.outputs.vn }}-macOS.tar.gz dnmtools 103 | - name: Upload the lipo binary 104 | uses: actions/upload-artifact@v4 105 | with: 106 | name: dnmtools-${{ steps.vn.outputs.vn }}-macOS.tar.gz 107 | path: dnmtools-${{ steps.vn.outputs.vn }}-macOS.tar.gz 108 | -------------------------------------------------------------------------------- /docs/content/guessprotocol.md: -------------------------------------------------------------------------------- 1 | # guessprotocol - Identify bisulfite sequencing protocol 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools guessprotocol [OPTIONS] [] 6 | ``` 7 | 8 | ## Description 9 | 10 | Mapping a WGBS dataset requires knowledge of the sequencing protocol 11 | generated to process the data. This may not be properly documented 12 | where the data was obtained, so we created this command to guess the 13 | protocol based on the nucleotide content in the input FASTQ file (or 14 | files, for paired-end). 15 | 16 | The `guessprotocol` tool uses two models of nucleotide content 17 | following bisulfite conversion and applies this model to each 18 | read. One model is for WGBS, and the other is for PBAT. For each read, 19 | both models are applied, and the result is a probability for whether 20 | the read (or read pair) was generated using WGBS or PBAT. Once the 21 | requested number of reads is processed, the aggregate results for all 22 | reads are used to guess whether the protocol used to generate the data 23 | was WGBS, PBAT or rPBAT. The criteria are roughly as follows: if most 24 | of the reads look like they are from WGBS, then we conclude WGBS. If 25 | most of the reads look like they are from PBAT, then we conclude 26 | PBAT. If the result is more towards the middle, then we conclude 27 | rPBAT. 28 | 29 | More details: the number of As, Cs, Gs and Ts differs depending on 30 | WGBS (traditional WGBS or MethylC-seq), PBAT -- post bisulfite adaptor 31 | tagging, or rPBAT (random PBAT). 32 | 33 | * For WGBS, a single-end sequenced read should be T-rich, and if the 34 | data is paired-end, read1 is T-rich and read2 is A-rich. 35 | * For PBAT, a single-end sequenced read should be A-rich, and if the 36 | data is paired-end, read1 is A-rich and read2 is T-rich. 37 | * For rPBAT, we have a random mix of the above situations. However, in 38 | practice it seems almost never to be 50% each. 39 | 40 | In most cases, when the data is WGBS or PBAT, it is very obvious which 41 | is the protocol used. 42 | 43 | As of dnmtools v1.4.1, `guessprotocol` will always make a conclusion, 44 | but includes a confidence level. 45 | 46 | The output of `guessprotocol` is useful prior to mapping. For example, 47 | it can be used to decide whether or not to map with the `-R` flag (for 48 | "random PBAT") when using 49 | [abismal](https://github.com/smithlabcode/abismal). 50 | 51 | For paired-end data, `guessprotocol` finds ensures reads are mates by 52 | finding identical read names. Some datasets finish the read name with 53 | identifiers like ".1" on end 1 and ".2" on end 2, thus making the read 54 | names technically different at the last two characters. You can tell 55 | the program to ignore a certain suffix size (like size 2 in this 56 | example) when matching read names using the `-i` flag. 57 | 58 | The output includes the following values in a YAML format: 59 | * `protocol`: this is the guessed protocol (wgbs, pbat or rpbat) based 60 | on the content of the reads. 61 | * `confidence`: indicates the level of confidence in the guess for the 62 | protocol (values: low or high). 63 | * `layout`: indicates whether the supplied reads were paired or 64 | single-ended. 65 | * `n_reads_wgbs`: the average number of reads (for single-ended reads) 66 | or read pairs (for paired reads) where read1 is determined by the 67 | model to be T-rich. 68 | * `n_reads`: the number of evaluated reads or read pairs. 69 | * `wgbs_fraction`: the probability that a read (for single-ended 70 | reads) or the read1 of a read pair (for paired reads) is T-rich. 71 | 72 | ## Options 73 | ``` 74 | -n, -nreads 75 | ``` 76 | Number of reads to check. The program stops after collecting 77 | statistics for the first `n` reads (default: 1,000,000). Fewer than 78 | the default are usually sufficient, but increase this value if you 79 | suspect reads at the start of the file might be problematic. 80 | 81 | ```txt 82 | -i -ignore 83 | ``` 84 | Length of the read name suffix to ignore when matching read names to 85 | ensure mates are correctly synchronized when the data is paired-end. 86 | 87 | ``` 88 | -b, -bisulfite 89 | ``` 90 | Assumed bisulfite conversion rate for the models (default: 0.98). 91 | 92 | ``` 93 | -H, -human 94 | ``` 95 | Use human genome nucleotide frequencies. A good assumption for samples 96 | from a mammal. 97 | 98 | ``` 99 | -o, -output 100 | ``` 101 | The output file name. 102 | 103 | ``` 104 | -v, -verbose 105 | ``` 106 | Report available information during the run. 107 | -------------------------------------------------------------------------------- /src/common/BetaBin.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2011-2022 University of Southern California 3 | Authors: Andrew D. Smith, Song Qiang 4 | 5 | This file is part of dnmtools. 6 | 7 | dnmtools is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | dnmtools is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | */ 17 | 18 | #include "BetaBin.hpp" 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | #include 31 | 32 | using std::cerr; 33 | using std::max; 34 | using std::min; 35 | using std::pair; 36 | using std::setprecision; 37 | using std::setw; 38 | using std::string; 39 | using std::vector; 40 | 41 | ////////////////////////////////////////////// 42 | ////// struct betabin ////// 43 | ////////////////////////////////////////////// 44 | 45 | const double betabin::tolerance = 1e-10; 46 | 47 | betabin::betabin() : alpha(1), beta(1), lnbeta_helper(gsl_sf_lnbeta(1, 1)) {} 48 | 49 | betabin::betabin(const double a, const double b) : 50 | alpha(a), beta(b), lnbeta_helper(gsl_sf_lnbeta(a, b)) {} 51 | 52 | betabin::betabin(const string &str) { 53 | std::istringstream iss(str, std::istringstream::in); 54 | string name; 55 | iss >> name >> alpha >> beta; 56 | if (name != "betabin" || alpha < 0 || beta < 0) { 57 | cerr << "betabin::betabin: " 58 | << "bad string representation of betabin distribution: " << str 59 | << '\n'; 60 | throw "bad string representation of betabin distribution"; 61 | } 62 | lnbeta_helper = gsl_sf_lnbeta(alpha, beta); 63 | } 64 | 65 | string 66 | betabin::tostring() const { 67 | std::ostringstream os; 68 | os << "betabin " << setprecision(4) << alpha << " " << setprecision(4) 69 | << beta; 70 | return os.str(); 71 | } 72 | 73 | double 74 | betabin::operator()(const pair &val) const { 75 | const std::uint32_t x = static_cast(val.first); 76 | const std::uint32_t n = static_cast(val.first + val.second); 77 | return gsl_sf_lnchoose(n, x) + gsl_sf_lnbeta(alpha + x, beta + val.second) - 78 | lnbeta_helper; 79 | } 80 | 81 | double 82 | betabin::log_likelihood(const pair &val) const { 83 | const std::uint32_t x = static_cast(val.first); 84 | const std::uint32_t n = static_cast(val.first + val.second); 85 | return gsl_sf_lnchoose(n, x) + gsl_sf_lnbeta(alpha + x, beta + val.second) - 86 | lnbeta_helper; 87 | } 88 | 89 | double 90 | betabin::sign(const double x) { 91 | return (x >= 0) ? 1.0 : -1.0; 92 | } 93 | 94 | double 95 | betabin::invpsi(const double tolerance, const double x) { 96 | double L = 1.0; 97 | double Y = std::exp(x); 98 | while (L > tolerance) { 99 | Y += L * sign(x - gsl_sf_psi(Y)); 100 | L /= 2.0; // NOLINT(*-avoid-magic-numbers) 101 | } 102 | return Y; 103 | } 104 | 105 | double 106 | betabin::movement(const double curr, const double prev) { 107 | return std::abs(curr - prev) / std::max(std::fabs(curr), std::fabs(prev)); 108 | } 109 | 110 | void 111 | betabin::fit(const vector &vals_a, const vector &vals_b, 112 | const vector &p) { 113 | static constexpr auto initial_param_vals = 0.01; 114 | const double p_total = std::accumulate(p.begin(), p.end(), 0.0); 115 | const double alpha_rhs = 116 | std::inner_product(std::cbegin(vals_a), std::cend(vals_a), std::cbegin(p), 117 | 0.0) / 118 | p_total; 119 | const double beta_rhs = 120 | std::inner_product(std::cbegin(vals_b), std::cend(vals_b), std::cbegin(p), 121 | 0.0) / 122 | p_total; 123 | double prev_alpha = 0.0, prev_beta = 0.0; 124 | alpha = beta = initial_param_vals; 125 | 126 | while (movement(alpha, prev_alpha) > tolerance && 127 | movement(beta, prev_beta) > tolerance) { 128 | prev_alpha = alpha; 129 | prev_beta = beta; 130 | alpha = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + alpha_rhs); 131 | beta = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + beta_rhs); 132 | } 133 | lnbeta_helper = gsl_sf_lnbeta(alpha, beta); 134 | } 135 | -------------------------------------------------------------------------------- /docs/content/levels.md: -------------------------------------------------------------------------------- 1 | # levels - global methylation summary statistics 2 | 3 | ## Synopsis 4 | 5 | ```console 6 | $ dnmtools levels [OPTIONS] 7 | ``` 8 | 9 | ## Description 10 | 11 | The `levels` command computes global summary statistics for the output 12 | of [counts](../counts). Example output is below. It computes multiple 13 | summary statistics related to the quantity of data (e.g., coverage of 14 | sites) and methylation (e.g., global average methylation). These 15 | summary statistics are also provided by context. The context are 16 | explained [here](../cytosine_contexts). These are not exclusive 17 | categories, and include: 18 | 19 | * cytosines, all of them, on either strand 20 | * cpg sites, on either strand 21 | * symmetric cpg sites (strands combined) 22 | * the CHH context 23 | * the CCG context 24 | * the CXG context (we "invented" this one) 25 | 26 | The summary statistics computed include: 27 | 28 | * `total_sites` the total number of sites counted for this context 29 | * `sites_covered` among the total above, those with at least one read 30 | * `total_c` among the observations in reads, how many are C 31 | * `total_t` among the observations in reads, how many are T 32 | * `max_depth` the most coverage of any site for this context 33 | * `mutations` number of sites for this context marked as mutated 34 | * `called_meth` number of sites "called" methylated 35 | * `called_unmeth` number of sites "called" unmethylated 36 | * `mean_agg` the sum of methylation levels for all sites 37 | * `coverage` total data informing on sites for this context 38 | * `sites_covered_fraction` fraction of sites covered 39 | * `mean_depth` among all sites, the mean coverage by reads 40 | * `mean_depth_covered` among all covered sites, the mean coverage 41 | * `mean_meth` the mean of the methylation levels for covered sites 42 | * `mean_meth_weighted` the mean weighted by coverage 43 | * `fractional_meth` the fraction of "called" sites "called" methylated 44 | 45 | (If you want more information on these, please ask.) 46 | 47 | Among the above, many are included because they are needed for 48 | calculating the the "derived" statistics. For example, the `mean_agg` 49 | is used in the denominator for `mean_meth`, where the denominator is 50 | the number of covered sites. Why keep those raw statistics? Because 51 | it's essential if two different `levels` output files are combined. 52 | 53 | The final three values are the "levels" and are described in Schultz 54 | et al. (2012): 55 | ```txt 56 | "Leveling" the playing field for analyses of single-base resolution DNA methylomes 57 | Schultz, Schmitz & Ecker (TIG 2012) 58 | ``` 59 | 60 | Note: the `fractional_meth` level we calculate is inspired but 61 | different from the paper. What we are do is use a binomial test to 62 | determine significantly hyper/hypomethylated sites, and only use the 63 | subset of significant sites to calculate `fractional_meth` level. 64 | 65 | This command should provide flexibility to compare methylation data 66 | with publications that calculate averages different ways. The sample 67 | output below only shows the results for cytosines and CpGs in the 68 | sample, but similar output is generated for symmetric CpGs and 69 | cytosines in the CHH, CCG, and CXG contexts. 70 | 71 | ```yaml 72 | cytosines: 73 | total_sites: 1200559022 74 | sites_covered: 797100353 75 | total_c: 417377038 76 | total_t: 4048558428 77 | max_depth: 30662 78 | mutations: 3505469 79 | called_meth: 44229556 80 | called_unmeth: 750163257 81 | mean_agg: 4.40429e+07 82 | coverage: 4465935466 83 | sites_covered_fraction: 0.663941 84 | mean_depth: 3.71988 85 | mean_depth_covered: 5.60273 86 | mean_meth: 0.055254 87 | mean_meth_weighted: 0.093458 88 | fractional_meth: 0.055677 89 | cpg: 90 | total_sites: 58803590 91 | sites_covered: 47880982 92 | total_c: 261807401 93 | total_t: 84403225 94 | max_depth: 30080 95 | mutations: 381675 96 | called_meth: 38861909 97 | called_unmeth: 7152004 98 | mean_agg: 3.69282e+07 99 | coverage: 346210626 100 | sites_covered_fraction: 0.814253 101 | mean_depth: 5.88758 102 | mean_depth_covered: 7.23065 103 | mean_meth: 0.771250 104 | mean_meth_weighted: 0.756208 105 | ``` 106 | 107 | You can run the `levels` command as follows: 108 | ```console 109 | $ dnmtools levels -o output.levels input.meth 110 | ``` 111 | 112 | ## Options 113 | 114 | ```console 115 | -o, -output 116 | ``` 117 | Output file in YAML format (default: stdout). 118 | 119 | ```console 120 | -a, -alpha 121 | ``` 122 | Alpha for confidence interval (default: 0.95). 123 | 124 | ```console 125 | -v, -verbose 126 | ``` 127 | Report more information while the program is running. 128 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GitHub Downloads](https://img.shields.io/github/downloads/smithlabcode/dnmtools/total?style=social)](https://github.com/smithlabcode/dnmtools/releases) 2 | [![Install with Conda](https://anaconda.org/bioconda/dnmtools/badges/version.svg)](https://anaconda.org/bioconda/dnmtools) 3 | [![Install with Conda](https://anaconda.org/bioconda/dnmtools/badges/platforms.svg)](https://anaconda.org/bioconda/dnmtools) 4 | [![Install with Conda](https://anaconda.org/bioconda/dnmtools/badges/downloads.svg)](https://anaconda.org/bioconda/dnmtools) 5 | [![Documentation Status](https://readthedocs.org/projects/dnmtools/badge/?version=latest)](https://dnmtools.readthedocs.io/en/latest/?badge=latest) 6 | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) 7 | 8 | DNMTools is a set of tools for analyzing DNA methylation data from 9 | high-throughput sequencing experiments, especially whole genome bisulfite 10 | sequencing (WGBS), but also reduced representation bisulfite sequencing 11 | (RRBS). These tools focus on overcoming the computing challenges imposed by 12 | the scale of genome-wide DNA methylation data, which is usually the early 13 | parts of data analysis. 14 | 15 | **Nanopore** As of v1.5.0, DNMTools has funcionality to start analysis with a 16 | BAM file from Nanopore sequencing with 5mC and 5hmC calls at CpG sites. 17 | 18 | ## Usage 19 | 20 | The documentation for DNMTools can be found 21 | [here](https://dnmtools.readthedocs.io). 22 | 23 | ## Installation 24 | 25 | - **Linux** 26 | [binary](https://github.com/smithlabcode/dnmtools/releases/download/v1.5.1/dnmtools-1.5.1-Linux.tar.gz). 27 | Should work on any Linux distribution since roughly 2017. 28 | 29 | - **Mac** 30 | [binary](https://github.com/smithlabcode/dnmtools/releases/download/v1.5.1/dnmtools-1.5.1-macOS.tar.gz). 31 | Should work on any Mac hardware and macOS-13 (Ventura) or newer. 32 | 33 | - **Conda** 34 | ```console 35 | conda install -c bioconda dnmtools 36 | ``` 37 | 38 | - **Source** 39 | [dnmtools-1.5.1.tar.gz](https://github.com/smithlabcode/dnmtools/releases/download/v1.5.1/dnmtools-1.5.1.tar.gz). Dependencies: 40 | [GSL](http://www.gnu.org/software/gsl), 41 | [HTSlib](https://github.com/samtools/htslib), 42 | [libdeflate](https://github.com/ebiggers/libdeflate) and 43 | [ZLib](https://github.com/madler/zlib). Installing HTSlib as a package 44 | should also give you ZLib and libdeflate. System-specific details below. 45 | 46 | Build DNMTools like this: 47 | ```console 48 | tar -xf dnmtools-1.5.1.tar.gz 49 | cd dnmtools-1.5.1 50 | ./configure --prefix=$HOME 51 | make 52 | make install 53 | ``` 54 | 55 | To get dependencies and a compiler on (these might with OS/package updates): 56 | 57 | Ubuntu/Debian 58 | ```console 59 | apt-get install build-essential htslib-dev libgsl-dev 60 | ``` 61 | 62 | RedHat/Fedora 63 | ```console 64 | dnf install @c-development @development-tools htslib-devel gsl-devel awk 65 | ``` 66 | 67 | Homebrew (see notes below) 68 | ```console 69 | brew install gcc htslib gsl 70 | ``` 71 | 72 | Conda (see notes below) 73 | ```console 74 | conda create -n build-env -c conda-forge -c bioconda \ 75 | gcc gxx make autoconf automake htslib gsl zlib binutils && \ 76 | conda activate build-env 77 | ``` 78 | 79 | Notes: If you use only Homebrew or only Conda to setup your environment, you 80 | could need additional dependencies, and some of what I listed you might 81 | already have. You might need to set additional environment variables or run 82 | configure differently. For example with Homebrew: 83 | ```console 84 | ./configure CPPFLAGS="-I$(brew --prefix)/include" LDFLAGS="-L$(brew --prefix)/lib" 85 | ``` 86 | 87 | ## Contact 88 | 89 | Andrew D. Smith 90 | andrewds@usc.edu 91 | 92 | ## Copyright and License Information 93 | 94 | Copyright (C) 2022-2025 95 | 96 | Andrew D. Smith and Guilherme de Sena Brandine 97 | 98 | Authors of DNMTools: Andrew D. Smith and Guilherme de Sena Brandine 99 | 100 | Essential contributors: Ben Decato, Meng Zhou, Liz Ji, Terence Li, Jenny Qu, 101 | Qiang Song, Fang Fang and Masaru Nakajima 102 | 103 | This is free software: you can redistribute it and/or modify it under the 104 | terms of the GNU General Public License as published by the Free Software 105 | Foundation, either version 3 of the License, or (at your option) any later 106 | version. 107 | 108 | This software is distributed in the hope that it will be useful, but WITHOUT 109 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 110 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 111 | details. 112 | -------------------------------------------------------------------------------- /docs/content/amrfinder.md: -------------------------------------------------------------------------------- 1 | # amrfinder - Compute allelically methylated regions (AMRs) 2 | 3 | ## Synopsis 4 | ```shell 5 | $ dnmtools amrfinder [OPTIONS] 6 | ``` 7 | 8 | ## Description 9 | 10 | The program `amrfinder` scans the genome using a sliding window to 11 | identify AMRs. For a genomic interval, two statistical models are 12 | fitted to the reads mapped, respectively. One model (single-allele 13 | model) assumes the two alleles have the same methylation state, and 14 | the other (two-allele model) represents different methylation states 15 | for the two alleles. Comparing the likelihood of the two models, the 16 | interrogated genomic interval may be classified as an AMR. The 17 | following command shows an example to run the program amrfinder and 18 | takes as input an epireads file generated from 19 | [stats](../states). 20 | 21 | ```shell 22 | $ dnmtools amrfinder -c /path/to/genome.fa -o output.amr input.epiread 23 | ``` 24 | 25 | There are several options for running amrfinder. 26 | 27 | * The `-b` switches from 28 | using a likelihood ratio test to BIC as the criterion for calling an 29 | AMR. 30 | 31 | * The `-i` option changes the number of iterations used in the EM 32 | procedure when fitting the models. 33 | 34 | * The `-w` option changes the size of 35 | the sliding window, which is in terms of CpGs. The default of 10 CpGs 36 | per window has worked well for us. 37 | 38 | * The `-m` indicates the minimum 39 | coverage per CpG site required for a window to be tested as an AMR. 40 | The default requires 4 reads on average, and any lower will probably 41 | lead to unreliable results. 42 | 43 | * The `-g` parameter is used to indicate the maximum distance between 44 | any two identified AMRS; AMRs are often fragmented, as coverage 45 | fluctuates, and spacing between CpGs means their linkage cannot be 46 | captured by the model. if two are any closer than this value, they 47 | are merged. The default is 1000, and it seems to work well in 48 | practice, not joining things that appear as though they should be 49 | distinct. In the current version of the program, at the end of the 50 | procedure, any AMRs whose size in terms of base-pairs is less than 51 | half the "gap" size are eliminated. This is a hack that has produced 52 | excellent results, but will eventually be eliminated (hopefully soon). 53 | 54 | * The `-C` parameter specifies the critical value for keeping windows 55 | as AMRs, and is only useful when the likelihood ratio test is the 56 | used; for BIC windows are retained if the BIC for the two-allele model 57 | is less than that for the single-allele model. amrfinder calculates a 58 | false discovery rate to correct for multiple testing, and therefore 59 | most p-values that pass the test will be significantly below the 60 | critical value. 61 | 62 | * The `-h` option produces FDR-adjusted p-values according to a 63 | step-up procedure and then compares them directly to the given 64 | critical value, which allows further use of the p-values without 65 | multiple testing correction. 66 | 67 | * The `-f` omits multiple testing correction entirely by not applying 68 | a correction to the p-values or using a false discovery rate cutoff 69 | to select AMRs. 70 | 71 | ## Options 72 | 73 | ```txt 74 | -o, -output 75 | ``` 76 | The name of the output file. If no file name is provided, the output 77 | will be written to standard output. Due to the size of this output, a 78 | file should be specified unless the output will be piped to another 79 | command or program. The output file contains genomic intervals in BED 80 | format. 81 | 82 | ```txt 83 | -c, -chrom 84 | ``` 85 | FASTA file or directory of chromosomes containing FASTA files. This 86 | parameter is required. 87 | 88 | ```txt 89 | -i, -itr 90 | ``` 91 | The maximum number of iterations when training (default: 10). 92 | 93 | ```txt 94 | -w, -window 95 | ``` 96 | Size of sliding window (default: 10 CpG sites). 97 | 98 | ```txt 99 | -m, -min-cov 100 | ``` 101 | Minimum coverage per CpG to test in each window (default: 4). 102 | 103 | ```txt 104 | -g, -gap 105 | ``` 106 | Minimum allowed gap, in bp, between AMRs (default: 1000). 107 | 108 | ```txt 109 | -C, -crit 110 | ``` 111 | Critical p-value cutoff (default: 0.01). 112 | 113 | ```txt 114 | -f, -nofdr 115 | ``` 116 | Omits the FDR multiple testing correction. 117 | 118 | ```txt 119 | -h, -pvals 120 | ``` 121 | Adjusts p-values using Hochberg step-up. 122 | 123 | ```txt 124 | -b, -bic 125 | ``` 126 | Use Bayesian Information Criterion (BIC) to compare models. 127 | 128 | ```txt 129 | -v, -verbose 130 | ``` 131 | Print more information while the command is running. 132 | 133 | ```txt 134 | -P, -progress 135 | ``` 136 | Print progress info while the command is running. 137 | --------------------------------------------------------------------------------