├── .github
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   ├── dnmtools_distcheck_ubuntu.yml
    │   ├── dnmtools_build_ubuntu.yml
    │   ├── dnmtools_build_macos.yml
    │   ├── dnmtools_release_linux.yml
    │   └── dnmtools_release_macos.yml
├── docs
    ├── content
    │   ├── environment.yaml
    │   ├── Makefile
    │   ├── merge-bsrate.md
    │   ├── cleanhp.md
    │   ├── liftfilter.md
    │   ├── hmr-rep.md
    │   ├── allelic.md
    │   ├── amrtester.md
    │   ├── radmerge.md
    │   ├── selectsites.md
    │   ├── entropy.md
    │   ├── radadjust.md
    │   ├── states.md
    │   ├── uniq.md
    │   ├── diff.md
    │   ├── sym.md
    │   ├── fastlift.md
    │   ├── counts-nano.md
    │   ├── hypermr.md
    │   ├── multistat.md
    │   ├── visualization.md
    │   ├── guessprotocol.md
    │   ├── levels.md
    │   └── amrfinder.md
    ├── requirements.txt
    ├── dnmtools_bash_completion
    ├── README.md
    └── mkdocs.yml
├── data
    ├── reads_1.fq.gz
    ├── reads_2.fq.gz
    ├── araTha1_simulated.counts.gz
    ├── pmd_test_data.counts.sym.gz
    ├── make_full_license_info_header.sh
    ├── radmeth_test_design.txt
    ├── tRex1_promoters.bed
    ├── config.h.in
    ├── md5sum.txt
    ├── methylome_a.counts.sym
    └── methylome_b.counts.sym
├── pipeline
    ├── config.yaml
    ├── runconfig.yaml
    └── Dockerfile
├── iwyu.json
├── .readthedocs.yaml
├── .gitmodules
├── .clang-format
├── test_scripts
    ├── test_abismalidx.test
    ├── test_hmr.test
    ├── test_sym.test
    ├── test_levels.test
    ├── test_pmd.test
    ├── test_hypermr.test
    ├── test_diff.test
    ├── test_xcounts.test
    ├── test_unxcounts.test
    ├── test_amrfinder.test
    ├── test_bsrate.test
    ├── test_radmeth.test
    ├── test_uniq.test
    ├── test_selectsites.test
    ├── test_counts.test
    ├── test_states.test
    ├── test_roi.test
    ├── test_format.test
    ├── test_mlml.test
    └── test_abismal.test
├── src
    ├── common
    │   ├── dnmtools_utils.hpp
    │   ├── dnmtools_gaussinv.hpp
    │   ├── dnmtools_utils.cpp
    │   ├── CMakeLists.txt
    │   ├── numerical_utils.hpp
    │   ├── dnmt_error.hpp
    │   ├── Smoothing.hpp
    │   ├── BetaBin.hpp
    │   ├── xcounts_utils.hpp
    │   ├── numerical_utils.cpp
    │   ├── Epiread.hpp
    │   ├── counts_header.hpp
    │   ├── Interval.cpp
    │   ├── EmissionDistribution.hpp
    │   ├── bsutils.cpp
    │   ├── Interval.hpp
    │   ├── bsutils.hpp
    │   ├── Interval6.hpp
    │   ├── EpireadStats.hpp
    │   ├── Interval6.cpp
    │   ├── ThreeStateHMM.hpp
    │   ├── Epiread.cpp
    │   └── BetaBin.cpp
    ├── radmeth
    │   ├── radmeth_optimize_params.hpp
    │   ├── radmeth_optimize_series.hpp
    │   ├── radmeth_optimize_gamma.hpp
    │   ├── radmeth_utils.hpp
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── radmeth_design.hpp
    ├── mlml
    │   └── CMakeLists.txt
    ├── utils
    │   ├── CMakeLists.txt
    │   └── lift-filter.cpp
    ├── analysis
    │   └── CMakeLists.txt
    ├── amrfinder
    │   └── CMakeLists.txt
    └── CMakeLists.txt
├── CPPLINT.cfg
├── .cppcheck_suppress
├── cmake
    ├── FindLIBDEFLATE.cmake
    └── static_analysis.cmake
├── CMakeLists.txt
├── autogen.sh
├── Dockerfile
├── .clang-tidy
├── MAINTAINERS.md
└── README.md


/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/docs/content/environment.yaml:
--------------------------------------------------------------------------------
1 | name: docs
2 | dependencies:
3 |   - Jinja2>3.1.4
4 | 


--------------------------------------------------------------------------------
/data/reads_1.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/dnmtools/HEAD/data/reads_1.fq.gz


--------------------------------------------------------------------------------
/data/reads_2.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/dnmtools/HEAD/data/reads_2.fq.gz


--------------------------------------------------------------------------------
/data/araTha1_simulated.counts.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/dnmtools/HEAD/data/araTha1_simulated.counts.gz


--------------------------------------------------------------------------------
/data/pmd_test_data.counts.sym.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/dnmtools/HEAD/data/pmd_test_data.counts.sym.gz


--------------------------------------------------------------------------------
/data/make_full_license_info_header.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | input=$1
4 | 
5 | echo 'static const char *license_text = R"('
6 | cat "$input"
7 | echo ')";'
8 | 


--------------------------------------------------------------------------------
/pipeline/config.yaml:
--------------------------------------------------------------------------------
1 | dnmtools_dir: '/home/username/bin'
2 | trim_galore_dir: '/usr/bin'
3 | samtools_dir: '/usr/bin'
4 | scratch_dir: '/tmp'
5 | threads: 6
6 | 


--------------------------------------------------------------------------------
/data/radmeth_test_design.txt:
--------------------------------------------------------------------------------
 1 | base	sex	factor
 2 | sample_FA1	1	1	1
 3 | sample_FA2	1	1	1
 4 | sample_FB1	1	1	0
 5 | sample_FB2	1	1	0
 6 | sample_MA1	1	0	1
 7 | sample_MA2	1	0	1
 8 | sample_MB1	1	0	0
 9 | sample_MB2	1	0	0
10 | 


--------------------------------------------------------------------------------
/pipeline/runconfig.yaml:
--------------------------------------------------------------------------------
 1 | outfiles:
 2 |   - 'test.hmr'
 3 |   - 'test.bsrate'
 4 |   - 'test.hypermr'
 5 |   - 'test.levels'
 6 |   - 'test.pmd'
 7 |   - 'test.amr'
 8 | 
 9 | genome_fasta_file: '/home/username/data/genome.fa'
10 | paired: True
11 | cpg_only: True
12 | 


--------------------------------------------------------------------------------
/iwyu.json:
--------------------------------------------------------------------------------
1 | [
2 |     { "include": ["<pstl/glue_algorithm_defs.h>", "private", "<algorithm>", "public"] },
3 |     { "include": ["@[\"<]htslib/kstring.h[\">]", "private", "<htslib/sam.h>", "public"] },
4 |     { "include": ["@[\"<]htslib/hts.h[\">]", "private", "<htslib/sam.h>", "public"] },
5 | ]
6 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | build:
 3 |   os: ubuntu-20.04
 4 |   tools:
 5 |     python: "3.9"
 6 | 
 7 | mkdocs:
 8 |   configuration: docs/mkdocs.yml
 9 |   fail_on_warning: false
10 | 
11 | python:
12 |   install:
13 |     - requirements: docs/requirements.txt
14 | 
15 | formats:
16 |   - pdf
17 |   - epub
18 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "src/smithlab_cpp"]
 2 | 	path = src/smithlab_cpp
 3 | 	url = ../smithlab_cpp.git
 4 | 	ignore = dirty
 5 | [submodule "src/abismal"]
 6 | 	path = src/abismal
 7 | 	url = ../abismal.git
 8 | 	ignore = dirty
 9 | [submodule "src/bamxx"]
10 | 	path = src/bamxx
11 | 	url = ../bamxx.git
12 | 	ignore = dirty
13 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: LLVM
 2 | ColumnLimit: 80
 3 | IndentWidth: 2
 4 | AlwaysBreakAfterReturnType: All
 5 | ContinuationIndentWidth: 2
 6 | ConstructorInitializerIndentWidth: 2
 7 | BraceWrapping:
 8 |   BeforeElse: true
 9 |   BeforeCatch: true
10 | BreakBeforeBraces: Custom
11 | BreakConstructorInitializers: AfterColon
12 | SpacesBeforeTrailingComments: 2
13 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | Jinja2>=3.1.4
 2 | mkdocs>=1.3.1
 3 | babel>=2.9.0
 4 | click>=7.0
 5 | Markdown>=3.2.1,<3.4
 6 | PyYAML>=5.2
 7 | watchdog>=2.0.0
 8 | mdx_gh_links>=0.2
 9 | ghp-import>=1.0
10 | pyyaml_env_tag>=0.1
11 | mkdocs-redirects>=1.0.1
12 | importlib_metadata>=4.3
13 | packaging>=20.5
14 | mergedeep>=1.3.4
15 | pygments>=2.12
16 | pymdown-extensions
17 | mkdocs-material
18 | 


--------------------------------------------------------------------------------
/data/tRex1_promoters.bed:
--------------------------------------------------------------------------------
 1 | chr1    42178   44225
 2 | chr1    45867   47867
 3 | chr1    113357  115357
 4 | chr1    195388  197388
 5 | chr1    288263  290263
 6 | chr1    320602  322602
 7 | chr1    332945  334945
 8 | chr1    456998  458998
 9 | chr1    481945  483998
10 | chr2    144282  146282
11 | chr2    243609  245609
12 | chr2    270268  272434
13 | chr2    323639  325760
14 | chr2    373828  376724
15 | chr2    495275  497290
16 | 


--------------------------------------------------------------------------------
/test_scripts/test_abismalidx.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile=tests/tRex1.fa
 4 | outfile=tests/tRex1.idx
 5 | if [[ -e "${infile}" ]]; then
 6 |     ./dnmtools abismalidx ${infile} ${outfile}
 7 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 8 |     if [[ "${x}" != "OK" ]]; then
 9 |         exit 1;
10 |     fi
11 | else
12 |     echo "${infile} not found; skipping remaining tests";
13 |     exit 77;
14 | fi
15 | 


--------------------------------------------------------------------------------
/test_scripts/test_hmr.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile=tests/reads.counts.sym
 4 | outfile=tests/reads.hmr
 5 | if [[ -e "${infile}" ]]; then
 6 |     ./dnmtools hmr -v -o ${outfile} ${infile}
 7 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 8 |     if [[ "${x}" != "OK" ]]; then
 9 |         exit 1;
10 |     fi
11 | else
12 |     echo "${infile} not found; skipping remaining tests";
13 |     exit 77;
14 | fi
15 | 


--------------------------------------------------------------------------------
/test_scripts/test_sym.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile=tests/reads.counts
 4 | outfile=tests/reads.counts.sym
 5 | if [[ -e "${infile}" ]]; then
 6 |     ./dnmtools sym -o ${outfile} ${infile}
 7 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 8 |     if [[ "${x}" != "OK" ]]; then
 9 |         exit 1;
10 |     fi
11 | else
12 |     echo "${infile} not found; skipping remaining tests";
13 |     exit 77;
14 | fi
15 | 


--------------------------------------------------------------------------------
/test_scripts/test_levels.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile=tests/reads.counts
 4 | outfile=tests/reads.levels
 5 | if [[ -e "${infile}" ]]; then
 6 |     ./dnmtools levels -v -o ${outfile} ${infile}
 7 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 8 |     if [[ "${x}" != "OK" ]]; then
 9 |         exit 1;
10 |     fi
11 | else
12 |     echo "${infile} not found; skipping remaining tests";
13 |     exit 77;
14 | fi
15 | 


--------------------------------------------------------------------------------
/docs/dnmtools_bash_completion:
--------------------------------------------------------------------------------
 1 | _dnmtools()
 2 | {
 3 |   local cur prev opts
 4 |   COMPREPLY=()
 5 |   cur="${COMP_WORDS[COMP_CWORD]}"
 6 |   prev="${COMP_WORDS[COMP_CWORD-1]}"
 7 | 
 8 |   opts=`dnmtools | grep "^    " | awk '{print $1}' | tr -d ':'`
 9 | 
10 |   case $prev in
11 |       dnmtools)
12 |           COMPREPLY=( $(compgen -f -W "${opts}" -- "$cur") )
13 |           ;;
14 |   esac
15 |   return 0
16 | }
17 | complete -F _dnmtools -o default dnmtools
18 | 


--------------------------------------------------------------------------------
/test_scripts/test_pmd.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile=tests/pmd_test_data.counts.sym.gz
 4 | outfile=tests/methylome.pmd
 5 | if [[ -e "${infile}" ]]; then
 6 |     ./dnmtools pmd -o ${outfile} ${infile}
 7 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 8 |     if [[ "${x}" != "OK" ]]; then
 9 |         exit 1;
10 |     fi
11 | else
12 |     echo "${infile} not found; skipping remaining tests";
13 |     exit 77;
14 | fi
15 | 


--------------------------------------------------------------------------------
/test_scripts/test_hypermr.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile=tests/araTha1_simulated.counts.gz
 4 | outfile=tests/araTha1_simulated.hypermr
 5 | if [[ -e "${infile}" ]]; then
 6 |     ./dnmtools hypermr -o ${outfile} ${infile}
 7 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 8 |     if [[ "${x}" != "OK" ]]; then
 9 |         exit 1;
10 |     fi
11 | else
12 |     echo "${infile} not found; skipping test";
13 |     exit 77;
14 | fi
15 | 


--------------------------------------------------------------------------------
/test_scripts/test_diff.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/methylome_a.counts.sym
 4 | infile2=tests/methylome_b.counts.sym
 5 | outfile=tests/methylome_ab.diff
 6 | if [[ -e "${infile1}" || -e "${infile2}" ]]; then
 7 |     ./dnmtools diff -o ${outfile} ${infile1} ${infile2}
 8 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 9 |     if [[ "${x}" != "OK" ]]; then
10 |         exit 1;
11 |     fi
12 | else
13 |     echo "input missing; skipping test";
14 |     exit 77;
15 | fi
16 | 


--------------------------------------------------------------------------------
/test_scripts/test_xcounts.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/reads.counts
 4 | infile2=tests/tRex1.fa
 5 | outfile=tests/reads.xcounts
 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then
 7 |     ./dnmtools xcounts -c ${infile2} -o ${outfile} ${infile1}
 8 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 9 |     if [[ "${x}" != "OK" ]]; then
10 |         exit 1;
11 |     fi
12 | else
13 |     echo "xcounts input file(s) not found; skipping test";
14 |     exit 77;
15 | fi
16 | 


--------------------------------------------------------------------------------
/test_scripts/test_unxcounts.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/reads.xcounts
 4 | infile2=tests/tRex1.fa
 5 | outfile=tests/reads.unxcounts
 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then
 7 |     ./dnmtools unxcounts -c ${infile2} -o ${outfile} ${infile1}
 8 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 9 |     if [[ "${x}" != "OK" ]]; then
10 |         exit 1;
11 |     fi
12 | else
13 |     echo "unxcounts input file not found; skipping remaining tests";
14 |     exit 77;
15 | fi
16 | 


--------------------------------------------------------------------------------
/test_scripts/test_amrfinder.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/two_epialleles.states
 4 | infile2=tests/tRex1.fa
 5 | outfile=tests/two_epialleles.amr
 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then
 7 |     ./dnmtools amrfinder -v -c ${infile2} -o ${outfile} ${infile1}
 8 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 9 |     if [[ "${x}" != "OK" ]]; then
10 |         exit 1;
11 |     fi
12 | else
13 |     echo "amrfinder input file(s) not found; skipping test";
14 |     exit 77;
15 | fi
16 | 


--------------------------------------------------------------------------------
/test_scripts/test_bsrate.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/reads.fmt.srt.uniq.sam
 4 | infile2=tests/tRex1.fa
 5 | outfile=tests/reads.bsrate
 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then
 7 |     ./dnmtools bsrate -c ${infile2} -o ${outfile} ${infile1}
 8 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 9 |     if [[ "${x}" != "OK" ]]; then
10 |         exit 1;
11 |     fi
12 | else
13 |     echo "${infile1} and ${infile2} not found; skipping dependent tests";
14 |     exit 77;
15 | fi
16 | 


--------------------------------------------------------------------------------
/test_scripts/test_radmeth.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/radmeth_test_table.txt
 4 | infile2=tests/radmeth_test_design.txt
 5 | outfile=tests/radmeth_test_output.txt
 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then
 7 |     ./dnmtools radmeth -o ${outfile} -f factor ${infile2} ${infile1}
 8 |     x=$(wc -l tests/radmeth_test_output.txt | awk '$1 == 17903 {print "OK"}')
 9 |     if [[ "${x}" != "OK" ]]; then
10 |         exit 1;
11 |     fi
12 | else
13 |     echo "radmeth input file(s) not found; skipping test";
14 |     exit 77;
15 | fi
16 | 


--------------------------------------------------------------------------------
/test_scripts/test_uniq.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile=tests/reads.fmt.srt.sam
 4 | outfile1=tests/reads.fmt.srt.uniq.sam
 5 | outfile2=tests/reads.ustats
 6 | if [[ -e "${infile}" ]]; then
 7 |     ./dnmtools uniq -v -S ${outfile2} ${infile} ${outfile1}
 8 |     x1=$(md5sum -c tests/md5sum.txt | grep "${outfile1}:" | cut -d ' ' -f 2)
 9 |     x2=$(md5sum -c tests/md5sum.txt | grep "${outfile2}:" | cut -d ' ' -f 2)
10 |     if [[ "${x1}" != "OK" || "${x2}" != "OK" ]]; then
11 |         exit 1;
12 |     fi
13 | else
14 |     echo "${infile} not found; skipping dependent tests";
15 |     exit 77;
16 | fi
17 | 


--------------------------------------------------------------------------------
/test_scripts/test_selectsites.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/tRex1_promoters.bed
 4 | infile2=tests/reads.counts
 5 | outfile=tests/reads.counts.select
 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then
 7 |     ./dnmtools selectsites -o ${outfile} ${infile1} ${infile2}
 8 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 9 |     if [[ "${x}" != "OK" ]]; then
10 |         exit 1;
11 |     fi
12 |   elif [[ -e "${infile1}" ]]; then
13 |     echo "${infile1} not found; skipping remaining tests";
14 |     exit 77;
15 |   else
16 |     echo "${infile2} not found; skipping remaining tests";
17 |     exit 77;
18 | fi
19 | 


--------------------------------------------------------------------------------
/test_scripts/test_counts.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/reads.fmt.srt.uniq.sam
 4 | infile2=tests/tRex1.fa
 5 | outfile=tests/reads.counts
 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then
 7 |     ./dnmtools counts -v -o ${outfile} -c ${infile2} ${infile1}
 8 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 9 |     if [[ "${x}" != "OK" ]]; then
10 |         exit 1;
11 |     fi
12 | elif [[ -e "${infile1}" ]]; then
13 |     echo "${infile1} not found; skipping remaining tests";
14 |     exit 77;
15 | else # if [[ -e "${infile2}" ]]
16 |     echo "${infile2} not found; skipping remaining tests";
17 |     exit 77;
18 | fi
19 | 


--------------------------------------------------------------------------------
/test_scripts/test_states.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/reads.fmt.srt.uniq.sam
 4 | infile2=tests/tRex1.fa
 5 | outfile=tests/reads.epiread
 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then
 7 |     ./dnmtools states -v -o ${outfile} -c ${infile2} ${infile1}
 8 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 9 |     if [[ "${x}" != "OK" ]]; then
10 |         exit 1;
11 |     fi
12 | elif [[ -e "${infile1}" ]]; then
13 |     echo "${infile1} not found; skipping remaining tests";
14 |     exit 77;
15 | else # if [[ -e "${infile2}" ]]
16 |     echo "${infile2} not found; skipping remaining tests";
17 |     exit 77;
18 | fi
19 | 


--------------------------------------------------------------------------------
/test_scripts/test_roi.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/reads.counts.sym
 4 | infile2=tests/tRex1_promoters.bed
 5 | outfile=tests/tRex1_promoters.roi.bed
 6 | if [[ -e "${infile1}" && -e "${infile2}" ]]; then
 7 |     ./dnmtools roi -v -M -o ${outfile} ${infile2} ${infile1}
 8 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
 9 |     if [[ "${x}" != "OK" ]]; then
10 |         exit 1;
11 |     fi
12 | elif [[ -e "${infile1}" ]]; then
13 |     echo "${infile1} not found; skipping remaining tests";
14 |     exit 77;
15 | else # if [[ -e "${infile2}" ]]
16 |     echo "${infile2} not found; skipping remaining tests";
17 |     exit 77;
18 | fi
19 | 


--------------------------------------------------------------------------------
/pipeline/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | 
 3 | ## Copy this file to where the directory where you have the dnmtools binary. Make sure it was built on linux x86_64.
 4 | 
 5 | FROM ubuntu:22.04
 6 | 
 7 | # install pipeline dependencies
 8 | RUN apt-get update
 9 | RUN apt-get install -y libgsl-dev libhts-dev libgomp1 samtools libcurl4 trim-galore sra-toolkit rsync
10 | RUN rm -rf /var/lib/apt/lists/*
11 | RUN rsync -a hgdownload.soe.ucsc.edu::genome/admin/exe/linux.x86_64/bedToBigBed /usr/bin
12 | RUN rsync -a hgdownload.soe.ucsc.edu::genome/admin/exe/linux.x86_64/wigToBigWig /usr/bin
13 | 
14 | # install dnmtools and it must be build for Ubuntu
15 | COPY dnmtools /usr/bin
16 | 


--------------------------------------------------------------------------------
/data/config.h.in:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2025 Andrew D Smith
 2 |  *
 3 |  * This program is free software: you can redistribute it and/or modify it
 4 |  * under the terms of the GNU General Public License as published by the Free
 5 |  * Software Foundation, either version 3 of the License, or (at your option)
 6 |  * any later version.
 7 |  *
 8 |  * This program is distributed in the hope that it will be useful, but WITHOUT
 9 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11 |  * more details.
12 |  */
13 | 
14 | #define PROJECT_NAME "@PROJECT_NAME@"
15 | #define VERSION "@PROJECT_VERSION@"
16 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/docs/content/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/content/merge-bsrate.md:
--------------------------------------------------------------------------------
 1 | # merge-bsrate - Combine bisulfite conversion rate statistics files
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools merge-bsrate [OPTIONS] <file-1.bsrate> <file-2.bsrate> ...
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | Given several bisulfite conversion summary statistics generated using
11 | the [bsrate](../bsrate) program, the `merge-bsrate` utility
12 | combines their information. This is usually useful if your dataset has
13 | been split into multipe files and processed in parallel, after which
14 | one would like to combine the summaries of separate runs.
15 | 
16 | ## Options
17 | 
18 | ```txt
19 |  -o -output
20 | ```
21 | output file (default : STDOUT)
22 | 
23 | ```txt
24 |  -v -verbose
25 | ```
26 | print more run info to STDERR as the program runs.
27 | 


--------------------------------------------------------------------------------
/test_scripts/test_format.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile=tests/reads.sam
 4 | outfile1=tests/reads.fmt.sam
 5 | outfile2=tests/reads.fmt.srt.sam
 6 | cmd=samtools
 7 | if [[ -e "${infile}" ]]; then
 8 |     ./dnmtools format -f abismal ${infile} ${outfile1}
 9 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile1}:" | cut -d ' ' -f 2)
10 |     ### ADS: only want to check the first output here; any failure
11 |     ### later will result in a skip for subsequent tests.
12 |     if [[ "${x}" != "OK" ]]; then
13 |         exit 1;
14 |     fi
15 | else
16 |     echo "${infile} not found; skipping remaining tests";
17 |     exit 77;
18 | fi
19 | 
20 | if [[ -e $(type -P "${cmd}") ]]; then
21 |     samtools sort --no-PG -O SAM -o ${outfile2} ${outfile1};
22 | else
23 |     echo "${cmd} not found"
24 | fi
25 | 


--------------------------------------------------------------------------------
/test_scripts/test_mlml.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | archive=tests/mlml_test_data.tgz
 4 | if [[ ! -e "${archive}" ]] ; then
 5 |     echo "input missing; skipping test";
 6 |     exit 77;
 7 | fi
 8 | 
 9 | infile1=tests/bs.counts
10 | infile2=tests/tab.counts
11 | infile3=tests/oxbs.counts
12 | outfile=tests/mlml.out
13 | tar -xf ${archive} -C tests
14 | if [[ -e "${infile1}" || -e "${infile2}" || -e "${infile3}" ]]; then
15 |     ./dnmtools mlml -o ${outfile} -bsseq ${infile1} -tabseq ${infile2} -oxbsseq ${infile3}
16 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
17 |     rm -f ${infile1} ${infile2} ${infile3}
18 |     if [[ "${x}" != "OK" ]]; then
19 |         exit 1;
20 |     fi
21 | else
22 |     echo "input missing; skipping test";
23 |     rm -f ${infile1} ${infile2} ${infile3}
24 |     exit 77;
25 | fi
26 | 


--------------------------------------------------------------------------------
/src/common/dnmtools_utils.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2019-2023 Andrew D. Smith
 2 |  *
 3 |  * Authors: Andrew D. Smith
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, either version 3 of the License, or
 8 |  * (at your option) any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful,
11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |  * GNU General Public License for more details.
14 |  */
15 | 
16 | #ifndef DNMTOOLS_UTILS_HPP
17 | #define DNMTOOLS_UTILS_HPP
18 | 
19 | #include <string>
20 | 
21 | auto
22 | get_command_line(const int argc,
23 |                  char *argv[]) -> std::string;  // NOLINT(*-c-arrays)
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/test_scripts/test_abismal.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | infile1=tests/reads_1.fq.gz
 4 | infile2=tests/reads_2.fq.gz
 5 | infile3=tests/tRex1.idx
 6 | outfile1=tests/reads.sam
 7 | outfile2=tests/reads.mstats
 8 | if [[ -e "${infile1}" && -e "${infile2}" && -e "${infile3}" ]]; then
 9 |     ./dnmtools abismal -s ${outfile2} -o ${outfile1} \
10 |                -i ${infile3} ${infile1} ${infile2};
11 |     x1=$(md5sum -c tests/md5sum.txt | grep "${outfile1}:" | cut -d ' ' -f 2)
12 |     x2=$(md5sum -c tests/md5sum.txt | grep "${outfile2}:" | cut -d ' ' -f 2)
13 |     if [[ "${x1}" != "OK" || "${x2}" != "OK" ]]; then
14 |         exit 1;
15 |     fi
16 | elif [[ ! -e "${infile1}" || ! -e "${infile2}" ]]; then
17 |     echo "missing fastq input file(s); skipping remaining tests";
18 |     exit 77;
19 | else ## if [[ ! -e "${infile3}" ]]; then
20 |     echo "missing index file; skipping remaining tests";
21 |     exit 77;
22 | fi
23 | 


--------------------------------------------------------------------------------
/docs/content/cleanhp.md:
--------------------------------------------------------------------------------
 1 | # cleanhp - Remove hairpin reads
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools cleanhp [OPTIONS] <read-1.fastq> <read-2.fastq>
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | ## Options
11 | 
12 | ```txt
13 |  -o, -output
14 | ```
15 | output filename prefix [required]
16 | ```txt
17 |  -s, -stat
18 | ```
19 | stats output filename [required]
20 | ```txt
21 |  -h, -hairpin
22 | ```
23 | maximum hairpin rate
24 | ```txt
25 |  -check
26 | ```
27 | check for hairpin contamination
28 | ```txt
29 |  -n, -nreads
30 | ```
31 | number of reads in initial check
32 | ```txt
33 |  -c, -cutoff
34 | ```
35 | cutoff for calling an inverse duplication(default: 0.95)
36 | ```txt
37 |  -i, -ignore
38 | ```
39 | length of read name suffix to ignore when matching
40 | ```txt
41 |  -v, -verbose
42 | ```
43 | print more run info to the terminal while the program is running
44 | ```txt
45 |  -h, -hist
46 | ```
47 | write a histogram of hairpin matches to this file
48 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # DNMTools documentation
 2 | 
 3 | This is the documentation for DNMTools that uses
 4 | [mkdocs](https://mkdocs.readthedocs.io) to generate readthedocs pages.
 5 | The public web verison of this documentation is available at
 6 | [dnmtools.readthedocs.io](https://dnmtools.readthedocs.io), but for
 7 | uses who wish to see the documentation on a web browser offline, you
 8 | can build the documentation locally as described below.
 9 | 
10 | ### Dependencies
11 | 
12 | To build the documentation locally, install mkdocs
13 | 
14 | ```
15 | pip install -U mkdocs
16 | ```
17 | 
18 | ### Local compilation
19 | 
20 | Build the HTML documentation by running
21 | ```
22 | mkdocs build -f docs/mkdocs.yml
23 | ```
24 | which will create a `site` directory where markdown files are
25 | converted to HTML
26 | 
27 | Create a local host for the HTML documentation by running
28 | 
29 | ```
30 | mkdocs serve -f docs/mkdocs.yml
31 | ```
32 | 
33 | This will create the documentation, usually at http://localhost:8000 .
34 | 


--------------------------------------------------------------------------------
/docs/content/liftfilter.md:
--------------------------------------------------------------------------------
 1 | # liftfilter - merge lifted entries to the same position
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools liftfilter [OPTIONS] -o <output.meth> <input.meth>
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | The [fastlift](../fastlift) program may report multiple mm9
11 | sites mapped to a same position in hg19.  In this situation, we may
12 | either collapse read counts at those mm9 sites, or keep the data for
13 | only one mm9 site. We can use the lift-filter program to achieve these
14 | two options. Use
15 | 
16 | ```shell
17 | $ dnmtools liftfilter -o output-filtered.meth input.meth
18 | ```
19 | 
20 | to merge data from mm9 sites lifted to the same hg19 position. Use the
21 | option `-u` to keep the first record of duplicated sites.
22 | 
23 | ## Options
24 | 
25 | ```txt
26 |  -o, -output
27 | ```
28 | Output processed methcount [required]
29 | ```txt
30 |  -u, -unique
31 | ```
32 |  keep unique sites
33 | ```txt
34 |  -v, -verbose
35 | ```
36 | print more information to STDERR as the program runs.
37 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/src/radmeth/radmeth_optimize_params.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2025 Andrew D Smith
 2 |  *
 3 |  * Author: Andrew D. Smith
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify it
 6 |  * under the terms of the GNU General Public License as published by the Free
 7 |  * Software Foundation, either version 3 of the License, or (at your option)
 8 |  * any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful, but WITHOUT
11 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13 |  * more details.
14 |  */
15 | 
16 | #ifndef RADMETH_OPTIMIZE_PARAMS_HPP
17 | #define RADMETH_OPTIMIZE_PARAMS_HPP
18 | 
19 | #include <cstdint>
20 | 
21 | namespace radmeth_optimize_params {
22 | inline double tolerance = 1e-4;
23 | inline double stepsize = 0.01;
24 | inline std::uint32_t max_iter = 250;
25 | };  // namespace radmeth_optimize_params
26 | 
27 | #endif  // RADMETH_OPTIMIZE_PARAMS_HPP
28 | 


--------------------------------------------------------------------------------
/src/radmeth/radmeth_optimize_series.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2025 Andrew D.
 2 |  *
 3 |  * Author: Andrew D. Smith
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify it
 6 |  * under the terms of the GNU General Public License as published by the Free
 7 |  * Software Foundation, either version 3 of the License, or (at your option)
 8 |  * any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful, but WITHOUT
11 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13 |  * more details.
14 |  */
15 | 
16 | #ifndef RADMETH_OPTIMIZE_SERIES_HPP
17 | #define RADMETH_OPTIMIZE_SERIES_HPP
18 | 
19 | #include <cstdint>
20 | #include <vector>
21 | 
22 | template <typename T> struct Regression;
23 | 
24 | void
25 | fit_regression_model(Regression<std::uint32_t> &r,
26 |                      std::vector<double> &p_estimates,
27 |                      double &dispersion_estimate);
28 | 
29 | #endif  // RADMETH_OPTIMIZE_SERIES_HPP
30 | 


--------------------------------------------------------------------------------
/CPPLINT.cfg:
--------------------------------------------------------------------------------
 1 | # This file is part of dnmtools
 2 | #
 3 | # Copyright (C) 2023-2025 Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it
 8 | # under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | set noparent
17 | filter=-runtime/references
18 | filter=-build/include_subdir
19 | filter=-build/include_order
20 | filter=-build/c++11
21 | filter=-build/c++17
22 | # Formatting below handled by clang-format
23 | filter=-whitespace/line_length
24 | filter=-whitespace/newline
25 | filter=-readability/braces
26 | filter=-whitespace/semicolon
27 | filter=-whitespace/indent
28 | filter=-whitespace/braces
29 | filter=-whitespace/parens
30 | filter=-readability/nolint
31 | 


--------------------------------------------------------------------------------
/.github/workflows/dnmtools_distcheck_ubuntu.yml:
--------------------------------------------------------------------------------
 1 | name: DNMTools distcheck (Ubuntu)
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches: [ "master" ]
 7 |   pull_request:
 8 |     branches: [ "master" ]
 9 | 
10 | jobs:
11 |   distcheck:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |     - uses: actions/checkout@v4
15 |       with:
16 |         submodules: recursive
17 |     - name: Install dependencies
18 |       run: |
19 |         sudo apt-get update
20 |         sudo apt-get install -y \
21 |         libgsl-dev \
22 |         libcurl4-gnutls-dev \
23 |         libdeflate-dev \
24 |         liblzma-dev \
25 |         zlib1g-dev \
26 |         libbz2-dev
27 |     - name: Build and install htslib (for recent version)
28 |       run: |
29 |         git clone --recursive https://github.com/samtools/htslib.git
30 |         cd htslib
31 |         make -j4
32 |         sudo make install prefix=/usr
33 |     - name: Generate configure script
34 |       run: ./autogen.sh
35 |     - name: configure
36 |       run: ./configure
37 |     - name: Generate the source archive
38 |       run: make dist
39 |     - name: make distcheck
40 |       run: make -j4 distcheck
41 | 


--------------------------------------------------------------------------------
/src/common/dnmtools_gaussinv.hpp:
--------------------------------------------------------------------------------
 1 | /* Code from GSl, see copyright below.
 2 |  */
 3 | 
 4 | /* cdf/gsl_cdf.h
 5 |  *
 6 |  * Copyright (C) 2002 Jason H. Stover.
 7 |  *
 8 |  * This program is free software; you can redistribute it and/or modify
 9 |  * it under the terms of the GNU General Public License as published by
10 |  * the Free Software Foundation; either version 3 of the License, or (at
11 |  * your option) any later version.
12 |  *
13 |  * This program is distributed in the hope that it will be useful, but
14 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
15 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 |  * General Public License for more details.
17 |  *
18 |  * You should have received a copy of the GNU General Public License
19 |  * along with this program; if not, write to the Free Software Foundation,
20 |  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 |  */
22 | 
23 | /* Author:  J. Stover */
24 | 
25 | double dnmt_gsl_cdf_ugaussian_Pinv(const double P);
26 | double dnmt_gsl_cdf_ugaussian_Qinv(const double Q);
27 | 
28 | double dnmt_gsl_cdf_gaussian_P(const double x, const double sigma);
29 | double dnmt_gsl_cdf_gaussian_Q(const double x, const double sigma);
30 | 


--------------------------------------------------------------------------------
/.cppcheck_suppress:
--------------------------------------------------------------------------------
 1 | # This file is part of dnmtools
 2 | #
 3 | # Copyright (C) 2023-2025 Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it
 8 | # under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | missingIncludeSystem
18 | constVariablePointer
19 | checkersReport
20 | unknownMacro
21 | unmatchedSuppression
22 | # Ignore unused function because it's too hard to get right
23 | unusedFunction
24 | # Ignore unused struct member because this won't go unnoticed anyway
25 | unusedStructMember
26 | # Ignore missing includes because if they are real things won't build
27 | missingInclude
28 | # Exclude external files
29 | *:*smithlab_cpp*
30 | *:*popcnt.hpp
31 | # Problem caused by external files
32 | toomanyconfigs
33 | # More problems caused by external files -- with too many ifdefs
34 | normalCheckLevelMaxBranches
35 | 


--------------------------------------------------------------------------------
/src/radmeth/radmeth_optimize_gamma.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2025 Andrew D Smith
 2 |  *
 3 |  * Author: Andrew D. Smith
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify it
 6 |  * under the terms of the GNU General Public License as published by the Free
 7 |  * Software Foundation, either version 3 of the License, or (at your option)
 8 |  * any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful, but WITHOUT
11 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13 |  * more details.
14 |  */
15 | 
16 | #ifndef RADMETH_OPTIMIZE_GAMMA_HPP
17 | #define RADMETH_OPTIMIZE_GAMMA_HPP
18 | 
19 | #include <cstdint>
20 | #include <vector>
21 | 
22 | template <typename T> struct Regression;
23 | 
24 | void
25 | fit_regression_model_gamma(Regression<std::uint32_t> &r,
26 |                            std::vector<double> &p_estimates,
27 |                            double &dispersion_estimate);
28 | 
29 | void
30 | fit_regression_model_gamma(Regression<double> &r,
31 |                            std::vector<double> &p_estimates,
32 |                            double &dispersion_estimate);
33 | 
34 | #endif  // RADMETH_OPTIMIZE_GAMMA_HPP
35 | 


--------------------------------------------------------------------------------
/src/common/dnmtools_utils.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2019-2023 Andrew D. Smith
 2 |  *
 3 |  * Authors: Andrew D. Smith
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, either version 3 of the License, or
 8 |  * (at your option) any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful,
11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |  * GNU General Public License for more details.
14 |  */
15 | 
16 | #include "dnmtools_utils.hpp"
17 | 
18 | #include <algorithm>
19 | #include <iterator>
20 | #include <sstream>
21 | #include <string>
22 | 
23 | using std::copy;
24 | using std::ostream_iterator;
25 | using std::ostringstream;
26 | using std::string;
27 | 
28 | auto
29 | get_command_line(const int argc,
30 |                  char *argv[]) -> std::string {  // NOLINT(*-c-arrays)
31 |   if (argc == 0)
32 |     return std::string{};
33 |   std::ostringstream cmd;
34 |   cmd << '"';
35 |   // NOLINTBEGIN(*-pointer-arithmetic)
36 |   copy(argv, argv + (argc - 1), ostream_iterator<const char *>(cmd, " "));
37 |   cmd << argv[argc - 1] << '"';
38 |   // NOLINTEND(*-pointer-arithmetic)
39 |   return cmd.str();
40 | }
41 | 


--------------------------------------------------------------------------------
/data/md5sum.txt:
--------------------------------------------------------------------------------
 1 | ae05a28de5643a512386e767b3aa963a  tests/araTha1_simulated.hypermr
 2 | 0048de3fc412cb12ec2e070c8151f86f  tests/methylome_ab.diff
 3 | 86ca23015535cf3295c0da3587a95f22  tests/radmeth_test_output.txt
 4 | 75777c209bf820ab700801d87a0a3615  tests/reads.bsrate
 5 | e73facd597c3b903cbfe29afa9f58371  tests/reads.counts
 6 | 56575da7d3af9b696258512142903d1e  tests/reads.counts.select
 7 | 0f72560aa101e85679783a1ecaf80615  tests/reads.epiread
 8 | 9dbd476424d48a8d0f043dfc00af0d23  tests/reads.fmt.srt.sam
 9 | 4085cc74b003a918b4a4743fca7922a4  tests/reads.hmr
10 | d8856f9731af76b8a4ab3cc7d667cdb2  tests/reads.ustats
11 | bcbf01be810cbf4051292813eb6b9225  tests/tRex1.idx
12 | ec6a686617cad31e9f7a37a3d378e6ed  tests/two_epialleles.states
13 | 93e38b20d162062a5d147c4290095a13  tests/mlml.out
14 | d947fe3d61ef7b1564558a69608f0e64  tests/methylome.pmd
15 | d41d8cd98f00b204e9800998ecf8427e  tests/two_epialleles.amr
16 | 001b9d966f62fa439b24cf2198cc3de5  tests/reads.counts.sym
17 | 2b8a0406015458be51b8b1c9e58b3602  tests/tRex1_promoters.roi.bed
18 | 33640b24cb64ad3179f364af5a887f95  tests/reads.fmt.sam
19 | b5a63997c57dcde5c3a6635f7beb2cce  tests/reads.fmt.srt.uniq.sam
20 | 3ac2b51545740bafd1a548ba7f73e739  tests/reads.xcounts
21 | 054fe804a32063c80862fbee30f74579  tests/reads.unxcounts
22 | 830157684f1ddbf1f1c37d354188dc2b  tests/reads.sam
23 | 8dbcdabecb6cfe6aebb73c94c605f696  tests/reads.mstats
24 | 490723e9af084c8f957f5a265cf02994  tests/reads.levels
25 | 


--------------------------------------------------------------------------------
/docs/content/hmr-rep.md:
--------------------------------------------------------------------------------
 1 | # hmr-rep - Hypomethylated regions across replicates
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools hmr-rep [OPTIONS] <input-1.meth> <input-2.meth> ...
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | This program is similar to [hmr](../hmr), but it identifies
11 | HMRs in a set of replicate methylomes. Methylation must be provided in
12 | the [counts](../counts) format. This program assumes
13 | only data at CpG sites and that strands are collapsed so only the
14 | positive site appears in the file (e.g. using
15 | [sym](../sym)).
16 | 
17 | ## Options
18 | 
19 | ```txt
20 |  -o, -out
21 | ```
22 | output file (default: STDOUT)
23 | ```txt
24 |  -d, -desert
25 | ```
26 | maximum distance between covered CpGs in HMR (default: 1000)
27 | 
28 | ```txt
29 |  -i, -itr
30 | ```
31 | max number of iterations (default: 100)
32 | ```txt
33 |  -v, -verbose
34 | ```
35 | print more run info to STDERR while the program is running.
36 | ```txt
37 |  -post-hypo
38 | ```
39 | output file for single-CpG posterior hypomethylation probability (default: none)
40 | 
41 | ```txt
42 |  -post-meth
43 | ```
44 | output file for single-CpG posteiror methylation probability (default: none)
45 | 
46 | ```txt
47 |  -P, -params-in
48 | ```
49 | HMM parameter file (override training step)
50 | ```txt
51 |  -p, -params-out
52 | ```
53 | write HMM parameters to this file (default: none)
54 | ```txt
55 |  -s, -seed
56 | ```
57 | specify random seed (default: 408)
58 | 


--------------------------------------------------------------------------------
/docs/content/allelic.md:
--------------------------------------------------------------------------------
 1 | # allelic - Single-site ASM scoring
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools allelic [OPTIONS] <input.epiread>
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | The program `allelicmeth`  calculates allele specific methylation scores
11 | for each CpG site. Input files should be the epiread files (.epiread
12 | suffix) produced using [states](../states). In the output file, each row
13 | represents a CpG pair made by any CpG and its previous CpG, the first
14 | three columns indicate the positions of the CpG site, the fourth
15 | column is the name including the number of reads covering the CpG
16 | pair, the fifth column is the score for ASM, and the last four columns
17 | record the number of reads of four different methylation combinations
18 | of the CpG pair: methylated methylated (mm), methylated unmethylated
19 | (mu), unmethylated methylated (um), or unmethylated unmethylated (uu).
20 | The following command will calculate allele specific methylation
21 | scores using the allelicmeth component of dnmtools:
22 | 
23 | ```shell
24 | $ dnmtools allelic -c /path/to/genome.fa -o output.allelic input.epiread
25 | ```
26 | 
27 | ## Options
28 | 
29 | ```txt
30 |  -o, -output
31 | ```
32 | output file name (default: STDOUT)
33 | ```txt
34 |  -c, -chrom
35 | ```
36 | FASTA file or directory of chromosomes containing FASTA files [required]
37 | 
38 | ```txt
39 |  -v, -verbose
40 | ```
41 | print more run info to STDERR while the program is running.
42 | 


--------------------------------------------------------------------------------
/src/radmeth/radmeth_utils.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2025 Andrew D Smith
 2 |  *
 3 |  * Author: Andrew D. Smith
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify it
 6 |  * under the terms of the GNU General Public License as published by the Free
 7 |  * Software Foundation, either version 3 of the License, or (at your option)
 8 |  * any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful, but WITHOUT
11 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13 |  * more details.
14 |  */
15 | 
16 | #ifndef RADMETH_UTILS_HPP
17 | #define RADMETH_UTILS_HPP
18 | 
19 | #include <chrono>
20 | #include <cstddef>
21 | #include <cstdint>
22 | #include <fstream>
23 | #include <string>
24 | 
25 | [[nodiscard]] std::string
26 | format_duration(const std::chrono::duration<double> elapsed);
27 | 
28 | struct file_progress {
29 |   double one_thousand_over_filesize{};
30 |   std::size_t prev_offset{};
31 |   explicit file_progress(const std::string &filename);
32 |   void
33 |   operator()(std::ifstream &in);  // cppcheck-suppress constParameterReference
34 | };
35 | 
36 | [[nodiscard]] double
37 | llr_test(const double null_loglik, const double full_loglik);
38 | 
39 | [[nodiscard]] inline double
40 | overdispersion_factor(const std::uint32_t n_samples, const double dispersion) {
41 |   return (n_samples - 1) / (dispersion + 1);
42 | }
43 | 
44 | #endif  // RADMETH_UTILS_HPP
45 | 


--------------------------------------------------------------------------------
/src/common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 Andrew D Smith
 2 | #
 3 | # This program is free software: you can redistribute it and/or modify it
 4 | # under the terms of the GNU General Public License as published by the Free
 5 | # Software Foundation, either version 3 of the License, or (at your option)
 6 | # any later version.
 7 | #
 8 | # This program is distributed in the hope that it will be useful, but WITHOUT
 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 | # more details.
12 | #
13 | # You should have received a copy of the GNU General Public License along with
14 | # this program. If not, see <https://www.gnu.org/licenses/>.
15 | 
16 | find_package(GSL REQUIRED)
17 | 
18 | file(GLOB cpp_files "*.cpp")
19 | 
20 | set(LIBRARY_OBJECTS "")
21 | foreach(cpp_file ${cpp_files})
22 |   get_filename_component(BASE_NAME ${cpp_file} NAME_WE)
23 |   add_library(${BASE_NAME} OBJECT ${cpp_file})
24 |   target_link_libraries(${BASE_NAME} PRIVATE
25 |     bamxx
26 |     smithlab_cpp
27 |     GSL::gsl
28 |   )
29 |   target_include_directories(${BASE_NAME} PUBLIC
30 |     ${CMAKE_BINARY_DIR}
31 |   )
32 |   list(APPEND LIBRARY_OBJECTS ${BASE_NAME})
33 | endforeach()
34 | 
35 | # Create static library linking the individual objects
36 | add_library(dnmtools_objs STATIC)
37 | target_include_directories(dnmtools_objs PUBLIC
38 |   ${CMAKE_CURRENT_SOURCE_DIR}
39 | )
40 | target_link_libraries(dnmtools_objs PUBLIC
41 |   ${LIBRARY_OBJECTS}
42 |   smithlab_cpp
43 |   bamxx
44 | )
45 | 


--------------------------------------------------------------------------------
/src/common/numerical_utils.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2011-2022 University of Southern California
 3 |  *                    Andrew D Smith and Qiang Song
 4 |  * Author: Qiang Song and Andrew D. Smith
 5 |  *
 6 |  * This is free software; you can redistribute it and/or modify it
 7 |  * under the terms of the GNU General Public License as published by
 8 |  * the Free Software Foundation; either version 2 of the License, or
 9 |  * (at your option) any later version.
10 |  *
11 |  * This is distributed in the hope that it will be useful,
12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  * GNU General Public License for more details.
15 |  */
16 | 
17 | #ifndef NUMERICAL_UTILS_HPP
18 | #define NUMERICAL_UTILS_HPP
19 | 
20 | #include <cmath>
21 | #include <cstddef>
22 | #include <vector>
23 | 
24 | inline double
25 | log_sum_log(const double p, const double q) {
26 |   if (p == 0) {
27 |     return q;
28 |   }
29 |   else if (q == 0) {
30 |     return p;
31 |   }
32 |   const double larger = (p > q) ? p : q;
33 |   const double smaller = (p > q) ? q : p;
34 |   return larger + log1p(exp(smaller - larger));
35 | }
36 | 
37 | inline double
38 | log_sum_log(const double p, const double q, const double r) {
39 |   return log_sum_log(log_sum_log(p, q), r);
40 | }
41 | 
42 | double
43 | log_sum_log_vec(const std::vector<double> &vals, const size_t limit);
44 | 
45 | double
46 | log_sum_log(const std::vector<double>::const_iterator &begin,
47 |             const std::vector<double>::const_iterator &end);
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/src/common/dnmt_error.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2023 Andrew D. Smith
 2 |  *
 3 |  * Authors: Andrew Smith
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or
 6 |  * modify it under the terms of the GNU General Public License as
 7 |  * published by the Free Software Foundation, either version 3 of the
 8 |  * License, or (at your option) any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful, but
11 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 |  * General Public License for more details.
14 |  */
15 | 
16 | #ifndef DNMT_ERROR_HPP
17 | #define DNMT_ERROR_HPP
18 | 
19 | #include <cstdint>  // for int64_t
20 | #include <cstring>
21 | #include <sstream>
22 | #include <stdexcept>
23 | #include <string>
24 | 
25 | struct dnmt_error : public std::exception {
26 |   int64_t err;           // error possibly from HTSlib
27 |   int the_errno;         // ERRNO at time of construction
28 |   std::string msg;       // the message
29 |   std::string the_what;  // to report
30 |   dnmt_error(const int64_t _err, const std::string &_msg) :
31 |     err{_err}, the_errno{errno}, msg{_msg} {
32 |     std::ostringstream oss;
33 |     oss << "[error: " << err << "][" << "ERRNO: " << the_errno << "]"
34 |         << "[" << strerror(the_errno) << "][" << msg << "]";
35 |     the_what = oss.str();
36 |   }
37 |   explicit dnmt_error(const std::string &_msg) : dnmt_error(0, _msg) {}
38 |   const char *
39 |   what() const noexcept override {
40 |     return the_what.c_str();
41 |   }
42 | };
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/src/common/Smoothing.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright (C) 2008-2022 Cold Spring Harbor Laboratory
 3 |   Authors: Andrew D. Smith
 4 | 
 5 |   This file is part of dnmtools.
 6 | 
 7 |   dnmtools is free software; you can redistribute it and/or modify
 8 |   it under the terms of the GNU General Public License as published by
 9 |   the Free Software Foundation; either version 2 of the License, or
10 |   (at your option) any later version.
11 | 
12 |   dnmtools is distributed in the hope that it will be useful,
13 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
14 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 |   GNU General Public License for more details.
16 | 
17 |   You should have received a copy of the GNU General Public License
18 |   along with dnmtools; if not, write to the Free Software
19 |   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20 | */
21 | 
22 | #ifndef SMOOTHING_HPP
23 | #define SMOOTHING_HPP
24 | 
25 | #include <vector>
26 | 
27 | void
28 | KernelSmoothing(const double bandwidth,
29 | 		const std::vector<double> &x_values,
30 | 		const std::vector<double> &y_values,
31 | 		const std::vector<double> &x_target,
32 | 		std::vector<double> &y_target);
33 | 
34 | void
35 | LocalLinearRegression(const double bandwidth,
36 | 		      const std::vector<double> &x_values,
37 | 		      const std::vector<double> &y_values,
38 | 		      const std::vector<double> &x_target,
39 | 		      std::vector<double> &y_target);
40 | 
41 | 
42 | void
43 | KernelSmoothing(const double bandwidth,
44 | 		const std::vector<double> &y_vals,
45 | 		std::vector<double> &y_target);
46 | 
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/docs/content/amrtester.md:
--------------------------------------------------------------------------------
 1 | # amrtester - resolve epi-alleles
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools amrtester [OPTIONS] <input.bed> <input.epiread>
 6 | ```
 7 | 
 8 | In addition to [amrfinder](../amrfinder), which uses a sliding
 9 | window, the `amrtester` program tests for allele-specific methylation
10 | in a given set of genomic intervals. The program can be run like this:
11 | 
12 | ```shell
13 | $ dnmtools amrtester -o output.amr -c /path/to/genome.fa intervals.bed input.epiread
14 | ```
15 | 
16 | This program works very similarly to `amrfinder`, but does not have
17 | options related to the sliding window. This program outputs a score
18 | for each input interval, and when the likelihood ratio test is used,
19 | the score is the p-value, which can easily be filtered later.
20 | 
21 | ## Options
22 | 
23 | ```txt
24 |  -o, -output
25 | ```
26 | The name of the output file. If no file name is provided, the output
27 | will be written to standard output. Due to the size of this output, a
28 | file should be specified unless the output will be piped to another
29 | command or program. The output file contains genomic intervals in BED
30 | format, with intervals corresponding to those provided as input.
31 | 
32 | ```txt
33 |  -c, -chrom
34 | ```
35 | FASTA file or directory of chromosomes containing FASTA files [required]
36 | ```txt
37 |  -i, -itr
38 | ```
39 |  max iterations
40 | ```txt
41 |  -v, -verbose
42 | ```
43 |  print more run info
44 | ```txt
45 |  -P, -progress
46 | ```
47 | print more run info to STDERR while the program is running.
48 | ```txt
49 |  -b, -bic
50 | ```
51 | use Bayesian Information Criterion (BIC) to compare models
52 | 


--------------------------------------------------------------------------------
/src/common/BetaBin.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright (C) 2011-2022 University of Southern California
 3 |   Authors: Andrew D. Smith, Song Qiang
 4 | 
 5 |   This file is part of dnmtools.
 6 | 
 7 |   dnmtools is free software; you can redistribute it and/or modify
 8 |   it under the terms of the GNU General Public License as published by
 9 |   the Free Software Foundation; either version 2 of the License, or
10 |   (at your option) any later version.
11 | 
12 |   dnmtools is distributed in the hope that it will be useful,
13 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
14 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 |   GNU General Public License for more details.
16 | */
17 | 
18 | #ifndef BETABIN_HPP
19 | #define BETABIN_HPP
20 | 
21 | #include <iterator>  // IWYU pragma: keep
22 | #include <string>
23 | #include <utility>
24 | #include <vector>
25 | 
26 | struct betabin {
27 |   betabin();
28 |   betabin(const double a, const double b);
29 |   explicit betabin(const std::string &str);
30 |   double
31 |   operator()(const std::pair<double, double> &val) const;
32 |   double
33 |   log_likelihood(const std::pair<double, double> &val) const;
34 |   double
35 |   sign(const double x);
36 |   double
37 |   invpsi(const double tolerance, const double x);
38 |   double
39 |   movement(const double curr, const double prev);
40 |   void
41 |   fit(const std::vector<double> &vals_a, const std::vector<double> &vals_b,
42 |       const std::vector<double> &p);
43 |   std::string
44 |   tostring() const;
45 |   double alpha{};
46 |   double beta{};
47 |   double lnbeta_helper{};
48 | 
49 |   static const double tolerance;
50 | };
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------
/docs/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: DNMTools
 2 | strict: true
 3 | 
 4 | docs_dir: content
 5 | 
 6 | theme: readthedocs
 7 | nav:
 8 |    - Home: 'index.md'
 9 |    - 'Installation': 'quickstart.md'
10 |    - 'DNMTools on GitHub' : https://github.com/smithlabcode/dnmtools
11 |    - Methylome construction:
12 |      - 'abismal': 'abismal.md'
13 |      - 'format': 'format.md'
14 |      - 'uniq': 'uniq.md'
15 |      - 'bsrate' : 'bsrate.md'
16 |      - 'counts' : 'counts.md'
17 |      - 'sym': 'sym.md'
18 |      - 'levels' : 'levels.md'
19 |    - Methylome analysis:
20 |      - 'hmr' : 'hmr.md'
21 |      - 'hmr-rep' : 'hmr-rep.md'
22 |      - 'hypermr' : 'hypermr.md'
23 |      - 'entropy' : 'entropy.md'
24 |      - 'multistat' : 'multistat.md'
25 |      - 'pmd' : 'pmd.md'
26 |      - 'roi' : 'roi.md'
27 |      - 'mlml' : 'mlml.md'
28 |    - Allele-specific methylation:
29 |      - 'states' : 'states.md'
30 |      - 'allelic' : 'allelic.md'
31 |      - 'amrfinder' : 'amrfinder.md'
32 |      - 'amrtester' : 'amrtester.md'
33 |    - Differential methylation:
34 |      - 'diff' : 'diff.md'
35 |      - 'dmr' : 'dmr.md'
36 |      - 'radmeth' : 'radmeth.md'
37 |      - 'radadjust' : 'radadjust.md'
38 |      - 'radmerge' : 'radmerge.md'
39 |    - Methylation visualisation:
40 |      - 'fastlift': 'fastlift.md'
41 |      - 'liftfilter': 'liftfilter.md'
42 |    - General-purpose tools:
43 |      - 'cleanhp': 'cleanhp.md'
44 |      - 'guessprotocol': 'guessprotocol.md'
45 |      - 'merge-bsrate': 'merge-bsrate.md'
46 |      - 'merge': 'merge.md'
47 |      - 'selectsites': 'selectsites.md'
48 |    - Visualization:
49 |      - 'Visualization' : 'visualization.md'
50 |    - Other:
51 |      - 'Cytosine contexts' : 'cytosine_contexts.md'
52 | 


--------------------------------------------------------------------------------
/cmake/FindLIBDEFLATE.cmake:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: GPL-3.0-or-later; (c) 2025 Andrew D Smith (author)
 2 | #[=======================================================================[.rst:
 3 | FindLIBDEFLATE
 4 | --------------
 5 | 
 6 | Find the native libdeflate includes and library.
 7 | 
 8 | #]=======================================================================]
 9 | 
10 | # FindLIBDEFLATE.cmake
11 | # Custom CMake module to find libdeflate
12 | 
13 | # Support preference of static libs by adjusting CMAKE_FIND_LIBRARY_SUFFIXES
14 | # ADS: this is taken from the FindBoost.cmake file
15 | if(LIBDEFLATE_USE_STATIC_LIBS)
16 |   set(_libdeflate_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES
17 |     ${CMAKE_FIND_LIBRARY_SUFFIXES}
18 |   )
19 |   if(WIN32)
20 |     list(INSERT CMAKE_FIND_LIBRARY_SUFFIXES 0 .lib .a)
21 |   else()
22 |     set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
23 |   endif()
24 | endif()
25 | 
26 | find_path(LIBDEFLATE_INCLUDE_DIR NAMES libdeflate.h)
27 | find_library(LIBDEFLATE_LIBRARY NAMES deflate libdeflate)
28 | 
29 | include(FindPackageHandleStandardArgs)
30 | find_package_handle_standard_args(LIBDEFLATE
31 |   REQUIRED_VARS LIBDEFLATE_LIBRARY LIBDEFLATE_INCLUDE_DIR
32 |   VERSION_VAR LIBDEFLATE_VERSION
33 | )
34 | 
35 | if(LIBDEFLATE_FOUND AND NOT TARGET LIBDEFLATE::LIBDEFLATE)
36 |   add_library(LIBDEFLATE::LIBDEFLATE UNKNOWN IMPORTED)
37 |   set_target_properties(LIBDEFLATE::LIBDEFLATE PROPERTIES
38 |     INTERFACE_INCLUDE_DIRECTORIES "${LIBDEFLATE_INCLUDE_DIR}"
39 |     IMPORTED_LOCATION "${LIBDEFLATE_LIBRARY}"
40 |   )
41 | endif()
42 | 
43 | # Restore the original find library ordering
44 | # ADS: this is take from the FindBoost.cmake file
45 | if(LIBDEFLATE_USE_STATIC_LIBS)
46 |   set(CMAKE_FIND_LIBRARY_SUFFIXES
47 |     ${_libdeflate_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}
48 |   )
49 | endif()
50 | 


--------------------------------------------------------------------------------
/src/common/xcounts_utils.hpp:
--------------------------------------------------------------------------------
 1 | /* xcounts_utils: code for doing things with xcounts format and some
 2 |  * for counts format that is common to several tools.
 3 |  *
 4 |  * Copyright (C) 2023-2024 Andrew D. Smith
 5 |  *
 6 |  * Authors: Andrew D. Smith
 7 |  *
 8 |  * This program is free software: you can redistribute it and/or
 9 |  * modify it under the terms of the GNU General Public License as
10 |  * published by the Free Software Foundation, either version 3 of the
11 |  * License, or (at your option) any later version.
12 |  *
13 |  * This program is distributed in the hope that it will be useful, but
14 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
15 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 |  * General Public License for more details.
17 |  */
18 | 
19 | #ifndef XCOUNTS_UTILS_HPP
20 | #define XCOUNTS_UTILS_HPP
21 | 
22 | #include <cstdint>
23 | #include <ostream>
24 | #include <string>
25 | #include <unordered_map>
26 | #include <vector>
27 | 
28 | struct xcounts_entry {
29 |   std::uint64_t pos{};  // absolute position
30 |   std::uint32_t n_meth{};
31 |   std::uint32_t n_unmeth{};
32 | 
33 |   [[nodiscard]] std::uint32_t
34 |   n_reads() const {
35 |     return n_meth + n_unmeth;
36 |   }
37 | 
38 |   [[nodiscard]] double
39 |   frac() const {
40 |     return static_cast<double>(n_meth) / n_reads();
41 |   }
42 | };
43 | 
44 | inline std::ostream &
45 | operator<<(std::ostream &o, const xcounts_entry &e) {
46 |   return o << e.pos << '\t' << e.n_meth << '\t' << e.n_unmeth;
47 | }
48 | 
49 | std::unordered_map<std::string, std::vector<xcounts_entry>>
50 | read_xcounts_by_chrom(const std::int32_t n_threads,
51 |                       const std::string &xcounts_file);
52 | 
53 | bool
54 | get_is_xcounts_file(const std::string &filename);
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/docs/content/radmerge.md:
--------------------------------------------------------------------------------
 1 | # radmerge - Merge CpGs to differentially methylated regions
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools radmerge [OPTIONS] <radmeth-input.bed>
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | After running [radmeth](../radmeth) followed by
11 | [radadjust](../radadjust), it is possible to further join individually
12 | differentially methylated CpGs into differentially methylated
13 | regions. This can be achieved with the command
14 | 
15 | ```shell
16 | $ dnmtools radmerge -p 0.01 radmeth-input.bed > output-dmrs.bed
17 | ```
18 | 
19 | The current algorithm is conservative: it joins neighboring
20 | differentially methylated sites with p-value below 0.01 (set by the -p
21 | parameter). The output format is
22 | 
23 | ```txt
24 | chrom    start    end   dmr    num-sites   meth-diff
25 | ```
26 | 
27 | Above, `num-sites` and `meth-diff` are the number of significantly
28 | differentially methylated CpGs in the DMR and the estimated
29 | methylation difference, respectively. Example output might look like
30 | this:
31 | 
32 | ```txt
33 | chr1     57315   57721  dmr     10      -0.498148
34 | chr1     58263   59009  dmr     27      -0.521182
35 | chr1    138522  139012  dmr     13      -0.443182
36 | chr1    149284  149444  dmr      7      -0.430453
37 | chr1    274339  275254  dmr     18      -0.520114
38 | ```
39 | 
40 | Note that in addition to being conservative, the work done by
41 | `radmerge` is very simple, and does not consider genomic distance
42 | between neighboring sites. It will merge consecutive significant sites
43 | into one interval no matter how distant are those sites on a
44 | chromosome.
45 | 
46 | ## Options
47 | 
48 | ```txt
49 |  -o, -output
50 | ```
51 | Output file (default: stdout).
52 | 
53 | ```txt
54 |  -p, -cutoff
55 | ```
56 | P-value cutoff (default: 0.01).
57 | 


--------------------------------------------------------------------------------
/src/mlml/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 Andrew D Smith
 2 | #
 3 | # This program is free software: you can redistribute it and/or modify it
 4 | # under the terms of the GNU General Public License as published by the Free
 5 | # Software Foundation, either version 3 of the License, or (at your option)
 6 | # any later version.
 7 | #
 8 | # This program is distributed in the hope that it will be useful, but WITHOUT
 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 | # more details.
12 | #
13 | # You should have received a copy of the GNU General Public License along with
14 | # this program. If not, see <https://www.gnu.org/licenses/>.
15 | 
16 | file(GLOB cpp_files "*.cpp")
17 | 
18 | # Gather all the object files that will be put in the static library
19 | # and prepare to compile them.
20 | set(LIBRARY_OBJECTS "")
21 | foreach(cpp_file ${cpp_files})
22 |   get_filename_component(BASE_NAME ${cpp_file} NAME_WE)
23 |   add_library(${BASE_NAME} OBJECT ${cpp_file})
24 |   target_link_libraries(${BASE_NAME} PUBLIC
25 |     dnmtools_objs
26 |     smithlab_cpp
27 |     HTSLIB::HTSLIB
28 |   )
29 |   ## Below is to make sure 'config.h' is visible for includes and any
30 |   ## of the headers for configured libraries
31 |   target_include_directories(${BASE_NAME} PUBLIC
32 |     ${PROJECT_BINARY_DIR}
33 |   )
34 |   list(APPEND LIBRARY_OBJECTS ${BASE_NAME})
35 | endforeach()
36 | 
37 | # Create static library linking the individual objects
38 | add_library(dnmtools_mlml_objs STATIC)
39 | target_include_directories(dnmtools_mlml_objs PUBLIC
40 |   ${CMAKE_BINARY_DIR}
41 |   ${CMAKE_CURRENT_SOURCE_DIR}
42 | )
43 | target_link_libraries(dnmtools_mlml_objs PUBLIC
44 |   ${LIBRARY_OBJECTS}
45 |   smithlab_cpp
46 | )
47 | 


--------------------------------------------------------------------------------
/src/utils/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 Andrew D Smith
 2 | #
 3 | # This program is free software: you can redistribute it and/or modify it
 4 | # under the terms of the GNU General Public License as published by the Free
 5 | # Software Foundation, either version 3 of the License, or (at your option)
 6 | # any later version.
 7 | #
 8 | # This program is distributed in the hope that it will be useful, but WITHOUT
 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 | # more details.
12 | #
13 | # You should have received a copy of the GNU General Public License along with
14 | # this program. If not, see <https://www.gnu.org/licenses/>.
15 | 
16 | file(GLOB cpp_files "*.cpp")
17 | 
18 | # Gather all the object files that will be put in the static library
19 | # and prepare to compile them.
20 | set(LIBRARY_OBJECTS "")
21 | foreach(cpp_file ${cpp_files})
22 |   get_filename_component(BASE_NAME ${cpp_file} NAME_WE)
23 |   add_library(${BASE_NAME} OBJECT ${cpp_file})
24 |   target_link_libraries(${BASE_NAME} PUBLIC
25 |     dnmtools_objs
26 |     bamxx
27 |     smithlab_cpp
28 |     HTSLIB::HTSLIB
29 |   )
30 |   ## Below is to make sure 'config.h' is visible for includes and any
31 |   ## of the headers for configured libraries
32 |   target_include_directories(${BASE_NAME} PUBLIC
33 |     ${PROJECT_BINARY_DIR}
34 |   )
35 |   list(APPEND LIBRARY_OBJECTS ${BASE_NAME})
36 | endforeach()
37 | 
38 | # Create static library linking the individual objects
39 | add_library(dnmtools_utils_objs STATIC)
40 | target_include_directories(dnmtools_utils_objs PUBLIC
41 |   ${CMAKE_BINARY_DIR}
42 |   ${CMAKE_CURRENT_SOURCE_DIR}
43 | )
44 | target_link_libraries(dnmtools_utils_objs PUBLIC
45 |   ${LIBRARY_OBJECTS}
46 |   smithlab_cpp
47 |   bamxx
48 | )
49 | 


--------------------------------------------------------------------------------
/src/analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 Andrew D Smith
 2 | #
 3 | # This program is free software: you can redistribute it and/or modify it
 4 | # under the terms of the GNU General Public License as published by the Free
 5 | # Software Foundation, either version 3 of the License, or (at your option)
 6 | # any later version.
 7 | #
 8 | # This program is distributed in the hope that it will be useful, but WITHOUT
 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 | # more details.
12 | #
13 | # You should have received a copy of the GNU General Public License along with
14 | # this program. If not, see <https://www.gnu.org/licenses/>.
15 | 
16 | file(GLOB cpp_files "*.cpp")
17 | 
18 | # Gather all the object files that will be put in the static library
19 | # and prepare to compile them.
20 | set(LIBRARY_OBJECTS "")
21 | foreach(cpp_file ${cpp_files})
22 |   get_filename_component(BASE_NAME ${cpp_file} NAME_WE)
23 |   add_library(${BASE_NAME} OBJECT ${cpp_file})
24 |   target_link_libraries(${BASE_NAME} PUBLIC
25 |     dnmtools_objs
26 |     bamxx
27 |     smithlab_cpp
28 |     HTSLIB::HTSLIB
29 |   )
30 |   ## Below is to make sure 'config.h' is visible for includes and any
31 |   ## of the headers for configured libraries
32 |   target_include_directories(${BASE_NAME} PUBLIC
33 |     ${PROJECT_BINARY_DIR}
34 |   )
35 |   list(APPEND LIBRARY_OBJECTS ${BASE_NAME})
36 | endforeach()
37 | 
38 | # Create static library linking the individual objects
39 | add_library(dnmtools_analysis_objs STATIC)
40 | target_include_directories(dnmtools_analysis_objs PUBLIC
41 |   ${CMAKE_BINARY_DIR}
42 |   ${CMAKE_CURRENT_SOURCE_DIR}
43 | )
44 | target_link_libraries(dnmtools_analysis_objs PUBLIC
45 |   ${LIBRARY_OBJECTS}
46 |   smithlab_cpp
47 |   bamxx
48 | )
49 | 


--------------------------------------------------------------------------------
/src/radmeth/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 Andrew D Smith
 2 | #
 3 | # This program is free software: you can redistribute it and/or modify it
 4 | # under the terms of the GNU General Public License as published by the Free
 5 | # Software Foundation, either version 3 of the License, or (at your option)
 6 | # any later version.
 7 | #
 8 | # This program is distributed in the hope that it will be useful, but WITHOUT
 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 | # more details.
12 | #
13 | # You should have received a copy of the GNU General Public License along with
14 | # this program. If not, see <https://www.gnu.org/licenses/>.
15 | 
16 | file(GLOB cpp_files "*.cpp")
17 | 
18 | # Gather all the object files that will be put in the static library
19 | # and prepare to compile them.
20 | set(LIBRARY_OBJECTS "")
21 | foreach(cpp_file ${cpp_files})
22 |   get_filename_component(BASE_NAME ${cpp_file} NAME_WE)
23 |   add_library(${BASE_NAME} OBJECT ${cpp_file})
24 |   target_link_libraries(${BASE_NAME} PUBLIC
25 |     dnmtools_objs
26 |     bamxx
27 |     smithlab_cpp
28 |     HTSLIB::HTSLIB
29 |   )
30 |   ## Below is to make sure 'config.h' is visible for includes and any
31 |   ## of the headers for configured libraries
32 |   target_include_directories(${BASE_NAME} PUBLIC
33 |     ${PROJECT_BINARY_DIR}
34 |   )
35 |   list(APPEND LIBRARY_OBJECTS ${BASE_NAME})
36 | endforeach()
37 | 
38 | # Create static library linking the individual objects
39 | add_library(dnmtools_radmeth_objs STATIC)
40 | target_include_directories(dnmtools_radmeth_objs PUBLIC
41 |   ${CMAKE_BINARY_DIR}
42 |   ${CMAKE_CURRENT_SOURCE_DIR}
43 | )
44 | target_link_libraries(dnmtools_radmeth_objs PUBLIC
45 |   ${LIBRARY_OBJECTS}
46 |   smithlab_cpp
47 |   bamxx
48 |   Threads::Threads
49 | )
50 | 


--------------------------------------------------------------------------------
/src/amrfinder/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 Andrew D Smith
 2 | #
 3 | # This program is free software: you can redistribute it and/or modify it
 4 | # under the terms of the GNU General Public License as published by the Free
 5 | # Software Foundation, either version 3 of the License, or (at your option)
 6 | # any later version.
 7 | #
 8 | # This program is distributed in the hope that it will be useful, but WITHOUT
 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 | # more details.
12 | #
13 | # You should have received a copy of the GNU General Public License along with
14 | # this program. If not, see <https://www.gnu.org/licenses/>.
15 | 
16 | file(GLOB cpp_files "*.cpp")
17 | 
18 | # Gather all the object files that will be put in the static library
19 | # and prepare to compile them.
20 | set(LIBRARY_OBJECTS "")
21 | foreach(cpp_file ${cpp_files})
22 |   get_filename_component(BASE_NAME ${cpp_file} NAME_WE)
23 |   add_library(${BASE_NAME} OBJECT ${cpp_file})
24 |   target_link_libraries(${BASE_NAME} PUBLIC
25 |     dnmtools_objs
26 |     bamxx
27 |     smithlab_cpp
28 |     HTSLIB::HTSLIB
29 |   )
30 |   ## Below is to make sure 'config.h' is visible for includes and any
31 |   ## of the headers for configured libraries
32 |   target_include_directories(${BASE_NAME} PUBLIC
33 |     ${PROJECT_BINARY_DIR}
34 |   )
35 |   list(APPEND LIBRARY_OBJECTS ${BASE_NAME})
36 | endforeach()
37 | 
38 | # Create static library linking the individual objects
39 | add_library(dnmtools_amrfinder_objs STATIC)
40 | target_include_directories(dnmtools_amrfinder_objs PUBLIC
41 |   ${CMAKE_BINARY_DIR}
42 |   ${CMAKE_CURRENT_SOURCE_DIR}
43 | )
44 | target_link_libraries(dnmtools_amrfinder_objs PUBLIC
45 |   ${LIBRARY_OBJECTS}
46 |   smithlab_cpp
47 |   bamxx
48 |   Threads::Threads
49 | )
50 | 


--------------------------------------------------------------------------------
/src/common/numerical_utils.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2011-2022 University of Southern California
 3 |  *                    Andrew D Smith and Qiang Song
 4 |  * Author: Qiang Song and Andrew D. Smith
 5 |  *
 6 |  * This is free software; you can redistribute it and/or modify it
 7 |  * under the terms of the GNU General Public License as published by
 8 |  * the Free Software Foundation; either version 2 of the License, or
 9 |  * (at your option) any later version.
10 |  *
11 |  * This is distributed in the hope that it will be useful,
12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  * GNU General Public License for more details.
15 |  */
16 | 
17 | #include "numerical_utils.hpp"
18 | 
19 | #include <algorithm>
20 | #include <cmath>
21 | #include <iterator>  // IWYU pragma: keep
22 | #include <vector>
23 | 
24 | double
25 | log_sum_log_vec(const std::vector<double> &vals, const size_t limit) {
26 |   const auto x = std::max_element(
27 |     std::cbegin(vals), std::cbegin(vals) + static_cast<std::ptrdiff_t>(limit));
28 |   const double max_val = *x;
29 |   const std::size_t max_idx = std::distance(std::cbegin(vals), x);
30 |   double sum = 1.0;
31 |   for (std::size_t i = 0; i < limit; ++i)
32 |     if (i != max_idx)
33 |       sum += std::exp(vals[i] - max_val);  // cppcheck-suppress useStlAlgorithm
34 |   return max_val + std::log(sum);
35 | }
36 | 
37 | double
38 | log_sum_log(const std::vector<double>::const_iterator &begin,
39 |             const std::vector<double>::const_iterator &end) {
40 |   const auto max_itr = std::max_element(begin, end);
41 |   const double max_val = *max_itr;
42 |   double sum = 1.0;
43 |   for (auto itr = begin; itr < end; ++itr)
44 |     if (itr != max_itr)
45 |       sum += std::exp(*itr - max_val);  // cppcheck-suppress useStlAlgorithm
46 |   return max_val + std::log(sum);
47 | }
48 | 


--------------------------------------------------------------------------------
/src/radmeth/README.md:
--------------------------------------------------------------------------------
 1 | RADMeth: Regression Analysis of Differential Methylation
 2 | ========================================================
 3 | 
 4 | RADMeth: Regression Analysis of Differential Methilation is a software for 
 5 | computing individual differentially methylated sites and genomic regions in 
 6 | whole genome bisulfite sequencing (WGBS) data.
 7 | 
 8 | Contact Information
 9 | -------------------
10 | 
11 | Egor Dolzhenko
12 | dolzhenk@usc.edu
13 | http://smithlabresearch.org/
14 | 
15 | Installation
16 | ------------
17 | *Before attempting to compile RADMeth please make sure that GNU Scientific 
18 | Library (http://www.gnu.org/software/gsl/) is installed on your system*
19 | Alternatively, you can download pre-compiled binaries for either Linux or Mac 
20 | from http://smithlabresearch.org/software/radmeth/
21 | 
22 | To compile RADMeth, enter the program's root directory (e.g. radmeth/) and  
23 | execute
24 | 
25 | > make
26 | 
27 | After the compilation, the binaries can be found in radmeth/bin/
28 | 
29 | Usage
30 | -----
31 | 
32 | Please see the manual, which can be obtained at 
33 | http://smithlabresearch.org/software/radmeth/
34 | 
35 | License
36 | -------
37 | Copyright (C) 2013 University of Southern California and
38 |                Egor Dolzhenko
39 |                Andrew D Smith
40 | 
41 |     Authors: Andrew D. Smith and Egor Dolzhenko
42 | 
43 |     This program is free software: you can redistribute it and/or modify
44 |     it under the terms of the GNU General Public License as published by
45 |     the Free Software Foundation, either version 3 of the License, or
46 |     (at your option) any later version.
47 | 
48 |     This program is distributed in the hope that it will be useful,
49 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
50 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
51 |     GNU General Public License for more details.
52 | 


--------------------------------------------------------------------------------
/docs/content/selectsites.md:
--------------------------------------------------------------------------------
 1 | # selectsites - get subsets of cytosines from counts files
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools selectsites [OPTIONS] <regions.bed> <input.counts>
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | In many cases, we may be interested in analyzing only a subset of
11 | cytosines or CpGs in a sample. Some instances of these cases including
12 | calculating average methylation levels in (1) annotated regions, such
13 | as promoter regions or repeats or (2) regions defined by the data
14 | itself, such as HMRs or PMDs.
15 | 
16 | A possible solution to subset these regions is to convert the counts file to
17 | BED format, intersect it with a BED file of the regions of interest (using
18 | [bedtools](https://bedtools.readthedocs.io)), then convert it back to
19 | counts. The program selectsites simplifies these operations. It takes a
20 | [counts](../counts) format file and a set of intervals in a BED file and
21 | produces a subset of the entries in the counts file included in the BED
22 | regions. We can select entries in `input.counts` contained in any inverval in
23 | `regions.bed` using the following command.
24 | 
25 | ```shell
26 | $ dnmtools selectsites -o output.counts regions.bed input.counts
27 | ```
28 | 
29 | ## Options
30 | 
31 | ```txt
32 |  -o, -output
33 | ```
34 | Name of output file (default: STDOUT)
35 | 
36 | ```txt
37 |  -p, -preload
38 | ```
39 | Preload sites (use for large target intervals).
40 | 
41 | ```txt
42 |  -v, -verbose
43 | ```
44 | Print more run info to STDERR while the program is running.
45 | 
46 | ```txt
47 |  -d, -disk
48 | ```
49 | Process sites on disk (fast if target intervals are few).
50 | 
51 | ```txt
52 |  -S, -summary
53 | ```
54 | Write summary to this file.
55 | 
56 | ```txt
57 |  -z, -zip
58 | ```
59 | The output file will be in gzip compressed format.
60 | 
61 | ```txt
62 |  -relaxed
63 | ```
64 | Allow additional columns in the input file.
65 | 


--------------------------------------------------------------------------------
/src/common/Epiread.hpp:
--------------------------------------------------------------------------------
 1 | /*    Copyright (C) 2011-2022 University of Southern California and
 2 |  *                       Andrew D. Smith and Fang Fang
 3 |  *
 4 |  *    Authors: Fang Fang and Andrew D. Smith
 5 |  *
 6 |  *    This program is free software: you can redistribute it and/or modify
 7 |  *    it under the terms of the GNU General Public License as published by
 8 |  *    the Free Software Foundation, either version 3 of the License, or
 9 |  *    (at your option) any later version.
10 |  *
11 |  *    This program is distributed in the hope that it will be useful,
12 |  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  *    GNU General Public License for more details.
15 |  */
16 | 
17 | #ifndef EPIREAD
18 | #define EPIREAD
19 | 
20 | #include <cstddef>
21 | #include <iosfwd>
22 | #include <string>
23 | #include <vector>
24 | 
25 | struct epiread {
26 |   std::string chr{};
27 |   size_t pos{};
28 |   std::string seq{};
29 |   epiread() = default;
30 |   explicit epiread(const std::string &line);
31 |   epiread(const size_t p, const std::string &s) : pos(p), seq(s) {}
32 |   epiread(const std::string &c, const size_t p, const std::string &s) :
33 |     chr(c), pos(p), seq(s) {}
34 | 
35 |   bool
36 |   operator<(const epiread &other) const {
37 |     return (chr < other.chr || (chr == other.chr && pos < other.pos));
38 |   }
39 |   size_t
40 |   end() const {
41 |     return pos + seq.length();
42 |   }
43 |   size_t
44 |   length() const {
45 |     return seq.length();
46 |   }
47 | };
48 | 
49 | std::istream &
50 | operator>>(std::istream &in, epiread &er);
51 | std::ostream &
52 | operator<<(std::ostream &out, const epiread &er);
53 | 
54 | size_t
55 | adjust_read_offsets(std::vector<epiread> &reads);
56 | 
57 | size_t
58 | get_n_cpgs(const std::vector<epiread> &reads);
59 | 
60 | bool
61 | validate_epiread_file(const std::string &filename);
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2025 Andrew D Smith
 2 | #
 3 | # This program is free software: you can redistribute it and/or modify it
 4 | # under the terms of the GNU General Public License as published by the Free
 5 | # Software Foundation, either version 3 of the License, or (at your option)
 6 | # any later version.
 7 | #
 8 | # This program is distributed in the hope that it will be useful, but WITHOUT
 9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 | # more details.
12 | #
13 | # You should have received a copy of the GNU General Public License along with
14 | # this program. If not, see <https://www.gnu.org/licenses/>.
15 | 
16 | # Packages
17 | if(USE_LIBDEFLATE)
18 |   find_package(LIBDEFLATE REQUIRED)
19 | endif()
20 | if(BUILD_NANOPORE)
21 |   find_package(HTSLIB 1.20 REQUIRED)
22 |   add_compile_definitions(BUILD_NANOPORE)
23 | else()
24 |   find_package(HTSLIB REQUIRED)
25 | endif()
26 | find_package(Threads REQUIRED)
27 | find_package(ZLIB REQUIRED)
28 | 
29 | # Subdirs
30 | if(NOT TARGET smithlab_cpp)
31 |   add_subdirectory(smithlab_cpp)
32 | endif()
33 | if(NOT TARGET bamxx)
34 |   add_subdirectory(bamxx)
35 | endif()
36 | add_subdirectory(common)
37 | add_subdirectory(radmeth)
38 | add_subdirectory(utils)
39 | add_subdirectory(analysis)
40 | add_subdirectory(amrfinder)
41 | add_subdirectory(abismal)
42 | add_subdirectory(mlml)
43 | 
44 | add_executable(dnmtools dnmtools.cpp)
45 | # ADS: below, for config.h
46 | target_include_directories(dnmtools PUBLIC ${CMAKE_BINARY_DIR})
47 | target_link_libraries(dnmtools PUBLIC
48 |   dnmtools_objs
49 |   abismal_objs
50 |   dnmtools_analysis_objs
51 |   dnmtools_utils_objs
52 |   dnmtools_radmeth_objs
53 |   dnmtools_mlml_objs
54 |   dnmtools_amrfinder_objs
55 |   bamxx
56 |   HTSLIB::HTSLIB
57 |   ZLIB::ZLIB
58 |   Threads::Threads
59 | )
60 | if(USE_LIBDEFLATE)
61 |   target_link_libraries(dnmtools PUBLIC
62 |     LIBDEFLATE::LIBDEFLATE
63 |   )
64 | endif()
65 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This file is part of dnmtools
 2 | #
 3 | # Copyright (C) 2025 Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it under the
 8 | # terms of the GNU General Public License as published by the Free Software
 9 | # Foundation, either version 3 of the License, or (at your option) any later
10 | # version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but WITHOUT
13 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 | # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15 | # more details.
16 | 
17 | # to find the version of cmake do
18 | # $ cmake --version
19 | cmake_minimum_required(VERSION 3.28)
20 | project(
21 |   dnmtools
22 |   VERSION 1.5.1
23 |   DESCRIPTION
24 |   "Tools for analyzing DNA methylation data"
25 |   HOMEPAGE_URL https://github.com/smithlabcode/dnmtools
26 |   LANGUAGES CXX)
27 | 
28 | # Set language version used
29 | set(CMAKE_CXX_STANDARD 17)
30 | set(CMAKE_CXX_STANDARD_REQUIRED on)
31 | set(CMAKE_CXX_EXTENSIONS off)  # prevents std=gnu++17
32 | set(CMAKE_EXPORT_COMPILE_COMMANDS on)
33 | 
34 | include(CheckIncludeFileCXX)
35 | include(CheckFunctionExists)
36 | include(CheckCXXCompilerFlag)
37 | 
38 | include(GNUInstallDirs)
39 | 
40 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
41 | 
42 | configure_file(data/config.h.in config.h)
43 | 
44 | if(ENABLE_LTO)
45 |   # Turn on LTO if we are building for distribution
46 |   include(CheckIPOSupported)
47 |   check_ipo_supported(RESULT result OUTPUT output)
48 |   if(result)
49 |     set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
50 |   else()
51 |     message(FATAL_ERROR "IPO is not supported: ${output}")
52 |   endif()
53 | endif()
54 | 
55 | if(STATIC_ANALYSIS)
56 |   include(cmake/static_analysis.cmake)
57 | endif()
58 | 
59 | # ADS: set the most stringent warnings we can
60 | add_compile_options(
61 |   -Wall
62 |   -Wextra
63 |   -Wpedantic
64 |   -Werror
65 |   -Wfatal-errors
66 | )
67 | 
68 | add_subdirectory(src)
69 | 


--------------------------------------------------------------------------------
/.github/workflows/dnmtools_build_ubuntu.yml:
--------------------------------------------------------------------------------
 1 | name: DNMTools build (Ubuntu)
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches: [ "master" ]
 7 |   pull_request:
 8 |     branches: [ "master" ]
 9 | 
10 | jobs:
11 |   build-with-gcc:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |     - uses: actions/checkout@v4
15 |       with:
16 |         submodules: recursive
17 |     - name: Install dependencies
18 |       run: |
19 |         sudo apt-get update
20 |         sudo apt-get install -y \
21 |         libgsl-dev \
22 |         libcurl4-gnutls-dev \
23 |         libdeflate-dev \
24 |         liblzma-dev \
25 |         zlib1g-dev \
26 |         libbz2-dev
27 |     - name: Build and install htslib (for recent version)
28 |       run: |
29 |         git clone --recursive https://github.com/samtools/htslib.git
30 |         cd htslib
31 |         make -j4
32 |         sudo make install prefix=/usr
33 |     - name: Generate configure script
34 |       run: ./autogen.sh
35 |     - name: Configure for GCC
36 |       run: ./configure CXX="g++"
37 |     - name: Build with g++
38 |       run: make -j4
39 |     - name: Test the g++ build
40 |       run: make -j4 check
41 |   build-with-clang:
42 |     runs-on: ubuntu-latest
43 |     steps:
44 |     - uses: actions/checkout@v4
45 |       with:
46 |         submodules: recursive
47 |     - name: Install dependencies
48 |       run: |
49 |         sudo apt-get update
50 |         sudo apt-get install -y \
51 |         libgsl-dev \
52 |         libcurl4-gnutls-dev \
53 |         libdeflate-dev \
54 |         liblzma-dev \
55 |         zlib1g-dev \
56 |         libbz2-dev
57 |     - name: Build and install htslib (for recent version)
58 |       run: |
59 |         git clone --recursive https://github.com/samtools/htslib.git
60 |         cd htslib
61 |         make -j4
62 |         sudo make install prefix=/usr
63 |     - name: Generate configure script
64 |       run: ./autogen.sh
65 |     - name: Configure for Clang
66 |       run: ./configure CXX="clang++"
67 |     - name: Build with clang++
68 |       run: make -j4
69 |     - name: Test the clang++ build
70 |       run: make -j4 check
71 | 


--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # Run 'autoreconf -i' to generate 'configure', 'Makefile.in', etc.,
 4 | # including in the subdirectories of dnmtools src (for the git
 5 | # submodules).
 6 | #
 7 | # The first time this is run on a new cloned git repo the configure
 8 | # script will not be present, only the configure.ac and
 9 | # Makefile.am. The rest must be generated by `autoreconf -i` and this
10 | # must happen in the `src/smithlab_cpp`,
11 | # `src/abismal/src/smithlab_cpp` and `src/abismal` subdirs. Running
12 | # `autoreconf -i` in some of these directories will move recursively
13 | # into others, but this is not guaranteed. This script will do each
14 | # separately.
15 | #
16 | # If you are working with a distribution (file ending with ".tar.gz"
17 | # or similar) then this script should not be needed, and should not be
18 | # present, as all the files should already exist. You should only run
19 | # this script if you know what you are doing with autoreconf.
20 | #
21 | # This script will only work with an argument to confirm the help
22 | # message has been read.
23 | 
24 | runautoreconf() {
25 |     autoreconf -i src/abismal/src/smithlab_cpp;
26 |     autoreconf -i src/abismal;
27 |     autoreconf -i src/smithlab_cpp;
28 |     autoreconf -i;
29 | }
30 | 
31 | if test -d .git && test "$(basename "${PWD}")" = "dnmtools"
32 | then
33 |     runautoreconf
34 |     exit 0
35 | else
36 |     echo "  It seems you are either attempting to run this script       "
37 |     echo "  from the wrong directory, or in a source tree that was      "
38 |     echo "  not obtained by cloning the dnmtools git repo.              "
39 |     echo "                                                              "
40 |     echo "  ./autogen.sh generates the configure script in the          "
41 |     echo "  relevant subdirectories. Only run this if you know          "
42 |     echo "  what you are doing with autoreconf and are simply           "
43 |     echo "  avoiding doing that. If you just want to use the            "
44 |     echo "  software, download a release and this script will           "
45 |     echo "  not be needed.                                              "
46 |     exit 1
47 | fi
48 | 


--------------------------------------------------------------------------------
/.github/workflows/dnmtools_build_macos.yml:
--------------------------------------------------------------------------------
 1 | name: DNMTools build (macOS)
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches: [ "master" ]
 7 |   pull_request:
 8 |     branches: [ "master" ]
 9 | 
10 | jobs:
11 |   ## ADS: removing this because of 'brownout' on github runners
12 |   # build-with-gcc-on-x86:
13 |   #   runs-on: macos-13
14 |   #   steps:
15 |   #   - uses: actions/checkout@v4
16 |   #     with:
17 |   #       submodules: recursive
18 |   #   - name: Update Homebrew
19 |   #     run: brew update
20 |   #   - name: Install autotools
21 |   #     run: brew install automake
22 |   #   - name: Install dependencies
23 |   #     run: brew install htslib gsl
24 |   #   - name: Generate configure script
25 |   #     run: ./autogen.sh
26 |   #   - name: configure with g++-14
27 |   #     run: ./configure CXX="g++-14" CPPFLAGS="-I$(brew --prefix)/include" LDFLAGS="-L$(brew --prefix)/lib"
28 |   #   - name: make
29 |   #     run: make -j4
30 |   build-with-gcc-on-arm64:
31 |     runs-on: macos-15
32 |     steps:
33 |     - uses: actions/checkout@v4
34 |       with:
35 |         submodules: recursive
36 |     - name: Update Homebrew
37 |       run: brew update
38 |     - name: Install autotools
39 |       run: brew install automake
40 |     - name: Install dependencies
41 |       run: brew install htslib gsl
42 |     - name: Generate configure script
43 |       run: ./autogen.sh
44 |     - name: configure with g++-14
45 |       run: ./configure CXX="g++-14" CPPFLAGS="-I$(brew --prefix)/include" LDFLAGS="-L$(brew --prefix)/lib"
46 |     - name: make
47 |       run: make -j4
48 |   build-with-clang-on-arm64:
49 |     runs-on: macos-15
50 |     steps:
51 |     - uses: actions/checkout@v4
52 |       with:
53 |         submodules: recursive
54 |     - name: Update Homebrew
55 |       run: brew update
56 |     - name: Install autotools
57 |       run: brew install automake
58 |     - name: Install dependencies
59 |       run: brew install htslib gsl
60 |     - name: Generate configure script
61 |       run: ./autogen.sh
62 |     - name: configure with clang++
63 |       run: ./configure CXX="clang++" CPPFLAGS="-I$(brew --prefix)/include" LDFLAGS="-L$(brew --prefix)/lib"
64 |     - name: make
65 |       run: make -j4
66 | 


--------------------------------------------------------------------------------
/docs/content/entropy.md:
--------------------------------------------------------------------------------
 1 | # entropy - Computing methylation entropy
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools entropy [OPTIONS] <genome.fa> <input.epiread>
 6 | ```
 7 | ## Description
 8 | The concept of methylation entropy was introduced into epigenetics
 9 | study to characterize the randomness of methylation patterns over
10 | several consecutive CpG sites (Xie et al, 2011). The `methentropy`
11 | program processes epireads and calculates the methylation entropy
12 | value in sliding windows of specified number of CpGs. Two input files
13 | are required.
14 | 
15 |  * (1) either a genome in FASTA format or a directory containing FASTA
16 |    chromosome files files
17 | 
18 |  * (2) an epiread file as produced by
19 |    [states](../states) program. The input epiread file
20 |    needs to be sorted, first by chromosome, then by position. It can
21 |     be done with the following command.
22 | 
23 | ```shell
24 | $ LC_ALL=C sort -k1,1 -k2,2g input.epiread -o input-sorted.epiread
25 | ```
26 | 
27 | Use the `-w` option to specify the desired number of CpGs in the
28 | sliding window; if unspecified, the default value is 4. In cases where
29 | symmetric patterns are considered the same, specify option -F, this
30 | will cause the majority state in each epiread to be forced into
31 | "methylated", and the minority to "unmethylated". The processed
32 | epireads will then be used for entropy calculation. To run the
33 | program, type command:
34 | ```shell
35 | $ dnmtools entropy -w 5 -v -o output.meth /path/to/genome.fa input-sorted.epiread
36 | ```
37 | 
38 | The output format is the same as [counts](../counts)
39 | output. The first 3 columns indicate the genomic location of the
40 | center CpG in each sliding window, the 5th column contains the entropy
41 | values, and the 6th column shows the number of reads used for each
42 | sliding window.  Below is an output example.
43 | 
44 | ```txt
45 | chr1    483     +       CpG     2.33914 27
46 | chr1    488     +       CpG     2.05298 23
47 | chr1    492     +       CpG     1.4622  24
48 | chr1    496     +       CpG     1.8784  35
49 | ```
50 | 
51 | ## Options
52 | ```txt
53 |  -w, -window
54 | ```
55 | number of CpGs in sliding window (default: 4)
56 | ```txt
57 |  -F, -flip
58 | ```
59 | flip read majority state to meth
60 | ```txt
61 |  -o, -output
62 | ```
63 | Name of output file (default: STDOUT)
64 | ```txt
65 |  -v, -verbose
66 | ```
67 | print more run info to STDERR while the program is running.
68 | 
69 | 


--------------------------------------------------------------------------------
/src/common/counts_header.hpp:
--------------------------------------------------------------------------------
 1 | /* xcounts_utils: code for doing things with xcounts format and some
 2 |  * for counts format that is common to several tools.
 3 |  *
 4 |  * Copyright (C) 2023 Andrew D. Smith
 5 |  *
 6 |  * Authors: Andrew D. Smith
 7 |  *
 8 |  * This program is free software: you can redistribute it and/or
 9 |  * modify it under the terms of the GNU General Public License as
10 |  * published by the Free Software Foundation, either version 3 of the
11 |  * License, or (at your option) any later version.
12 |  *
13 |  * This program is distributed in the hope that it will be useful, but
14 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
15 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 |  * General Public License for more details.
17 |  */
18 | 
19 | #ifndef COUNTS_HEADER_HPP
20 | #define COUNTS_HEADER_HPP
21 | 
22 | #include <cstdint>
23 | #include <string>
24 | #include <unordered_map>
25 | #include <vector>
26 | namespace bamxx {
27 | struct bam_header;
28 | }
29 | namespace bamxx {
30 | struct bgzf_file;
31 | }
32 | 
33 | std::unordered_map<std::string, std::uint32_t>
34 | write_counts_header_from_chrom_sizes(
35 |   const std::vector<std::string> &chrom_names,
36 |   const std::vector<uint64_t> &chrom_sizes, bamxx::bgzf_file &out);
37 | 
38 | std::unordered_map<std::string, std::uint32_t>
39 | write_counts_header_from_file(const std::string &header_file,
40 |                               bamxx::bgzf_file &out);
41 | 
42 | // returns -1 on failure, 0 on success
43 | int
44 | get_chrom_sizes_for_counts_header(const std::int32_t n_threads,
45 |                                   const std::string &filename,
46 |                                   std::vector<std::string> &chrom_names,
47 |                                   std::vector<uint64_t> &chrom_sizes);
48 | 
49 | void
50 | write_counts_header_from_bam_header(const bamxx::bam_header &hdr,
51 |                                     bamxx::bgzf_file &out);
52 | 
53 | bool
54 | write_counts_header_line(std::string line, bamxx::bgzf_file &out);
55 | 
56 | bamxx::bgzf_file &
57 | skip_counts_header(bamxx::bgzf_file &in);
58 | 
59 | bool
60 | get_has_counts_header(const std::string &filename);
61 | 
62 | inline bool
63 | is_counts_header_version_line(const std::string &line) {
64 |   const auto version_line = "#DNMTOOLS";
65 |   return line.compare(0, 9, version_line) == 0;
66 | }
67 | 
68 | template <typename T>
69 | inline bool
70 | is_counts_header_line(T &line) {
71 |   return line[0] == '#';
72 | }
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/src/radmeth/radmeth_design.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2025 Andrew D Smith
 2 |  *
 3 |  * Author: Andrew D Smith
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify it
 6 |  * under the terms of the GNU General Public License as published by the Free
 7 |  * Software Foundation, either version 3 of the License, or (at your option)
 8 |  * any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful, but WITHOUT
11 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13 |  * more details.
14 |  */
15 | 
16 | #ifndef RADMETH_DESIGN_HPP
17 | #define RADMETH_DESIGN_HPP
18 | 
19 | #include <cstddef>
20 | #include <cstdint>
21 | #include <istream>
22 | #include <iterator>
23 | #include <string>
24 | #include <vector>
25 | 
26 | struct Design {
27 |   std::vector<std::string> factor_names;
28 |   std::vector<std::string> sample_names;
29 |   std::vector<std::vector<std::uint8_t>> matrix;   // samples=rows, factors=cols
30 |   std::vector<std::vector<std::uint8_t>> tmatrix;  // factors=rows, samples=cols
31 |   std::vector<std::vector<std::uint8_t>> groups;   // combs of fact levels
32 |   std::vector<std::uint32_t> group_id;             // assign group to sample
33 | 
34 |   [[nodiscard]] static Design
35 |   read_design(const std::string &design_filename);
36 | 
37 |   [[nodiscard]] std::size_t
38 |   n_factors() const {
39 |     return std::size(factor_names);
40 |   }
41 | 
42 |   [[nodiscard]] std::size_t
43 |   n_groups() const {
44 |     return std::size(groups);
45 |   }
46 | 
47 |   [[nodiscard]] std::size_t
48 |   n_samples() const {
49 |     return std::size(sample_names);
50 |   }
51 | 
52 |   [[nodiscard]] Design
53 |   drop_factor(const std::uint32_t factor_idx);
54 | 
55 |   void
56 |   order_samples(const std::vector<std::string> &ordered_names);
57 | 
58 |   [[nodiscard]] std::uint32_t
59 |   get_test_factor_idx(const std::string &test_factor) const;
60 | 
61 |   [[nodiscard]] bool
62 |   has_two_values(const std::size_t test_factor) const;
63 | };
64 | 
65 | std::istream &
66 | operator>>(std::istream &is, Design &design);
67 | 
68 | std::ostream &
69 | operator<<(std::ostream &os, const Design &design);
70 | 
71 | void
72 | ensure_sample_order(const std::string &table_filename, Design &design);
73 | 
74 | [[nodiscard]] std::vector<std::string>
75 | get_sample_names_from_header(const std::string &header);
76 | 
77 | [[nodiscard]] bool
78 | consistent_sample_names(const Design &design, const std::string &header);
79 | 
80 | #endif  // RADMETH_DESIGN_HPP
81 | 


--------------------------------------------------------------------------------
/src/common/Interval.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2025 Andrew D Smith
 2 |  *
 3 |  * This is free software; you can redistribute it and/or modify it under the
 4 |  * terms of the GNU General Public License as published by the Free Software
 5 |  * Foundation; either version 2 of the License, or (at your option) any later
 6 |  * version.
 7 |  *
 8 |  * This is distributed in the hope that it will be useful, but WITHOUT ANY
 9 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
10 |  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
11 |  * details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License along
14 |  * with this software; if not, write to the Free Software Foundation, Inc., 51
15 |  * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 |  */
17 | 
18 | #include "Interval.hpp"
19 | 
20 | #include <algorithm>
21 | #include <charconv>
22 | #include <cstdint>
23 | #include <fstream>
24 | #include <string>
25 | #include <system_error>
26 | #include <vector>
27 | 
28 | auto
29 | Interval::initialize(const char *c, const char *c_end) -> bool {
30 |   constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; };
31 |   constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; };
32 | 
33 |   bool failed = false;
34 | 
35 |   // NOLINTBEGIN(*-pointer-arithmetic)
36 |   auto field_s = c;
37 |   auto field_e = std::find_if(field_s + 1, c_end, is_sep);
38 |   if (field_e == c_end)
39 |     failed = true;
40 | 
41 |   // chrom
42 |   {
43 |     const std::uint32_t d = std::distance(field_s, field_e);
44 |     chrom = std::string{field_s, d};
45 |   }
46 | 
47 |   // start
48 |   field_s = std::find_if(field_e + 1, c_end, not_sep);
49 |   field_e = std::find_if(field_s + 1, c_end, is_sep);
50 |   failed = failed || (field_e == c_end);
51 |   {
52 |     const auto [ptr, ec] = std::from_chars(field_s, field_e, start);
53 |     failed = failed || ec != std::errc{};
54 |   }
55 | 
56 |   // stop
57 |   field_s = std::find_if(field_e + 1, c_end, not_sep);
58 |   field_e = std::find_if(field_s + 1, c_end, is_sep);
59 |   {
60 |     const auto [ptr, ec] = std::from_chars(field_s, field_e, stop);
61 |     failed = failed || ec != std::errc{};
62 |   }
63 |   // NOLINTEND(*-pointer-arithmetic)
64 | 
65 |   return !failed;
66 | }
67 | 
68 | [[nodiscard]] auto
69 | read_intervals(const std::string &intervals_file) -> std::vector<Interval> {
70 |   std::ifstream in(intervals_file);
71 |   if (!in)
72 |     throw std::runtime_error("failed to open file: " + intervals_file);
73 |   std::string line;
74 |   std::vector<Interval> intervals;
75 |   while (getline(in, line))
76 |     intervals.emplace_back(line);
77 |   return intervals;
78 | }
79 | 


--------------------------------------------------------------------------------
/docs/content/radadjust.md:
--------------------------------------------------------------------------------
 1 | # radadjust - Correct p-values of individual CpGs
 2 | 
 3 | ## Synopsis
 4 | ```console
 5 | $ dnmtools radadjust [OPTIONS] <regression-output.bed>
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | This program adjusts the p-value of individual CpGs in the output of
11 | [radmeth](../radmeth). A typical application
12 | that takes the regression output as input and combines the p-values of
13 | 200 neighboring CpGs is done as follows.
14 | ```console
15 | $ dnmtools radadjust -bins 1:200:1 input.bed >output-adjusted.bed
16 | ```
17 | 
18 | Here, the only required parameter, besides the input file, is `-bins`
19 | whose value is set to `1:200:1` (which is also the default value). This
20 | means that for each `n = 1, 2, ...199`, `radmeth-adjust` computes the
21 | correlation between p-values of CpGs located at distance n from each
22 | other. These correlations are used during significance combination
23 | step. In addition, bin sizes determine the window for combining
24 | significance. In contrast, if `-bins` is set to `1:15:5`, then the
25 | correlation is computed separately for p-values corresponding to CpGs
26 | at distances `[1, 5)`, `[5, 10)`, and `[10, 15)` from one another.  The
27 | first five columns and the last four columns of `radmeth-adjust` have
28 | the same meaning as those output by radmeth regression. The 6th column
29 | gives the modified p-value based on the original p-value of the site
30 | and the p-values of its neighbors. The 7th column gives the
31 | FDR-corrected p-value. Then the last four columns correspond to the
32 | total read counts and methylated read counts of the case group and
33 | control group, respectively.  Here is what the `output-adjusted.bed`
34 | file looks like for our example dataset:
35 | 
36 | ```txt
37 | chr1  108   +     CpG   0.157971    0.099290    0.353466    18     4    20    15
38 | chr1  114   +     CpG   0.559191    0.099290    0.353466    21     3    41    10
39 | chr1  160   +     CpG   0.095112    0.099290    0.353466    32    24    39    17
40 | chr1  309   +     CpG   0.239772    0.122248    0.368902    33    17    19    13
41 | chr1  499   +     CpG   0.770140    0.204467    0.419872    43    22    29    15
42 | ```
43 | 
44 | After completing the previous steps, individual differentially methy-
45 | lated sites can be obtained with 'awk'. To get all CpGs with
46 | FDR-corrected p-value below 0.01, run
47 | 
48 | ```console
49 | $ awk '$7 <= 0.01' output-adjusted.bed >output-significant.bed
50 | ```
51 | 
52 | ## Options
53 | 
54 | ```txt
55 |  -o, -out
56 | ```
57 | Name of the output file (default: stdout).
58 | 
59 | ```txt
60 |  -b, -bins
61 | ```
62 | Correlation bin specification string (default is 1:200:1).
63 | 
64 | ```txt
65 |  -v, -verbose
66 | ```
67 | Print more information while the command is running.
68 | 


--------------------------------------------------------------------------------
/docs/content/states.md:
--------------------------------------------------------------------------------
 1 | # states - Allele-specific methylation file format
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools states [OPTIONS] <input.sam>
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | All programs that calculate statistics related to ASM must take the
11 | linked states of CpG sites within reads into account. Using full read
12 | sequences for this purpose is inefficient, so we defined an
13 | intermediate format, "epiread," for this purpose. The `states` command
14 | will convert a BAM or SAM file of mapped reads into a "states" file in
15 | the format used by `amrfinder` and `amrtester`.
16 | 
17 | The epiread format consists of three columns. The first column is the
18 | chromosome name for the mapped read, the second is the "index" of the
19 | first CpG in the read. The index `x` indicates that the first CpG site
20 | in the read corresponds to the `x`'th (starting from 0) CpG site in
21 | the chromosome.  Therefore, these are not nucleotide positions in the
22 | genome. The final column in the epiread format is the sequence of
23 | methylation states within the read. This sequence of states is
24 | composed of 3 possible letters: C if the corresponding letter at that
25 | CpG site in the mapped read is a C, and similar for T. Within this
26 | state sequence, letters in mapped reads at positions corresponding to
27 | CpG sites that are neither C nor T are encoded as N. Aside from the
28 | "N" this is effectively a binary encoding of methylation states.
29 | 
30 | Here is an example showing how some lines of an epiread format file might
31 | look:
32 | ```txt
33 | chr1    1460    CCCCCCCC
34 | chr1    1460    CCC
35 | chr1    1461    TCTTNNNNTTCT
36 | chr1    1468    CCCC
37 | chr1    1469    CCC
38 | chr1    1469    CCCT
39 | chr1    1469    CCC
40 | chr1    1469    CCCCCCT
41 | chr1    1469    CCC
42 | chr1    1470    CCCC
43 | chr1    1471    CCCNNNNNNTCCC
44 | chr1    1472    CCC
45 | ```
46 | Those epireads with the "N" in the middle correspond to paired-end
47 | reads with ends that are joined. It is important to use these as one
48 | fragment because linking methylation states within a fragment, over as
49 | large a distance as possible, helps the inference methods within both
50 | `amrfinder` and `amrtester`.
51 | 
52 | The following is an example of how to run the `states` command:
53 | ```shell
54 | $ dnmtools states -c /path/to/genome.fa -o output.epiread input.sam
55 | ```
56 | 
57 | ## Options
58 | 
59 | ```txt
60 |  -o, -output
61 | ```
62 | The name of the output file.
63 | 
64 | ```txt
65 |  -c, -chrom
66 | ```
67 | FASTA file of chromosomes containing FASTA files [required].
68 | 
69 | ```txt
70 |  -v, -verbose
71 | ```
72 | Print information to the terminal while the program runs.
73 | 
74 | ```txt
75 |  -z, -zip
76 | ```
77 | Write output in gzip compressed format.
78 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Define a base to inherit from so ARGs can be collected here
 2 | FROM alpine:latest as base
 3 | ARG NUM_JOBS=16
 4 | ARG HTSLIB_VERSION=1.21
 5 | ARG SAMTOOLS_VERSION=1.21
 6 | 
 7 | # All builds from source are installed to their own directory so the
 8 | # essential file from them can be retrieved separately as needed.
 9 | 
10 | # Build htslib and samtools from source
11 | FROM base as build_htslib
12 | RUN apk update
13 | WORKDIR /build
14 | 
15 | RUN apk add --no-cache \
16 |     build-base \
17 |     autoconf \
18 |     automake \
19 |     gsl-dev \
20 |     zlib-dev \
21 |     bzip2-dev \
22 |     xz-dev \
23 |     ncurses-dev \
24 |     wget
25 | ENV HTSLIB=htslib-${HTSLIB_VERSION}
26 | ENV SAMTOOLS=samtools-${SAMTOOLS_VERSION}
27 | RUN cd /build \
28 |     && wget -nv https://github.com/samtools/htslib/releases/download/${HTSLIB_VERSION}/${HTSLIB}.tar.bz2 \
29 |     && tar -xf ${HTSLIB}.tar.bz2 \
30 |     && cd ${HTSLIB} \
31 |     && autoreconf -i \
32 |     && ./configure --prefix=$(pwd) \
33 |     && make -j${NUM_JOBS} \
34 |     && make install
35 | RUN cd /build \
36 |     && wget -nv https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/${SAMTOOLS}.tar.bz2 \
37 |     && tar -xf ${SAMTOOLS}.tar.bz2 \
38 |     && cd ${SAMTOOLS} \
39 |     && ./configure --prefix=$(pwd) \
40 |     && make -j${NUM_JOBS} \
41 |     && make install
42 | 
43 | # Build dnmtools
44 | FROM base as build_dnmtools
45 | RUN apk update
46 | WORKDIR /build
47 | ENV HTSLIB=htslib-${HTSLIB_VERSION}
48 | ENV SAMTOOLS=samtools-${SAMTOOLS_VERSION}
49 | RUN apk add --no-cache \
50 |     build-base \
51 |     autoconf \
52 |     automake \
53 |     gsl-dev \
54 |     zlib-dev \
55 |     bzip2-dev \
56 |     xz-dev \
57 |     wget \
58 |     gzip \
59 |     bash
60 | 
61 | # Copying the install from within the build tree helps keep things
62 | # smaller than installing earlier and copying the /usr or /usr/local
63 | COPY --from=build_htslib /build/${HTSLIB}/lib /usr/lib
64 | COPY --from=build_htslib /build/${HTSLIB}/include /usr/include
65 | COPY --from=build_htslib /build/${SAMTOOLS}/bin /usr/bin
66 | 
67 | RUN mkdir /build/dnmtools
68 | COPY . /build/dnmtools
69 | RUN cd /build/dnmtools \
70 |     && ./autogen.sh \
71 |     && ./configure --prefix=$(pwd) \
72 |     && make -j${NUM_JOBS} \
73 |     && make -j${NUM_JOBS} check \
74 |     && make -j${NUM_JOBS} distcheck \
75 |     && make install
76 | 
77 | # Build a light-weight image just with binaries
78 | FROM base
79 | ENV HTSLIB=htslib-${HTSLIB_VERSION}
80 | RUN apk update
81 | WORKDIR /build
82 | RUN apk add \
83 |     gsl-dev \
84 |     zlib-dev \
85 |     bzip2-dev \
86 |     xz-dev \
87 |     libstdc++ \
88 |     libgomp
89 | COPY --from=build_htslib /build/${HTSLIB}/lib /usr/lib
90 | COPY --from=build_dnmtools /build/dnmtools/dnmtools /usr/bin
91 | 
92 | ENTRYPOINT ["dnmtools"]
93 | 


--------------------------------------------------------------------------------
/src/common/EmissionDistribution.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright (C) 2022 Andrew D Smith
 3 | 
 4 |   Authors: Andrew D. Smith and Benjamin E. Decato
 5 | 
 6 |   This file is part of dnmtools.
 7 | 
 8 |   dnmtools is free software; you can redistribute it and/or modify
 9 |   it under the terms of the GNU General Public License as published by
10 |   the Free Software Foundation; either version 2 of the License, or
11 |   (at your option) any later version.
12 | 
13 |   dnmtools is distributed in the hope that it will be useful,
14 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
15 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 |   GNU General Public License for more details.
17 | */
18 | 
19 | #ifndef EM_DTN
20 | #define EM_DTN
21 | 
22 | #include <iterator>  // IWYU pragma: keep
23 | #include <string>
24 | #include <utility>
25 | #include <vector>
26 | 
27 | /** Emission distributions for methylation should be modeled either as
28 |  * Beta or Beta Binomial. Since they will be used simultaneously, it is
29 |  * helpful to have an abstraction so that we can put them in the same
30 |  * container.
31 |  */
32 | class EmissionDistribution {
33 | public:
34 |   EmissionDistribution();
35 |   virtual ~EmissionDistribution();
36 |   EmissionDistribution(const double a, const double b);
37 |   EmissionDistribution(const std::string &str);
38 |   virtual double
39 |   operator()(const std::pair<double, double> &val) const = 0;
40 |   virtual double
41 |   log_likelihood(const std::pair<double, double> &val) const = 0;
42 |   std::string
43 |   tostring() const;
44 |   double
45 |   getalpha() {
46 |     return alpha;
47 |   };
48 |   double
49 |   getbeta() {
50 |     return beta;
51 |   };
52 |   void
53 |   fit(const std::vector<double> &vals_a, const std::vector<double> &vals_b,
54 |       const std::vector<double> &p);
55 | 
56 | protected:
57 |   double
58 |   sign(const double x);
59 |   double
60 |   invpsi(const double tolerance, const double x);
61 |   double
62 |   movement(const double curr, const double prev);
63 |   double alpha{};
64 |   double beta{};
65 |   double lnbeta_helper{};
66 | 
67 |   static constexpr double tolerance = 1e-10;
68 | };
69 | 
70 | class Beta : public EmissionDistribution {
71 | public:
72 |   Beta();
73 |   Beta(const double a, const double b);
74 |   explicit Beta(const std::string &str);
75 |   double
76 |   operator()(const std::pair<double, double> &val) const override;
77 |   double
78 |   log_likelihood(const std::pair<double, double> &val) const override;
79 | };
80 | 
81 | class BetaBinomial : public EmissionDistribution {
82 | public:
83 |   BetaBinomial();
84 |   BetaBinomial(const double a, const double b);
85 |   explicit BetaBinomial(const std::string &str);
86 |   double
87 |   operator()(const std::pair<double, double> &val) const override;
88 |   double
89 |   log_likelihood(const std::pair<double, double> &val) const override;
90 | };
91 | 
92 | #endif
93 | 


--------------------------------------------------------------------------------
/src/common/bsutils.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2018-2025 Andrew D. Smith
 2 |  *
 3 |  * Author: Andrew D. Smith
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify it
 6 |  * under the terms of the GNU General Public License as published by the Free
 7 |  * Software Foundation, either version 3 of the License, or (at your option)
 8 |  * any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful, but WITHOUT
11 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13 |  * more details.
14 |  */
15 | 
16 | #include "bsutils.hpp"
17 | #include "dnmtools_gaussinv.hpp"
18 | 
19 | #include "Interval6.hpp"
20 | 
21 | #include <algorithm>
22 | #include <cassert>
23 | #include <cmath>
24 | #include <cstddef>
25 | #include <iterator>
26 | #include <stdexcept>
27 | #include <string>
28 | #include <unordered_map>
29 | #include <utility>
30 | #include <vector>
31 | 
32 | void
33 | wilson_ci_for_binomial(const double alpha, const double n, const double p_hat,
34 |                        double &lower, double &upper) {
35 |   if (n <= 0.0) {  // protection
36 |     lower = 0.0;
37 |     upper = 1.0;
38 |     return;
39 |   }
40 |   const double z = dnmt_gsl_cdf_ugaussian_Pinv(1 - alpha / 2);
41 |   const double denom = 1 + z * z / n;
42 |   const double first_term = p_hat + z * z / (2 * n);
43 |   const double discriminant =
44 |     std::max(0.0, p_hat * (1 - p_hat) / n + z * z / (4 * n * n));
45 |   lower = std::max(0.0, (first_term - z * std::sqrt(discriminant)) / denom);
46 |   upper = std::min(1.0, (first_term + z * std::sqrt(discriminant)) / denom);
47 | }
48 | 
49 | void
50 | adjust_region_ends(const std::vector<std::vector<Interval6>> &clusters,
51 |                    std::vector<Interval6> &regions) {
52 |   assert(std::size(clusters) == std::size(regions));
53 |   for (std::size_t i = 0; i < std::size(regions); ++i) {
54 |     auto max_pos = regions[i].stop;
55 |     auto min_pos = regions[i].start;
56 |     for (std::size_t j = 0; j < std::size(clusters[i]); ++j) {
57 |       max_pos = std::max(clusters[i][j].stop, max_pos);
58 |       min_pos = std::min(clusters[i][j].start, min_pos);
59 |     }
60 |     regions[i].stop = max_pos;
61 |     regions[i].start = min_pos;
62 |   }
63 | }
64 | 
65 | void
66 | relative_sort(const std::vector<Interval6> &mapped_locations,
67 |               const std::vector<std::string> &names,
68 |               std::vector<std::size_t> &lookup) {
69 |   std::unordered_map<std::string, std::size_t> names_map;
70 |   for (std::size_t i = 0; i < std::size(names); ++i)
71 |     names_map[names[i]] = i;
72 |   for (std::size_t i = 0; i < std::size(mapped_locations); ++i) {
73 |     const auto j = names_map.find(mapped_locations[i].name);
74 |     if (j == std::cend(names_map))
75 |       throw std::runtime_error("read sequence not found for: " + names[i]);
76 |     lookup.push_back(j->second);
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/common/Interval.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2025 Andrew D Smith
 2 |  *
 3 |  * This is free software; you can redistribute it and/or modify it under the
 4 |  * terms of the GNU General Public License as published by the Free Software
 5 |  * Foundation; either version 2 of the License, or (at your option) any later
 6 |  * version.
 7 |  *
 8 |  * This is distributed in the hope that it will be useful, but WITHOUT ANY
 9 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
10 |  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
11 |  * details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License along
14 |  * with this software; if not, write to the Free Software Foundation, Inc., 51
15 |  * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 |  */
17 | 
18 | #ifndef INTERVAL_HPP_
19 | #define INTERVAL_HPP_
20 | 
21 | #include <cstdint>
22 | //  #include <format> // ADS: needs c++20
23 | #include <fstream>
24 | #include <iterator>  // std::size
25 | #include <stdexcept>
26 | #include <string>
27 | #include <vector>
28 | 
29 | struct Interval {
30 |   std::string chrom;
31 |   std::uint32_t start{};
32 |   std::uint32_t stop{};
33 | 
34 |   Interval() = default;
35 |   Interval(const std::string &chrom, const std::uint32_t start,
36 |            const std::uint32_t stop) : chrom{chrom}, start{start}, stop{stop} {}
37 | 
38 |   explicit Interval(const std::string &line) {
39 |     if (!initialize(line.data(), line.data() + std::size(line)))
40 |       throw std::runtime_error("bad interval line: " + line);
41 |   }
42 |   auto
43 |   initialize(const char *, const char *) -> bool;
44 | 
45 |   [[nodiscard]] auto
46 |   operator<(const Interval &rhs) const {
47 |     return (chrom < rhs.chrom ||
48 |             (chrom == rhs.chrom &&
49 |              (start < rhs.start || (start == rhs.start && stop < rhs.stop))));
50 |   }
51 | 
52 |   [[nodiscard]] auto
53 |   operator==(const Interval &rhs) const {
54 |     return chrom == rhs.chrom && start == rhs.start && stop < rhs.stop;
55 |   }
56 | 
57 |   // auto
58 |   // operator<=>(const Interval &) const = default;
59 | };
60 | 
61 | inline auto
62 | operator<<(std::ostream &os, const Interval &x) -> std::ostream & {
63 |   return os << x.chrom << "\t" << x.start << "\t" << x.stop;
64 | }
65 | 
66 | [[nodiscard]] inline auto
67 | to_string(const Interval &x) -> std::string {
68 |   return x.chrom + "\t" + std::to_string(x.start) + "\t" +
69 |          std::to_string(x.stop);
70 | }
71 | 
72 | // ADS: need to bump to c++20 for this
73 | //
74 | // template <> struct std::formatter<Interval> : std::formatter<std::string> {
75 | //   auto
76 | //   format(const Interval &i, format_context &ctx) const {
77 | //     static constexpr auto fmt = "{}\t{}\t{}";
78 | //     return std::formatter<std::string>::format(
79 | //       std::format(fmt, i.chrom, i.start, i.stop), ctx);
80 | //   }
81 | // };
82 | 
83 | [[nodiscard]] inline auto
84 | size(const Interval &x) {
85 |   return x.stop > x.start ? x.stop - x.start : 0ul;
86 | }
87 | 
88 | [[nodiscard]] auto
89 | read_intervals(const std::string &intervals_file) -> std::vector<Interval>;
90 | 
91 | #endif  // INTERVAL_HPP_
92 | 


--------------------------------------------------------------------------------
/docs/content/uniq.md:
--------------------------------------------------------------------------------
 1 | # uniq - ensure reads are not duplicates
 2 | 
 3 | ## Synopsis
 4 | ```shell
 5 | $ dnmtools uniq [OPTIONS] <input-sorted.sam> [out-sorted.sam]
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | The `uniq` command removes PCR duplicates. Before calculating
11 | methylation level, you should now remove duplicate reads, which in
12 | wgbs data are typically identified by their mapping to identical
13 | genomic locations. These reads are most likely PCR clones rather than
14 | representations of distinct DNA molecules. The command `uniq` remove
15 | such duplicates. It collects duplicate reads and/or fragments that
16 | have identical sequences and are mapped to the same genomic location
17 | (same chromosome, same start and end positions, and same strand), and
18 | chooses a random one to be the representative of the original DNA
19 | sequence.
20 | 
21 | *Note* As of dnmtools v1.2.5, the option to use the sequence of reads
22 | when deciding if two reads are duplicates has been removed. In the
23 | context of analyzing bisulfite sequencing reads, this has the danger
24 | of introducing bias in downstream analyses. Also, in the same version
25 | the test for sorted order of reads cannot be disabled. Empirical tests
26 | showed very little improvement to speed when disabling this test.
27 | 
28 | The `uniq` command can take reads sorted by (chrom, start, end,
29 | strand). If the reads in the input file are not sorted, run the
30 | following sort command using [samtools](https://samtools.github.io):
31 | 
32 | ```shell
33 | $ samtools sort -o reads_sorted.bam reads.bam
34 | ```
35 | 
36 | Next, execute the following command to remove duplicate reads:
37 | 
38 | ```shell
39 | $ dnmtools uniq -S duplicate-removal-stats.txt reads_sorted.bam reads_uniq.bam
40 | ```
41 | 
42 | ## Options
43 | 
44 | ```txt
45 |  -t, -threads
46 | ```
47 | The number of threads to use. These threads are used for I/O, and are
48 | most helpful when the input and output are both BAM, where the threads
49 | can really speed things up.
50 | 
51 | ```txt
52 |  -S, -summary
53 | ```
54 | Save statistics on duplication rates to this file. The statistics are not
55 | reported unless a file is specified here. This option is correct as of v1.4.0.
56 | 
57 | ```txt
58 |  -hist
59 | ```
60 | Output a histogram of duplication frequencies into the specified file
61 | for library complexity analysis.
62 | 
63 | ```txt
64 |  -B, -bam
65 | ```
66 | The output is in BAM format. This is an option to help prevent
67 | accidentally writing BAM format to the terminal or through a pipe that
68 | expects plain text, e.g., SAM.
69 | 
70 | ```txt
71 |  -stdout
72 | ```
73 | Write the output to standard out. This is not done by default even
74 | without an output file given, because of the danger of writing BAM to
75 | the terminal or through a pipe unexpectedly. It is possible to write
76 | BAM redirected or through a pipe, but the `-stdout` argument is
77 | required.
78 | 
79 | ```txt
80 |  -s, -seed
81 | ```
82 | Random number seed. Affects which read is kept among duplicates. The
83 | default seed is 408. This option is typically only used for testing.
84 | 
85 | ```txt
86 |  -v, -verbose
87 | ```
88 | Report more information while the program is running.
89 | 


--------------------------------------------------------------------------------
/.github/workflows/dnmtools_release_linux.yml:
--------------------------------------------------------------------------------
 1 | name: DNMTools release (Linux)
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | env:
 7 |   CONTAINER: andrewdavidsmith/transferase-build
 8 | 
 9 | jobs:
10 |   linux-releases:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Start docker container
14 |         # Pull the container, run it in detached mode, mount the workspace
15 |         run: |
16 |           docker pull $CONTAINER
17 |           docker run --name build-container \
18 |           -d -v ${{ github.workspace }}:/workspace $CONTAINER tail -f /dev/null
19 |       - uses: actions/checkout@v4
20 |         with:
21 |           submodules: recursive
22 |       - name: Get version number
23 |         id: get-vn
24 |         run: |
25 |           awk '/AC_INIT/ {print "vn="$2}' configure.ac | sed "s/\[//; s/\]//; s/,//" >> "$GITHUB_OUTPUT"
26 |         env:
27 |           GH_TOKEN: ${{ github.token }}
28 |       - name: Configure and build
29 |         env:
30 |           SCRIPT: |
31 |             export DEBIAN_FRONTEND=noninteractive && \
32 |             apt-get update && apt-get install --no-install-recommends -y automake libgsl-dev && \
33 |             find /usr -name libz.so -exec rm {} \; && \
34 |             find /usr -name libgsl\*.so -exec rm {} \; && \
35 |             git clone https://github.com/ebiggers/libdeflate.git && \
36 |             cd libdeflate && \
37 |             cmake -B build \
38 |                 -DLIBDEFLATE_BUILD_GZIP=off \
39 |                 -DLIBDEFLATE_BUILD_TESTS=off \
40 |                 -DLIBDEFLATE_BUILD_SHARED_LIB=off \
41 |                 -DCMAKE_VERBOSE_MAKEFILE=on \
42 |                 -DCMAKE_BUILD_TYPE=Release && \
43 |             cmake --build build -j4 && \
44 |             cmake --install build --prefix=/usr/local && \
45 |             cd .. && \
46 |             git clone --recursive https://github.com/samtools/htslib.git && \
47 |             cd htslib && \
48 |             autoreconf -i && \
49 |             mkdir build && cd build && \
50 |             ../configure \
51 |                 --disable-bz2 \
52 |                 --disable-libcurl \
53 |                 --disable-lzma \
54 |                 --disable-ref-cache \
55 |                 --with-libdeflate && \
56 |             make -j4 CFLAGS="-Wall -O2 -fvisibility=hidden" libhts.a && \
57 |             cp libhts.a /usr/local/lib/ && \
58 |             cp -r ../htslib /usr/local/include/ && \
59 |             cd /workspace && \
60 |             autoreconf -i && \
61 |             mkdir build && cd build && \
62 |             ../configure --with-libdeflate && \
63 |             ../data/make_full_license_info_header.sh ../data/LICENSE > license.h && \
64 |             echo "#define INCLUDE_FULL_LICENSE_INFO 1" >> config.h && \
65 |             make -j4 LDFLAGS="-static-libgcc -static-libstdc++ -s" && \
66 |             tar -cf dnmtools-${{ steps.get-vn.outputs.vn }}-Linux.tar.gz dnmtools
67 |         run: |
68 |           docker exec build-container bash -c "$SCRIPT"
69 |       - name: Upload the binary
70 |         uses: actions/upload-artifact@v4
71 |         with:
72 |           name: dnmtools-${{ steps.get-vn.outputs.vn }}-Linux.tar.gz
73 |           path: build/dnmtools-${{ steps.get-vn.outputs.vn }}-Linux.tar.gz
74 | 


--------------------------------------------------------------------------------
/.clang-tidy:
--------------------------------------------------------------------------------
 1 | Checks: 'cert-*,cppcoreguidelines-*,performance-*,clang-diagnostic-*,clang-analyzer-*,-clang-diagnostic-unqualified-std-cast-call,-clang-diagnostic-unknown-warning-option,-clang-analyzer-unix.BlockInCriticalSection,-cppcoreguidelines-pro-type-vararg'
 2 | WarningsAsErrors: '*'
 3 | HeaderFileExtensions:
 4 |   - ''
 5 |   - h
 6 |   - hh
 7 |   - hpp
 8 |   - hxx
 9 | ImplementationFileExtensions:
10 |   - c
11 |   - cc
12 |   - cpp
13 |   - cxx
14 | HeaderFilterRegex: ''
15 | ExcludeHeaderFilterRegex: 'OptionParser.hpp'
16 | FormatStyle: none
17 | CheckOptions:
18 |   cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU'
19 |   cert-err33-c.AllowCastToVoid: 'true'
20 |   cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;'
21 |   cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false'
22 |   cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false'
23 |   cppcoreguidelines-non-private-member-variables-in-classes.IgnorePublicMemberVariables: 'true'
24 |   google-readability-braces-around-statements.ShortStatementLines: '1'
25 |   google-readability-function-size.StatementThreshold: '800'
26 |   google-readability-namespace-comments.ShortNamespaceLines: '10'
27 |   google-readability-namespace-comments.SpacesBeforeComments: '2'
28 |   llvm-else-after-return.WarnOnConditionVariables: 'false'
29 |   llvm-else-after-return.WarnOnUnfixable: 'false'
30 |   llvm-qualified-auto.AddConstToQualified: 'false'
31 | SystemHeaders: 'false'
32 | 


--------------------------------------------------------------------------------
/docs/content/diff.md:
--------------------------------------------------------------------------------
 1 | # diff - compute methylation difference probabilities
 2 | 
 3 | ## Synopsis
 4 | ```console
 5 | $ dnmtools diff [OPTIONS] <input-a.meth> <input-b.meth>
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | Suppose that we want to compare two methylomes: `input-a.meth` and
11 | `input-b.meth`. Both these files would have been produced by the
12 | [counts](../counts) command. We start by calculating the differential
13 | methylation score (probability) for each CpG site using the `diff`
14 | command:
15 | 
16 | ```console
17 | $ dnmtools diff -o output.diff input-a.meth input-b.meth
18 | ```
19 | 
20 | Here are the first few lines of the output:
21 | 
22 | ```txt
23 | chr1    3000826   +     CpG     0.609908        16       7      21      11
24 | chr1    3001006   +     CpG     0.874119        21      18      15      22
25 | chr1    3001017   +     CpG     0.888384        20      19      15      25
26 | chr1    3001276   +     CpG     0.010825         3      20      12      16
27 | ```
28 | 
29 | The first four columns are the same as the counts input files. The 5th
30 | column gives the probability that the methylation level at each given
31 | site is lower in `input-a.meth` than `input-b.meth`. (For the other
32 | direction, you can either swap the order of the two input files or
33 | just subtract the probability from 1.0.) The method used to calculate
34 | this probability is explained by Altham (see reference below), and is
35 | simply a one-directional version of Fisher's exact test. The remaining
36 | columns in the output give the number of methylated reads of each CpG
37 | in `input-a.meth`, number of unmethylated reads in `input-a.meth`,
38 | number of methylated reads in `input-b.meth`, and number of
39 | unmethylated reads in `input-b.meth`, respectively.
40 | 
41 | The two input files must be have all sites within a chromosomes
42 | consecutive, have the same chromosome order, and have sites sorted in
43 | increasing order within each chromosome. The order of chromosomes does
44 | not matter (e.g., chr10 may precede chr2, or chr2 may precede chr10).
45 | 
46 | **Warning** the order of the samples/methylomes given as input, the
47 | "a" and "b", matters. It is probably a good idea to include this order
48 | in the output file name, for example as `output_a_lt_b.diff`.
49 | 
50 | The output from the `diff` command is used as input for the
51 | [dmr](../dmr) program, but may also form the basis of visualization if
52 | you want to plot differential methylation probabilities, for example
53 | along the genome in a genome browser.
54 | 
55 | Reference:
56 | ```txt
57 | Patricia M. E. Altham (1969)
58 | Exact bayesian analysis of a 2x2 contingency table, and Fisher's "exact" significance test
59 | Journal of the Royal Statistical Society, Series B (Methodological)
60 | 31(2):261-269
61 | ```
62 | 
63 | ## Options
64 | 
65 | ```txt
66 | -p, -pseudo
67 | ```
68 | The pseudocount to use (default: 1).
69 | 
70 | ```txt
71 | -A, -nonzero-only
72 | ```
73 | Process only sites with coveage in both samples.
74 | 
75 | ```txt
76 | -o, -out
77 | ```
78 | The name of the output file. If no file name is provided, the output
79 | will be written to standard output. Due to the size of this output, a
80 | file name should be specified unless the output will be piped to
81 | another command or program.
82 | 
83 | ```txt
84 | -v, -verbose
85 | ```
86 | Print more information while the command is running.
87 | 


--------------------------------------------------------------------------------
/src/common/bsutils.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *    Copyright (C) 2009-2022 University of Southern California and
 3 |  *                       Andrew D. Smith
 4 |  *
 5 |  *    Authors: Andrew D. Smith
 6 |  *
 7 |  *    This program is free software: you can redistribute it and/or modify
 8 |  *    it under the terms of the GNU General Public License as published by
 9 |  *    the Free Software Foundation, either version 3 of the License, or
10 |  *    (at your option) any later version.
11 |  *
12 |  *    This program is distributed in the hope that it will be useful,
13 |  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14 |  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 |  *    GNU General Public License for more details.
16 |  */
17 | 
18 | #ifndef BSUTILS_HPP
19 | #define BSUTILS_HPP
20 | 
21 | #include <cstddef>
22 | #include <string>
23 | #include <vector>
24 | class Interval6;
25 | 
26 | inline bool
27 | is_cytosine(char c) {
28 |   return (c == 'c' || c == 'C');
29 | }
30 | 
31 | inline bool
32 | is_guanine(char c) {
33 |   return (c == 'g' || c == 'G');
34 | }
35 | 
36 | inline bool
37 | is_thymine(char c) {
38 |   return (c == 't' || c == 'T');
39 | }
40 | 
41 | inline bool
42 | is_adenine(char c) {
43 |   return (c == 'a' || c == 'A');
44 | }
45 | 
46 | //// CONFIDENCE INTERVALS //**************////////////////////////
47 | void
48 | wilson_ci_for_binomial(const double alpha, const double n, const double p_hat,
49 |                        double &lower, double &upper);
50 | 
51 | inline bool
52 | is_cpg(const std::string &s, size_t i) {
53 |   return (i < (s.length() - 1)) && is_cytosine(s[i]) && is_guanine(s[i + 1]);
54 | }
55 | 
56 | void
57 | adjust_region_ends(const std::vector<std::vector<Interval6>> &clusters,
58 |                    std::vector<Interval6> &regions);
59 | 
60 | void
61 | relative_sort(const std::vector<Interval6> &mapped_locations,
62 |               const std::vector<std::string> &names,
63 |               std::vector<size_t> &lookup);
64 | 
65 | template <class T, class U, class V>
66 | static void
67 | separate_regions(const std::vector<T> &big_regions,
68 |                  const std::vector<U> &regions, const std::vector<V> &seqs,
69 |                  std::vector<std::vector<U>> &sep_regions,
70 |                  std::vector<std::vector<V>> &sep_seqs) {
71 |   size_t rr_id = 0;
72 |   const size_t n_regions = regions.size();
73 |   assert(n_regions <= seqs.size());
74 | 
75 |   const size_t n_big_regions = big_regions.size();
76 |   sep_regions.resize(n_big_regions);
77 |   sep_seqs.resize(n_big_regions);
78 |   for (size_t i = 0; i < n_big_regions; ++i) {
79 |     const std::string current_chrom(big_regions[i].get_chrom());
80 |     const size_t current_start = big_regions[i].get_start();
81 |     const size_t current_end = big_regions[i].get_end();
82 |     while (rr_id < n_regions && (regions[rr_id].get_chrom() < current_chrom ||
83 |                                  (regions[rr_id].get_chrom() == current_chrom &&
84 |                                   regions[rr_id].get_end() <= current_start)))
85 |       ++rr_id;
86 |     while (rr_id < n_regions && (regions[rr_id].get_chrom() == current_chrom &&
87 |                                  regions[rr_id].get_start() < current_end)) {
88 |       sep_regions[i].push_back(regions[rr_id]);
89 |       sep_seqs[i].push_back(seqs[rr_id]);
90 |       ++rr_id;
91 |     }
92 |   }
93 | }
94 | 
95 | #endif
96 | 


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
 1 | ## Docker images
 2 | 
 3 | The docker images for `dnmtools` are hosted in GitHub Container registry.  The
 4 | process of building and pushing the image to the registry is handled by the
 5 | workflow specified in
 6 | [docker-build.yml](https://github.com/smithlabcode/dnmtools/blob/master/.github/workflows/docker-build.yml).
 7 | The build instruction is in
 8 | [Dockerfile](https://github.com/smithlabcode/dnmtools/blob/master/Dockerfile).
 9 | You can see the published images
10 | [here](https://github.com/smithlabcode/dnmtools/pkgs/container/dnmtools).
11 | 
12 | The workflow is triggered either manually or automatically by a tag event of
13 | type `v*.*.*`, which is intended for new releases. Currently, publishing the
14 | images can happen only to commits tagged by a version number. This is intended
15 | to associate every docker image with a version number. This means that there is
16 | no option to push the image for the latest commit if it is not tagged by
17 | a version number.
18 | 
19 | ### Automatic build and publish in a tag event
20 | 
21 | In a tag event of type `v*.*.*`, such as new release or retagging of versoin
22 | number, this work flow is triggered to build and publish the image for the
23 | tagged version number. The published image is tagged with SHA hash and the
24 | version number.  It is also taged with `latest` if the version number is the
25 | latest.
26 | 
27 | ### Manual build (and publish)
28 | 
29 | Manual trigger is intedned to test the image build processes as well as publish
30 | an image for an existing version.  In
31 | [Actions](https://github.com/smithlabcode/dnmtools/actions), go to `Docker image
32 | build` under `All workflows` and click `Run workflow` and choose from the
33 | following options:
34 | 
35 | 1. `Build latest commit`: for testing for the latest commit
36 | 2. `Build existing version`: for testing a particular version
37 | 3. `Build + push existing version`: for publishing a particular version  
38 | 
39 | For options 2 and 3, specify the version number in the form `v*.*.*`. If not
40 | specified, the workflow will assume the latest verion.
41 | 
42 | ### Use scenarios 
43 | 
44 | **Before a new release**: It is a good idea to test image building before a new
45 | release. Manually trigger the workflow with opiton 1. If it builds with no
46 | issues, make a new release and the image will automatically be built and
47 | published. 
48 | 
49 | **Publish an existing version**: It is possible to publish a docker image for an
50 | existing version by option 3 in the manual trigger. First, test build using
51 | option 2, and then publish using option 3.  The published image is tagged with
52 | SHA hash and the version number.  It is also taged with `latest` if the version
53 | number is the latest. If option 3 is deployed with a version number for which
54 | a docker image already exists, it will simply rebuild and update the existing
55 | image.
56 | 
57 | **Deleting an image**: If you have owner access to `smithlabcode`, you can
58 | delete an image by going
59 | [here](https://github.com/smithlabcode/dnmtools/pkgs/container/dnmtools/versions)
60 | and manually delete a version.
61 | 
62 | 
63 | 
64 | ## Installation
65 | The image can be pulled by one of the following commands.
66 | 
67 | ```bash
68 | docker pull ghcr.io/smithlabcode/dnmtools:latest
69 | docker pull ghcr.io/smithlabcode/dnmtools:[7-DIGIT SHA]
70 | docker pull ghcr.io/smithlabcode/dnmtools:v[VERSION NUMBER] #(e.g. v1.4.2)
71 | ```
72 | 
73 | 


--------------------------------------------------------------------------------
/src/common/Interval6.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2025 Andrew D Smith
 2 |  *
 3 |  * This is free software; you can redistribute it and/or modify it under the
 4 |  * terms of the GNU General Public License as published by the Free Software
 5 |  * Foundation; either version 2 of the License, or (at your option) any later
 6 |  * version.
 7 |  *
 8 |  * This is distributed in the hope that it will be useful, but WITHOUT ANY
 9 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
10 |  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
11 |  * details.
12 |  *
13 |  * You should have received a copy of the GNU General Public License along
14 |  * with this software; if not, write to the Free Software Foundation, Inc., 51
15 |  * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 |  */
17 | 
18 | #ifndef INTERVAL6_HPP_
19 | #define INTERVAL6_HPP_
20 | 
21 | #include <cstdint>
22 | //  #include <format> // ADS: needs c++20
23 | #include <iterator>  // std::size
24 | #include <sstream>
25 | #include <stdexcept>
26 | #include <string>
27 | #include <vector>
28 | 
29 | struct Interval6 {
30 |   std::string chrom;
31 |   std::uint32_t start{};
32 |   std::uint32_t stop{};
33 |   std::string name;
34 |   double score{};
35 |   char strand{};
36 | 
37 |   Interval6() = default;
38 |   Interval6(const std::string &chrom, const std::uint32_t start,
39 |             const std::uint32_t stop, const std::string &name,
40 |             const double score, const char strand) :
41 |     chrom{chrom}, start{start}, stop{stop}, name{name}, score{score},
42 |     strand{strand} {}
43 | 
44 |   explicit Interval6(const std::string &line) {
45 |     if (!initialize(line.data(), line.data() + std::size(line)))
46 |       throw std::runtime_error("bad interval6 line: " + line);
47 |   }
48 |   auto
49 |   initialize(const char *, const char *) -> bool;
50 | 
51 |   auto
52 |   operator<(const Interval6 &rhs) const {
53 |     return (chrom < rhs.chrom ||
54 |             (chrom == rhs.chrom &&
55 |              (start < rhs.start || (start == rhs.start && stop < rhs.stop))));
56 |   }
57 | 
58 |   // auto
59 |   // operator<=>(const Interval6 &) const = default;
60 | };
61 | 
62 | inline auto
63 | operator<<(std::ostream &os, const Interval6 &x) -> std::ostream & {
64 |   return os << x.chrom << "\t" << x.start << "\t" << x.stop << "\t" << x.name
65 |             << "\t" << x.score << "\t" << x.strand;
66 | }
67 | 
68 | [[nodiscard]] inline auto
69 | to_string(const Interval6 &x) -> std::string {
70 |   std::ostringstream oss;
71 |   oss << x;
72 |   return oss.str();
73 |   // return x.chrom + "\t" + std::to_string(x.start) + "\t" +
74 |   //        std::to_string(x.stop) + "\t" + x.name + "\t" +
75 |   //        std::to_string(x.score) + "\t" + std::string(1, x.strand);
76 | }
77 | 
78 | // ADS: need to bump to c++20 for this
79 | //
80 | // template <> struct std::formatter<Interval6> : std::formatter<std::string> {
81 | //   auto
82 | //   format(const Interval6 &i, format_context &ctx) const {
83 | //     static constexpr auto fmt = "{}\t{}\t{}\t{}\t{:.6g}\t{}";
84 | //     return std::formatter<std::string>::format(
85 | //       std::format(fmt, i.chrom, i.start, i.stop, i.name, i.score, i.strand),
86 | //       ctx);
87 | //   }
88 | // };
89 | 
90 | [[nodiscard]] inline auto
91 | size(const Interval6 &x) {
92 |   return x.stop > x.start ? x.stop - x.start : 0ul;
93 | }
94 | 
95 | [[nodiscard]] auto
96 | read_intervals6(const std::string &intervals_file) -> std::vector<Interval6>;
97 | 
98 | #endif  // INTERVAL6_HPP_
99 | 


--------------------------------------------------------------------------------
/data/methylome_a.counts.sym:
--------------------------------------------------------------------------------
  1 | chr1	163	+	CpG	0.885371	2495
  2 | chr1	206	+	CpG	0.900059	3362
  3 | chr1	232	+	CpG	0.891898	4283
  4 | chr1	278	+	CpG	0.895936	4872
  5 | chr1	296	+	CpG	0.904536	5070
  6 | chr1	310	+	CpG	0.900655	5194
  7 | chr1	322	+	CpG	0.102944	5333
  8 | chr1	324	+	CpG	0.0979768	5338
  9 | chr1	350	+	CpG	0.0992214	5523
 10 | chr1	356	+	CpG	0.0980427	5569
 11 | chr1	358	+	CpG	0.0944375	5591
 12 | chr1	367	+	CpG	0.0983925	5661
 13 | chr1	388	+	CpG	0.100379	5808
 14 | chr1	402	+	CpG	0.894009	5859
 15 | chr1	404	+	CpG	0.898489	5891
 16 | chr1	422	+	CpG	0.890427	5996
 17 | chr1	434	+	CpG	0.891272	6061
 18 | chr1	442	+	CpG	0.890658	6091
 19 | chr1	448	+	CpG	0.896047	6147
 20 | chr1	461	+	CpG	0.893198	6189
 21 | chr1	467	+	CpG	0.895397	6214
 22 | chr1	473	+	CpG	0.890295	6244
 23 | chr1	485	+	CpG	0.896256	6304
 24 | chr1	488	+	CpG	0.897663	6332
 25 | chr1	496	+	CpG	0.896302	6355
 26 | chr1	502	+	CpG	0.895712	6367
 27 | chr1	514	+	CpG	0.896622	6365
 28 | chr1	517	+	CpG	0.895009	6372
 29 | chr1	520	+	CpG	0.892313	6361
 30 | chr1	522	+	CpG	0.894836	6352
 31 | chr1	535	+	CpG	0.893348	6404
 32 | chr1	537	+	CpG	0.900701	6415
 33 | chr1	540	+	CpG	0.898191	6414
 34 | chr1	564	+	CpG	0.893356	6367
 35 | chr1	569	+	CpG	0.89719	6371
 36 | chr1	572	+	CpG	0.89482	6332
 37 | chr1	577	+	CpG	0.892193	6289
 38 | chr1	583	+	CpG	0.894065	6268
 39 | chr1	585	+	CpG	0.894627	6254
 40 | chr1	588	+	CpG	0.896248	6236
 41 | chr1	594	+	CpG	0.896346	6213
 42 | chr1	602	+	CpG	0.893856	6152
 43 | chr1	606	+	CpG	0.900572	6115
 44 | chr1	609	+	CpG	0.889762	6105
 45 | chr1	612	+	CpG	0.90954	6069
 46 | chr1	617	+	CpG	0.89103	6020
 47 | chr1	620	+	CpG	0.897577	5985
 48 | chr1	631	+	CpG	0.896323	5874
 49 | chr1	633	+	CpG	0.895214	5850
 50 | chr1	642	+	CpG	0.900296	5737
 51 | chr1	650	+	CpG	0.902435	5709
 52 | chr1	654	+	CpG	0.896709	5683
 53 | chr1	660	+	CpG	0.897639	5676
 54 | chr1	665	+	CpG	0.886054	5643
 55 | chr1	673	+	CpG	0.900411	5593
 56 | chr1	679	+	CpG	0.892864	5535
 57 | chr1	681	+	CpG	0.895913	5505
 58 | chr1	684	+	CpG	0.906811	5462
 59 | chr1	702	+	CpG	0.893238	5339
 60 | chr1	705	+	CpG	0.893273	5322
 61 | chr1	708	+	CpG	0.89059	5292
 62 | chr1	710	+	CpG	0.895027	5268
 63 | chr1	713	+	CpG	0.896526	5267
 64 | chr1	729	+	CpG	0.891296	5170
 65 | chr1	731	+	CpG	0.894326	5129
 66 | chr1	737	+	CpG	0.101157	5101
 67 | chr1	745	+	CpG	0.098996	4980
 68 | chr1	755	+	CpG	0.103188	4768
 69 | chr1	757	+	CpG	0.0993447	4731
 70 | chr1	760	+	CpG	0.0984832	4681
 71 | chr1	766	+	CpG	0.100824	4612
 72 | chr1	779	+	CpG	0.097355	4499
 73 | chr1	785	+	CpG	0.104054	4440
 74 | chr1	787	+	CpG	0.0980481	4406
 75 | chr1	792	+	CpG	0.104547	4333
 76 | chr1	799	+	CpG	0.0990355	4251
 77 | chr1	801	+	CpG	0.0969194	4220
 78 | chr1	804	+	CpG	0.884496	4199
 79 | chr1	816	+	CpG	0.89358	4003
 80 | chr1	824	+	CpG	0.893299	3880
 81 | chr1	828	+	CpG	0.892152	3848
 82 | chr1	831	+	CpG	0.890568	3838
 83 | chr1	834	+	CpG	0.891522	3798
 84 | chr1	839	+	CpG	0.897553	3719
 85 | chr1	845	+	CpG	0.899183	3670
 86 | chr1	853	+	CpG	0.898612	3531
 87 | chr1	857	+	CpG	0.900296	3380
 88 | chr1	860	+	CpG	0.896175	3294
 89 | chr1	863	+	CpG	0.892756	3189
 90 | chr1	868	+	CpG	0.891703	3001
 91 | chr1	874	+	CpG	0.886834	2757
 92 | chr1	882	+	CpG	0.907975	2445
 93 | chr1	886	+	CpG	0.880694	2305
 94 | chr1	889	+	CpG	0.882969	2196
 95 | chr1	892	+	CpG	0.896952	2067
 96 | chr1	894	+	CpG	0.889332	2006
 97 | chr1	897	+	CpG	0.886603	1896
 98 | chr1	903	+	CpG	0.896429	1680
 99 | chr1	911	+	CpG	0.881223	1406
100 | chr1	915	+	CpG	0.868526	1255
101 | chr1	918	+	CpG	0.887457	1164
102 | chr1	921	+	CpG	0.887417	1057
103 | chr1	923	+	CpG	0.872802	967
104 | chr1	926	+	CpG	0.875887	846
105 | chr1	932	+	CpG	0.88853	619
106 | chr1	940	+	CpG	0.865714	350
107 | chr1	944	+	CpG	0.884058	207
108 | chr1	947	+	CpG	0.708738	103
109 | 


--------------------------------------------------------------------------------
/data/methylome_b.counts.sym:
--------------------------------------------------------------------------------
  1 | chr1	163	+	CpG	0.896375	2538
  2 | chr1	206	+	CpG	0.897481	3414
  3 | chr1	232	+	CpG	0.888131	4398
  4 | chr1	278	+	CpG	0.894575	4866
  5 | chr1	296	+	CpG	0.892725	5127
  6 | chr1	310	+	CpG	0.893081	5275
  7 | chr1	322	+	CpG	0.899683	5363
  8 | chr1	324	+	CpG	0.892228	5391
  9 | chr1	350	+	CpG	0.898259	5573
 10 | chr1	356	+	CpG	0.884881	5655
 11 | chr1	358	+	CpG	0.899364	5664
 12 | chr1	367	+	CpG	0.887803	5731
 13 | chr1	388	+	CpG	0.8979	5906
 14 | chr1	402	+	CpG	0.887257	6058
 15 | chr1	404	+	CpG	0.889255	6077
 16 | chr1	422	+	CpG	0.892903	6200
 17 | chr1	434	+	CpG	0.898734	6320
 18 | chr1	442	+	CpG	0.896819	6319
 19 | chr1	448	+	CpG	0.89785	6373
 20 | chr1	461	+	CpG	0.105519	6378
 21 | chr1	467	+	CpG	0.0939797	6395
 22 | chr1	473	+	CpG	0.0959203	6422
 23 | chr1	485	+	CpG	0.0894118	6375
 24 | chr1	488	+	CpG	0.101708	6381
 25 | chr1	496	+	CpG	0.0996085	6385
 26 | chr1	502	+	CpG	0.0979121	6322
 27 | chr1	514	+	CpG	0.100904	6303
 28 | chr1	517	+	CpG	0.0972134	6316
 29 | chr1	520	+	CpG	0.0994906	6282
 30 | chr1	522	+	CpG	0.0937997	6290
 31 | chr1	535	+	CpG	0.100511	6258
 32 | chr1	537	+	CpG	0.090749	6248
 33 | chr1	540	+	CpG	0.103028	6241
 34 | chr1	564	+	CpG	0.0994068	6237
 35 | chr1	569	+	CpG	0.100256	6244
 36 | chr1	572	+	CpG	0.0916251	6221
 37 | chr1	577	+	CpG	0.0930195	6203
 38 | chr1	583	+	CpG	0.101648	6188
 39 | chr1	585	+	CpG	0.0988014	6174
 40 | chr1	588	+	CpG	0.888853	6154
 41 | chr1	594	+	CpG	0.895356	6116
 42 | chr1	602	+	CpG	0.892945	6109
 43 | chr1	606	+	CpG	0.894495	6085
 44 | chr1	609	+	CpG	0.899095	6075
 45 | chr1	612	+	CpG	0.906008	6075
 46 | chr1	617	+	CpG	0.890079	6068
 47 | chr1	620	+	CpG	0.893045	6068
 48 | chr1	631	+	CpG	0.896924	6015
 49 | chr1	633	+	CpG	0.906344	5990
 50 | chr1	642	+	CpG	0.891856	5992
 51 | chr1	650	+	CpG	0.895422	5919
 52 | chr1	654	+	CpG	0.892596	5875
 53 | chr1	660	+	CpG	0.901209	5790
 54 | chr1	665	+	CpG	0.892863	5731
 55 | chr1	673	+	CpG	0.910198	5668
 56 | chr1	679	+	CpG	0.897883	5621
 57 | chr1	681	+	CpG	0.889067	5607
 58 | chr1	684	+	CpG	0.89372	5589
 59 | chr1	702	+	CpG	0.895077	5423
 60 | chr1	705	+	CpG	0.892007	5380
 61 | chr1	708	+	CpG	0.895394	5363
 62 | chr1	710	+	CpG	0.894124	5327
 63 | chr1	713	+	CpG	0.891525	5310
 64 | chr1	729	+	CpG	0.892229	5057
 65 | chr1	731	+	CpG	0.901375	5019
 66 | chr1	737	+	CpG	0.889204	4937
 67 | chr1	745	+	CpG	0.892381	4804
 68 | chr1	755	+	CpG	0.898156	4664
 69 | chr1	757	+	CpG	0.888985	4648
 70 | chr1	760	+	CpG	0.893792	4623
 71 | chr1	766	+	CpG	0.900198	4539
 72 | chr1	779	+	CpG	0.897518	4352
 73 | chr1	785	+	CpG	0.89578	4289
 74 | chr1	787	+	CpG	0.895231	4257
 75 | chr1	792	+	CpG	0.897337	4169
 76 | chr1	799	+	CpG	0.898918	4066
 77 | chr1	801	+	CpG	0.899803	4052
 78 | chr1	804	+	CpG	0.897532	4011
 79 | chr1	816	+	CpG	0.895012	3829
 80 | chr1	824	+	CpG	0.903985	3739
 81 | chr1	828	+	CpG	0.898031	3707
 82 | chr1	831	+	CpG	0.892002	3676
 83 | chr1	834	+	CpG	0.905847	3643
 84 | chr1	839	+	CpG	0.889659	3607
 85 | chr1	845	+	CpG	0.893179	3548
 86 | chr1	853	+	CpG	0.889873	3387
 87 | chr1	857	+	CpG	0.887658	3249
 88 | chr1	860	+	CpG	0.89147	3142
 89 | chr1	863	+	CpG	0.889328	3036
 90 | chr1	868	+	CpG	0.89277	2863
 91 | chr1	874	+	CpG	0.892424	2640
 92 | chr1	882	+	CpG	0.899573	2340
 93 | chr1	886	+	CpG	0.890511	2192
 94 | chr1	889	+	CpG	0.892601	2095
 95 | chr1	892	+	CpG	0.898949	1999
 96 | chr1	894	+	CpG	0.891316	1923
 97 | chr1	897	+	CpG	0.873894	1808
 98 | chr1	903	+	CpG	0.893949	1603
 99 | chr1	911	+	CpG	0.883738	1359
100 | chr1	915	+	CpG	0.870968	1209
101 | chr1	918	+	CpG	0.889488	1113
102 | chr1	921	+	CpG	0.892644	1006
103 | chr1	923	+	CpG	0.884289	942
104 | chr1	926	+	CpG	0.890361	830
105 | chr1	932	+	CpG	0.893142	627
106 | chr1	940	+	CpG	0.830357	336
107 | chr1	944	+	CpG	0.0909091	187
108 | chr1	947	+	CpG	0.120482	83
109 | 


--------------------------------------------------------------------------------
/src/common/EpireadStats.hpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2011-2022 University of Southern California and
  2 |  *                    Andrew D. Smith and Fang Fang
  3 |  *
  4 |  * Authors: Fang Fang and Andrew D. Smith
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or
  7 |  * modify it under the terms of the GNU General Public License as
  8 |  * published by the Free Software Foundation, either version 3 of the
  9 |  * License, or (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful, but
 12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 |  * General Public License for more details.
 15 |  */
 16 | 
 17 | #ifndef EPIREAD_STATS
 18 | #define EPIREAD_STATS
 19 | 
 20 | #include <cstddef>
 21 | #include <cstdint>
 22 | #include <iterator>
 23 | #include <string>
 24 | #include <vector>
 25 | 
 26 | struct small_epiread {
 27 |   uint32_t pos{};
 28 |   std::string seq{};
 29 | 
 30 |   small_epiread(const std::uint32_t p, const std::string &s) : pos{p}, seq{s} {}
 31 | 
 32 |   uint32_t
 33 |   end() const {
 34 |     return pos + std::size(seq);
 35 |   }
 36 | 
 37 |   uint32_t
 38 |   length() const {
 39 |     return std::size(seq);
 40 |   }
 41 | 
 42 |   std::size_t
 43 |   size() const {
 44 |     return std::size(seq);
 45 |   }
 46 | };
 47 | 
 48 | double
 49 | log_likelihood(const small_epiread &r, const std::vector<double> &a);
 50 | 
 51 | void
 52 | fit_epiallele(const std::vector<small_epiread> &reads,
 53 |               const std::vector<double> &indicators, std::vector<double> &a);
 54 | double
 55 | fit_single_epiallele(const std::vector<small_epiread> &reads,
 56 |                      std::vector<double> &a);
 57 | 
 58 | double
 59 | log_likelihood(const small_epiread &r, const double z,
 60 |                const std::vector<double> &a1, const std::vector<double> &a2);
 61 | double
 62 | log_likelihood(const small_epiread &r, const std::vector<double> &a1,
 63 |                const std::vector<double> &a2);
 64 | double
 65 | log_likelihood(const std::vector<small_epiread> &reads,
 66 |                const std::vector<double> &indicators,
 67 |                const std::vector<double> &a1, const std::vector<double> &a2);
 68 | 
 69 | double
 70 | resolve_epialleles(const size_t max_itr,
 71 |                    const std::vector<small_epiread> &reads,
 72 |                    std::vector<double> &indicators, std::vector<double> &a1,
 73 |                    std::vector<double> &a2);
 74 | 
 75 | double
 76 | test_asm_lrt(const size_t max_itr, const bool crct_for_read_count,
 77 |              const double low_prob, const double high_prob,
 78 |              std::vector<small_epiread> &reads);
 79 | 
 80 | double
 81 | test_asm_bic(const size_t max_itr, const bool crct_for_read_count,
 82 |              const double low_prob, const double high_prob,
 83 |              std::vector<small_epiread> &reads);
 84 | 
 85 | struct EpireadStats {
 86 |   double
 87 |   test_asm(std::vector<small_epiread> &reads, bool &is_significant) const {
 88 |     const double score = use_bic ? test_asm_bic(max_itr, crct_for_read_count,
 89 |                                                 low_prob, high_prob, reads)
 90 |                                  : test_asm_lrt(max_itr, crct_for_read_count,
 91 |                                                 low_prob, high_prob, reads);
 92 |     is_significant = use_bic ? score < 0.0 : score < critical_value;
 93 |     return score;
 94 |   }
 95 | 
 96 |   double low_prob{0.25};
 97 |   double high_prob{0.75};
 98 |   double critical_value{0.01};
 99 |   size_t max_itr{10};
100 |   bool use_bic{false};
101 |   bool crct_for_read_count{true};
102 | };
103 | 
104 | #endif
105 | 


--------------------------------------------------------------------------------
/src/common/Interval6.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2025 Andrew D Smith
  2 |  *
  3 |  * This is free software; you can redistribute it and/or modify it under the
  4 |  * terms of the GNU General Public License as published by the Free Software
  5 |  * Foundation; either version 2 of the License, or (at your option) any later
  6 |  * version.
  7 |  *
  8 |  * This is distributed in the hope that it will be useful, but WITHOUT ANY
  9 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 10 |  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 11 |  * details.
 12 |  *
 13 |  * You should have received a copy of the GNU General Public License along
 14 |  * with this software; if not, write to the Free Software Foundation, Inc., 51
 15 |  * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 16 |  */
 17 | 
 18 | #include "Interval6.hpp"
 19 | 
 20 | #include <algorithm>
 21 | #include <charconv>
 22 | #include <cstdint>
 23 | #include <fstream>
 24 | #include <stdexcept>
 25 | #include <string>
 26 | #include <system_error>
 27 | #include <vector>
 28 | 
 29 | auto
 30 | Interval6::initialize(const char *c, const char *c_end) -> bool {
 31 |   constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; };
 32 |   constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; };
 33 | 
 34 |   bool failed = false;
 35 | 
 36 |   // NOLINTBEGIN(*-pointer-arithmetic)
 37 |   auto field_s = c;
 38 |   auto field_e = std::find_if(field_s + 1, c_end, is_sep);
 39 |   if (field_e == c_end)
 40 |     failed = true;
 41 | 
 42 |   // chrom
 43 |   {
 44 |     const std::uint32_t d = std::distance(field_s, field_e);
 45 |     chrom = std::string{field_s, d};
 46 |   }
 47 | 
 48 |   // start
 49 |   field_s = std::find_if(field_e + 1, c_end, not_sep);
 50 |   field_e = std::find_if(field_s + 1, c_end, is_sep);
 51 |   failed = failed || (field_e == c_end);
 52 |   {
 53 |     const auto [ptr, ec] = std::from_chars(field_s, field_e, start);
 54 |     failed = failed || (ptr == field_s);
 55 |   }
 56 | 
 57 |   // stop
 58 |   field_s = std::find_if(field_e + 1, c_end, not_sep);
 59 |   field_e = std::find_if(field_s + 1, c_end, is_sep);
 60 |   failed = failed || (field_e == c_end);
 61 |   {
 62 |     const auto [ptr, ec] = std::from_chars(field_s, field_e, stop);
 63 |     failed = failed || (ptr == field_s);
 64 |   }
 65 | 
 66 |   // name
 67 |   field_s = std::find_if(field_e + 1, c_end, not_sep);
 68 |   field_e = std::find_if(field_s + 1, c_end, is_sep);
 69 |   failed = failed || (field_e == c_end);
 70 |   name = std::string(field_s, std::distance(field_s, field_e));
 71 | 
 72 |   // score
 73 |   field_s = std::find_if(field_e + 1, c_end, not_sep);
 74 |   field_e = std::find_if(field_s + 1, c_end, is_sep);
 75 |   failed = failed || (field_e == c_end);
 76 |   {
 77 | #ifdef __APPLE__
 78 |     const int ret = std::sscanf(field_s, "%lf", &score);
 79 |     failed = failed || (ret < 1);
 80 | #else
 81 |     const auto [ptr, ec] = std::from_chars(field_s, field_e, score);
 82 |     failed = failed || ec != std::errc{};
 83 | #endif
 84 |   }
 85 | 
 86 |   // strand (no stop; just one char and maybe end of line)
 87 |   field_s = std::find_if(field_e + 1, c_end, not_sep);
 88 |   failed = failed || (field_s == c_end);
 89 |   strand = *field_s;
 90 |   failed = failed || (strand != '-' && strand != '+');
 91 |   // NOLINTEND(*-pointer-arithmetic)
 92 | 
 93 |   return !failed;
 94 | }
 95 | 
 96 | [[nodiscard]] auto
 97 | read_intervals6(const std::string &intervals_file) -> std::vector<Interval6> {
 98 |   std::ifstream in(intervals_file);
 99 |   if (!in)
100 |     throw std::runtime_error("failed to open file: " + intervals_file);
101 |   std::string line;
102 |   std::vector<Interval6> intervals;
103 |   while (getline(in, line))
104 |     intervals.emplace_back(line);
105 |   return intervals;
106 | }
107 | 


--------------------------------------------------------------------------------
/src/common/ThreeStateHMM.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Copyright (C) 2011-2023 University of Southern California
  3 |                           Andrew D. Smith and Song Qiang
  4 | 
  5 |   Authors: Andrew D. Smith, Song Qiang
  6 | 
  7 |   This file is part of dnmtools.
  8 | 
  9 |   dnmtools is free software; you can redistribute it and/or modify
 10 |   it under the terms of the GNU General Public License as published by
 11 |   the Free Software Foundation; either version 2 of the License, or
 12 |   (at your option) any later version.
 13 | 
 14 |   dnmtools is distributed in the hope that it will be useful,
 15 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 |   GNU General Public License for more details.
 18 | */
 19 | 
 20 | #ifndef THREE_STATE_HMM_HPP
 21 | #define THREE_STATE_HMM_HPP
 22 | 
 23 | #include "BetaBin.hpp"
 24 | 
 25 | #include <cstddef>
 26 | #include <iterator>
 27 | #include <iterator>  // IWYU pragma: keep
 28 | #include <utility>
 29 | #include <vector>
 30 | 
 31 | enum STATE_LABELS { hypo, HYPER, HYPO };
 32 | 
 33 | struct Triplet {
 34 |   double hypo, HYPER, HYPO;
 35 | };
 36 | 
 37 | class ThreeStateHMM {
 38 | public:
 39 |   ThreeStateHMM(std::vector<std::pair<double, double>> &obs,
 40 |                 const std::vector<size_t> &res, const double tol,
 41 |                 const size_t max_itr, const bool v);
 42 | 
 43 |   void
 44 |   set_parameters(const betabin &hypo_em, const betabin &HYPER_em,
 45 |                  const betabin &HYPO_em,
 46 |                  const std::vector<std::vector<double>> &tr);
 47 | 
 48 |   void
 49 |   get_parameters(betabin &hypo_em, betabin &HYPER_em, betabin &HYPO_em,
 50 |                  std::vector<std::vector<double>> &tr) const;
 51 | 
 52 |   double
 53 |   BaumWelchTraining();
 54 | 
 55 |   double
 56 |   PosteriorDecoding();
 57 | 
 58 |   double
 59 |   ViterbiDecoding();
 60 | 
 61 |   void
 62 |   get_state_posteriors(std::vector<Triplet> &scores) const;
 63 | 
 64 |   void
 65 |   get_classes(std::vector<STATE_LABELS> &classes) const;
 66 | 
 67 |   // private:
 68 | 
 69 |   //////////// methods ////////////
 70 |   double
 71 |   single_iteration();
 72 |   double
 73 |   forward_algorithm(const size_t start, const size_t end);
 74 |   double
 75 |   backward_algorithm(const size_t start, const size_t end);
 76 |   double
 77 |   hypo_segment_log_likelihood(const size_t start, const size_t end);
 78 |   double
 79 |   HYPER_segment_log_likelihood(const size_t start, const size_t end);
 80 |   double
 81 |   HYPO_segment_log_likelihood(const size_t start, const size_t end);
 82 | 
 83 |   void
 84 |   estimate_state_posterior(const size_t start, const size_t end);
 85 |   void
 86 |   estimate_posterior_trans_prob(const size_t start, const size_t end);
 87 |   void
 88 |   estimate_parameters();
 89 |   void
 90 |   update_observation_likelihood();
 91 | 
 92 |   double
 93 |   ViterbiDecoding(const size_t start, const size_t end);
 94 | 
 95 |   std::vector<std::pair<double, double>> observations;
 96 |   std::vector<size_t> reset_points;
 97 |   std::vector<double> meth_lp, unmeth_lp;
 98 |   std::vector<double> hypo_log_likelihood, HYPER_log_likelihood,
 99 |     HYPO_log_likelihood;
100 | 
101 |   //  HMM internal data
102 |   betabin hypo_emission, HYPER_emission, HYPO_emission;
103 | 
104 |   Triplet lp_start{};
105 |   Triplet lp_end{};
106 |   std::vector<std::vector<double>> trans;
107 | 
108 |   std::vector<Triplet> forward;
109 |   std::vector<Triplet> backward;
110 |   std::vector<double> hypo_posteriors, HYPER_posteriors, HYPO_posteriors;
111 |   std::vector<double> hypo_hypo, hypo_HYPER, HYPER_hypo, HYPER_HYPER,
112 |     HYPER_HYPO, HYPO_HYPER, HYPO_HYPO;
113 | 
114 |   // result
115 |   std::vector<STATE_LABELS> classes;
116 |   std::vector<Triplet> state_posteriors;
117 | 
118 |   // parameters
119 |   double tolerance{};
120 |   size_t max_iterations{};
121 |   bool VERBOSE{};
122 | };
123 | 
124 | #endif
125 | 


--------------------------------------------------------------------------------
/src/utils/lift-filter.cpp:
--------------------------------------------------------------------------------
  1 | /* lift-filter: process lift results
  2 |  *
  3 |  * Copyright (C) 2014-2022 University of Southern California and
  4 |  *                    Andrew D. Smith
  5 |  *
  6 |  * Authors: Jenny Qu
  7 |  *
  8 |  * This program is free software: you can redistribute it and/or modify
  9 |  * it under the terms of the GNU General Public License as published by
 10 |  * the Free Software Foundation, either version 3 of the License, or
 11 |  * (at your option) any later version.
 12 |  *
 13 |  * This program is distributed in the hope that it will be useful,
 14 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 |  * GNU General Public License for more details.
 17 |  */
 18 | 
 19 | #include "MSite.hpp"
 20 | #include "OptionParser.hpp"
 21 | 
 22 | #include <cstdlib>
 23 | #include <fstream>
 24 | #include <iostream>
 25 | #include <new>
 26 | #include <stdexcept>
 27 | #include <string>
 28 | #include <vector>
 29 | 
 30 | using std::cerr;
 31 | using std::endl;
 32 | using std::runtime_error;
 33 | using std::string;
 34 | using std::vector;
 35 | 
 36 | static bool
 37 | same_chrom_pos_strand(const MSite &a, const MSite &b) {
 38 |   return a.pos == b.pos && a.chrom == b.chrom && a.strand == b.strand;
 39 | }
 40 | 
 41 | int
 42 | main_lift_filter(int argc, char *argv[]) {  // NOLINT(*-avoid-c-arrays)
 43 |   try {
 44 |     string pfile;
 45 |     bool VERBOSE = false;
 46 |     bool UNIQUE = false;
 47 | 
 48 |     /****************** COMMAND LINE OPTIONS ********************/
 49 |     OptionParser opt_parse(argv[0],  // NOLINT(*-pointer-arithmetic)
 50 |                            "Process duplicated sites from fast-liftover output",
 51 |                            "<methcount file>");
 52 |     opt_parse.add_opt("output", 'o', "Output processed methcount", true, pfile);
 53 |     opt_parse.add_opt("unique", 'u', "keep unique sites", false, UNIQUE);
 54 |     opt_parse.add_opt("verbose", 'v', "print more information", false, VERBOSE);
 55 | 
 56 |     vector<string> leftover_args;
 57 |     opt_parse.parse(argc, argv, leftover_args);
 58 |     if (argc == 1 || opt_parse.help_requested()) {
 59 |       cerr << opt_parse.help_message() << '\n';
 60 |       return EXIT_SUCCESS;
 61 |     }
 62 |     if (opt_parse.about_requested()) {
 63 |       cerr << opt_parse.about_message() << '\n';
 64 |       return EXIT_SUCCESS;
 65 |     }
 66 |     if (opt_parse.option_missing()) {
 67 |       cerr << opt_parse.option_missing_message() << '\n';
 68 |       return EXIT_SUCCESS;
 69 |     }
 70 |     if (leftover_args.empty()) {
 71 |       cerr << opt_parse.help_message() << '\n';
 72 |       return EXIT_SUCCESS;
 73 |     }
 74 |     const string mfile(leftover_args.front());
 75 |     /****************** END COMMAND LINE OPTIONS *****************/
 76 | 
 77 |     std::ifstream in(mfile);
 78 |     if (!in)
 79 |       throw runtime_error("cannot open input file: " + mfile);
 80 | 
 81 |     std::ofstream out(pfile);
 82 |     // if (!of)
 83 |     //   throw runtime_error("cannot open output file: " + pfile);
 84 |     // std::ostream out(of.rdbuf());
 85 | 
 86 |     // read first site
 87 |     MSite curr_site;
 88 |     if (!(in >> curr_site))
 89 |       throw runtime_error("failed reading: " + mfile);
 90 | 
 91 |     MSite next_site;
 92 |     bool site_is_unique = true;
 93 |     while (in >> next_site) {
 94 |       if (same_chrom_pos_strand(curr_site, next_site)) {
 95 |         site_is_unique = false;
 96 |         curr_site.add(next_site);
 97 |       }
 98 |       else {
 99 |         if (!UNIQUE || site_is_unique)
100 |           out << curr_site << '\n';
101 |         site_is_unique = true;
102 |         curr_site = next_site;
103 |       }
104 |     }
105 |     if (!UNIQUE || site_is_unique)
106 |       out << curr_site << '\n';
107 |   }
108 |   catch (const std::exception &e) {
109 |     std::cerr << e.what() << '\n';
110 |     return EXIT_FAILURE;
111 |   }
112 |   return EXIT_SUCCESS;
113 | }
114 | 


--------------------------------------------------------------------------------
/docs/content/sym.md:
--------------------------------------------------------------------------------
 1 | # sym - collapse counts for symmetric CpGs sites
 2 | 
 3 | ## Synopsis
 4 | ```console
 5 | $ dnmtools sym [OPTIONS] <input.meth>
 6 | ```
 7 | 
 8 | ## Description
 9 | 
10 | Many of our tools were designed for data vertebrate species. In these
11 | species, the methylation levels at CpG sites tends to be symmetric,
12 | the same on each strand. Of course there are exceptions. But in many
13 | analysis settings, combining data from both strands for the same CpG
14 | site is a good idea. Assume you have output from
15 | [counts](../counts). The `sym` command will merge data on both strands
16 | for each CpG site. It takes files having the same format as output by
17 | counts with either all cytosines or CpGs only (generated with `-n`
18 | option when running counts).
19 | ```console
20 | $ dnmtools sym -o human_esc_CpG.meth human_esc.meth
21 | ```
22 | The above command will merge all CpG pairs while also discarding sites
23 | with an indication that the CpG has mutated. Note that as long as one
24 | site of the pair is mutated, the pair is discarded. This is the
25 | default mode. If you want to keep those mutated sites, run the
26 | following:
27 | ```console
28 | $ dnmtools sym -m -o human_esc_CpG.meth human_esc.meth
29 | ```
30 | 
31 | Here is an example to show what `sym` actually does with the data.
32 | First, the following is several lines of output generated by
33 | [counts](../counts). This partial output includes sites in multiple
34 | contexts, and among them 4 are CpG sites:
35 | 
36 | ```txt
37 | chr10        11473        +        CHH        0          3
38 | chr10        11474        +        CXG        0          13
39 | chr10        11476        -        CXG        0          22
40 | chr10        11477        +        CpG        0.181818   11
41 | chr10        11478        -        CpG        0.391304   23
42 | chr10        11479        -        CCG        0          22
43 | chr10        11481        -        CHH        0          23
44 | chr10        11483        +        CCG        0          11
45 | chr10        11484        +        CpG        0.909091   11
46 | chr10        11485        -        CpG        0.913043   23
47 | chr10        11486        -        CCG        0          19
48 | chr10        11487        -        CHH        0          20
49 | chr10        11489        -        CHH        0.105263   19
50 | ```
51 | 
52 | The first CpG site above is at position 11477 on chr10, and there is
53 | another one immediately following it on the opposite strand. These are
54 | the two C in the same CpG site. The first one is covered by 11 reads,
55 | and among those 2 indicate methylation (a C in the reads). This is
56 | obtained by 0.181818 x 11. The next CpG has a "-" for the strand, so
57 | it refers to the G on the positive reference strand, which is the same
58 | as the C on the opposite strand for that site. This one is covered by
59 | 23 reads, 9 of which indicate methylation (0.391304 x 23). For this
60 | one CpG dinucleotide, the total methylation observations are 2 + 9 =
61 | 11, and the total reads are 11 + 23 = 34. Therefore, the methylation
62 | level for the dinucleotide is 11/34 = 0.3235294. The `sym` command
63 | would produce the following:
64 | 
65 | ```txt
66 | chr10   11477   +   CpG 0.323529    34
67 | chr10   11484   +   CpG 0.911765    34
68 | ```
69 | 
70 | By chance, the other CpG site in the partial output above had the same
71 | number, 34, of reads covering the site when counting both
72 | strands. Notice that non-CpG sites are removed. Your input/output
73 | might look slightly different in your terminal, as the format involves
74 | tabs and not spaces.
75 | 
76 | ## Options
77 | 
78 | ```txt
79 | -o, -output
80 | ```
81 | The name of the output file (default: stdout). The format is
82 | the same as output by [counts](../counts).
83 | 
84 | ```txt
85 | -m, -muts
86 | ```
87 | Include mutated CpG sites among the output, i.e. entries with an "x"
88 | terminating the fourth column of each line of input.
89 | 
90 | ```txt
91 | -v, -verbose
92 | ```
93 | Report more information while the program is running.
94 | 


--------------------------------------------------------------------------------
/docs/content/fastlift.md:
--------------------------------------------------------------------------------
  1 | # fastlift - Mapping methylomes between species
  2 | 
  3 | ## Synopsis
  4 | ```shell
  5 | $ dnmtools fastlift -i <input.index> -f <input.from> -t <output.to>
  6 | ```
  7 | 
  8 | ## Description
  9 | 
 10 | Mapping methylomes between species builds on the
 11 | [liftOver tool](http://genome.ucsc.edu/cgi-bin/hgLiftOver) provided by
 12 | [UCSC Genome Browser](https://genome.ucsc.edu). However it is time
 13 | consuming to convert each methcounts output file from one assembly
 14 | to another using the UCSC liftOver tool, given that they all should
 15 | have the same locations but different read counts. Therefore, we use
 16 | liftOver to generate an index file between two assemblies, and provide
 17 | the `fast-liftover` tool.  Suppose we have downloaded the `liftOver` tool
 18 | and the chain file `mm9ToHg19.over.chain.gz` from the UCSC Genome
 19 | Browser website. If we have a methcounts file `mm9.meth` of
 20 | CpG sites or all cytosines in mm9.  Entries in  `mm9.meth`
 21 | look like
 22 | 
 23 | ```txt
 24 | chr1  3005765  +  CpG  0.166667   6
 25 | chr1  3005846  +  CpG  0.5        10
 26 | chr1  3005927  +  CpG  0          9
 27 | ```
 28 | 
 29 | We would like to lift it over to the human genome hg19, and generate
 30 | an index file `mm9-hg19.index` to facilitate later lift-over
 31 | operations from mm9 to hg19, and keep a record of unlifted mm9
 32 | cytosine positions in the file `mm9-hg19.unlifted`. First, convert the
 33 | [counts](../counts) file `mm9.meth` to the
 34 | BED file `mm9-cpg.bed` file for liftOver using the following command.
 35 | 
 36 | ```shell
 37 | $ awk '{print $1"\t"$2"\t"$2+1"\t",$1":"$2":"$2+1":+\t0\t+"}' mm9.meth >mm9-cpg.bed
 38 | ```
 39 | 
 40 | The output file `mm9-cpg.bed` should look like this:
 41 | 
 42 | ```txt
 43 | chr1  3005765   3005766  chr1:3005765:3005766:+  0  +
 44 | chr1  3005846   3005847  chr1:3005846:3005847:+  0  +
 45 | chr1  3005927   3005928  chr1:3005927:3005928:+  0  +
 46 | ```
 47 | 
 48 | Note that the fourth column is the genomic location data linked with
 49 | colons.
 50 | 
 51 | Then, run UCSC Genome Browser tool `liftOver` as follows:
 52 | 
 53 | ```shell
 54 | $ liftOver mm9-cpg.bed mm9ToHg19.over.chain.gz mm9-hg19.index mm9-hg19.unlifted
 55 | ```
 56 | 
 57 | The generated index file `mm9-hg19.index` will be a BED format file in
 58 | hg19 coordinates, with entries like
 59 | 
 60 | ```txt
 61 | chr8    56539820        56539821  chr1:3005765:3005766:+        0       -
 62 | chr8    56539547        56539548  chr1:3005846:3005847:+        0       -
 63 | chr8    56539209        56539210  chr1:3005927:3005928:+        0       -
 64 | ```
 65 | 
 66 | where the 4th column contains the genomic position of the cytosine
 67 | site in mm9 coordinates.
 68 | 
 69 | Next, convert the file `mm9-hg19.index` to a tab-separated input to be
 70 | passed onto the fast-liftover tool as follows.
 71 | 
 72 | ```shell
 73 | $ tr ':' '\t' <mm9-hg19.index | awk '{print $4"\t"$5"\t"$1"\t"$2"\t"$9}' >mm9-hg19-fastliftover.index
 74 | ```
 75 | 
 76 | After the index file is converted, we can use the `fast-liftover`
 77 | program on any mm9 methcounts file to lift it to hg19:
 78 | 
 79 | ```shell
 80 | $ dnmtools fastlift -i mm9-hg19-fastliftover.index -f mm9.meth -t hg19-lift.meth
 81 | ```
 82 | 
 83 | The `-p` option should be specified to report positions on the
 84 | positive strand of the target assembly.  Before using the lifted
 85 | methcounts file, make sure it is sorted properly.
 86 | 
 87 | ```shell
 88 | $ LC_ALL=C sort -k1,1 -k2,2g -k3,3 hg19-lift.meth -o hg19-lift-sorted.meth
 89 | ```
 90 | 
 91 | ## Options
 92 | ```txt
 93 |  -i, -indexfile
 94 | ```
 95 | index file [required]
 96 | ```txt
 97 |  -f, -from
 98 | ```
 99 | Original file [required]
100 | ```txt
101 |  -t, -to
102 | ```
103 |  Output file liftovered [required]
104 | ```txt
105 |  -u, -unlifted
106 | ```
107 | (optional) File for unlifted sites
108 | ```txt
109 |  -p, -plus-strand
110 | ```
111 |  (optional) Report sites on + strand
112 | ```txt
113 |  -v, -verbose
114 | ```
115 | print more run info to STDERR as the program runs
116 | 
117 | 


--------------------------------------------------------------------------------
/docs/content/counts-nano.md:
--------------------------------------------------------------------------------
  1 | # counts-nano - compute single-site methylation from nanopore data
  2 | 
  3 | ## Synopsis
  4 | ```console
  5 | $ dnmtools counts-nano [OPTIONS] -c <chroms> <input.bam>
  6 | ```
  7 | 
  8 | ## Description
  9 | 
 10 | The `counts-nano` command introduced in v1.5.0 is designed specifically to
 11 | generate DNMTools [counts](../counts) format files from nanopore data called
 12 | for the `5mCG_5hmCG` modification. Currently this is only supported for
 13 | methylation and hydroxymethylation called at CpG sites.
 14 | 
 15 | More documentation will come as this tool evolves, but for now:
 16 | 
 17 | - Most behavior is very similar to what you will find from [counts](../counts).
 18 | - Mutation information is not estimated by `nano-counts`.
 19 | - Currently this only works for CpG sites and when the only modified sites are
 20 |   marked as `C+m?` or `C+h?` in the `MM` field of each BAM/SAM read record.
 21 | - The first 6 columns of the output are the same as explained in the
 22 |   [counts](../counts) format, except the fraction for the 5th column is both
 23 |   5mC and 5hmC. The 7th column is for 5hmC alone and the 8th is for 5mC alone.
 24 | - The methylation levels will not result in integer values when multiplied by
 25 |   the number of reads because probabilities on modifications are used, so
 26 |   methylation levels for each site are expected values (the best estimates we
 27 |   can make), and do not use arbitrary cutoffs.
 28 | - Several other commands in DNMTools have been modified to use this form of
 29 |   expected methylation level, and behave as previously for bisulfite
 30 |   sequencing data, but have updated behavior when the data is from
 31 |   nanopore. The user does not need to specify the technology used.
 32 | - Some commands need to use a `-relaxed` flag to work with the additional
 33 |   columns in the output from `counts-nano` compared with `counts`. For
 34 |   commands without this option, simply do `cut -f1-6` on the output of
 35 |   `counts-nano` to remove those.
 36 | 
 37 | ## Options
 38 | 
 39 | ```txt
 40 | -o, -output
 41 | ```
 42 | Output file name. The default is to write output to the terminal,
 43 | which is not useful unless commands are piped.
 44 | 
 45 | ```txt
 46 | -c, -chrom
 47 | ```
 48 | Reference genome file, which must be in FASTA format. This is
 49 | required.
 50 | 
 51 | ```txt
 52 | -t, -threads
 53 | ```
 54 | 
 55 | The number of threads to use. This is only really helpful if the input is BAM
 56 | (not helpful for SAM), and the output is to be zipped (see `-z` below). These
 57 | threads are used to decompress BAM input and compress gzip output. If only one
 58 | of these conditions holds, using more threads can still help. Because most
 59 | computation in `counts-nano` is processing reads sequentially, using too many
 60 | threads will have decreasing returns.
 61 | 
 62 | ```txt
 63 | -z, -zip
 64 | ```
 65 | 
 66 | The output should be zipped (in gzip format). This is not deduced by the
 67 | filename, but specifying this argument should be accompanied by using a `.gz`
 68 | filename suffix for the output.
 69 | 
 70 | ```txt
 71 | -n, -cpg-only
 72 | ```
 73 | 
 74 | Print only CpG context cytosines. This significantly reduces the output size
 75 | in most genomes. Note that using this option does not merge data as symmetric
 76 | CpGs.
 77 | 
 78 | ```txt
 79 | -sym
 80 | ```
 81 | 
 82 | This will turn on `-n, -cpg-only` automatically and will output symmetric CpG
 83 | sites, with each level including all counts and methylation levels as a
 84 | (weighted) average of both strands.
 85 | 
 86 | ```txt
 87 | -H, -header
 88 | ```
 89 | 
 90 | Add a header to the output file to identify the reference genome. This will be
 91 | in the form of "comment" lines beginning with `#`. This is not required for most
 92 | downstream processing, but is used by commands that check for consistency with
 93 | a reference genome.
 94 | 
 95 | ```txt
 96 | -v, -verbose
 97 | ```
 98 | 
 99 | Report more information while the program is running.
100 | 
101 | ```txt
102 | -progress
103 | ```
104 | Show progress while the program is running.
105 | 


--------------------------------------------------------------------------------
/docs/content/hypermr.md:
--------------------------------------------------------------------------------
  1 | # hypermr - Detecting hypermethylated regions
  2 | 
  3 | ## Synopsis
  4 | ```shell
  5 | $ dnmtools hypermr [OPTIONS] <input.meth>
  6 | ```
  7 | 
  8 | ## Description
  9 | 
 10 | The plant genomes, exemplified by *A. thaliana*, are devoid of DNA
 11 | methylation by default, with genic regions and transposons being
 12 | hyper-methylated, which we termed HyperMRs to stress their difference
 13 | from hypo-methylated regions in mammalian methylomes. DNA methylation
 14 | in plants has been associated with expression regulation and
 15 | transposon repression, and therefore characterizing HyperMRs is of
 16 | much biological relevance. In addition to plants, hydroxymethylation
 17 | tends to appear in a small fraction of the mammalian genome, and
 18 | therefore it makes sense to identify hyper-hydroxymethylated regions.
 19 | 
 20 | The first kind of HyperMR analysis involves finding continuous blocks
 21 | of hyper-methylated CpGs with the hmr program. Since the
 22 | [hmr](../hmr) program is designed to find hypo-methylated
 23 | regions, one can use it to identify HyperMRs by inverting the
 24 | methylation levels in the methcounts output file as follows:
 25 | 
 26 | ```shell
 27 | $ awk '{$5=1-$5; print $0}' input.meth > input_inverted.meth
 28 | ```
 29 | 
 30 | Next one may use the hmr program to find "valleys" in the inverted
 31 | Arabidopsis methylome, which are the hyper-methylated regions in the
 32 | original methylome. The command is invoked as below
 33 | 
 34 | ```shell
 35 | $ dnmtools hmr -o output.hmr input_inverted.meth
 36 | ```
 37 | 
 38 | This kind of HyperMR analysis produces continuous blocks of
 39 | hyper-methylated CpGs. However in some regions, intragenic regions in
 40 | particular, such continuous blocks of hyper-methylated CpGs are
 41 | separated by a few unmethylated CpGs, which have distinct sequence
 42 | preference when compared to those CpGs in the majority of unmethylated
 43 | genome. The blocks of hyper-methylated CpGs and gap CpGs together form
 44 | composite HyperMRs. The hypermr program, which implements a
 45 | three-state HMM, is used to identify such HyperMRs. Suppose the
 46 | [counts](../counts) output file is Col0 Meth.bed, to
 47 | find HyperMRs from this dataset, run
 48 | 
 49 | ```shell
 50 | $ dnmtools hypermr -o output.hypermr input.meth
 51 | ```
 52 | 
 53 | The output file is a 6-column
 54 | [BED](https://en.wikipedia.org/wiki/BED_(file_format))  file. The
 55 | first three columns give the chromosome, starting position and ending
 56 | position of that HyperMR.  The fourth column starts with the `hyper:`,
 57 | followed by the number of CpGs within this HyperMR. The fifth column
 58 | is the accumulative methylation level of all CpGs. The last column
 59 | indicates the strand, which is always +.
 60 | 
 61 | Lastly, it is worth noting that plants exhibit significantly more
 62 | methylation in the non-CpG context, and therefore inclusion of non-CpG
 63 | methylation in the calling of hyper-methylated regions could possibly
 64 | be informative. We suggest separating each cytosine context from the
 65 | methcounts output file as illustrated in the previous section (via
 66 | grep) and calling HyperMRs separately for each context.
 67 | 
 68 | ## Options
 69 | 
 70 | ```txt
 71 |  -o, -out
 72 | ```
 73 | output BED file (default: STDOUT)
 74 | 
 75 | ```txt
 76 |  -s, -scores
 77 | ```
 78 | output file for posterior scores
 79 | 
 80 | ```txt
 81 |  -t, -tolerance
 82 | ```
 83 | tolerance (default: 0)
 84 | 
 85 | ```txt
 86 |  -d, -desert
 87 | ```
 88 | maximum distance between covered CpGs in HyperMR (default: 1000)
 89 | 
 90 | ```txt
 91 |  -i, -itr
 92 | ```
 93 | max number of iterations (default: 100)
 94 | 
 95 | ```txt
 96 |  -V, -viterbi
 97 | ```
 98 | Use Viterbi decoding
 99 | 
100 | ```txt
101 |  -M, -min-meth
102 | ```
103 | min cumulative methylation level in HypeMR (default: 4)
104 | ```txt
105 |  -v, -verbose
106 | ```
107 | print more run info to STDERR while the program is running.
108 | ```txt
109 |  -P, -params-in
110 | ```
111 | HMM parameters input file
112 | ```txt
113 |  -p, -params-out
114 | ```
115 | HMM parameters output file
116 | 
117 | 


--------------------------------------------------------------------------------
/src/common/Epiread.cpp:
--------------------------------------------------------------------------------
  1 | /*    Copyright (C) 2011-2022 University of Southern California and
  2 |  *                       Andrew D. Smith and Fang Fang
  3 |  *
  4 |  *    Authors: Fang Fang and Andrew D. Smith
  5 |  *
  6 |  *    This program is free software: you can redistribute it and/or modify
  7 |  *    it under the terms of the GNU General Public License as published by
  8 |  *    the Free Software Foundation, either version 3 of the License, or
  9 |  *    (at your option) any later version.
 10 |  *
 11 |  *    This program is distributed in the hope that it will be useful,
 12 |  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  *    GNU General Public License for more details.
 15 |  */
 16 | 
 17 | #include "Epiread.hpp"
 18 | 
 19 | #include <algorithm>
 20 | #include <charconv>
 21 | #include <cstdint>
 22 | #include <fstream>
 23 | #include <iterator>
 24 | #include <limits>
 25 | #include <sstream>
 26 | #include <stdexcept>
 27 | #include <string>
 28 | 
 29 | size_t
 30 | adjust_read_offsets(std::vector<epiread> &reads) {
 31 |   size_t first_read_offset = std::numeric_limits<size_t>::max();
 32 |   for (size_t i = 0; i < reads.size(); ++i)
 33 |     first_read_offset = std::min(reads[i].pos, first_read_offset);
 34 |   for (size_t i = 0; i < reads.size(); ++i)
 35 |     reads[i].pos -= first_read_offset;
 36 |   return first_read_offset;
 37 | }
 38 | 
 39 | size_t
 40 | get_n_cpgs(const std::vector<epiread> &reads) {
 41 |   size_t n_cpgs = 0;
 42 |   for (size_t i = 0; i < reads.size(); ++i)
 43 |     n_cpgs = std::max(n_cpgs, reads[i].end());
 44 |   return n_cpgs;
 45 | }
 46 | 
 47 | std::istream &
 48 | operator>>(std::istream &in, epiread &er) {
 49 |   std::string buffer;
 50 |   if (getline(in, buffer)) {
 51 |     std::istringstream is(buffer);
 52 |     if (!(is >> er.chr >> er.pos >> er.seq))
 53 |       throw std::runtime_error("malformed epiread line:\n" + buffer);
 54 |   }
 55 |   return in;
 56 | }
 57 | 
 58 | std::ostream &
 59 | operator<<(std::ostream &out, const epiread &er) {
 60 |   return out << er.chr << '\t' << er.pos << '\t' << er.seq;
 61 | }
 62 | 
 63 | bool
 64 | validate_epiread_file(const std::string &filename) {
 65 |   const size_t max_lines_to_validate = 10000;
 66 |   std::ifstream in(filename);
 67 |   if (!in)
 68 |     throw std::runtime_error("failed to open file: " + filename);
 69 | 
 70 |   std::string c, s, other;
 71 |   size_t p = 0;
 72 | 
 73 |   size_t n_lines = 0;
 74 |   std::string line;
 75 |   while (getline(in, line) && n_lines++ < max_lines_to_validate) {
 76 |     std::istringstream iss(line);
 77 |     if (!(iss >> c >> p >> s) || iss >> other)
 78 |       return false;
 79 |   }
 80 |   return true;
 81 | }
 82 | 
 83 | epiread::epiread(const std::string &line) {
 84 |   constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; };
 85 |   constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; };
 86 | 
 87 |   using std::distance;
 88 |   using std::find_if;
 89 |   using std::from_chars;
 90 | 
 91 |   bool failed = false;
 92 | 
 93 |   // NOLINTBEGIN(*-pointer-arithmetic)
 94 | 
 95 |   const auto c = line.data();
 96 |   const auto c_end = c + line.size();
 97 | 
 98 |   auto field_s = c;
 99 |   auto field_e = find_if(field_s + 1, c_end, is_sep);
100 |   if (field_e == c_end)
101 |     failed = true;
102 | 
103 |   chr = std::string{field_s, static_cast<uint32_t>(distance(field_s, field_e))};
104 | 
105 |   field_s = find_if(field_e + 1, c_end, not_sep);
106 |   field_e = find_if(field_s + 1, c_end, is_sep);
107 |   failed = failed || (field_e == c_end);
108 | 
109 |   const auto [ptr, ec] = from_chars(field_s, field_e, pos);
110 |   failed = failed || (ptr == field_s);
111 | 
112 |   field_s = find_if(field_e + 1, c_end, not_sep);
113 |   field_e = find_if(field_s + 1, c_end, is_sep);
114 |   failed = failed || (field_e != c_end);
115 | 
116 |   seq = std::string{field_s, static_cast<uint32_t>(distance(field_s, field_e))};
117 | 
118 |   if (failed) {
119 |     throw std::runtime_error("bad epiread line: " + line);
120 |     // ADS: the value below would work for a flag
121 |     // pos = std::numeric_limits<decltype(pos)>::max();
122 |   }
123 | 
124 |   // NOLINTEND(*-pointer-arithmetic)
125 | }
126 | 


--------------------------------------------------------------------------------
/cmake/static_analysis.cmake:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2025 Andrew D Smith
  2 | #
  3 | # This program is free software: you can redistribute it and/or modify it
  4 | # under the terms of the GNU General Public License as published by the Free
  5 | # Software Foundation, either version 3 of the License, or (at your option)
  6 | # any later version.
  7 | #
  8 | # This program is distributed in the hope that it will be useful, but WITHOUT
  9 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 10 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 11 | # more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License along with
 14 | # this program. If not, see <https://www.gnu.org/licenses/>.
 15 | 
 16 | # StaticAnalysis
 17 | message(STATUS "Enabling static analysis")
 18 | # If no specific static analysis is requested, do them all
 19 | if(NOT RUN_CPPCHECK AND NOT RUN_IWYU AND
 20 |     NOT RUN_CPPLINT  AND NOT RUN_CLANG_TIDY)
 21 |   set(RUN_CPPCHECK on)
 22 |   set(RUN_IWYU on)
 23 |   set(RUN_CPPLINT on)
 24 |   set(RUN_CLANG_TIDY on)
 25 | endif()
 26 | 
 27 | set(STATIC_ANALYSIS_CHECKS "")
 28 | if(RUN_CPPCHECK)
 29 |   list(APPEND STATIC_ANALYSIS_CHECKS "cppcheck")
 30 | endif()
 31 | if(RUN_CPPLINT)
 32 |   list(APPEND STATIC_ANALYSIS_CHECKS "cpplint")
 33 | endif()
 34 | if(RUN_IWYU)
 35 |   list(APPEND STATIC_ANALYSIS_CHECKS "iwyu")
 36 | endif()
 37 | if(RUN_CLANG_TIDY)
 38 |   list(APPEND STATIC_ANALYSIS_CHECKS "clang-tidy")
 39 | endif()
 40 | 
 41 | message(STATUS "Requested static analysis: ${STATIC_ANALYSIS_CHECKS}")
 42 | 
 43 | # cpplint: all options are in the config file
 44 | if ("cpplint" IN_LIST STATIC_ANALYSIS_CHECKS)
 45 |   find_program(FOUND_CPPLINT cpplint)
 46 |   if(FOUND_CPPLINT)
 47 |     message(STATUS "Enabling cpplint analysis")
 48 |     set(CMAKE_CXX_CPPLINT cpplint --quiet)
 49 |   else()
 50 |     message(STATUS "Could not find cpplint; disabling cpplint")
 51 |   endif()
 52 | endif()
 53 | 
 54 | # include-what-you-use: config is a mappings file
 55 | if ("iwyu" IN_LIST STATIC_ANALYSIS_CHECKS)
 56 |   find_program(FOUND_IWYU include-what-you-use)
 57 |   if(FOUND_IWYU)
 58 |     message(STATUS "Enabling include-what-you-use analysis")
 59 |     set(CMAKE_CXX_INCLUDE_WHAT_YOU_USE
 60 |       include-what-you-use
 61 |       -Xiwyu
 62 |       --comment_style=none
 63 |       -Xiwyu
 64 |       --quoted_includes_first
 65 |       -Xiwyu
 66 |       --mapping_file=${PROJECT_SOURCE_DIR}/iwyu.json
 67 |     )
 68 |   else()
 69 |     message(STATUS "Could not find iwyu; disabling iwyu")
 70 |   endif()
 71 | endif()
 72 | 
 73 | # cppcheck: options on the command line as there is no config file
 74 | if ("cppcheck" IN_LIST STATIC_ANALYSIS_CHECKS)
 75 |   find_program(FOUND_CPPCHECK cppcheck)
 76 |   if(FOUND_CPPCHECK)
 77 |     message(STATUS "Enabling cppcheck analysis")
 78 |     set(CMAKE_CXX_CPPCHECK
 79 |       cppcheck
 80 |       --quiet
 81 |       --enable=all
 82 |       --inline-suppr
 83 |       --max-configs=1
 84 |       --suppressions-list=${PROJECT_SOURCE_DIR}/.cppcheck_suppress
 85 |     )
 86 |   else()
 87 |     message(STATUS "Could not find cppcheck; disabling cppcheck")
 88 |   endif()
 89 | endif()
 90 | 
 91 | # clang-tidy: need to make sure version is at least 20
 92 | if ("clang-tidy" IN_LIST STATIC_ANALYSIS_CHECKS)
 93 |   find_program(CLANG_TIDY_EXECUTABLE NAMES clang-tidy)
 94 |   # Minimum required version
 95 |   set(MIN_CLANG_TIDY_VERSION "20.0.0")
 96 |   if(CLANG_TIDY_EXECUTABLE)
 97 |     execute_process(
 98 |       COMMAND
 99 |       bash -c
100 |       "${CLANG_TIDY_EXECUTABLE} --version | grep version | tr -cd '0-9.\n'"
101 |       OUTPUT_VARIABLE CLANG_TIDY_VERSION
102 |       OUTPUT_STRIP_TRAILING_WHITESPACE
103 |     )
104 |     # Compare the version numbers
105 |     if(CLANG_TIDY_VERSION VERSION_GREATER_EQUAL MIN_CLANG_TIDY_VERSION)
106 |       message(STATUS "Enabling clang-tidy (version: ${CLANG_TIDY_VERSION})")
107 |       set(CMAKE_CXX_CLANG_TIDY
108 |         clang-tidy
109 |         --quiet
110 |         --allow-no-checks
111 |         -p ${PROJECT_BINARY_DIR}
112 |       )
113 |     else()
114 |       message(STATUS "Not enabling clang-tidy (min version not found")
115 |     endif()
116 |   else()
117 |     message(STATUS "Could not find clang-tidy; disabling clang-tidy")
118 |   endif()
119 | endif()
120 | 


--------------------------------------------------------------------------------
/docs/content/multistat.md:
--------------------------------------------------------------------------------
  1 | # multistat
  2 | 
  3 | ## Synopsis
  4 | ```shell
  5 | $ dnmtools multistat [OPTIONS] <intervals.bed> <input-tabular.tsv>
  6 | ```
  7 | 
  8 | ## Description
  9 | 
 10 | The `multistat` program is similar to [roi](../roi), but instead of
 11 | creating a BED file with averge methylation levels from a single
 12 | counts file, it takes as an input the output of [merge](../merge) with
 13 | tabular format (i.e. using the `-tabular` flag to make a data frame)
 14 | and using the `-radmeth` flag to remove suffixes that are not used in
 15 | this program.  In other words, `multistat` takes a data frame as input
 16 | and produces a data frame as output.
 17 | 
 18 | The input of `multistat` starts with a line with `2n` column names, with each
 19 | column name appearing sequentially twice. The file is then followed by a set of
 20 | lines containing `2n+1` elements. Each sample contains two columns. The first
 21 | column is the number of reads that cover the CpG in the sample, and the second
 22 | column is the number of CpGs that are methylated among the reads.
 23 | 
 24 | Here is a visual example of a file called `input-tabular.tsv` with four samples
 25 | (D083a, D083b, D091a and D091b):
 26 | 
 27 | ```txt
 28 |   D083a       D083a       D083b       D083b       D091a       D091a       D091b       D091b
 29 | chr1:10468:+:CpG        3       0       2       0       2       0       1       0
 30 | chr1:10470:+:CpG        6       0       3       0       4       0       3       0
 31 | chr1:10483:+:CpG        7       0       3       0       5       0       3       1
 32 | chr1:10488:+:CpG        7       0       3       0       5       0       3       0
 33 | chr1:10492:+:CpG        7       0       2       0       5       0       3       0
 34 | chr1:10496:+:CpG        6       0       4       0       5       0       4       0
 35 | chr1:10524:+:CpG        6       2       4       0       7       0       5       1
 36 | chr1:10541:+:CpG        4       0       4       0       7       2       5       0
 37 | chr1:10562:+:CpG        3       0       3       0       6       0       4       0
 38 | chr1:10570:+:CpG        2       0       3       0       6       0       4       0
 39 | chr1:10576:+:CpG        2       0       3       0       6       0       4       0
 40 | ```
 41 | 
 42 | Note that, if you do not add the `-radmeth` flag when running `merge`,
 43 | the tabular output may contain suffixes `_R` and `_M` on the column
 44 | names (e.g. `D083a_R` and `D083a_M` corresponding to the "Reads" and
 45 | "Methylated" columns). You can remove these to make the input proper
 46 | by running
 47 | 
 48 | ```shell
 49 |  $ sed -i '1s/_[MR]//g' input-tabular.tsv
 50 | ```
 51 | 
 52 | `multistat` also requires an input BED file representing the genomic
 53 | intervals of interest. The regions must be sorted by chromosome, position and
 54 | strand. If they are not, you can add the `-s` flag to sort the file prior to
 55 | running the program. Note that for very large BED files, this may take a long
 56 | time. Given an input file `regions.bed`, you can sort it in one of the two
 57 | following ways:
 58 | 
 59 | ```shell
 60 |  $ LC_ALL=C sort -k1,1 -k2,2n -k3,3n -k6,6 -o regions.bed regions.bed
 61 | ```
 62 | 
 63 | ```shell
 64 |  $ bedtools sort -i regions.bed
 65 | ```
 66 | 
 67 | Finally, to create a file `data-frame.tsv` with methylation levels (which can be
 68 | [weighted, unweighted or fractional](../levels) methylation), run
 69 | 
 70 | ```shell
 71 |  $ dnmtools multistat -o data-frame.tsv regions.bed input-tabular.tsv
 72 | ```
 73 | 
 74 | ## Options
 75 | 
 76 | ```txt
 77 |  -o, -output
 78 | ```
 79 | 
 80 | Name of output file (default: STDOUT)
 81 | 
 82 | ```txt
 83 |  -N, -numeric
 84 | ```
 85 | 
 86 | print numeric values only (not NAs), guaranteeing that the output
 87 | contains as many rows as there are regions in the BED input.
 88 | 
 89 | ```txt
 90 |  -L, -preload
 91 | ```
 92 | 
 93 | Load all CpG sites
 94 | 
 95 | ```txt
 96 |  -s, -sort
 97 | ```
 98 | 
 99 | sort data if needed
100 | 
101 | 
102 | ```txt
103 |  -l, -level
104 | ```
105 | 
106 | the level to report as score column in bed format output (w, u or f),
107 | corresponding to weighted, unweighted or fractional methylation (default: w)
108 | 
109 | ```txt
110 |  -M, -more-levels
111 | ```
112 | 
113 | report more methylation information
114 | 
115 | ```txt
116 |  -v, -verbose
117 | ```
118 | 
119 | print more run info to STDERR
120 | 


--------------------------------------------------------------------------------
/docs/content/visualization.md:
--------------------------------------------------------------------------------
 1 | # Visualizing methylome data
 2 | 
 3 | Here we explain how to visualize data using the UCSC Genome
 4 | Browser. When we refer to the genome browser below, we mean the UCSC
 5 | kind.
 6 | 
 7 | ## Single-site methylation levels
 8 | 
 9 | Here we are concerned with individual sites. These need not be CpG
10 | sites -- the could be any/all cytosines, but we will assume they are
11 | CpGs through our explanation.
12 | 
13 | To view the methylation level at individual CpG sites in a genome
14 | browser, the data should be converted into bigWig format. The starting
15 | point should be a "counts" file, as output from the
16 | [counts](../counts) command. The bigWig format is intended for the
17 | "wiggle" tracks, which shows information associated with individual
18 | genomic positions, but in the bigWig format this information is
19 | encoded concisely and is not for direct human viewing. The same
20 | approach is used to build files that show the coverage at individual
21 | CpG sites.
22 | 
23 | To create methylation level tracks or read coverage tracks, follow
24 | these steps:
25 | 
26 | * Download the `wigToBigWig` program from the UCSC Genome Browser
27 |   directory of [binaries](http://hgdownload.cse.ucsc.edu/admin/exe/).
28 | 
29 | * Use the `fetchChromSizes` script, from the same directory, to get
30 |   the `.chrom.sizes` file for the database (reference genome) you are
31 |   working with (e.g., hg38). Note: this is the file mentioned below as
32 |   `hg19.chrom.sizes` for the hg19 reference genome.
33 | 
34 | * To create a bigWig format track for methylation levels at CpG
35 |   sites, convert the symmetric methylation file ([counts](../counts)
36 |   format) as follows:
37 | ```console
38 | $ awk '{print $1,$2,$2+1,$5}' sample.meth | wigToBigWig /dev/stdin hg19.chrom.sizes sample.meth.bw
39 | ```
40 |   In the command above, the first part selects the appropriate columns
41 |   to generate bedgraph format, and then the second part converts this
42 |   directly into a bigWig format file, which is not human-readable.
43 | 
44 | * To create a bigWig format track for read coverage at CpG sites, use the
45 |   following command, which is very similar to the previous one above:
46 | ```console
47 | $ awk '{print $1,$2,$2+1,$6}' sample.meth | wigToBigWig /dev/stdin hg19.chrom.sizes sample.reads.bw
48 | ```
49 | 
50 | If the `wigToBigWig` or `fetchChromSizes` programs are not
51 | executable when downloaded, try the following:
52 | ```console
53 | $ chmod +x wigToBigWig
54 | $ chmod +x fetchChromSizes
55 | ```
56 | 
57 | ## The identified features
58 | 
59 | This refers to the HMRs, the AMRs, the PMDs, and possibly the
60 | HyperMRs. These are contiguous genomic intervals. It happens that for
61 | an individual set of these features, as identified using dnmtools, no
62 | two features will overlap. This fact isn't relevant here, though.
63 | 
64 | We will assume you want to make browser tracks for HMRs. The same
65 | procedure also works for [AMRs](../amrfinder), [PMDs](../pmd), or
66 | [DMRs](../dmr). To do so, follow these steps:
67 | 
68 | * Download the `bedToBigBed` program from the UCSC Genome Browser
69 |   directory of [binaries](http://hgdownload.cse.ucsc.edu/admin/exe/).
70 | 
71 | * Use the `fetchChromSizes` script, from the same directory, to get
72 |   the `.chrom.sizes` file for the database (reference genome) you are
73 |   working with (e.g., hg38). Note: this is the file mentioned below as
74 |   `hg19.chrom.sizes` for the hg19 reference genome.
75 | 
76 | * Modify and use the following commands: PMDs, HMRs and AMRs may have
77 |   a score greater than 1000 in the 5th column, in which case
78 |   `bedToBigBed` will output an error. Also, HMR file `sample.bed` may
79 |   have a non-integer score in the 5th column. The following script
80 |   rounds the 5th column and prints 1000 if the score is greater than
81 |   1000:
82 | ```console
83 | $ awk -v OFS="\t" '{if ($5>1000) print $1,$2,$3,$4,"1000"; \
84 |                     else print $1,$2,$3,$4,int($5)}' sample.bed > sample.for_bigbed
85 | ```
86 |   In the above command, since the HMRs are not stranded, we do not keep
87 |   the 6th column. Keeping the 6th column would make all the HMRs appear
88 |   as though they have a direction and they would all appear to be on the +
89 |   strand. This would be visually misleading (and somewhat annoying).
90 | 
91 | * Generate the `.bb` track file using the command below:
92 | ```console
93 | $ bedToBigBed sample.for_bigbed hg19.chrom.sizes output.bb
94 | ```
95 | 


--------------------------------------------------------------------------------
/.github/workflows/dnmtools_release_macos.yml:
--------------------------------------------------------------------------------
  1 | name: DNMTools release (macOS)
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 | 
  6 | jobs:
  7 |   build-macos-binaries:
  8 |     strategy:
  9 |       matrix:
 10 |         os: [macos-13, macos-14]
 11 |     runs-on: ${{ matrix.os }}
 12 |     steps:
 13 |       - uses: actions/checkout@v4
 14 |         with:
 15 |           submodules: recursive
 16 |       - name: Make dnmtools dependency directories
 17 |         run: sudo mkdir -p /opt/dnmtools/lib /opt/dnmtools/include
 18 |       - name: Install dependency headers and static libs
 19 |         run: |
 20 |           brew install zlib gsl automake
 21 |           sudo cp $(brew --prefix zlib)/lib/*.a /opt/dnmtools/lib
 22 |           sudo cp $(brew --prefix gsl)/lib/*.a /opt/dnmtools/lib
 23 |           sudo cp -r $(brew --prefix zlib)/include/* /opt/dnmtools/include
 24 |           sudo cp -r $(brew --prefix gsl)/include/* /opt/dnmtools/include
 25 |       - name: Build and install libdeflate
 26 |         run: |
 27 |           git clone https://github.com/ebiggers/libdeflate.git && \
 28 |           cd libdeflate && \
 29 |           cmake -B build \
 30 |               -DLIBDEFLATE_BUILD_GZIP=off \
 31 |               -DLIBDEFLATE_BUILD_TESTS=off \
 32 |               -DLIBDEFLATE_BUILD_SHARED_LIB=off \
 33 |               -DCMAKE_VERBOSE_MAKEFILE=on \
 34 |               -DCMAKE_BUILD_TYPE=Release && \
 35 |           cmake --build build -j4 && \
 36 |           sudo cmake --install build --prefix=/opt/dnmtools
 37 |       - name: Build and install HTSlib
 38 |         run: |
 39 |           git clone --recursive https://github.com/samtools/htslib.git
 40 |           cd htslib
 41 |           sudo cp -r htslib /opt/dnmtools/include
 42 |           autoreconf -i
 43 |           mkdir build && cd build
 44 |           ../configure \
 45 |             --disable-bz2 \
 46 |             --disable-libcurl \
 47 |             --disable-lzma \
 48 |             --disable-ref-cache \
 49 |             --with-libdeflate \
 50 |             LDFLAGS="-L/opt/dnmtools/lib" CPPFLAGS="-I/opt/dnmtools/include"
 51 |           make -j4 CFLAGS="-Wall -O2 -fvisibility=hidden" libhts.a
 52 |           sudo cp libhts.a /opt/dnmtools/lib
 53 |       - name: Build dnmtools
 54 |         run: |
 55 |           ./autogen.sh
 56 |           mkdir build && cd build
 57 |           ../configure --with-libdeflate \
 58 |             CXX=g++-14 \
 59 |             LDFLAGS="-L/opt/dnmtools/lib -static-libgcc -static-libstdc++ -Wl,-dead_strip" \
 60 |             CPPFLAGS="-I/opt/dnmtools/include"
 61 |           ../data/make_full_license_info_header.sh ../data/LICENSE > license.h
 62 |           echo "#define INCLUDE_FULL_LICENSE_INFO 1" >> config.h
 63 |           make -j4
 64 |       - name: Rename the binary
 65 |         run: mv build/dnmtools dnmtools_$(uname -m)
 66 |       - name: Get version number
 67 |         id: vars
 68 |         run: |
 69 |           awk '/AC_INIT/ {print "vn="$2}' configure.ac | \
 70 |             sed "s/\[//; s/\]//; s/,//" >> "$GITHUB_OUTPUT"
 71 |           uname -m | awk '{print "arch="$0}' >> "$GITHUB_OUTPUT"
 72 |         env:
 73 |           GH_TOKEN: ${{ github.token }}
 74 |       - name: Upload the binary
 75 |         uses: actions/upload-artifact@v4
 76 |         with:
 77 |           name: dnmtools-${{ steps.vars.outputs.arch }}
 78 |           path: |
 79 |             dnmtools_${{ steps.vars.outputs.arch }}
 80 |   make-lipo:
 81 |     needs: build-macos-binaries
 82 |     runs-on: macos-15
 83 |     steps:
 84 |       - uses: actions/checkout@v4
 85 |       - name: Get version number
 86 |         id: vn
 87 |         run: awk '/AC_INIT/ {print "vn="$2}' configure.ac | sed "s/\[//; s/\]//; s/,//" >> "$GITHUB_OUTPUT"
 88 |         env:
 89 |           GH_TOKEN: ${{ github.token }}
 90 |       - name: Download artifacts
 91 |         uses: actions/download-artifact@v4
 92 |         with:
 93 |           path: binaries
 94 |           pattern: dnmtools-*
 95 |           merge-multiple: false
 96 |       - name: Create universal binary
 97 |         run: |
 98 |           lipo -create \
 99 |           binaries/dnmtools-*/dnmtools_* \
100 |           -output dnmtools
101 |           chmod +x dnmtools
102 |           tar -cf dnmtools-${{ steps.vn.outputs.vn }}-macOS.tar.gz dnmtools
103 |       - name: Upload the lipo binary
104 |         uses: actions/upload-artifact@v4
105 |         with:
106 |           name: dnmtools-${{ steps.vn.outputs.vn }}-macOS.tar.gz
107 |           path: dnmtools-${{ steps.vn.outputs.vn }}-macOS.tar.gz
108 | 


--------------------------------------------------------------------------------
/docs/content/guessprotocol.md:
--------------------------------------------------------------------------------
  1 | # guessprotocol - Identify bisulfite sequencing protocol
  2 | 
  3 | ## Synopsis
  4 | ```shell
  5 | $ dnmtools guessprotocol [OPTIONS] <file-1.fastq> [<file-2.fastq>]
  6 | ```
  7 | 
  8 | ## Description
  9 | 
 10 | Mapping a WGBS dataset requires knowledge of the sequencing protocol
 11 | generated to process the data. This may not be properly documented
 12 | where the data was obtained, so we created this command to guess the
 13 | protocol based on the nucleotide content in the input FASTQ file (or
 14 | files, for paired-end).
 15 | 
 16 | The `guessprotocol` tool uses two models of nucleotide content
 17 | following bisulfite conversion and applies this model to each
 18 | read. One model is for WGBS, and the other is for PBAT. For each read,
 19 | both models are applied, and the result is a probability for whether
 20 | the read (or read pair) was generated using WGBS or PBAT. Once the
 21 | requested number of reads is processed, the aggregate results for all
 22 | reads are used to guess whether the protocol used to generate the data
 23 | was WGBS, PBAT or rPBAT. The criteria are roughly as follows: if most
 24 | of the reads look like they are from WGBS, then we conclude WGBS.  If
 25 | most of the reads look like they are from PBAT, then we conclude
 26 | PBAT. If the result is more towards the middle, then we conclude
 27 | rPBAT.
 28 | 
 29 | More details: the number of As, Cs, Gs and Ts differs depending on
 30 | WGBS (traditional WGBS or MethylC-seq), PBAT -- post bisulfite adaptor
 31 | tagging, or rPBAT (random PBAT).
 32 | 
 33 | * For WGBS, a single-end sequenced read should be T-rich, and if the
 34 |   data is paired-end, read1 is T-rich and read2 is A-rich.
 35 | * For PBAT, a single-end sequenced read should be A-rich, and if the
 36 |   data is paired-end, read1 is A-rich and read2 is T-rich.
 37 | * For rPBAT, we have a random mix of the above situations. However, in
 38 |   practice it seems almost never to be 50% each.
 39 | 
 40 | In most cases, when the data is WGBS or PBAT, it is very obvious which
 41 | is the protocol used.
 42 | 
 43 | As of dnmtools v1.4.1, `guessprotocol` will always make a conclusion,
 44 | but includes a confidence level.
 45 | 
 46 | The output of `guessprotocol` is useful prior to mapping. For example,
 47 | it can be used to decide whether or not to map with the `-R` flag (for
 48 | "random PBAT") when using
 49 | [abismal](https://github.com/smithlabcode/abismal).
 50 | 
 51 | For paired-end data, `guessprotocol` finds ensures reads are mates by
 52 | finding identical read names. Some datasets finish the read name with
 53 | identifiers like ".1" on end 1 and ".2" on end 2, thus making the read
 54 | names technically different at the last two characters. You can tell
 55 | the program to ignore a certain suffix size (like size 2 in this
 56 | example) when matching read names using the `-i` flag.
 57 | 
 58 | The output includes the following values in a YAML format:
 59 | * `protocol`: this is the guessed protocol (wgbs, pbat or rpbat) based
 60 |   on the content of the reads.
 61 | * `confidence`: indicates the level of confidence in the guess for the
 62 |   protocol (values: low or high).
 63 | * `layout`: indicates whether the supplied reads were paired or
 64 |   single-ended.
 65 | * `n_reads_wgbs`: the average number of reads (for single-ended reads)
 66 |   or read pairs (for paired reads) where read1 is determined by the
 67 |   model to be T-rich.
 68 | * `n_reads`: the number of evaluated reads or read pairs.
 69 | * `wgbs_fraction`: the probability that a read (for single-ended
 70 |   reads) or the read1 of a read pair (for paired reads) is T-rich.
 71 | 
 72 | ## Options
 73 | ```
 74 | -n, -nreads
 75 | ```
 76 | Number of reads to check. The program stops after collecting
 77 | statistics for the first `n` reads (default: 1,000,000). Fewer than
 78 | the default are usually sufficient, but increase this value if you
 79 | suspect reads at the start of the file might be problematic.
 80 | 
 81 | ```txt
 82 |  -i -ignore
 83 | ```
 84 | Length of the read name suffix to ignore when matching read names to
 85 | ensure mates are correctly synchronized when the data is paired-end.
 86 | 
 87 | ```
 88 | -b, -bisulfite
 89 | ```
 90 | Assumed bisulfite conversion rate for the models (default: 0.98).
 91 | 
 92 | ```
 93 | -H, -human
 94 | ```
 95 | Use human genome nucleotide frequencies. A good assumption for samples
 96 | from a mammal.
 97 | 
 98 | ```
 99 | -o, -output
100 | ```
101 | The output file name.
102 | 
103 | ```
104 | -v, -verbose
105 | ```
106 | Report available information during the run.
107 | 


--------------------------------------------------------------------------------
/src/common/BetaBin.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Copyright (C) 2011-2022 University of Southern California
  3 |   Authors: Andrew D. Smith, Song Qiang
  4 | 
  5 |   This file is part of dnmtools.
  6 | 
  7 |   dnmtools is free software; you can redistribute it and/or modify
  8 |   it under the terms of the GNU General Public License as published by
  9 |   the Free Software Foundation; either version 2 of the License, or
 10 |   (at your option) any later version.
 11 | 
 12 |   dnmtools is distributed in the hope that it will be useful,
 13 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 |   GNU General Public License for more details.
 16 | */
 17 | 
 18 | #include "BetaBin.hpp"
 19 | 
 20 | #include <algorithm>
 21 | #include <cmath>
 22 | #include <cstdint>
 23 | #include <iomanip>
 24 | #include <iostream>
 25 | #include <numeric>
 26 | #include <sstream>
 27 | #include <utility>
 28 | 
 29 | #include <gsl/gsl_sf_gamma.h>
 30 | #include <gsl/gsl_sf_psi.h>
 31 | 
 32 | using std::cerr;
 33 | using std::max;
 34 | using std::min;
 35 | using std::pair;
 36 | using std::setprecision;
 37 | using std::setw;
 38 | using std::string;
 39 | using std::vector;
 40 | 
 41 | //////////////////////////////////////////////
 42 | //////       struct betabin             //////
 43 | //////////////////////////////////////////////
 44 | 
 45 | const double betabin::tolerance = 1e-10;
 46 | 
 47 | betabin::betabin() : alpha(1), beta(1), lnbeta_helper(gsl_sf_lnbeta(1, 1)) {}
 48 | 
 49 | betabin::betabin(const double a, const double b) :
 50 |   alpha(a), beta(b), lnbeta_helper(gsl_sf_lnbeta(a, b)) {}
 51 | 
 52 | betabin::betabin(const string &str) {
 53 |   std::istringstream iss(str, std::istringstream::in);
 54 |   string name;
 55 |   iss >> name >> alpha >> beta;
 56 |   if (name != "betabin" || alpha < 0 || beta < 0) {
 57 |     cerr << "betabin::betabin: "
 58 |          << "bad string representation of betabin distribution: " << str
 59 |          << '\n';
 60 |     throw "bad string representation of betabin distribution";
 61 |   }
 62 |   lnbeta_helper = gsl_sf_lnbeta(alpha, beta);
 63 | }
 64 | 
 65 | string
 66 | betabin::tostring() const {
 67 |   std::ostringstream os;
 68 |   os << "betabin " << setprecision(4) << alpha << " " << setprecision(4)
 69 |      << beta;
 70 |   return os.str();
 71 | }
 72 | 
 73 | double
 74 | betabin::operator()(const pair<double, double> &val) const {
 75 |   const std::uint32_t x = static_cast<std::uint32_t>(val.first);
 76 |   const std::uint32_t n = static_cast<std::uint32_t>(val.first + val.second);
 77 |   return gsl_sf_lnchoose(n, x) + gsl_sf_lnbeta(alpha + x, beta + val.second) -
 78 |          lnbeta_helper;
 79 | }
 80 | 
 81 | double
 82 | betabin::log_likelihood(const pair<double, double> &val) const {
 83 |   const std::uint32_t x = static_cast<std::uint32_t>(val.first);
 84 |   const std::uint32_t n = static_cast<std::uint32_t>(val.first + val.second);
 85 |   return gsl_sf_lnchoose(n, x) + gsl_sf_lnbeta(alpha + x, beta + val.second) -
 86 |          lnbeta_helper;
 87 | }
 88 | 
 89 | double
 90 | betabin::sign(const double x) {
 91 |   return (x >= 0) ? 1.0 : -1.0;
 92 | }
 93 | 
 94 | double
 95 | betabin::invpsi(const double tolerance, const double x) {
 96 |   double L = 1.0;
 97 |   double Y = std::exp(x);
 98 |   while (L > tolerance) {
 99 |     Y += L * sign(x - gsl_sf_psi(Y));
100 |     L /= 2.0;  // NOLINT(*-avoid-magic-numbers)
101 |   }
102 |   return Y;
103 | }
104 | 
105 | double
106 | betabin::movement(const double curr, const double prev) {
107 |   return std::abs(curr - prev) / std::max(std::fabs(curr), std::fabs(prev));
108 | }
109 | 
110 | void
111 | betabin::fit(const vector<double> &vals_a, const vector<double> &vals_b,
112 |              const vector<double> &p) {
113 |   static constexpr auto initial_param_vals = 0.01;
114 |   const double p_total = std::accumulate(p.begin(), p.end(), 0.0);
115 |   const double alpha_rhs =
116 |     std::inner_product(std::cbegin(vals_a), std::cend(vals_a), std::cbegin(p),
117 |                        0.0) /
118 |     p_total;
119 |   const double beta_rhs =
120 |     std::inner_product(std::cbegin(vals_b), std::cend(vals_b), std::cbegin(p),
121 |                        0.0) /
122 |     p_total;
123 |   double prev_alpha = 0.0, prev_beta = 0.0;
124 |   alpha = beta = initial_param_vals;
125 | 
126 |   while (movement(alpha, prev_alpha) > tolerance &&
127 |          movement(beta, prev_beta) > tolerance) {
128 |     prev_alpha = alpha;
129 |     prev_beta = beta;
130 |     alpha = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + alpha_rhs);
131 |     beta = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + beta_rhs);
132 |   }
133 |   lnbeta_helper = gsl_sf_lnbeta(alpha, beta);
134 | }
135 | 


--------------------------------------------------------------------------------
/docs/content/levels.md:
--------------------------------------------------------------------------------
  1 | # levels - global methylation summary statistics
  2 | 
  3 | ## Synopsis
  4 | 
  5 | ```console
  6 | $ dnmtools levels [OPTIONS] <input.meth>
  7 | ```
  8 | 
  9 | ## Description
 10 | 
 11 | The `levels` command computes global summary statistics for the output
 12 | of [counts](../counts). Example output is below. It computes multiple
 13 | summary statistics related to the quantity of data (e.g., coverage of
 14 | sites) and methylation (e.g., global average methylation). These
 15 | summary statistics are also provided by context. The context are
 16 | explained [here](../cytosine_contexts). These are not exclusive
 17 | categories, and include:
 18 | 
 19 | * cytosines, all of them, on either strand
 20 | * cpg sites, on either strand
 21 | * symmetric cpg sites (strands combined)
 22 | * the CHH context
 23 | * the CCG context
 24 | * the CXG context (we "invented" this one)
 25 | 
 26 | The summary statistics computed include:
 27 | 
 28 | * `total_sites` the total number of sites counted for this context
 29 | * `sites_covered` among the total above, those with at least one read
 30 | * `total_c` among the observations in reads, how many are C
 31 | * `total_t` among the observations in reads, how many are T
 32 | * `max_depth` the most coverage of any site for this context
 33 | * `mutations` number of sites for this context marked as mutated
 34 | * `called_meth` number of sites "called" methylated
 35 | * `called_unmeth` number of sites "called" unmethylated
 36 | * `mean_agg` the sum of methylation levels for all sites
 37 | * `coverage` total data informing on sites for this context
 38 | * `sites_covered_fraction` fraction of sites covered
 39 | * `mean_depth` among all sites, the mean coverage by reads
 40 | * `mean_depth_covered` among all covered sites, the mean coverage
 41 | * `mean_meth` the mean of the methylation levels for covered sites
 42 | * `mean_meth_weighted` the mean weighted by coverage
 43 | * `fractional_meth` the fraction of "called" sites "called" methylated
 44 | 
 45 | (If you want more information on these, please ask.)
 46 | 
 47 | Among the above, many are included because they are needed for
 48 | calculating the the "derived" statistics. For example, the `mean_agg`
 49 | is used in the denominator for `mean_meth`, where the denominator is
 50 | the number of covered sites. Why keep those raw statistics? Because
 51 | it's essential if two different `levels` output files are combined.
 52 | 
 53 | The final three values are the "levels" and are described in Schultz
 54 | et al. (2012):
 55 | ```txt
 56 | "Leveling" the playing field for analyses of single-base resolution DNA methylomes
 57 | Schultz, Schmitz & Ecker (TIG 2012)
 58 | ```
 59 | 
 60 | Note: the `fractional_meth` level we calculate is inspired but
 61 | different from the paper. What we are do is use a binomial test to
 62 | determine significantly hyper/hypomethylated sites, and only use the
 63 | subset of significant sites to calculate `fractional_meth` level.
 64 | 
 65 | This command should provide flexibility to compare methylation data
 66 | with publications that calculate averages different ways. The sample
 67 | output below only shows the results for cytosines and CpGs in the
 68 | sample, but similar output is generated for symmetric CpGs and
 69 | cytosines in the CHH, CCG, and CXG contexts.
 70 | 
 71 | ```yaml
 72 | cytosines:
 73 |   total_sites: 1200559022
 74 |   sites_covered: 797100353
 75 |   total_c: 417377038
 76 |   total_t: 4048558428
 77 |   max_depth: 30662
 78 |   mutations: 3505469
 79 |   called_meth: 44229556
 80 |   called_unmeth: 750163257
 81 |   mean_agg: 4.40429e+07
 82 |   coverage: 4465935466
 83 |   sites_covered_fraction: 0.663941
 84 |   mean_depth: 3.71988
 85 |   mean_depth_covered: 5.60273
 86 |   mean_meth: 0.055254
 87 |   mean_meth_weighted: 0.093458
 88 |   fractional_meth: 0.055677
 89 | cpg:
 90 |   total_sites: 58803590
 91 |   sites_covered: 47880982
 92 |   total_c: 261807401
 93 |   total_t: 84403225
 94 |   max_depth: 30080
 95 |   mutations: 381675
 96 |   called_meth: 38861909
 97 |   called_unmeth: 7152004
 98 |   mean_agg: 3.69282e+07
 99 |   coverage: 346210626
100 |   sites_covered_fraction: 0.814253
101 |   mean_depth: 5.88758
102 |   mean_depth_covered: 7.23065
103 |   mean_meth: 0.771250
104 |   mean_meth_weighted: 0.756208
105 | ```
106 | 
107 | You can run the `levels` command as follows:
108 | ```console
109 | $ dnmtools levels -o output.levels input.meth
110 | ```
111 | 
112 | ## Options
113 | 
114 | ```console
115 | -o, -output
116 | ```
117 | Output file in YAML format (default: stdout).
118 | 
119 | ```console
120 | -a, -alpha
121 | ```
122 | Alpha for confidence interval (default: 0.95).
123 | 
124 | ```console
125 | -v, -verbose
126 | ```
127 | Report more information while the program is running.
128 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![GitHub Downloads](https://img.shields.io/github/downloads/smithlabcode/dnmtools/total?style=social)](https://github.com/smithlabcode/dnmtools/releases)
  2 | [![Install with Conda](https://anaconda.org/bioconda/dnmtools/badges/version.svg)](https://anaconda.org/bioconda/dnmtools)
  3 | [![Install with Conda](https://anaconda.org/bioconda/dnmtools/badges/platforms.svg)](https://anaconda.org/bioconda/dnmtools)
  4 | [![Install with Conda](https://anaconda.org/bioconda/dnmtools/badges/downloads.svg)](https://anaconda.org/bioconda/dnmtools)
  5 | [![Documentation Status](https://readthedocs.org/projects/dnmtools/badge/?version=latest)](https://dnmtools.readthedocs.io/en/latest/?badge=latest)
  6 | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
  7 | 
  8 | DNMTools is a set of tools for analyzing DNA methylation data from
  9 | high-throughput sequencing experiments, especially whole genome bisulfite
 10 | sequencing (WGBS), but also reduced representation bisulfite sequencing
 11 | (RRBS). These tools focus on overcoming the computing challenges imposed by
 12 | the scale of genome-wide DNA methylation data, which is usually the early
 13 | parts of data analysis.
 14 | 
 15 | **Nanopore** As of v1.5.0, DNMTools has funcionality to start analysis with a
 16 | BAM file from Nanopore sequencing with 5mC and 5hmC calls at CpG sites.
 17 | 
 18 | ## Usage
 19 | 
 20 | The documentation for DNMTools can be found
 21 | [here](https://dnmtools.readthedocs.io).
 22 | 
 23 | ## Installation
 24 | 
 25 | - **Linux**
 26 |   [binary](https://github.com/smithlabcode/dnmtools/releases/download/v1.5.1/dnmtools-1.5.1-Linux.tar.gz).
 27 |   Should work on any Linux distribution since roughly 2017.
 28 | 
 29 | - **Mac**
 30 |   [binary](https://github.com/smithlabcode/dnmtools/releases/download/v1.5.1/dnmtools-1.5.1-macOS.tar.gz).
 31 |   Should work on any Mac hardware and macOS-13 (Ventura) or newer.
 32 | 
 33 | - **Conda**
 34 |   ```console
 35 |   conda install -c bioconda dnmtools
 36 |   ```
 37 | 
 38 | - **Source**
 39 |   [dnmtools-1.5.1.tar.gz](https://github.com/smithlabcode/dnmtools/releases/download/v1.5.1/dnmtools-1.5.1.tar.gz). Dependencies:
 40 |   [GSL](http://www.gnu.org/software/gsl),
 41 |   [HTSlib](https://github.com/samtools/htslib),
 42 |   [libdeflate](https://github.com/ebiggers/libdeflate) and
 43 |   [ZLib](https://github.com/madler/zlib). Installing HTSlib as a package
 44 |   should also give you ZLib and libdeflate.  System-specific details below.
 45 | 
 46 |   Build DNMTools like this:
 47 |   ```console
 48 |   tar -xf dnmtools-1.5.1.tar.gz
 49 |   cd dnmtools-1.5.1
 50 |   ./configure --prefix=$HOME
 51 |   make
 52 |   make install
 53 |   ```
 54 | 
 55 |   To get dependencies and a compiler on (these might with OS/package updates):
 56 | 
 57 |   Ubuntu/Debian
 58 |   ```console
 59 |   apt-get install build-essential htslib-dev libgsl-dev
 60 |   ```
 61 | 
 62 |   RedHat/Fedora
 63 |   ```console
 64 |   dnf install @c-development @development-tools htslib-devel gsl-devel awk
 65 |   ```
 66 | 
 67 |   Homebrew (see notes below)
 68 |   ```console
 69 |   brew install gcc htslib gsl
 70 |   ```
 71 | 
 72 |   Conda (see notes below)
 73 |   ```console
 74 |   conda create -n build-env -c conda-forge -c bioconda \
 75 |       gcc gxx make autoconf automake htslib gsl zlib binutils && \
 76 |   conda activate build-env
 77 |   ```
 78 | 
 79 |   Notes: If you use only Homebrew or only Conda to setup your environment, you
 80 |   could need additional dependencies, and some of what I listed you might
 81 |   already have. You might need to set additional environment variables or run
 82 |   configure differently. For example with Homebrew:
 83 |   ```console
 84 |   ./configure CPPFLAGS="-I$(brew --prefix)/include" LDFLAGS="-L$(brew --prefix)/lib"
 85 |   ```
 86 | 
 87 | ## Contact
 88 | 
 89 | Andrew D. Smith
 90 | andrewds@usc.edu
 91 | 
 92 | ## Copyright and License Information
 93 | 
 94 | Copyright (C) 2022-2025
 95 | 
 96 | Andrew D. Smith and Guilherme de Sena Brandine
 97 | 
 98 | Authors of DNMTools: Andrew D. Smith and Guilherme de Sena Brandine
 99 | 
100 | Essential contributors: Ben Decato, Meng Zhou, Liz Ji, Terence Li, Jenny Qu,
101 | Qiang Song, Fang Fang and Masaru Nakajima
102 | 
103 | This is free software: you can redistribute it and/or modify it under the
104 | terms of the GNU General Public License as published by the Free Software
105 | Foundation, either version 3 of the License, or (at your option) any later
106 | version.
107 | 
108 | This software is distributed in the hope that it will be useful, but WITHOUT
109 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
110 | FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
111 | details.
112 | 


--------------------------------------------------------------------------------
/docs/content/amrfinder.md:
--------------------------------------------------------------------------------
  1 | # amrfinder - Compute allelically methylated regions (AMRs)
  2 | 
  3 | ## Synopsis
  4 | ```shell
  5 | $ dnmtools amrfinder [OPTIONS] <input.epiread>
  6 | ```
  7 | 
  8 | ## Description
  9 | 
 10 | The program `amrfinder` scans the genome using a sliding window to
 11 | identify AMRs. For a genomic interval, two statistical models are
 12 | fitted to the reads mapped, respectively. One model (single-allele
 13 | model) assumes the two alleles have the same methylation state, and
 14 | the other (two-allele model) represents different methylation states
 15 | for the two alleles.  Comparing the likelihood of the two models, the
 16 | interrogated genomic interval may be classified as an AMR.  The
 17 | following command shows an example to run the program amrfinder and
 18 | takes as input an epireads file generated from
 19 | [stats](../states).
 20 | 
 21 | ```shell
 22 | $ dnmtools amrfinder -c /path/to/genome.fa -o output.amr input.epiread
 23 | ```
 24 | 
 25 | There are several options for running amrfinder.
 26 | 
 27 |  * The `-b` switches from
 28 | using a likelihood ratio test to BIC as the criterion for calling an
 29 | AMR.
 30 | 
 31 |  * The `-i` option changes the number of iterations used in the EM
 32 | procedure when fitting the models.
 33 | 
 34 |  * The `-w` option changes the size of
 35 | the sliding window, which is in terms of CpGs. The default of 10 CpGs
 36 | per window has worked well for us.
 37 | 
 38 |  * The `-m` indicates the minimum
 39 | coverage per CpG site required for a window to be tested as an AMR.
 40 | The default requires 4 reads on average, and any lower will probably
 41 | lead to unreliable results.
 42 | 
 43 |  * The `-g` parameter is used to indicate the maximum distance between
 44 |    any two identified AMRS; AMRs are often fragmented, as coverage
 45 | fluctuates, and spacing between CpGs means their linkage cannot be
 46 | captured by the model.  if two are any closer than this value, they
 47 | are merged. The default is 1000, and it seems to work well in
 48 | practice, not joining things that appear as though they should be
 49 | distinct. In the current version of the program, at the end of the
 50 | procedure, any AMRs whose size in terms of base-pairs is less than
 51 | half the "gap" size are eliminated. This is a hack that has produced
 52 | excellent results, but will eventually be eliminated (hopefully soon).
 53 | 
 54 |  * The `-C` parameter specifies the critical value for keeping windows
 55 |    as AMRs, and is only useful when the likelihood ratio test is the
 56 | used; for BIC windows are retained if the BIC for the two-allele model
 57 | is less than that for the single-allele model.  amrfinder calculates a
 58 | false discovery rate to correct for multiple testing, and therefore
 59 | most p-values that pass the test will be significantly below the
 60 | critical value.
 61 | 
 62 |  * The `-h` option produces FDR-adjusted p-values according to a
 63 |    step-up procedure and then compares them directly to the given
 64 | critical value, which allows further use of the p-values without
 65 | multiple testing correction.
 66 | 
 67 |  * The `-f` omits multiple testing correction entirely by not applying
 68 |    a correction to the p-values or using a false discovery rate cutoff
 69 | to select AMRs.
 70 | 
 71 | ## Options
 72 | 
 73 | ```txt
 74 |  -o, -output
 75 | ```
 76 | The name of the output file. If no file name is provided, the output
 77 | will be written to standard output. Due to the size of this output, a
 78 | file should be specified unless the output will be piped to another
 79 | command or program. The output file contains genomic intervals in BED
 80 | format.
 81 | 
 82 | ```txt
 83 |  -c, -chrom
 84 | ```
 85 | FASTA file or directory of chromosomes containing FASTA files. This
 86 | parameter is required.
 87 | 
 88 | ```txt
 89 |  -i, -itr
 90 | ```
 91 | The maximum number of iterations when training (default: 10).
 92 | 
 93 | ```txt
 94 |  -w, -window
 95 | ```
 96 | Size of sliding window (default: 10 CpG sites).
 97 | 
 98 | ```txt
 99 |  -m, -min-cov
100 | ```
101 | Minimum coverage per CpG to test in each window (default: 4).
102 | 
103 | ```txt
104 |  -g, -gap
105 | ```
106 | Minimum allowed gap, in bp, between AMRs (default: 1000).
107 | 
108 | ```txt
109 |  -C, -crit
110 | ```
111 | Critical p-value cutoff (default: 0.01).
112 | 
113 | ```txt
114 |  -f, -nofdr
115 | ```
116 | Omits the FDR multiple testing correction.
117 | 
118 | ```txt
119 |  -h, -pvals
120 | ```
121 | Adjusts p-values using Hochberg step-up.
122 | 
123 | ```txt
124 |  -b, -bic
125 | ```
126 | Use Bayesian Information Criterion (BIC) to compare models.
127 | 
128 | ```txt
129 |  -v, -verbose
130 | ```
131 | Print more information while the command is running.
132 | 
133 | ```txt
134 |  -P, -progress
135 | ```
136 | Print progress info while the command is running.
137 | 


--------------------------------------------------------------------------------