├── .gitignore
├── .gitmodules
├── Makefile
├── Makefile.am
├── README.md
├── configure.ac
├── docs
├── figs
│ ├── A-rich-base-composition.eps
│ ├── A-rich-base-composition.pdf
│ ├── Methpipe_work_flow.eps
│ ├── Methpipe_work_flow.pdf
│ ├── Methpipe_work_flow.tif
│ ├── T-rich-base-composition.eps
│ └── T-rich-base-composition.pdf
├── methpipe-manual.bib
├── methpipe-manual.pdf
└── methpipe-manual.tex
├── m4
├── ax_cxx_check_lib.m4
├── ax_cxx_compile_stdcxx.m4
├── ax_cxx_compile_stdcxx_11.m4
└── ax_subdirs_configure.m4
└── src
├── Makefile
├── amrfinder
├── Makefile
├── allelicmeth.cpp
├── amrfinder.cpp
└── amrtester.cpp
├── analysis
├── Makefile
├── bsrate.cpp
├── bsrate_sam.cpp
├── hmr.cpp
├── hmr_rep.cpp
├── hypermr.cpp
├── levels.cpp
├── methcounts.cpp
├── methentropy.cpp
├── methstates.cpp
├── multimethstat.cpp
├── pmd.cpp
└── roimethstat.cpp
├── common-experimental
├── Makefile
├── ModelParams.cxx
├── ModelParams.hpp
├── ThreeStateHDHMM.cpp
├── ThreeStateHDHMM.hpp
├── TwoStateCTHMM.cpp
├── TwoStateCTHMM.hpp
├── contingency-table.cpp
├── contingency-table.hpp
├── false_discovery_rate.cpp
├── false_discovery_rate.hpp
├── nonparametric-test.cpp
└── nonparametric-test.hpp
├── common
├── BetaBin.cpp
├── BetaBin.hpp
├── Distro.cpp
├── Distro.hpp
├── EmissionDistribution.cpp
├── EmissionDistribution.hpp
├── Epiread.cpp
├── Epiread.hpp
├── EpireadStats.cpp
├── EpireadStats.hpp
├── LevelsCounter.cpp
├── LevelsCounter.hpp
├── Makefile
├── MethpipeFiles.cpp
├── MethpipeFiles.hpp
├── MethpipeSite.cpp
├── MethpipeSite.hpp
├── Smoothing.cpp
├── Smoothing.hpp
├── ThreeStateHMM.cpp
├── ThreeStateHMM.hpp
├── TwoStateHMM.cpp
├── TwoStateHMM.hpp
├── TwoStateHMM_PMD.cpp
├── TwoStateHMM_PMD.hpp
├── bsutils.cpp
├── bsutils.hpp
├── numerical_utils.cpp
└── numerical_utils.hpp
├── experimental
├── Makefile
└── dmr-hdhmm.cpp
├── mlml
├── Makefile
└── mlml.cpp
├── radmeth
├── LICENSE
├── Makefile
├── README.md
├── dmr.cpp
├── methdiff.cpp
├── radmeth-adjust.cpp
├── radmeth-merge.cpp
└── radmeth.cpp
└── utils
├── Makefile
├── bigWig_to_methcounts.py
├── clean-hairpins.cpp
├── duplicate-remover.cpp
├── duplicate-remover_sam.cpp
├── fast-liftover.cpp
├── format_reads.cpp
├── guessprotocol.cpp
├── lc_approx.cpp
├── lift-filter.cpp
├── merge-bsrate.cpp
├── merge-methcounts.cpp
├── selectsites.cpp
└── symmetric-cpgs.cpp
/.gitignore:
--------------------------------------------------------------------------------
1 | # no .svn direcrories
2 | .svn/
3 | bin/
4 |
5 | # no compiled object file
6 | *.[oa]
7 | *.so
8 |
9 | # no temporary file
10 | *~
11 | *.out
12 | *.aux
13 | *.log
14 | *.eps
15 | *.tif
16 | *.dvi
17 |
18 | # ignore binary file
19 | src/amrfinder/allelicmeth
20 | src/amrfinder/amrfinder
21 | src/amrfinder/amrtester
22 | src/analysis/amr
23 | src/analysis/bsrate
24 | src/analysis/checkoverlap
25 | src/analysis/clipmates
26 | src/analysis/hmr
27 | src/analysis/hmr_plant
28 | src/analysis/hmr_posterior
29 | src/analysis/hmr_rep
30 | src/analysis/hypermr
31 | src/analysis/levels
32 | src/analysis/mappedstat
33 | src/analysis/merge-bsrate
34 | src/analysis/merge-counts
35 | src/analysis/mergelanes
36 | src/analysis/merge-methcounts
37 | src/analysis/methcounts
38 | src/analysis/methentropy
39 | src/analysis/methstates
40 | src/analysis/mlml
41 | src/analysis/pairedend_stat
42 | src/analysis/pmd
43 | src/analysis/roimethstat
44 | src/analysis/sortreads
45 | src/cytosines/cytosines
46 | src/experimental/dmr-hdhmm
47 | src/mlml/mlml
48 | src/pipeline/build_methylome.py
49 | src/pipeline/run_clipmates.py
50 | src/postmapping/clipmates
51 | src/postmapping/duplicate-remover
52 | src/postmapping/frag2mr
53 | src/postmapping/mask-overlap
54 | src/postmapping/merge
55 | src/postmapping/reorder
56 | src/postmapping/revcomp
57 | src/postmapping/sort
58 | src/postmapping/unique
59 | src/premapping/read-quality-prof
60 | src/premapping/trim-adapter
61 | src/premapping/visireads
62 | src/radmeth/dmr
63 | src/radmeth/make_table
64 | src/radmeth/methdiff
65 | src/radmeth/radmeth
66 | src/rmapbs/rmapbs
67 | src/smithlab_cpp/libsmithlab_cpp.so
68 | src/utils/clean-hairpins
69 | src/utils/duplicate-remover
70 | src/utils/fast-liftover
71 | src/utils/fastLiftOver
72 | src/utils/fastLiftOver2
73 | src/utils/lc_approx
74 | src/utils/lift-filter
75 | src/utils/merge-bsrate
76 | src/utils/merge-methcounts
77 | src/utils/selectsites
78 | src/utils/symmetric-cpgs
79 | src/utils/to-mr
80 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/smithlab_cpp"]
2 | path = src/smithlab_cpp
3 | url = ../smithlab_cpp.git
4 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # This file is part of the methpipe system
2 | #
3 | # Copyright (C) 2010-2014 University of Southern California and
4 | # Andrew D. Smith
5 | #
6 | # Authors: Andrew D. Smith
7 | #
8 | # This program is free software: you can redistribute it and/or modify
9 | # it under the terms of the GNU General Public License as published by
10 | # the Free Software Foundation, either version 3 of the License, or
11 | # (at your option) any later version.
12 | #
13 | # This program is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU General Public License
19 | # along with this program. If not, see .
20 | #
21 |
22 | METHPIPE_ROOT = $(shell pwd)
23 |
24 | all:
25 | @make -C src METHPIPE_ROOT=$(METHPIPE_ROOT)
26 |
27 | install:
28 | @make -C src METHPIPE_ROOT=$(METHPIPE_ROOT) install
29 |
30 | clean:
31 | @make -C src METHPIPE_ROOT=$(METHPIPE_ROOT) clean
32 | .PHONY: clean
33 |
34 | distclean: clean
35 | @rm -rf $(METHPIPE_ROOT)/bin
36 | .PHONY: distclean
37 |
--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
1 | # This file is part of methpipe
2 | #
3 | # Copyright (C) 2010-2019: Andrew D. Smith
4 | #
5 | # Authors: Andrew D. Smith
6 | #
7 | # This is free software: you can redistribute it and/or modify it
8 | # under the terms of the GNU General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 | # General Public License for more details.
16 |
17 | EXTRA_DIST = README.md
18 | ACLOCAL_AMFLAGS = -I m4
19 |
20 | SUBDIRS := src/smithlab_cpp
21 | install installdirs: SUBDIRS := $(filter-out src/smithlab_cpp, $(SUBDIRS))
22 | AM_CPPFLAGS = -I $(top_srcdir)/src/common -I $(top_srcdir)/src/smithlab_cpp
23 |
24 | CXXFLAGS = -O3 # default has optimization on
25 |
26 | noinst_LIBRARIES = libmethpipe.a
27 | libmethpipe_a_SOURCES = \
28 | src/common/BetaBin.cpp \
29 | src/common/Distro.cpp \
30 | src/common/EmissionDistribution.cpp \
31 | src/common/Epiread.cpp \
32 | src/common/EpireadStats.cpp \
33 | src/common/LevelsCounter.cpp \
34 | src/common/MethpipeSite.cpp \
35 | src/common/Smoothing.cpp \
36 | src/common/ThreeStateHMM.cpp \
37 | src/common/TwoStateHMM.cpp \
38 | src/common/TwoStateHMM_PMD.cpp \
39 | src/common/bsutils.cpp \
40 | src/common/numerical_utils.cpp
41 |
42 | libmethpipe_a_SOURCES += \
43 | src/common/BetaBin.hpp \
44 | src/common/Distro.hpp \
45 | src/common/EmissionDistribution.hpp \
46 | src/common/Epiread.hpp \
47 | src/common/EpireadStats.hpp \
48 | src/common/LevelsCounter.hpp \
49 | src/common/MethpipeFiles.hpp \
50 | src/common/MethpipeSite.hpp \
51 | src/common/Smoothing.hpp \
52 | src/common/ThreeStateHMM.hpp \
53 | src/common/TwoStateHMM.hpp \
54 | src/common/TwoStateHMM_PMD.hpp \
55 | src/common/bsutils.hpp \
56 | src/common/numerical_utils.hpp
57 |
58 | LDADD = libmethpipe.a src/smithlab_cpp/libsmithlab_cpp.a
59 |
60 | ### ANALYSIS SUBDIR
61 | bin_PROGRAMS = pmd methcounts bsrate hmr hypermr levels roimethstat \
62 | methstates methentropy hmr_rep multimethstat
63 |
64 | pmd_SOURCES = src/analysis/pmd.cpp
65 | methstates_SOURCES = src/analysis/methstates.cpp
66 | bsrate_SOURCES = src/analysis/bsrate.cpp
67 | methentropy_SOURCES = src/analysis/methentropy.cpp
68 | methcounts_SOURCES = src/analysis/methcounts.cpp
69 | roimethstat_SOURCES = src/analysis/roimethstat.cpp
70 | multimethstat_SOURCES = src/analysis/multimethstat.cpp
71 | hmr_SOURCES = src/analysis/hmr.cpp
72 | hmr_rep_SOURCES = src/analysis/hmr_rep.cpp
73 | levels_SOURCES = src/analysis/levels.cpp
74 | hypermr_SOURCES = src/analysis/hypermr.cpp
75 |
76 | ### UTILS SUBDIR
77 | bin_PROGRAMS += lc_approx fast-liftover lift-filter merge-bsrate \
78 | merge-methcounts duplicate-remover symmetric-cpgs \
79 | clean-hairpins selectsites guessprotocol format_reads
80 |
81 | clean_hairpins_SOURCES = src/utils/clean-hairpins.cpp
82 | guessprotocol_SOURCES = src/utils/guessprotocol.cpp
83 | duplicate_remover_SOURCES = src/utils/duplicate-remover.cpp
84 | merge_bsrate_SOURCES = src/utils/merge-bsrate.cpp
85 | format_reads_SOURCES = src/utils/format_reads.cpp
86 | lc_approx_SOURCES = src/utils/lc_approx.cpp
87 | selectsites_SOURCES = src/utils/selectsites.cpp
88 | symmetric_cpgs_SOURCES = src/utils/symmetric-cpgs.cpp
89 | merge_methcounts_SOURCES = src/utils/merge-methcounts.cpp
90 | lift_filter_SOURCES = src/utils/lift-filter.cpp
91 | fast_liftover_SOURCES = src/utils/fast-liftover.cpp
92 |
93 | ### AMRFINDER SUBDIR
94 | bin_PROGRAMS += allelicmeth amrfinder amrtester
95 |
96 | allelicmeth_SOURCES = src/amrfinder/allelicmeth.cpp
97 | amrfinder_SOURCES = src/amrfinder/amrfinder.cpp
98 | amrtester_SOURCES = src/amrfinder/amrtester.cpp
99 |
100 | ### RADMETH SUBDIR
101 | bin_PROGRAMS += radmeth radmeth-adjust radmeth-merge methdiff dmr
102 |
103 | dmr_SOURCES = src/radmeth/dmr.cpp
104 | methdiff_SOURCES = src/radmeth/methdiff.cpp
105 | radmeth_SOURCES = src/radmeth/radmeth.cpp
106 | radmeth_adjust_SOURCES = src/radmeth/radmeth-adjust.cpp
107 | radmeth_merge_SOURCES = src/radmeth/radmeth-merge.cpp
108 |
109 | ### MLML SUBDIR
110 | bin_PROGRAMS += mlml
111 |
112 | mlml_SOURCES = src/mlml/mlml.cpp
113 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | NOTICE: MethPipe is no longer maintained in this repository
2 | ============================================================
3 |
4 | MethPipe is now [DNMTools](https://github.com/smithlabcode/dnmtools). The
5 | MethPipe repository will remain open for issues and discussion, but further releases,
6 | updates and fixes will no longer be maintained in this page. We strongly
7 | recommend that users replace their existing release of MethPipe with the
8 | most recent version of DNMTools, which contains all existing MethPipe programs
9 | along with various fixes, improvements and novel programs for bisulfite sequencing
10 | data analysis.
11 |
12 | [Visit the DNMTools repository](https://github.com/smithlabcode/dnmtools)
13 |
14 | ------------------------------------------------------------------
15 |
16 | The MethPipe software package is a computational pipeline for
17 | analyzing bisulfite sequencing data (WGBS and RRBS). MethPipe provides
18 | tools methylation-specific technical evaluation of sequencing data,
19 | and for estimating methylation levels at individual cytosines.
20 | Additionally, MethPipe includes tools for identifying higher-level
21 | methylation features, such as hypo-methylated regions (HMR), partially
22 | methylated domains (PMD), hyper-methylated regions (HyperMR), and
23 | allele-specific methylated regions (AMR).
24 |
25 | Release 5.0.1
26 | ===================
27 |
28 | This new release no longer supports `mr` files, which means that the
29 | `to-mr` program has been eliminated and replaced by a program called
30 | `format_reads`, which merges mates in paired-end SAM files, also
31 | converting them to a standardized SAM format depending on the mapper
32 | it originates from. Additionally, the `htslib` library is now
33 | required, and instructions to install it in different environments are
34 | discussed below.
35 |
36 | If working with MR files is necessary for your analysis, we refer
37 | users to methpipe version [5.0.1](https://github.com/smithlabcode/methpipe/releases/tag/v5.0.1),
38 | which is the last release that contains programs that take MR as
39 | input.
40 |
41 | ## Installing release 5.0.1
42 |
43 | ### Required libraries
44 |
45 | * A recent compiler: most users will be building and installing this
46 | software with GCC. We require a compiler that fully supports C++11,
47 | so we recommend using at least GCC 5.8. There are still many systems
48 | that install a very old version of GCC by default, so if you have
49 | problems with building this software, that might be the first thing
50 | to check.
51 | * The GNU Scientific Library: this has always been required. It can be
52 | installed using `apt` on Linux, using `brew` on macOS, or from
53 | source available [here](http://www.gnu.org/software/gsl).
54 | * The Zlib compression library. Most likely you already have this
55 | installed on your system. If not, it can be installed using `apt`
56 | on Linux through the package `zlib1g-dev`. On macOS, Zlib can be
57 | installed with `brew`.
58 | * The HTSlib library, which can be installed through `brew`
59 | on macOS, through `apt` on Linux, or from source downloadable
60 | [here](https://github.com/samtools/htslib).
61 |
62 | ### Configuration
63 |
64 | 1. Download methpipe-5.0.1.tar.gz [here](https://github.com/smithlabcode/methpipe/releases/download/v5.0.1/methpipe-5.0.1.tar.gz).
65 | 2. Unpack the archive:
66 | ```
67 | $ tar -zxvf methpipe-5.0.1.tar.gz
68 | ```
69 | 3. Move into the methpipe directory and create a build directory:
70 | ```
71 | $ cd methpipe-5.0.1
72 | $ mkdir build && cd build
73 | ```
74 | 4. Run the configuration script:
75 | ```
76 | $ ../configure
77 | ```
78 | If you do not want to install the methpipe system-wide, or if you do
79 | not have admin privileges, specify a prefix directory:
80 | ```
81 | $ ../configure --prefix=/some/reasonable/place
82 | ```
83 | If you installed HTSlib yourself in some non-standard directory,
84 | you must specify the location like this:
85 | ```
86 | $ ../configure CPPFLAGS='-I /path/to/htslib/headers' \
87 | LDFLAGS='-L/path/to/htslib/lib'
88 | ```
89 |
90 | ### Building and installing the tools
91 |
92 | If you are still in the `build` directory, run `make` to compile the
93 | tools, and then `make install` to install them. If your HTSlib is not
94 | installed system-wide, then you might need to udpate your library
95 | path:
96 | ```
97 | $ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/to/htslib/lib
98 | ```
99 |
100 | ### Building and installing from source
101 |
102 | We strongly recommend using methpipe through the latest stable release
103 | under the releases section on GitHub. However, developers who wish to
104 | work on the latest commits, which are potentially unstable, can
105 | compile the cloned repository using the `Makefile` available in the
106 | repository. If HTSLib is available system-wide, compile by running
107 | ```
108 | make
109 | ```
110 |
111 | Usage
112 | =====
113 |
114 | Read methpipe-manual.pdf in the docs directory.
115 |
116 | Contacts and bug reports
117 | ========================
118 |
119 | Andrew D. Smith
120 | andrewds@usc.edu
121 |
122 | Ben Decato
123 | decato@usc.edu
124 |
125 | Meng Zhou
126 | mengzhou@usc.edu
127 |
128 | MethPipe and MethBase Users' Mailinglist
129 | methpipe@googlegroups.com
130 | http://groups.google.com/group/methpipe
131 |
132 | Copyright and License Information
133 | =================================
134 |
135 | Copyright (C) 2018-2021
136 | University of Southern California,
137 | Andrew D. Smith
138 |
139 | Current Authors: Andrew D. Smith, Ben Decato, Meng Zhou, Liz Ji,
140 | Terence Li, Guilherme de Sena Brandine
141 |
142 | This is free software: you can redistribute it and/or modify it under
143 | the terms of the GNU General Public License as published by the Free
144 | Software Foundation, either version 3 of the License, or (at your
145 | option) any later version.
146 |
147 | This software is distributed in the hope that it will be useful, but
148 | WITHOUT ANY WARRANTY; without even the implied warranty of
149 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
150 | General Public License for more details.
151 |
--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
1 | dnl This file is part of methpipe
2 | dnl
3 | dnl Copyright (C) 2019: Andrew D. Smith
4 | dnl
5 | dnl Authors: Andrew D. Smith
6 | dnl
7 | dnl This is free software: you can redistribute it and/or modify it
8 | dnl under the terms of the GNU General Public License as published by
9 | dnl the Free Software Foundation, either version 3 of the License, or
10 | dnl (at your option) any later version.
11 | dnl
12 | dnl This software is distributed in the hope that it will be useful,
13 | dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 | dnl General Public License for more details.
16 |
17 | AC_CONFIG_SUBDIRS([src/smithlab_cpp])
18 | AC_INIT([methpipe], [5.0.1], [andrewds@usc.edu],
19 | [methpipe], [https://github.com/smithlabcode/methpipe])
20 | dnl the config.h is not currently #included in the source, and only
21 | dnl used to keep command lines short.
22 | AC_CONFIG_HEADERS([config.h])
23 | AM_INIT_AUTOMAKE([subdir-objects foreign])
24 |
25 |
26 | AC_CONFIG_MACRO_DIR([m4])
27 | AC_LANG(C++)
28 | AC_PROG_CXX
29 | AX_CXX_COMPILE_STDCXX_11([noext], [mandatory])
30 | AC_PROG_RANLIB
31 |
32 | dnl recursively configure smithlab_cpp
33 | AX_SUBDIRS_CONFIGURE([src/smithlab_cpp], [--enable-hts])
34 |
35 | dnl check for HTSLib if requested
36 | hts_fail_msg="
37 | Failed to locate HTSLib on your system. Please use the LDFLAGS and
38 | CPPFLAGS variables to specify the directories where the HTSLib library
39 | and headers can be found.
40 | "
41 |
42 | gsl_fail_msg="
43 | Failed to locate the GNU Scientific Library (GSL) on your system. Please use
44 | the LDFLAGS and CPPFLAGS variables to specify the directories where the GSL
45 | library and headers can be found.
46 | "
47 |
48 | zlib_fail_msg="
49 | Failed to locate the ZLib on your system. Please use the LDFLAGS and CPPFLAGS
50 | variables to specify the directories where the ZLib library and headers can be
51 | found.
52 | "
53 |
54 | dnl check for required libraries
55 | AC_SEARCH_LIBS([hts_version], [hts], [], [AC_MSG_FAILURE([$hts_fail_msg])])
56 | AC_SEARCH_LIBS([zlibVersion], [z], [], [AC_MSG_FAILURE([$zlib_fail_msg])])
57 | AC_SEARCH_LIBS([cblas_dgemm], [gslcblas], [], [AC_MSG_FAILURE([$gsl_fail_msg])])
58 | AC_SEARCH_LIBS([gsl_blas_dgemm], [gsl], [], [AC_MSG_FAILURE([$gsl_fail_msg])])
59 |
60 | AC_CONFIG_FILES([
61 | Makefile
62 | ])
63 | AC_OUTPUT
64 |
--------------------------------------------------------------------------------
/docs/figs/A-rich-base-composition.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/figs/A-rich-base-composition.pdf
--------------------------------------------------------------------------------
/docs/figs/Methpipe_work_flow.eps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/figs/Methpipe_work_flow.eps
--------------------------------------------------------------------------------
/docs/figs/Methpipe_work_flow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/figs/Methpipe_work_flow.pdf
--------------------------------------------------------------------------------
/docs/figs/Methpipe_work_flow.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/figs/Methpipe_work_flow.tif
--------------------------------------------------------------------------------
/docs/figs/T-rich-base-composition.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/figs/T-rich-base-composition.pdf
--------------------------------------------------------------------------------
/docs/methpipe-manual.bib:
--------------------------------------------------------------------------------
1 |
2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3 | % Internal citations
4 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
5 |
6 | @article{song2013reference,
7 | title={A reference methylome database and analysis pipeline to facilitate integrative and comparative epigenomics},
8 | author={Song, Qiang and Decato, Benjamin and Hong, Elizabeth E and Zhou, Meng and Fang, Fang and Qu, Jianghan and Garvin, Tyler and Kessler, Michael and Zhou, Jun and Smith, Andrew D},
9 | journal={PloS one},
10 | volume={8},
11 | number={12},
12 | pages={e81148},
13 | year={2013},
14 | publisher={Public Library of Science}
15 | }
16 |
17 | @article{fang2012genomic,
18 | title={Genomic landscape of human allele-specific DNA methylation},
19 | author={Fang, Fang and Hodges, Emily and Molaro, Antoine and Dean, Matthew and Hannon, Gregory J and Smith, Andrew D},
20 | journal={Proceedings of the National Academy of Sciences},
21 | volume={109},
22 | number={19},
23 | pages={7332--7337},
24 | year={2012},
25 | publisher={National Acad Sciences}
26 | }
27 |
28 | @article{qu2013mlml,
29 | title={MLML: Consistent simultaneous estimates of DNA methylation and hydroxymethylation},
30 | author={Qu, Jianghan and Zhou, Meng and Song, Qiang and Hong, Elizabeth E and Smith, Andrew D},
31 | journal={Bioinformatics},
32 | volume={29},
33 | number={20},
34 | pages={2645--2646},
35 | year={2013}
36 | }
37 |
38 | @article{dolzhenko2014using,
39 | title={Using beta-binomial regression for high-precision differential methylation analysis in multifactor whole-genome bisulfite sequencing experiments},
40 | author={Dolzhenko, Egor and Smith, Andrew D},
41 | journal={BMC bioinformatics},
42 | volume={15},
43 | number={1},
44 | pages={1--8},
45 | year={2014},
46 | publisher={BioMed Central}
47 | }
48 |
49 | @article{decato2020characterization,
50 | title={Characterization of universal features of partially methylated domains across tissues and species},
51 | author={Decato, Benjamin E and Qu, Jianghan and Ji, Xiaojing and Wagenblast, Elvin and Knott, Simon RV and Hannon, Gregory J and Smith, Andrew D},
52 | journal={Epigenetics \& Chromatin},
53 | year={2020},
54 | publisher={BioMed Central}
55 | }
56 |
57 |
58 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
59 | % External citations
60 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
61 |
62 | @article{altham1969exact,
63 | title={Exact Bayesian analysis of a 2$\times$ 2 contingency table, and Fisher's" exact" significance test},
64 | author={Altham, Patricia ME},
65 | journal={Journal of the Royal Statistical Society. Series B (Methodological)},
66 | pages={261--269},
67 | year={1969},
68 | publisher={JSTOR}
69 | }
70 |
71 |
72 | @article{xie2011,
73 | author={ Xie, Hehuang and Wang, Min and Andrade, Alexandre de and Bonaldo, Maria de F. and Galat, Vasil and Arndt, Kelly and Rajaram, Veena and Goldman, Stewart and Tomita, Tadanori and Soares,Marcelo B.},
74 | title = {Genome-wide quantitative assessment of variation in DNA methylation patterns},
75 | journal = {Nucl. Acids Res.},
76 | volume = {39},
77 | number ={10},
78 | page ={4099 -- 4108},
79 | year= {2011}
80 | }
81 |
--------------------------------------------------------------------------------
/docs/methpipe-manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/methpipe-manual.pdf
--------------------------------------------------------------------------------
/m4/ax_cxx_check_lib.m4:
--------------------------------------------------------------------------------
1 | dnl @synopsis AX_CXX_CHECK_LIB(libname, functioname, action-if, action-if-not)
2 | dnl
3 | dnl The standard AC_CHECK_LIB can not test functions in namespaces.
4 | dnl Therefore AC_CHECK_LIB(cgicc, cgicc::Cgicc::getVersion) will always
5 | dnl fail. We need to decompose the functionname into a series of namespaces
6 | dnl where it gets declared so that it can be used for a link test.
7 | dnl
8 | dnl In the first version I did allow namespace::functionname to be a
9 | dnl reference to a void-argument global functionname (just wrapped in a
10 | dnl namespace) like its C counterparts would be - but in reality such
11 | dnl thing does not exist. The only global / static functions are always
12 | dnl made const-functions which is an attribute mangled along into the
13 | dnl library function export name.
14 | dnl
15 | dnl The normal usage will ask for a test of a class-member function which
16 | dnl should be presented with a full function spec with arguments given in
17 | dnl parentheses following the function name - if the function to test for
18 | dnl does expect arguments then you should add default initial values in the
19 | dnl prototype (even if they do not exist originally, these are used only
20 | dnl locally to build a correct function call in the configure test script).
21 | dnl
22 | dnl In the current version if you do omit the parenthesis from the macro
23 | dnl argument then the macro will assume that you want to check for the
24 | dnl class name - which is really to check for default constructor being
25 | dnl exported from the given library name.
26 | dnl
27 | dnl EXAMPLE:
28 | dnl AX_CXX_CHECK_LIB(cgicc, [cgicc::HTTPCookie])
29 | dnl AX_CXX_CHECK_LIB(cgicc, [cgicc::Cgicc::getVersion () const],
30 | dnl AX_CXX_CHECK_LIB(boost_regex, [boost::RegEx::Position (int i = 0) const])
31 | dnl
32 | dnl Result:
33 | dnl Just as the usual AX_CXX_CHECK_LIB - defines HAVE_LIBCGICC
34 | dnl and adds the libraries to the default library path (and
35 | dnl uses internally the normal ac_check_lib cache symbol
36 | dnl like ac_cv_lib_cgicc_cgicc__Cgicc)
37 | dnl
38 | dnl Footnote: The C++ language is not good at creating stable library
39 | dnl interfaces at the binary level - a lot of functionality is usually being
40 | dnl given as inline functions plus there is hardly a chance to create opaque
41 | dnl types. Therefore most C++ library tests will only do compile tests using
42 | dnl the header files. Doing a check_lib is however good to check the link
43 | dnl dependency before hitting it as an error in the build later.
44 | dnl
45 | dnl @category C++
46 | dnl @author Guido U. Draheim
47 | dnl @vesion 2006-12-18
48 |
49 | AC_DEFUN([AX_CXX_CHECK_LIB],
50 | [m4_ifval([$3], , [AH_CHECK_LIB([$1])])dnl
51 | AS_LITERAL_IF([$1],
52 | [AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1_$2])],
53 | [AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1''_$2])])dnl
54 | AC_CACHE_CHECK([for $2 in -l$1], ac_Lib,
55 | [ac_check_lib_save_LIBS=$LIBS
56 | LIBS="-l$1 $5 $LIBS"
57 | case "$2"
58 | in *::*::*\(*)
59 | AC_LINK_IFELSE([AC_LANG_PROGRAM([
60 | namespace `echo "$2" | sed -e "s/::.*//"`
61 | { class `echo "$2" | sed -e "s/.*::\\(.*\\)::.*/\\1/" -e "s/(.*//"`
62 | { public: int `echo "$2" | sed -e "s/.*:://" -e "/(/!s/..*/&()/"`;
63 | };
64 | }
65 | ],[`echo "$2" | sed -e "s/(.*//" -e "s/\\(.*\\)::\\(.*\\)/((\\1*)(0))->\\2/g"`()])],
66 | [AS_VAR_SET(ac_Lib, yes)],
67 | [AS_VAR_SET(ac_Lib, no)])
68 | ;; *::*::*)
69 | AC_LINK_IFELSE([AC_LANG_PROGRAM([
70 | namespace `echo "$2" | sed -e "s/::.*//"`
71 | { namespace `echo "$2" | sed -e "s/.*::\\(.*\\)::.*/\\1/"`
72 | { class `echo "$2" | sed -e "s/.*:://"`
73 | { public: `echo "$2" | sed -e "s/.*:://"` ();
74 | };
75 | }
76 | }
77 | ],[new $2()])],
78 | [AS_VAR_SET(ac_Lib, yes)],
79 | [AS_VAR_SET(ac_Lib, no)])
80 | ;; *::*\(*)
81 | AC_LINK_IFELSE([AC_LANG_PROGRAM([
82 | class `echo "$2" | sed -e "s/\\(.*\\)::.*/\\1/" -e "s/(.*//"`
83 | { public: int `echo "$2" | sed -e "s/.*:://" -e "/(/!s/..*/&()/"`;
84 | };
85 | ],[`echo "$2" | sed -e "s/(.*//" -e "s/\\(.*\\)::\\(.*\\)/((\\1*)(0))->\\2/g"`()])],
86 | [AS_VAR_SET(ac_Lib, yes)],
87 | [AS_VAR_SET(ac_Lib, no)])
88 | ;; *::*)
89 | AC_LINK_IFELSE([AC_LANG_PROGRAM([
90 | namespace `echo "$2" | sed -e "s/::.*//"`
91 | { class `echo "$2" | sed -e "s/.*:://"`
92 | { public: `echo "$2" | sed -e "s/.*:://"` ();
93 | };
94 | }
95 | ],[new $2()])],
96 | [AS_VAR_SET(ac_Lib, yes)],
97 | [AS_VAR_SET(ac_Lib, no)])
98 | ;; *)
99 | AC_LINK_IFELSE([AC_LANG_CALL([], [$2])],
100 | [AS_VAR_SET(ac_Lib, yes)],
101 | [AS_VAR_SET(ac_Lib, no)])
102 | ;; esac
103 | LIBS=$ac_check_lib_save_LIBS])
104 | AS_IF([test AS_VAR_GET(ac_Lib) = yes],
105 | [m4_default([$3], [AC_DEFINE_UNQUOTED(AS_TR_CPP(HAVE_LIB$1))
106 | LIBS="-l$1 $LIBS"
107 | ])],
108 | [$4])dnl
109 | AS_VAR_POPDEF([ac_Lib])dnl
110 | ])# AC_CHECK_LIB
111 |
--------------------------------------------------------------------------------
/m4/ax_cxx_compile_stdcxx_11.m4:
--------------------------------------------------------------------------------
1 | # =============================================================================
2 | # https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html
3 | # =============================================================================
4 | #
5 | # SYNOPSIS
6 | #
7 | # AX_CXX_COMPILE_STDCXX_11([ext|noext], [mandatory|optional])
8 | #
9 | # DESCRIPTION
10 | #
11 | # Check for baseline language coverage in the compiler for the C++11
12 | # standard; if necessary, add switches to CXX and CXXCPP to enable
13 | # support.
14 | #
15 | # This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX
16 | # macro with the version set to C++11. The two optional arguments are
17 | # forwarded literally as the second and third argument respectively.
18 | # Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for
19 | # more information. If you want to use this macro, you also need to
20 | # download the ax_cxx_compile_stdcxx.m4 file.
21 | #
22 | # LICENSE
23 | #
24 | # Copyright (c) 2008 Benjamin Kosnik
25 | # Copyright (c) 2012 Zack Weinberg
26 | # Copyright (c) 2013 Roy Stogner
27 | # Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov
28 | # Copyright (c) 2015 Paul Norman
29 | # Copyright (c) 2015 Moritz Klammler
30 | #
31 | # Copying and distribution of this file, with or without modification, are
32 | # permitted in any medium without royalty provided the copyright notice
33 | # and this notice are preserved. This file is offered as-is, without any
34 | # warranty.
35 |
36 | #serial 18
37 |
38 | AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX])
39 | AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [AX_CXX_COMPILE_STDCXX([11], [$1], [$2])])
40 |
--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile from methpipe software
2 | #
3 | # Copyright (C) 2010-2014 University of Southern California and
4 | # Andrew D. Smith
5 | #
6 | # Authors: Andrew D. Smith
7 | #
8 | # This program is free software: you can redistribute it and/or modify
9 | # it under the terms of the GNU General Public License as published by
10 | # the Free Software Foundation, either version 3 of the License, or
11 | # (at your option) any later version.
12 | #
13 | # This program is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU General Public License
19 | # along with this program. If not, see .
20 | #
21 |
22 | # check if recursive clone was done
23 | SMITHLAB_CPP=$(abspath $(dir $(MAKEFILE_LIST)))/smithlab_cpp
24 |
25 | ifeq (,$(wildcard $(SMITHLAB_CPP)/Makefile))
26 | $(error src/smithlab_cpp does not have a Makefile. \
27 | Did you use --recursive when running git clone?)
28 | endif
29 |
30 | all_subdirs=common utils analysis amrfinder mlml radmeth
31 | lib_subdirs=common
32 | app_subdirs=analysis utils amrfinder mlml radmeth
33 |
34 | all_subdirs += $(SMITHLAB_CPP)
35 | lib_subdirs += $(SMITHLAB_CPP)
36 |
37 | all:
38 | make -C $(SMITHLAB_CPP) HAVE_HTSLIB=1 all
39 | @for i in $(app_subdirs); do \
40 | make -C $${i} \
41 | SMITHLAB_CPP=$(SMITHLAB_CPP) \
42 | SRC_ROOT=$(METHPIPE_ROOT); \
43 | done;
44 |
45 | install:
46 | @for i in $(app_subdirs); do \
47 | make -C $${i} \
48 | SMITHLAB_CPP=$(SMITHLAB_CPP) \
49 | SRC_ROOT=$(METHPIPE_ROOT) install; \
50 | done;
51 |
52 | clean:
53 | make -C $(SMITHLAB_CPP) clean
54 | @for i in $(all_subdirs); do \
55 | make -C $${i} \
56 | SMITHLAB_CPP=$(SMITHLAB_CPP) \
57 | SRC_ROOT=$(METHPIPE_ROOT) clean; \
58 | done;
59 |
--------------------------------------------------------------------------------
/src/amrfinder/Makefile:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2014 University of Southern California
2 | # and Andrew D. Smith and Benjamin E. Decato
3 | #
4 | # Authors: Andrew D. Smith and Benjamin E. Decato
5 | #
6 | # This is free software; you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation; either version 2 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This software is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this software; if not, write to the Free Software
18 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
19 | # 02110-1301 USA
20 | #
21 |
22 | ifndef SMITHLAB_CPP
23 | $(error SMITHLAB_CPP variable undefined)
24 | endif
25 |
26 | PROGS = allelicmeth amrfinder amrtester
27 |
28 | CXX = g++
29 | CXXFLAGS = -Wall -std=c++11
30 | OPTFLAGS = -O3
31 | DEBUGFLAGS = -g
32 |
33 | ifdef DEBUG
34 | CXXFLAGS += $(DEBUGFLAGS)
35 | else
36 | CXXFLAGS += $(OPTFLAGS)
37 | endif
38 |
39 | COMMON_DIR = ../common
40 | INCLUDEDIRS = $(SMITHLAB_CPP) $(COMMON_DIR)
41 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS))
42 | override CPPFLAGS += $(INCLUDEARGS)
43 |
44 | LDLIBS = -lgsl -lgslcblas -lz
45 |
46 | all: $(PROGS)
47 |
48 | install: $(PROGS)
49 | @mkdir -p $(SRC_ROOT)/bin
50 | @install -m 755 $(PROGS) $(SRC_ROOT)/bin
51 |
52 | $(PROGS): $(addprefix $(SMITHLAB_CPP)/, libsmithlab_cpp.a)
53 |
54 | amrfinder: $(addprefix $(COMMON_DIR)/, EpireadStats.o Epiread.o)
55 |
56 | amrtester: $(addprefix $(COMMON_DIR)/, EpireadStats.o Epiread.o)
57 |
58 | allelicmeth: $(addprefix $(COMMON_DIR)/, Epiread.o)
59 |
60 | %.o: %.cpp %.hpp
61 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(CPPFLAGS)
62 |
63 | %: %.cpp
64 | $(CXX) $(CXXFLAGS) -o $@ $^ $(CPPFLAGS) $(LDLIBS) $(LDFLAGS)
65 |
66 | clean:
67 | @-rm -f $(PROGS) *.o *.so *.a *~
68 |
69 | .PHONY: clean
70 |
--------------------------------------------------------------------------------
/src/analysis/Makefile:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2014 University of Southern California
2 | # and Andrew D. Smith and Benjamin E. Decato
3 | #
4 | # Authors: Andrew D. Smith and Benjamin E. Decato
5 | #
6 | # This is free software; you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation; either version 2 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This software is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this software; if not, write to the Free Software
18 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
19 | # 02110-1301 USA
20 |
21 | PROGS = pmd methcounts bsrate hmr hypermr levels roimethstat \
22 | methstates methentropy hmr_rep multimethstat
23 |
24 | CXX = g++
25 | CXXFLAGS = -Wall -std=c++11
26 | OPTFLAGS = -O3
27 | DEBUGFLAGS = -g
28 |
29 | ifndef SRC_ROOT
30 | SRC_ROOT=../..
31 | endif
32 |
33 | ifndef SMITHLAB_CPP
34 | $(error SMITHLAB_CPP variable undefined)
35 | endif
36 |
37 | ifdef DEBUG
38 | CXXFLAGS += $(DEBUGFLAGS)
39 | else
40 | CXXFLAGS += $(OPTFLAGS)
41 | endif
42 |
43 | COMMON_DIR = ../common
44 | INCLUDEDIRS = $(SMITHLAB_CPP) $(COMMON_DIR)
45 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS))
46 | override CPPFLAGS += $(INCLUDEARGS)
47 |
48 | LDLIBS = -lgsl -lgslcblas -lz -lhts
49 |
50 | all: $(PROGS)
51 |
52 | install: $(PROGS)
53 | @mkdir -p $(SRC_ROOT)/bin
54 | @install -m 755 $(PROGS) $(SRC_ROOT)/bin
55 |
56 | $(PROGS): $(addprefix $(SMITHLAB_CPP)/, libsmithlab_cpp.a)
57 |
58 | levels hmr_rep hmr methcounts roimethstat hypermr pmd: \
59 | $(addprefix $(COMMON_DIR)/, MethpipeSite.o)
60 |
61 | hmr hmr_rep: $(addprefix $(COMMON_DIR)/, TwoStateHMM.o)
62 |
63 | pmd: $(addprefix $(COMMON_DIR)/, bsutils.o \
64 | TwoStateHMM_PMD.o EmissionDistribution.o)
65 |
66 | hypermr: $(addprefix $(COMMON_DIR)/, ThreeStateHMM.o Smoothing.o \
67 | Distro.o BetaBin.o numerical_utils.o)
68 |
69 | levels: $(addprefix $(COMMON_DIR)/, LevelsCounter.o)
70 |
71 | methcounts: $(addprefix $(COMMON_DIR)/, MethpipeSite.o)
72 |
73 | roimethstat levels: $(addprefix $(COMMON_DIR)/, bsutils.o)
74 |
75 | %.o: %.cpp %.hpp
76 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(CPPFLAGS)
77 |
78 | %: %.cpp
79 | $(CXX) $(CXXFLAGS) -o $@ $^ $(CPPFLAGS) $(LDLIBS) $(LDFLAGS)
80 |
81 | clean:
82 | @-rm -f $(PROGS) *.o *.so *.a *~
83 |
84 | .PHONY: clean
85 |
--------------------------------------------------------------------------------
/src/analysis/levels.cpp:
--------------------------------------------------------------------------------
1 | /* levels: a program to compute coverage statistics, mutation rates,
2 | * and three different formulas for methylation levels described in
3 | * the paper:
4 | *
5 | * 'Leveling' the playing field for analyses of single-base
6 | * resolution DNA methylomes
7 | * Schultz, Schmitz & Ecker (TIG 2012)
8 | *
9 | * Note: the fractional methylation level calculated in this program
10 | * is inspired but different from the paper. What we are doing here is
11 | * using binomial test to determine significantly hyper/hypomethylated
12 | * sites, and only use these subset of sites to calculate methylation
13 | * level.
14 | *
15 | * Copyright (C) 2014-2015 University of Southern California and
16 | * Andrew D. Smith and Benjamin E Decato
17 | *
18 | * Authors: Andrew D. Smith and Benjamin E Decato
19 | *
20 | * This program is free software: you can redistribute it and/or modify
21 | * it under the terms of the GNU General Public License as published by
22 | * the Free Software Foundation, either version 3 of the License, or
23 | * (at your option) any later version.
24 | *
25 | * This program is distributed in the hope that it will be useful,
26 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 | * GNU General Public License for more details.
29 | */
30 |
31 | #include
32 | #include
33 | #include
34 | #include
35 | #include
36 | #include
37 | #include
38 |
39 | #include "OptionParser.hpp"
40 | #include "smithlab_utils.hpp"
41 | #include "smithlab_os.hpp"
42 | #include "MethpipeSite.hpp"
43 | #include "LevelsCounter.hpp"
44 | #include "zlib_wrapper.hpp"
45 | #include "bsutils.hpp"
46 |
47 | using std::string;
48 | using std::vector;
49 | using std::cout;
50 | using std::cerr;
51 | using std::endl;
52 | using std::to_string;
53 | using std::runtime_error;
54 |
55 |
56 | int
57 | main(int argc, const char **argv) {
58 |
59 | try {
60 |
61 | bool VERBOSE = false;
62 | string outfile;
63 |
64 | /****************** COMMAND LINE OPTIONS ********************/
65 | OptionParser opt_parse(strip_path(argv[0]), "compute methylation levels",
66 | "");
67 | opt_parse.add_opt("output", 'o', "output file (default: stdout)",
68 | false, outfile);
69 | opt_parse.add_opt("alpha", 'a', "alpha for confidence interval",
70 | false, LevelsCounter::alpha);
71 | opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
72 | vector leftover_args;
73 | opt_parse.parse(argc, argv, leftover_args);
74 | if (opt_parse.help_requested()) {
75 | cerr << opt_parse.help_message() << endl
76 | << opt_parse.about_message() << endl;
77 | return EXIT_SUCCESS;
78 | }
79 | if (opt_parse.about_requested()) {
80 | cerr << opt_parse.about_message() << endl;
81 | return EXIT_SUCCESS;
82 | }
83 | if (opt_parse.option_missing()) {
84 | cerr << opt_parse.option_missing_message() << endl;
85 | return EXIT_SUCCESS;
86 | }
87 | if (leftover_args.size() != 1) {
88 | cerr << opt_parse.help_message() << endl;
89 | return EXIT_SUCCESS;
90 | }
91 | const string meth_file = leftover_args.front();
92 | /****************** END COMMAND LINE OPTIONS *****************/
93 |
94 | igzfstream in(meth_file);
95 | if (!in)
96 | throw std::runtime_error("bad input file: " + meth_file);
97 |
98 | LevelsCounter cpg("cpg");
99 | LevelsCounter cpg_symmetric("cpg_symmetric");
100 | LevelsCounter chh("chh");
101 | LevelsCounter cxg("cxg");
102 | LevelsCounter ccg("ccg");
103 | LevelsCounter cytosines("cytosines");
104 |
105 | MSite site, prev_site;
106 | size_t chrom_count = 0;
107 |
108 | while (in >> site) {
109 |
110 | if (site.chrom != prev_site.chrom) {
111 | ++chrom_count;
112 | if (VERBOSE)
113 | cerr << "PROCESSING:\t" << site.chrom << "\n";
114 | }
115 |
116 | if (site.is_cpg()) {
117 | cpg.update(site);
118 | if (site.is_mate_of(prev_site)) {
119 | site.add(prev_site);
120 | cpg_symmetric.update(site);
121 | }
122 | }
123 | else if (site.is_chh()) chh.update(site);
124 | else if (site.is_ccg()) ccg.update(site);
125 | else if (site.is_cxg()) cxg.update(site);
126 | else throw runtime_error("bad site context: " + site.context);
127 |
128 | cytosines.update(site);
129 |
130 | prev_site = site;
131 | }
132 |
133 | std::ofstream of;
134 | if (!outfile.empty()) of.open(outfile.c_str());
135 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
136 |
137 | out << cytosines << endl
138 | << cpg << endl
139 | << cpg_symmetric << endl
140 | << chh << endl
141 | << ccg << endl
142 | << cxg << endl;
143 | }
144 | catch (const std::exception &e) {
145 | cerr << e.what() << endl;
146 | return EXIT_FAILURE;
147 | }
148 | return EXIT_SUCCESS;
149 | }
150 |
--------------------------------------------------------------------------------
/src/common-experimental/Makefile:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2011 University of Southern California and
2 | # Andrew D. Smith
3 | #
4 | # Authors: Andrew D. Smith
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with this program. If not, see .
18 | #
19 |
20 | ifndef SMITHLAB_CPP
21 | $(error SMITHLAB_CPP variable undefined)
22 | endif
23 |
24 | CXX = g++
25 | CXXFLAGS = -std=c++11 -Wall
26 | OPTFLAGS = -O3
27 | DEBUGFLAGS = -g
28 |
29 | INCLUDEDIRS = $(SMITHLAB_CPP)/ ../common/
30 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS))
31 |
32 | ifdef DEBUG
33 | CXXFLAGS += $(DEBUGFLAGS)
34 | else
35 | CXXFLAGS += $(OPTFLAGS)
36 | endif
37 |
38 | %.o: %.cpp %.hpp
39 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(INCLUDEARGS)
40 |
41 | clean:
42 | @-rm -f *.o *~
43 | .PHONY: clean
44 |
--------------------------------------------------------------------------------
/src/common-experimental/ModelParams.cxx:
--------------------------------------------------------------------------------
1 | d/*
2 | * Copyright (C) 2012 University of Southern California
3 | * Andrew D Smith and Qiang Song
4 | * Author: Qiang Song
5 | *
6 | * This is free software; you can redistribute it and/or modify it
7 | * under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation; either version 2 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this software; if not, write to the Free Software
18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
19 | * 02110-1301 USA
20 | */
21 |
22 | #include
23 | #include
24 | #include
25 | #include
26 | #include
27 | #include
28 | #include
29 |
30 | #include "smithlab_utils.hpp"
31 |
32 | using std::vector;
33 | using std::string;
34 | using std::endl;
35 | using std::cerr;
36 | using std::getline;
37 |
38 | static void
39 | convert_to_stringstream(const string &infile, std::stringstream &ss)
40 | {
41 | std::ifstream in(infile.c_str());
42 |
43 | while (!in.eof())
44 | {
45 | string str;
46 | getline(in, str);
47 | const size_t comment_start = str.find("#");
48 | if (comment_start != string::npos)
49 | str.erase(comment_start);
50 | str = smithlab::strip(str);
51 | if (!str.empty())
52 | ss << str << endl;
53 | }
54 | in.close();
55 | }
56 |
57 | template void
58 | read_param_file(const std::string &infile, size_t &n,
59 | std::vector > &trans,
60 | std::vector &emissions,
61 | std::vector &durations);
62 | {
63 | std::stringstream ss(std::stringstream::in | std::stringstream::out);
64 | convert_to_stringstream(infile, ss);
65 |
66 | ss >> n;
67 | string tmp_str;
68 | getline(ss, tmp_str);
69 |
70 | emissions.clear();
71 | for (size_t i = 0; i < n; ++i)
72 | {
73 | string tmp_str;
74 | getline(ss, tmp_str);
75 | emissions.push_back(Distro_Type(tmp_str));
76 | }
77 |
78 | durations.clear();
79 | for (size_t i = 0; i < n; ++i)
80 | {
81 | string tmp_str;
82 | getline(ss, tmp_str);
83 | durations.push_back(Distro_Type(tmp_str));
84 | }
85 |
86 | trans.resize(n, vector(n));
87 | for (size_t i = 0; i < n; ++i)
88 | for (size_t j = 0; j < n; ++j)
89 | ss >> trans[i][j];
90 | }
91 |
92 | template void
93 | write_param_file(const std::string &outfile, const size_t &n,
94 | const std::vector > &trans,
95 | const std::vector &emissions,
96 | const std::vector &durations);
97 | {
98 | std::ofstream out(outfile.c_str());
99 |
100 | out << "# number of states" << endl;
101 | out << n << endl;
102 |
103 | out << "\n# emmission distributions" << endl;
104 | std::copy(emissions.begin(), emissions.end(),
105 | std::ostream_iterator(out, "\n"));
106 |
107 | out << "\n# duration distributions" << endl;
108 | std::copy(durations.begin(), durations.end(),
109 | std::ostream_iterator(out, "\n"));
110 |
111 | out << "\n# state transition probabilities" << endl;
112 | for (size_t i = 0; i < n; ++i)
113 | {
114 | copy(trans[i].begin(), trans[i].end(),
115 | std::ostream_iterator(out, "\t"));
116 | out << endl;
117 | }
118 | }
119 |
120 |
--------------------------------------------------------------------------------
/src/common-experimental/ModelParams.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 University of Southern California
3 | * Andrew D Smith and Qiang Song
4 | * Author: Qiang Song
5 | *
6 | * This is free software; you can redistribute it and/or modify it
7 | * under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation; either version 2 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this software; if not, write to the Free Software
18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
19 | * 02110-1301 USA
20 | */
21 |
22 | #ifndef MODEL_PARAMS_HPP
23 | #define MODEL_PARAMS_HPP
24 |
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 | #include
31 |
32 | #include "smithlab_utils.hpp"
33 |
34 | template void
35 | read_param_file(const std::string &infile, size_t &n,
36 | std::vector > &trans,
37 | std::vector &emissions,
38 | std::vector &durations);
39 |
40 | template void
41 | write_param_file(const std::string &outfile, const size_t &n,
42 | const std::vector > &trans,
43 | const std::vector &emissions,
44 | const std::vector &durations);
45 |
46 | static void
47 | convert_to_stringstream(const std::string &infile, std::stringstream &ss)
48 | {
49 | std::ifstream in(infile.c_str());
50 |
51 | while (!in.eof())
52 | {
53 | std::string str;
54 | std::getline(in, str);
55 | const size_t comment_start = str.find("#");
56 | if (comment_start != std::string::npos)
57 | str.erase(comment_start);
58 | str = smithlab::strip(str);
59 | if (!str.empty())
60 | ss << str << std::endl;
61 | }
62 | in.close();
63 | }
64 |
65 | template void
66 | read_param_file(const std::string &infile, size_t &n,
67 | std::vector > &trans,
68 | std::vector &emissions,
69 | std::vector &durations)
70 | {
71 | std::stringstream ss(std::stringstream::in | std::stringstream::out);
72 | convert_to_stringstream(infile, ss);
73 |
74 | ss >> n;
75 | std::string tmp_str;
76 | getline(ss, tmp_str);
77 |
78 | emissions.clear();
79 | for (size_t i = 0; i < n; ++i)
80 | {
81 | std::string tmp_str;
82 | std::getline(ss, tmp_str);
83 | emissions.push_back(Distro_Type(tmp_str));
84 | }
85 |
86 | durations.clear();
87 | for (size_t i = 0; i < n; ++i)
88 | {
89 | std::string tmp_str;
90 | std::getline(ss, tmp_str);
91 | durations.push_back(Distro_Type(tmp_str));
92 | }
93 |
94 | trans.resize(n, std::vector(n));
95 | for (size_t i = 0; i < n; ++i)
96 | for (size_t j = 0; j < n; ++j)
97 | ss >> trans[i][j];
98 | }
99 |
100 | template void
101 | write_param_file(const std::string &outfile, const size_t &n,
102 | const std::vector > &trans,
103 | const std::vector &emissions,
104 | const std::vector &durations)
105 | {
106 | std::ofstream out(outfile.c_str());
107 |
108 | out << "# number of states" << std::endl;
109 | out << n << std::endl;
110 |
111 | out << "\n# emmission distributions" << std::endl;
112 | std::copy(emissions.begin(), emissions.end(),
113 | std::ostream_iterator(out, "\n"));
114 |
115 | out << "\n# duration distributions" << std::endl;
116 | std::copy(durations.begin(), durations.end(),
117 | std::ostream_iterator(out, "\n"));
118 |
119 | out << "\n# state transition probabilities" << std::endl;
120 | for (size_t i = 0; i < n; ++i)
121 | {
122 | std::copy(trans[i].begin(), trans[i].end(),
123 | std::ostream_iterator(out, "\t"));
124 | out << std::endl;
125 | }
126 | }
127 | #endif
128 |
129 |
--------------------------------------------------------------------------------
/src/common-experimental/ThreeStateHDHMM.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2011 University of Southern California
3 | Authors: Andrew D. Smith, Song Qiang
4 |
5 | This file is part of rmap.
6 |
7 | rmap is free software; you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation; either version 2 of the License, or
10 | (at your option) any later version.
11 |
12 | rmap is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU General Public License
18 | along with rmap; if not, write to the Free Software
19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 | */
21 |
22 | #ifndef THREE_STATE_HDHMM_HPP
23 | #define THREE_STATE_HDHMM_HPP
24 |
25 | #include
26 | #include
27 | #include
28 | #include
29 |
30 | #include "smithlab_utils.hpp"
31 | #include "Distro.hpp"
32 |
33 | enum STATE_LABELS {GAIN, SAME, LOSS};
34 | struct Triplet { double gain, same, loss; };
35 |
36 | class ThreeStateHDHMM {
37 | public:
38 | ThreeStateHDHMM(
39 | const std::vector &_observations,
40 | const std::vector &_reset_points,
41 | const double tol,
42 | const size_t max_itr, const bool v,
43 | const size_t _MAX_LEN);
44 |
45 | void
46 | set_parameters(const Distro & _gain_emission,
47 | const Distro & _same_emission,
48 | const Distro & _loss_emission,
49 | const Distro & _gain_duration,
50 | const Distro & _same_duration,
51 | const Distro & _loss_duration,
52 | const std::vector > & _trans);
53 | void
54 | get_parameters(Distro & _gain_emission,
55 | Distro & _same_emission,
56 | Distro & _loss_emission,
57 | Distro & _gain_duration,
58 | Distro & _same_duration,
59 | Distro & _loss_duration,
60 | std::vector > & _trans) const;
61 |
62 | double
63 | BaumWelchTraining();
64 |
65 | double
66 | PosteriorDecoding();
67 |
68 | void
69 | get_posterior_scores(std::vector &scores,
70 | std::vector &classes);
71 |
72 | private:
73 |
74 | //////////// methods ////////////
75 | double
76 | single_iteration();
77 | double
78 | forward_algorithm(const size_t start, const size_t end);
79 | double
80 | backward_algorithm(const size_t start, const size_t end);
81 |
82 | double
83 | gain_segment_log_likelihood(const size_t start, const size_t end);
84 |
85 | double
86 | same_segment_log_likelihood(const size_t start, const size_t end);
87 |
88 | double
89 | loss_segment_log_likelihood(const size_t start, const size_t end);
90 |
91 | void
92 | estimate_state_posterior(const size_t start, const size_t end);
93 |
94 | void estimate_parameters();
95 |
96 | void update_observation_likelihood();
97 |
98 | //////// data ////////
99 | std::vector observations;
100 | std::vector reset_points;
101 | std::vector meth_lp, unmeth_lp;
102 | std::vector gain_log_likelihood, same_log_likelihood, loss_log_likelihood;
103 |
104 | // HMM internal data
105 | Distro gain_emission, same_emission, loss_emission;
106 | Distro gain_duration, same_duration, loss_duration;
107 |
108 | Triplet lp_start, lp_end;
109 | std::vector > trans;
110 |
111 | std::vector forward;
112 | std::vector backward;
113 | std::vector gain_posteriors, same_posteriors, loss_posteriors;
114 |
115 | // parameters
116 | // double MIN_PROB;
117 | double tolerance;
118 | size_t max_iterations;
119 | bool VERBOSE;
120 | size_t MAX_LEN;
121 | };
122 | // }
123 |
124 | #endif
125 |
--------------------------------------------------------------------------------
/src/common-experimental/TwoStateCTHMM.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2019 Andrew D. Smith
3 | Author: Andrew D. Smith
4 |
5 | This is free software; you can redistribute it and/or modify it
6 | under the terms of the GNU General Public License as published by
7 | the Free Software Foundation; either version 2 of the License, or
8 | (at your option) any later version.
9 |
10 | This software is distributed in the hope that it will be useful, but
11 | WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | General Public License for more details.
14 | */
15 |
16 | #ifndef TWO_STATE_CTHMM_HPP
17 | #define TWO_STATE_CTHMM_HPP
18 |
19 | #include "smithlab_utils.hpp"
20 | #include
21 |
22 | struct betabin;
23 | struct prob_mat;
24 |
25 | class TwoStateCTHMM {
26 | public:
27 |
28 | TwoStateCTHMM(const double ds, const double mp, const double tol,
29 | const size_t max_itr, const bool v, bool d = false) :
30 | desert_size(ds), MIN_PROB(mp), tolerance(tol), max_iterations(max_itr),
31 | VERBOSE(v), DEBUG(d) {}
32 |
33 | double
34 | ViterbiDecoding(const std::vector &pos,
35 | const std::vector > &values,
36 | const std::vector &reset_points,
37 | const std::vector &start_trans,
38 | const std::vector > &trans,
39 | const std::vector &end_trans,
40 | const double fg_alpha, const double fg_beta,
41 | const double bg_alpha, const double bg_beta,
42 | std::vector &ml_classes) const;
43 |
44 |
45 | double
46 | BaumWelchTraining(const std::vector &pos,
47 | const std::vector > &values,
48 | const std::vector &reset_points,
49 | std::vector &start_trans,
50 | double &mu0, double &mu1,
51 | std::vector &end_trans,
52 | double &fg_alpha, double &fg_beta,
53 | double &bg_alpha, double &bg_beta) const;
54 |
55 | double
56 | PosteriorDecoding(const std::vector &pos,
57 | const std::vector > &values,
58 | const std::vector &reset_points,
59 | const std::vector &start_trans,
60 | const double mu0, const double mu1,
61 | const std::vector &end_trans,
62 | const double fg_alpha, const double fg_beta,
63 | const double bg_alpha, const double bg_beta,
64 | std::vector &classes,
65 | std::vector &llr_scores) const;
66 |
67 | std::string
68 | error_log() const;
69 |
70 | static const size_t FG_TO_BG_TRANSITION = 1;
71 | static const size_t BG_TO_FG_TRANSITION = 2;
72 |
73 | private:
74 |
75 | double
76 | BaumWelchTraining(const std::vector &pos,
77 | const std::vector > &values,
78 | const std::vector &reset_points,
79 | double &p_sf, double &p_sb,
80 | double &mu0, double &mu1,
81 | double &p_ft, double &p_bt,
82 | betabin &fg_distro, betabin &bg_distro) const;
83 |
84 | double
85 | PosteriorDecoding(const std::vector &pos,
86 | const std::vector > &values,
87 | const std::vector &reset_points,
88 | const double p_sf, const double p_sb,
89 | const double mu0, const double mu1,
90 | const double p_ft, const double p_bt,
91 | const betabin &fg_distro,
92 | const betabin &bg_distro,
93 | std::vector &classes,
94 | std::vector &llr_scores) const;
95 |
96 | double
97 | single_iteration(const std::vector &pos,
98 | const std::vector > &values,
99 | const std::vector &vals_a,
100 | const std::vector &vals_b,
101 | const std::vector &reset_points,
102 | std::vector > &forward,
103 | std::vector > &backward,
104 | double &p_sf, double &p_sb,
105 | double &mu0, double &mu1,
106 | double &p_ft, double &p_bt,
107 | betabin &fg_distro, betabin &bg_distro) const;
108 |
109 | double
110 | forward_algorithm(const std::vector &pos,
111 | const std::vector > &vals,
112 | const size_t start, const size_t end,
113 | const double lp_sf, const double lp_sb,
114 | const std::vector &lm,
115 | const double lp_ft, const double lp_bt,
116 | const betabin &fg_distro,
117 | const betabin &bg_distro,
118 | std::vector > &f) const;
119 | double
120 | backward_algorithm(const std::vector &pos,
121 | const std::vector > &vals,
122 | const size_t start, const size_t end,
123 | const double lp_sf, const double lp_sb,
124 | const std::vector &lm,
125 | const double lp_ft, const double lp_bt,
126 | const betabin &fg_distro,
127 | const betabin &bg_distro,
128 | std::vector > &b) const;
129 |
130 | double
131 | log_sum_log_vec(const std::vector &vals, size_t limit) const;
132 |
133 | void
134 | estimate_emissions(const std::vector > &f,
135 | const std::vector > &b,
136 | std::vector &fg_probs,
137 | std::vector &bg_probs) const;
138 |
139 | void
140 | estimate_transitions(const std::vector &pos,
141 | const std::vector > &vals,
142 | const size_t start, const size_t end,
143 | const std::vector > &f,
144 | const std::vector > &b,
145 | const double total,
146 | const betabin &fg_distro,
147 | const betabin &bg_distro,
148 | const std::vector &lm,
149 | std::vector &ff_vals,
150 | std::vector &fb_vals,
151 | std::vector &bf_vals,
152 | std::vector &bb_vals) const;
153 |
154 | double
155 | log_sum_log(const double p, const double q) const;
156 |
157 | uint32_t desert_size;
158 | double MIN_PROB;
159 | double tolerance;
160 | size_t max_iterations;
161 | bool VERBOSE;
162 | bool DEBUG;
163 | };
164 |
165 | #endif
166 |
--------------------------------------------------------------------------------
/src/common-experimental/contingency-table.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2011 University of Southern California and
3 | * Andrew D. Smith, Song Qiang
4 | *
5 | * Authors: Andrew D. Smith, Song Qiang
6 | *
7 | * This program is free software: you can redistribute it and/or modify
8 | * it under the terms of the GNU General Public License as published by
9 | * the Free Software Foundation, either version 3 of the License, or
10 | * (at your option) any later version.
11 | *
12 | * This program is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | * GNU General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU General Public License
18 | * along with this program. If not, see .
19 | */
20 |
21 | #include
22 |
23 | #include
24 |
25 | #include "contingency-table.hpp"
26 | #include "numerical_utils.hpp"
27 |
28 | using std::min;
29 |
30 | static inline double
31 | log_prob_hypergeo(const size_t meth_a, const size_t unmeth_a,
32 | const size_t meth_b, const size_t unmeth_b,
33 | const size_t k)
34 | {
35 | return gsl_sf_lnchoose(meth_b + unmeth_b - 1, k) +
36 | gsl_sf_lnchoose(meth_a + unmeth_a - 1, meth_a + meth_b - 1 - k) -
37 | gsl_sf_lnchoose(meth_a + unmeth_a + meth_b + unmeth_b - 2,
38 | meth_a + meth_b - 1);
39 | }
40 |
41 | double
42 | ContingencyTable::beta_population_greater(
43 | const size_t meth_a, const size_t unmeth_a,
44 | const size_t meth_b, const size_t unmeth_b)
45 | {
46 | double p = 0;
47 |
48 | for (size_t k = meth_b > unmeth_a ? meth_b - unmeth_a : 0;
49 | k < meth_b; ++k)
50 | p = log_sum_log(p, log_prob_hypergeo(
51 | meth_a, unmeth_a, meth_b, unmeth_b, k));
52 | return exp(p);
53 | }
54 |
55 |
--------------------------------------------------------------------------------
/src/common-experimental/contingency-table.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2011 University of Southern California and
3 | * Andrew D. Smith, Song Qiang
4 | *
5 | * Authors: Andrew D. Smith, Song Qiang
6 | *
7 | * This program is free software: you can redistribute it and/or modify
8 | * it under the terms of the GNU General Public License as published by
9 | * the Free Software Foundation, either version 3 of the License, or
10 | * (at your option) any later version.
11 | *
12 | * This program is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | * GNU General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU General Public License
18 | * along with this program. If not, see .
19 | */
20 |
21 | #ifndef CONTINGENCY_TABLE_HPP
22 | #define CONTINGENCY_TABLE_HPP
23 |
24 | namespace ContingencyTable
25 | {
26 | double
27 | beta_population_greater(const size_t meth_a, const size_t unmeth_a,
28 | const size_t meth_b, const size_t unmeth_b);
29 | };
30 |
31 | #endif
32 |
--------------------------------------------------------------------------------
/src/common-experimental/false_discovery_rate.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2011 University of Southern California and
3 | * Andrew D. Smith, Song Qiang
4 | *
5 | * Authors: Andrew D. Smith, Song Qiang
6 | *
7 | * This program is free software: you can redistribute it and/or modify
8 | * it under the terms of the GNU General Public License as published by
9 | * the Free Software Foundation, either version 3 of the License, or
10 | * (at your option) any later version.
11 | *
12 | * This program is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | * GNU General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU General Public License
18 | * along with this program. If not, see .
19 | */
20 |
21 | #include
22 |
23 | #include
24 | #include
25 | #include
26 |
27 | #include "false_discovery_rate.hpp"
28 |
29 | using std::vector;
30 | using std::upper_bound;
31 |
32 | double
33 | FDR::get_empirical_p_value(const vector &random_scores,
34 | const double &observed_score)
35 | {
36 | return random_scores.size() == 0 ? 0 :
37 | (random_scores.end() -
38 | upper_bound(random_scores.begin(), random_scores.end(), observed_score))
39 | / static_cast(random_scores.size());
40 | }
41 |
42 |
43 | void
44 | FDR::assign_empirical_p_values(
45 | const vector &random_scores,
46 | const vector &observed_scores,
47 | vector &p_values)
48 | {
49 | // make sure random_scores are sorted
50 | assert(std::adjacent_find(random_scores.begin(), random_scores.end(),
51 | std::greater())
52 | == random_scores.end());
53 |
54 | // get p_values
55 | p_values.resize(observed_scores.size());
56 | for (size_t i = 0; i < observed_scores.size(); ++i)
57 | p_values[i] = get_empirical_p_value(random_scores, observed_scores[i]);
58 |
59 | // std::transform(observed_scores.begin(), observed_scores.end(),
60 | // p_values.begin(),
61 | // std::bind1st(std::ptr_fun(get_empirical_p_value),
62 | // random_scores));
63 | }
64 |
65 | double
66 | FDR::get_fdr_cutoff(const vector &p_values, const double fdr)
67 | {
68 | if (fdr < 0) return 0;
69 | else if (fdr > 1) return 1;
70 |
71 | vector local(p_values);
72 | std::sort(local.begin(), local.end());
73 | assert(local.size() > 0);
74 | size_t i = 0;
75 | for (; i < local.size() - 1 &&
76 | local[i+1] < fdr*static_cast(i+1)/local.size(); ++i);
77 | assert(i < local.size());
78 | return local[i];
79 | }
80 |
81 |
--------------------------------------------------------------------------------
/src/common-experimental/false_discovery_rate.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2011 University of Southern California and
3 | * Andrew D. Smith, Song Qiang
4 | *
5 | * Authors: Andrew D. Smith, Song Qiang
6 | *
7 | * This program is free software: you can redistribute it and/or modify
8 | * it under the terms of the GNU General Public License as published by
9 | * the Free Software Foundation, either version 3 of the License, or
10 | * (at your option) any later version.
11 | *
12 | * This program is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | * GNU General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU General Public License
18 | * along with this program. If not, see .
19 | */
20 |
21 | #ifndef FALSE_DISCOVERY_RATE
22 | #define FALSE_DISCOVERY_RATE
23 |
24 | #include
25 |
26 | namespace FDR
27 | {
28 | double
29 | get_empirical_p_value(const std::vector &random_scores,
30 | const double &observed_score);
31 | void
32 | assign_empirical_p_values(const std::vector &random_scores,
33 | const std::vector &observed_scores,
34 | std::vector &p_values);
35 | double
36 | get_fdr_cutoff(const std::vector &p_values, const double fdr);
37 | };
38 |
39 | #endif
40 |
--------------------------------------------------------------------------------
/src/common-experimental/nonparametric-test.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2011 University of Southern California and
3 | * Song Qiang
4 | *
5 | * Authors: Song Qiang
6 | *
7 | * This program is free software: you can redistribute it and/or modify
8 | * it under the terms of the GNU General Public License as published by
9 | * the Free Software Foundation, either version 3 of the License, or
10 | * (at your option) any later version.
11 | *
12 | * This program is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | * GNU General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU General Public License
18 | * along with this program. If not, see .
19 | */
20 |
21 | #ifndef NON_PARAMETRIC_TEST_HPP
22 | #define NON_PARAMETRIC_TEST_HPP
23 |
24 | #include
25 |
26 | namespace NonParametricTest
27 | {
28 | double
29 | sign_test(const std::vector &x,
30 | const std::vector &y,
31 | const bool alternative = false);
32 |
33 | double
34 | wilcoxon_test(const std::vector &x,
35 | const std::vector &y,
36 | const bool alternative = false);
37 | };
38 |
39 | #endif
40 |
--------------------------------------------------------------------------------
/src/common/BetaBin.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2011 University of Southern California
3 | Authors: Andrew D. Smith, Song Qiang
4 |
5 | This file is part of rmap.
6 |
7 | rmap is free software; you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation; either version 2 of the License, or
10 | (at your option) any later version.
11 |
12 | rmap is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU General Public License
18 | along with rmap; if not, write to the Free Software
19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 | */
21 |
22 | #include "BetaBin.hpp"
23 |
24 | #include
25 |
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 | #include
32 | #include
33 |
34 |
35 | using std::vector;
36 | using std::pair;
37 | using std::setw;
38 | using std::max;
39 | using std::min;
40 | using std::cerr;
41 | using std::endl;
42 | using std::string;
43 | using std::setprecision;
44 |
45 | //////////////////////////////////////////////
46 | ////// struct betabin //////
47 | //////////////////////////////////////////////
48 |
49 | const double betabin::tolerance = 1e-10;
50 |
51 | betabin::betabin() :
52 | alpha(1), beta(1), lnbeta_helper(gsl_sf_lnbeta(1, 1)) {}
53 |
54 | betabin::betabin(const double a, const double b) :
55 | alpha(a), beta(b), lnbeta_helper(gsl_sf_lnbeta(a, b)) {}
56 |
57 | betabin::betabin(const string &str)
58 | {
59 | std::istringstream iss(str, std::istringstream::in);
60 | string name;
61 | iss >> name >> alpha >> beta;
62 | if (name != "betabin" || alpha < 0 || beta < 0)
63 | {
64 | cerr << "betabin::betabin: "
65 | << "bad string representation of betabin distribution: "
66 | << str << endl;
67 | throw "bad string representation of betabin distribution";
68 | }
69 | lnbeta_helper = gsl_sf_lnbeta(alpha, beta);
70 | }
71 |
72 |
73 | string
74 | betabin::tostring() const
75 | {
76 | std::ostringstream os;
77 | os << "betabin " << setprecision(4) << alpha << " "
78 | << setprecision(4) << beta;
79 | return os.str();
80 | }
81 |
82 |
83 | double
84 | betabin::operator()(const pair &val) const
85 | {
86 | const size_t x = static_cast(val.first);
87 | const size_t n = static_cast(x + val.second);
88 | return gsl_sf_lnchoose(n, x) +
89 | gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper;
90 | }
91 |
92 | double
93 | betabin::log_likelihood(const pair &val) const
94 | {
95 | const size_t x = static_cast(val.first);
96 | const size_t n = static_cast(x + val.second);
97 | return gsl_sf_lnchoose(n, x) +
98 | gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper;
99 | }
100 |
101 | double
102 | betabin::sign(const double x)
103 | {
104 | return (x >= 0) ? 1.0 : -1.0;
105 | }
106 |
107 | double
108 | betabin::invpsi(const double tolerance, const double x)
109 | {
110 | double L = 1.0, Y = std::exp(x);
111 | while (L > tolerance)
112 | {
113 | Y += L*sign(x - gsl_sf_psi(Y));
114 | L /= 2.0;
115 | }
116 | return Y;
117 | }
118 |
119 | double
120 | betabin::movement(const double curr, const double prev)
121 | {
122 | return std::abs(curr - prev)/std::max(std::fabs(curr), std::fabs(prev));
123 | }
124 |
125 | void
126 | betabin::fit(const vector &vals_a, const vector &vals_b,
127 | const vector &p)
128 | {
129 | const double p_total = std::accumulate(p.begin(), p.end(), 0.0);
130 | const double alpha_rhs = inner_product(vals_a.begin(), vals_a.end(),
131 | p.begin(), 0.0)/p_total;
132 | const double beta_rhs = inner_product(vals_b.begin(), vals_b.end(),
133 | p.begin(), 0.0)/p_total;
134 | double prev_alpha = 0.0, prev_beta = 0.0;
135 | alpha = beta = 0.01;
136 | while (movement(alpha, prev_alpha) > tolerance &&
137 | movement(beta, prev_beta) > tolerance)
138 | {
139 | prev_alpha = alpha;
140 | prev_beta = beta;
141 | alpha = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + alpha_rhs);
142 | beta = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + beta_rhs);
143 | }
144 | lnbeta_helper = gsl_sf_lnbeta(alpha, beta);
145 | }
146 |
147 |
--------------------------------------------------------------------------------
/src/common/BetaBin.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2011 University of Southern California
3 | Authors: Andrew D. Smith, Song Qiang
4 |
5 | This file is part of rmap.
6 |
7 | rmap is free software; you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation; either version 2 of the License, or
10 | (at your option) any later version.
11 |
12 | rmap is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU General Public License
18 | along with rmap; if not, write to the Free Software
19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 | */
21 |
22 | #ifndef BETABIN_HPP
23 | #define BETABIN_HPP
24 |
25 | #include
26 | #include
27 | #include
28 |
29 | // struct betabin;
30 | struct betabin
31 | {
32 | betabin();
33 | betabin(const double a, const double b);
34 | betabin(const std::string &str);
35 | double operator()(const std::pair &val) const;
36 | double log_likelihood(const std::pair &val) const;
37 | double sign(const double x);
38 | double invpsi(const double tolerance, const double x);
39 | double movement(const double curr, const double prev);
40 | void fit(const std::vector &vals_a,
41 | const std::vector &vals_b,
42 | const std::vector &p);
43 | std::string tostring() const;
44 | double alpha;
45 | double beta;
46 | double lnbeta_helper;
47 |
48 | static const double tolerance;
49 | };
50 |
51 | #endif
52 |
53 |
--------------------------------------------------------------------------------
/src/common/EmissionDistribution.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2017 University of Southern California
3 | Authors: Andrew D. Smith and Benjamin E. Decato
4 |
5 | This file is part of methpipe.
6 |
7 | methpipe is free software; you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation; either version 2 of the License, or
10 | (at your option) any later version.
11 |
12 | methpipe is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU General Public License
18 | along with rmap; if not, write to the Free Software
19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 | */
21 |
22 | #include "EmissionDistribution.hpp"
23 |
24 | using std::vector;
25 | using std::pair;
26 | using std::setw;
27 | using std::max;
28 | using std::min;
29 | using std::cerr;
30 | using std::endl;
31 | using std::string;
32 | using std::setprecision;
33 |
34 | EmissionDistribution::EmissionDistribution() :
35 | alpha(1), beta(1), lnbeta_helper(gsl_sf_lnbeta(1, 1)) {}
36 |
37 | EmissionDistribution::EmissionDistribution(const double a, const double b) :
38 | alpha(a), beta(b), lnbeta_helper(gsl_sf_lnbeta(a, b)) {}
39 |
40 | EmissionDistribution::EmissionDistribution(const string &str) {
41 | std::istringstream iss(str, std::istringstream::in);
42 | string name;
43 | iss >> name >> alpha >> beta;
44 | if (name != "edtn" || alpha < 0 || beta < 0)
45 | {
46 | cerr << "EmissionDistribution::EmissionDistribution: "
47 | << "bad string representation of emission distribution: "
48 | << str << endl;
49 | throw "bad string representation of emission distribution";
50 | }
51 | lnbeta_helper = gsl_sf_lnbeta(alpha, beta);
52 | }
53 |
54 | EmissionDistribution::~EmissionDistribution() {}
55 |
56 | string
57 | EmissionDistribution::tostring() const {
58 | std::ostringstream os;
59 | os << "Emission dtn params: " << setprecision(4) << alpha << " "
60 | << setprecision(4) << beta;
61 | return os.str();
62 | }
63 |
64 |
65 | double
66 | EmissionDistribution::sign(const double x) {
67 | return (x >= 0) ? 1.0 : -1.0;
68 | }
69 |
70 |
71 | double
72 | EmissionDistribution::invpsi(const double tolerance, const double x) {
73 | double L = 1.0, Y = std::exp(x);
74 | while (L > tolerance)
75 | {
76 | Y += L*sign(x - gsl_sf_psi(Y));
77 | L /= 2.0;
78 | }
79 | return Y;
80 | }
81 |
82 |
83 | double
84 | EmissionDistribution::movement(const double curr, const double prev) {
85 | return std::abs(curr - prev)/std::max(std::fabs(curr), std::fabs(prev));
86 | }
87 |
88 |
89 | void
90 | EmissionDistribution::fit(const vector &vals_a,
91 | const vector &vals_b, const vector &p) {
92 | const double p_total = std::accumulate(p.begin(), p.end(), 0.0);
93 | const double alpha_rhs = inner_product(vals_a.begin(), vals_a.end(),
94 | p.begin(), 0.0)/p_total;
95 | const double beta_rhs = inner_product(vals_b.begin(), vals_b.end(),
96 | p.begin(), 0.0)/p_total;
97 |
98 | double prev_alpha = 0.0, prev_beta = 0.0;
99 | alpha = beta = 0.01;
100 | while (movement(alpha, prev_alpha) > tolerance &&
101 | movement(beta, prev_beta) > tolerance)
102 | {
103 | prev_alpha = alpha;
104 | prev_beta = beta;
105 | alpha = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + alpha_rhs);
106 | beta = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + beta_rhs);
107 | }
108 | lnbeta_helper = gsl_sf_lnbeta(alpha, beta);
109 | }
110 |
111 | Beta::Beta() : EmissionDistribution() {}
112 | Beta::Beta(const double a, const double b) : EmissionDistribution(a,b) {}
113 | Beta::Beta(const std::string &str) : EmissionDistribution(str) {}
114 |
115 | double
116 | Beta::operator()(const pair &val) const {
117 | const double p = val.first/val.second;
118 | return (alpha-1.0)*log(p) + (beta-1.0)*log(1.0-p) - gsl_sf_lnbeta(alpha, beta);
119 | }
120 |
121 | double
122 | Beta::log_likelihood(const pair &val) const {
123 | const double p = val.first/val.second;
124 | return (alpha-1.0)*log(p) + (beta-1.0)*log(1.0-p) - gsl_sf_lnbeta(alpha, beta);
125 | }
126 |
127 | BetaBinomial::BetaBinomial() : EmissionDistribution() {}
128 | BetaBinomial::BetaBinomial(const double a, const double b)
129 | : EmissionDistribution(a,b) {}
130 | BetaBinomial::BetaBinomial(const std::string &str)
131 | : EmissionDistribution(str) {}
132 |
133 | double
134 | BetaBinomial::operator()(const pair &val) const {
135 | const size_t x = static_cast(val.first);
136 | const size_t n = static_cast(x + val.second);
137 | return gsl_sf_lnchoose(n, x) +
138 | gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper;
139 | }
140 |
141 | double
142 | BetaBinomial::log_likelihood(const pair &val) const {
143 | const size_t x = static_cast(val.first);
144 | const size_t n = static_cast(x + val.second);
145 | return gsl_sf_lnchoose(n, x) +
146 | gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper;
147 | }
148 |
--------------------------------------------------------------------------------
/src/common/EmissionDistribution.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) 2017 University of Southern California
3 | Authors: Andrew D. Smith and Benjamin E. Decato
4 |
5 | This file is part of methpipe.
6 |
7 | methpipe is free software; you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation; either version 2 of the License, or
10 | (at your option) any later version.
11 |
12 | methpipe is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU General Public License
18 | along with rmap; if not, write to the Free Software
19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 | */
21 |
22 | #ifndef EM_DTN
23 | #define EM_DTN
24 |
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 | #include
31 | #include
32 | #include
33 | #include
34 | #include
35 |
36 | /** Emission distributions for methylation should be modeled either as
37 | * Beta or Beta Binomial. Since they will be used simultaneously, it is
38 | * helpful to have an abstraction so that we can put them in the same
39 | * container.
40 | */
41 | class EmissionDistribution
42 | {
43 | public:
44 | EmissionDistribution();
45 | virtual ~EmissionDistribution();
46 | EmissionDistribution(const double a, const double b);
47 | EmissionDistribution(const std::string &str);
48 | virtual double operator()(const std::pair &val) const = 0;
49 | virtual double log_likelihood(const std::pair &val) const = 0;
50 | std::string tostring() const;
51 | double getalpha() { return alpha; };
52 | double getbeta() { return beta; };
53 | void fit(const std::vector &vals_a,
54 | const std::vector &vals_b,
55 | const std::vector &p);
56 |
57 | protected:
58 | double sign(const double x);
59 | double invpsi(const double tolerance, const double x);
60 | double movement(const double curr, const double prev);
61 | double alpha;
62 | double beta;
63 | double lnbeta_helper;
64 |
65 | const double tolerance = 1e-10;
66 | };
67 |
68 | class Beta : public EmissionDistribution
69 | {
70 | public:
71 | Beta();
72 | Beta(const double a, const double b);
73 | Beta(const std::string &str);
74 | double operator()(const std::pair &val) const;
75 | double log_likelihood(const std::pair &val) const;
76 | };
77 |
78 | class BetaBinomial : public EmissionDistribution
79 | {
80 | public:
81 | BetaBinomial();
82 | BetaBinomial(const double a, const double b);
83 | BetaBinomial(const std::string &str);
84 | double operator()(const std::pair &val) const;
85 | double log_likelihood(const std::pair &val) const;
86 | };
87 |
88 | #endif
89 |
--------------------------------------------------------------------------------
/src/common/Epiread.cpp:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2011 University of Southern California and
2 | * Andrew D. Smith and Fang Fang
3 | *
4 | * Authors: Fang Fang and Andrew D. Smith
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | */
19 |
20 | #include
21 | #include
22 | #include
23 |
24 | #include "Epiread.hpp"
25 |
26 | using std::vector;
27 | using std::string;
28 |
29 | size_t
30 | adjust_read_offsets(vector &reads) {
31 | size_t first_read_offset = std::numeric_limits::max();
32 | for (size_t i = 0; i < reads.size(); ++i)
33 | first_read_offset = std::min(reads[i].pos, first_read_offset);
34 | for (size_t i = 0; i < reads.size(); ++i)
35 | reads[i].pos -= first_read_offset;
36 | return first_read_offset;
37 | }
38 |
39 |
40 | size_t
41 | get_n_cpgs(const vector &reads) {
42 | size_t n_cpgs = 0;
43 | for (size_t i = 0; i < reads.size(); ++i)
44 | n_cpgs = std::max(n_cpgs, reads[i].end());
45 | return n_cpgs;
46 | }
47 |
48 | std::istream&
49 | operator>>(std::istream &in, epiread &er) {
50 | string buffer;
51 | if (getline(in, buffer)) {
52 | std::istringstream is(buffer);
53 | if (!(is >> er.chr >> er.pos >> er.seq))
54 | throw std::runtime_error("malformed epiread line:\n" + buffer);
55 | }
56 | return in;
57 | }
58 |
59 |
60 | std::ostream&
61 | operator<<(std::ostream &out, const epiread &er) {
62 | return out << er.chr << '\t' << er.pos << '\t' << er.seq;
63 | }
64 |
--------------------------------------------------------------------------------
/src/common/Epiread.hpp:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2011 University of Southern California and
2 | * Andrew D. Smith and Fang Fang
3 | *
4 | * Authors: Fang Fang and Andrew D. Smith
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | */
19 |
20 | #ifndef EPIREAD
21 | #define EPIREAD
22 |
23 | #include
24 | #include
25 | #include "smithlab_utils.hpp"
26 |
27 | struct epiread {
28 | std::string chr;
29 | size_t pos;
30 | std::string seq;
31 | epiread() {}
32 | epiread(const size_t p, const std::string &s) : pos(p), seq(s) {}
33 | epiread(const std::string &c, const size_t p, const std::string &s)
34 | : chr(c), pos(p), seq(s) {}
35 |
36 | bool operator<(const epiread &other) const {
37 | return (chr < other.chr || (chr == other.chr && pos < other.pos));
38 | }
39 | size_t end() const {return pos + seq.length();}
40 | size_t length() const {return seq.length();}
41 | };
42 |
43 | std::istream& operator>>(std::istream &in, epiread &er);
44 | std::ostream& operator<<(std::ostream &out, const epiread &er);
45 |
46 | size_t
47 | adjust_read_offsets(std::vector &reads);
48 |
49 | size_t
50 | get_n_cpgs(const std::vector &reads);
51 |
52 | #endif
53 |
--------------------------------------------------------------------------------
/src/common/EpireadStats.cpp:
--------------------------------------------------------------------------------
1 | /* Copyright (C) 2014 University of Southern California and
2 | * Andrew D. Smith and Fang Fang and Benjamin Decato
3 | *
4 | * Authors: Fang Fang and Benjamin Decato and Andrew D. Smith
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | */
19 |
20 | #include "EpireadStats.hpp"
21 |
22 | #include
23 | #include
24 | #include
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 | #include
32 | #include
33 |
34 | using std::string;
35 | using std::vector;
36 | using std::isfinite;
37 |
38 | static const double PSEUDOCOUNT = 1e-10;
39 |
40 | inline bool
41 | is_meth(const epiread &r, const size_t pos) {return (r.seq[pos] == 'C');}
42 |
43 | inline bool
44 | un_meth(const epiread &r, const size_t pos) {return (r.seq[pos] == 'T');}
45 |
46 | double
47 | log_likelihood(const epiread &r, const vector &a) {
48 | double ll = 0.0;
49 | for (size_t i = 0; i < r.seq.length(); ++i)
50 | if (is_meth(r, i) || un_meth(r, i)) {
51 | const double val = (is_meth(r, i) ? a[r.pos + i] : (1.0 - a[r.pos + i]));
52 | assert(isfinite(log(val)));
53 | ll += log(val);
54 | }
55 | return ll;
56 | }
57 |
58 |
59 | double
60 | log_likelihood(const epiread &r, const double mixing,
61 | const vector &a1, const vector &a2) {
62 | return log(mixing*exp(log_likelihood(r, a1)) +
63 | (1.0 - mixing)*exp(log_likelihood(r, a2)));
64 | }
65 |
66 |
67 | double
68 | log_likelihood(const vector &reads, const double mixing,
69 | const vector &a1, const vector &a2) {
70 | double ll = 0.0;
71 | for (size_t i = 0; i < reads.size(); ++i)
72 | ll += log_likelihood(reads[i], mixing, a1, a2);
73 | return ll;
74 | }
75 |
76 |
77 | static double
78 | expectation_step(const vector &reads, const double mixing,
79 | const vector &a1, const vector &a2,
80 | vector &indicators) {
81 | const double log_mixing1 = log(mixing);
82 | const double log_mixing2 = log(1.0 - mixing);
83 | assert(isfinite(log_mixing1) && isfinite(log_mixing2));
84 |
85 | double score = 0;
86 | for (size_t i = 0; i < reads.size(); ++i) {
87 | const double ll1 = log_mixing1 + log_likelihood(reads[i], a1);
88 | const double ll2 = log_mixing2 + log_likelihood(reads[i], a2);
89 | assert(isfinite(ll1) && isfinite(ll2));
90 | const double log_denom = log(exp(ll1) + exp(ll2));
91 | score += log_denom;
92 | indicators[i] = exp(ll1 - log_denom);
93 | assert(isfinite(log_denom) && isfinite(indicators[i]));
94 | }
95 | return score;
96 | }
97 |
98 |
99 | void
100 | fit_epiallele(double pseudo, const vector &reads,
101 | const vector &indicators, vector &a) {
102 | const size_t n_cpgs = a.size();
103 | vector meth(n_cpgs, 0.0), total(n_cpgs, 0.0);
104 | for (size_t i = 0; i < reads.size(); ++i) {
105 | const size_t start = reads[i].pos;
106 | const double weight = indicators[i];
107 | for (size_t j = 0; j < reads[i].seq.length(); ++j)
108 | if (is_meth(reads[i], j) || un_meth(reads[i], j)) {
109 | meth[start + j] += weight*(is_meth(reads[i], j));
110 | total[start + j] += weight;
111 | }
112 | }
113 | for (size_t i = 0; i < n_cpgs; ++i)
114 | a[i] = (meth[i] + pseudo)/(total[i] + 2*pseudo);
115 | }
116 |
117 |
118 | static void
119 | maximization_step(const vector &reads, const vector &indicators,
120 | vector &a1, vector &a2) {
121 |
122 | vector inverted_indicators(indicators);
123 | for (size_t i = 0; i < inverted_indicators.size(); ++i)
124 | inverted_indicators[i] = 1.0 - inverted_indicators[i];
125 |
126 | // Fit the regular model parameters. Since the two epialleles'
127 | // likelihoods are summed, we need to make sure the pseudocount
128 | // is proportional to the pseudocount used in the single allele model.
129 | fit_epiallele(0.5*PSEUDOCOUNT, reads, indicators, a1);
130 | fit_epiallele(0.5*PSEUDOCOUNT, reads, inverted_indicators, a2);
131 | }
132 |
133 |
134 | static void
135 | rescale_indicators(const double mixing, vector