├── .gitignore ├── .gitmodules ├── Makefile ├── Makefile.am ├── README.md ├── configure.ac ├── docs ├── figs │ ├── A-rich-base-composition.eps │ ├── A-rich-base-composition.pdf │ ├── Methpipe_work_flow.eps │ ├── Methpipe_work_flow.pdf │ ├── Methpipe_work_flow.tif │ ├── T-rich-base-composition.eps │ └── T-rich-base-composition.pdf ├── methpipe-manual.bib ├── methpipe-manual.pdf └── methpipe-manual.tex ├── m4 ├── ax_cxx_check_lib.m4 ├── ax_cxx_compile_stdcxx.m4 ├── ax_cxx_compile_stdcxx_11.m4 └── ax_subdirs_configure.m4 └── src ├── Makefile ├── amrfinder ├── Makefile ├── allelicmeth.cpp ├── amrfinder.cpp └── amrtester.cpp ├── analysis ├── Makefile ├── bsrate.cpp ├── bsrate_sam.cpp ├── hmr.cpp ├── hmr_rep.cpp ├── hypermr.cpp ├── levels.cpp ├── methcounts.cpp ├── methentropy.cpp ├── methstates.cpp ├── multimethstat.cpp ├── pmd.cpp └── roimethstat.cpp ├── common-experimental ├── Makefile ├── ModelParams.cxx ├── ModelParams.hpp ├── ThreeStateHDHMM.cpp ├── ThreeStateHDHMM.hpp ├── TwoStateCTHMM.cpp ├── TwoStateCTHMM.hpp ├── contingency-table.cpp ├── contingency-table.hpp ├── false_discovery_rate.cpp ├── false_discovery_rate.hpp ├── nonparametric-test.cpp └── nonparametric-test.hpp ├── common ├── BetaBin.cpp ├── BetaBin.hpp ├── Distro.cpp ├── Distro.hpp ├── EmissionDistribution.cpp ├── EmissionDistribution.hpp ├── Epiread.cpp ├── Epiread.hpp ├── EpireadStats.cpp ├── EpireadStats.hpp ├── LevelsCounter.cpp ├── LevelsCounter.hpp ├── Makefile ├── MethpipeFiles.cpp ├── MethpipeFiles.hpp ├── MethpipeSite.cpp ├── MethpipeSite.hpp ├── Smoothing.cpp ├── Smoothing.hpp ├── ThreeStateHMM.cpp ├── ThreeStateHMM.hpp ├── TwoStateHMM.cpp ├── TwoStateHMM.hpp ├── TwoStateHMM_PMD.cpp ├── TwoStateHMM_PMD.hpp ├── bsutils.cpp ├── bsutils.hpp ├── numerical_utils.cpp └── numerical_utils.hpp ├── experimental ├── Makefile └── dmr-hdhmm.cpp ├── mlml ├── Makefile └── mlml.cpp ├── radmeth ├── LICENSE ├── Makefile ├── README.md ├── dmr.cpp ├── methdiff.cpp ├── radmeth-adjust.cpp ├── radmeth-merge.cpp └── radmeth.cpp └── utils ├── Makefile ├── bigWig_to_methcounts.py ├── clean-hairpins.cpp ├── duplicate-remover.cpp ├── duplicate-remover_sam.cpp ├── fast-liftover.cpp ├── format_reads.cpp ├── guessprotocol.cpp ├── lc_approx.cpp ├── lift-filter.cpp ├── merge-bsrate.cpp ├── merge-methcounts.cpp ├── selectsites.cpp └── symmetric-cpgs.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # no .svn direcrories 2 | .svn/ 3 | bin/ 4 | 5 | # no compiled object file 6 | *.[oa] 7 | *.so 8 | 9 | # no temporary file 10 | *~ 11 | *.out 12 | *.aux 13 | *.log 14 | *.eps 15 | *.tif 16 | *.dvi 17 | 18 | # ignore binary file 19 | src/amrfinder/allelicmeth 20 | src/amrfinder/amrfinder 21 | src/amrfinder/amrtester 22 | src/analysis/amr 23 | src/analysis/bsrate 24 | src/analysis/checkoverlap 25 | src/analysis/clipmates 26 | src/analysis/hmr 27 | src/analysis/hmr_plant 28 | src/analysis/hmr_posterior 29 | src/analysis/hmr_rep 30 | src/analysis/hypermr 31 | src/analysis/levels 32 | src/analysis/mappedstat 33 | src/analysis/merge-bsrate 34 | src/analysis/merge-counts 35 | src/analysis/mergelanes 36 | src/analysis/merge-methcounts 37 | src/analysis/methcounts 38 | src/analysis/methentropy 39 | src/analysis/methstates 40 | src/analysis/mlml 41 | src/analysis/pairedend_stat 42 | src/analysis/pmd 43 | src/analysis/roimethstat 44 | src/analysis/sortreads 45 | src/cytosines/cytosines 46 | src/experimental/dmr-hdhmm 47 | src/mlml/mlml 48 | src/pipeline/build_methylome.py 49 | src/pipeline/run_clipmates.py 50 | src/postmapping/clipmates 51 | src/postmapping/duplicate-remover 52 | src/postmapping/frag2mr 53 | src/postmapping/mask-overlap 54 | src/postmapping/merge 55 | src/postmapping/reorder 56 | src/postmapping/revcomp 57 | src/postmapping/sort 58 | src/postmapping/unique 59 | src/premapping/read-quality-prof 60 | src/premapping/trim-adapter 61 | src/premapping/visireads 62 | src/radmeth/dmr 63 | src/radmeth/make_table 64 | src/radmeth/methdiff 65 | src/radmeth/radmeth 66 | src/rmapbs/rmapbs 67 | src/smithlab_cpp/libsmithlab_cpp.so 68 | src/utils/clean-hairpins 69 | src/utils/duplicate-remover 70 | src/utils/fast-liftover 71 | src/utils/fastLiftOver 72 | src/utils/fastLiftOver2 73 | src/utils/lc_approx 74 | src/utils/lift-filter 75 | src/utils/merge-bsrate 76 | src/utils/merge-methcounts 77 | src/utils/selectsites 78 | src/utils/symmetric-cpgs 79 | src/utils/to-mr 80 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/smithlab_cpp"] 2 | path = src/smithlab_cpp 3 | url = ../smithlab_cpp.git 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # This file is part of the methpipe system 2 | # 3 | # Copyright (C) 2010-2014 University of Southern California and 4 | # Andrew D. Smith 5 | # 6 | # Authors: Andrew D. Smith 7 | # 8 | # This program is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, either version 3 of the License, or 11 | # (at your option) any later version. 12 | # 13 | # This program is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with this program. If not, see . 20 | # 21 | 22 | METHPIPE_ROOT = $(shell pwd) 23 | 24 | all: 25 | @make -C src METHPIPE_ROOT=$(METHPIPE_ROOT) 26 | 27 | install: 28 | @make -C src METHPIPE_ROOT=$(METHPIPE_ROOT) install 29 | 30 | clean: 31 | @make -C src METHPIPE_ROOT=$(METHPIPE_ROOT) clean 32 | .PHONY: clean 33 | 34 | distclean: clean 35 | @rm -rf $(METHPIPE_ROOT)/bin 36 | .PHONY: distclean 37 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | # This file is part of methpipe 2 | # 3 | # Copyright (C) 2010-2019: Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | EXTRA_DIST = README.md 18 | ACLOCAL_AMFLAGS = -I m4 19 | 20 | SUBDIRS := src/smithlab_cpp 21 | install installdirs: SUBDIRS := $(filter-out src/smithlab_cpp, $(SUBDIRS)) 22 | AM_CPPFLAGS = -I $(top_srcdir)/src/common -I $(top_srcdir)/src/smithlab_cpp 23 | 24 | CXXFLAGS = -O3 # default has optimization on 25 | 26 | noinst_LIBRARIES = libmethpipe.a 27 | libmethpipe_a_SOURCES = \ 28 | src/common/BetaBin.cpp \ 29 | src/common/Distro.cpp \ 30 | src/common/EmissionDistribution.cpp \ 31 | src/common/Epiread.cpp \ 32 | src/common/EpireadStats.cpp \ 33 | src/common/LevelsCounter.cpp \ 34 | src/common/MethpipeSite.cpp \ 35 | src/common/Smoothing.cpp \ 36 | src/common/ThreeStateHMM.cpp \ 37 | src/common/TwoStateHMM.cpp \ 38 | src/common/TwoStateHMM_PMD.cpp \ 39 | src/common/bsutils.cpp \ 40 | src/common/numerical_utils.cpp 41 | 42 | libmethpipe_a_SOURCES += \ 43 | src/common/BetaBin.hpp \ 44 | src/common/Distro.hpp \ 45 | src/common/EmissionDistribution.hpp \ 46 | src/common/Epiread.hpp \ 47 | src/common/EpireadStats.hpp \ 48 | src/common/LevelsCounter.hpp \ 49 | src/common/MethpipeFiles.hpp \ 50 | src/common/MethpipeSite.hpp \ 51 | src/common/Smoothing.hpp \ 52 | src/common/ThreeStateHMM.hpp \ 53 | src/common/TwoStateHMM.hpp \ 54 | src/common/TwoStateHMM_PMD.hpp \ 55 | src/common/bsutils.hpp \ 56 | src/common/numerical_utils.hpp 57 | 58 | LDADD = libmethpipe.a src/smithlab_cpp/libsmithlab_cpp.a 59 | 60 | ### ANALYSIS SUBDIR 61 | bin_PROGRAMS = pmd methcounts bsrate hmr hypermr levels roimethstat \ 62 | methstates methentropy hmr_rep multimethstat 63 | 64 | pmd_SOURCES = src/analysis/pmd.cpp 65 | methstates_SOURCES = src/analysis/methstates.cpp 66 | bsrate_SOURCES = src/analysis/bsrate.cpp 67 | methentropy_SOURCES = src/analysis/methentropy.cpp 68 | methcounts_SOURCES = src/analysis/methcounts.cpp 69 | roimethstat_SOURCES = src/analysis/roimethstat.cpp 70 | multimethstat_SOURCES = src/analysis/multimethstat.cpp 71 | hmr_SOURCES = src/analysis/hmr.cpp 72 | hmr_rep_SOURCES = src/analysis/hmr_rep.cpp 73 | levels_SOURCES = src/analysis/levels.cpp 74 | hypermr_SOURCES = src/analysis/hypermr.cpp 75 | 76 | ### UTILS SUBDIR 77 | bin_PROGRAMS += lc_approx fast-liftover lift-filter merge-bsrate \ 78 | merge-methcounts duplicate-remover symmetric-cpgs \ 79 | clean-hairpins selectsites guessprotocol format_reads 80 | 81 | clean_hairpins_SOURCES = src/utils/clean-hairpins.cpp 82 | guessprotocol_SOURCES = src/utils/guessprotocol.cpp 83 | duplicate_remover_SOURCES = src/utils/duplicate-remover.cpp 84 | merge_bsrate_SOURCES = src/utils/merge-bsrate.cpp 85 | format_reads_SOURCES = src/utils/format_reads.cpp 86 | lc_approx_SOURCES = src/utils/lc_approx.cpp 87 | selectsites_SOURCES = src/utils/selectsites.cpp 88 | symmetric_cpgs_SOURCES = src/utils/symmetric-cpgs.cpp 89 | merge_methcounts_SOURCES = src/utils/merge-methcounts.cpp 90 | lift_filter_SOURCES = src/utils/lift-filter.cpp 91 | fast_liftover_SOURCES = src/utils/fast-liftover.cpp 92 | 93 | ### AMRFINDER SUBDIR 94 | bin_PROGRAMS += allelicmeth amrfinder amrtester 95 | 96 | allelicmeth_SOURCES = src/amrfinder/allelicmeth.cpp 97 | amrfinder_SOURCES = src/amrfinder/amrfinder.cpp 98 | amrtester_SOURCES = src/amrfinder/amrtester.cpp 99 | 100 | ### RADMETH SUBDIR 101 | bin_PROGRAMS += radmeth radmeth-adjust radmeth-merge methdiff dmr 102 | 103 | dmr_SOURCES = src/radmeth/dmr.cpp 104 | methdiff_SOURCES = src/radmeth/methdiff.cpp 105 | radmeth_SOURCES = src/radmeth/radmeth.cpp 106 | radmeth_adjust_SOURCES = src/radmeth/radmeth-adjust.cpp 107 | radmeth_merge_SOURCES = src/radmeth/radmeth-merge.cpp 108 | 109 | ### MLML SUBDIR 110 | bin_PROGRAMS += mlml 111 | 112 | mlml_SOURCES = src/mlml/mlml.cpp 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | NOTICE: MethPipe is no longer maintained in this repository 2 | ============================================================ 3 | 4 | MethPipe is now [DNMTools](https://github.com/smithlabcode/dnmtools). The 5 | MethPipe repository will remain open for issues and discussion, but further releases, 6 | updates and fixes will no longer be maintained in this page. We strongly 7 | recommend that users replace their existing release of MethPipe with the 8 | most recent version of DNMTools, which contains all existing MethPipe programs 9 | along with various fixes, improvements and novel programs for bisulfite sequencing 10 | data analysis. 11 | 12 | [Visit the DNMTools repository](https://github.com/smithlabcode/dnmtools) 13 | 14 | ------------------------------------------------------------------ 15 | 16 | The MethPipe software package is a computational pipeline for 17 | analyzing bisulfite sequencing data (WGBS and RRBS). MethPipe provides 18 | tools methylation-specific technical evaluation of sequencing data, 19 | and for estimating methylation levels at individual cytosines. 20 | Additionally, MethPipe includes tools for identifying higher-level 21 | methylation features, such as hypo-methylated regions (HMR), partially 22 | methylated domains (PMD), hyper-methylated regions (HyperMR), and 23 | allele-specific methylated regions (AMR). 24 | 25 | Release 5.0.1 26 | =================== 27 | 28 | This new release no longer supports `mr` files, which means that the 29 | `to-mr` program has been eliminated and replaced by a program called 30 | `format_reads`, which merges mates in paired-end SAM files, also 31 | converting them to a standardized SAM format depending on the mapper 32 | it originates from. Additionally, the `htslib` library is now 33 | required, and instructions to install it in different environments are 34 | discussed below. 35 | 36 | If working with MR files is necessary for your analysis, we refer 37 | users to methpipe version [5.0.1](https://github.com/smithlabcode/methpipe/releases/tag/v5.0.1), 38 | which is the last release that contains programs that take MR as 39 | input. 40 | 41 | ## Installing release 5.0.1 42 | 43 | ### Required libraries 44 | 45 | * A recent compiler: most users will be building and installing this 46 | software with GCC. We require a compiler that fully supports C++11, 47 | so we recommend using at least GCC 5.8. There are still many systems 48 | that install a very old version of GCC by default, so if you have 49 | problems with building this software, that might be the first thing 50 | to check. 51 | * The GNU Scientific Library: this has always been required. It can be 52 | installed using `apt` on Linux, using `brew` on macOS, or from 53 | source available [here](http://www.gnu.org/software/gsl). 54 | * The Zlib compression library. Most likely you already have this 55 | installed on your system. If not, it can be installed using `apt` 56 | on Linux through the package `zlib1g-dev`. On macOS, Zlib can be 57 | installed with `brew`. 58 | * The HTSlib library, which can be installed through `brew` 59 | on macOS, through `apt` on Linux, or from source downloadable 60 | [here](https://github.com/samtools/htslib). 61 | 62 | ### Configuration 63 | 64 | 1. Download methpipe-5.0.1.tar.gz [here](https://github.com/smithlabcode/methpipe/releases/download/v5.0.1/methpipe-5.0.1.tar.gz). 65 | 2. Unpack the archive: 66 | ``` 67 | $ tar -zxvf methpipe-5.0.1.tar.gz 68 | ``` 69 | 3. Move into the methpipe directory and create a build directory: 70 | ``` 71 | $ cd methpipe-5.0.1 72 | $ mkdir build && cd build 73 | ``` 74 | 4. Run the configuration script: 75 | ``` 76 | $ ../configure 77 | ``` 78 | If you do not want to install the methpipe system-wide, or if you do 79 | not have admin privileges, specify a prefix directory: 80 | ``` 81 | $ ../configure --prefix=/some/reasonable/place 82 | ``` 83 | If you installed HTSlib yourself in some non-standard directory, 84 | you must specify the location like this: 85 | ``` 86 | $ ../configure CPPFLAGS='-I /path/to/htslib/headers' \ 87 | LDFLAGS='-L/path/to/htslib/lib' 88 | ``` 89 | 90 | ### Building and installing the tools 91 | 92 | If you are still in the `build` directory, run `make` to compile the 93 | tools, and then `make install` to install them. If your HTSlib is not 94 | installed system-wide, then you might need to udpate your library 95 | path: 96 | ``` 97 | $ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/to/htslib/lib 98 | ``` 99 | 100 | ### Building and installing from source 101 | 102 | We strongly recommend using methpipe through the latest stable release 103 | under the releases section on GitHub. However, developers who wish to 104 | work on the latest commits, which are potentially unstable, can 105 | compile the cloned repository using the `Makefile` available in the 106 | repository. If HTSLib is available system-wide, compile by running 107 | ``` 108 | make 109 | ``` 110 | 111 | Usage 112 | ===== 113 | 114 | Read methpipe-manual.pdf in the docs directory. 115 | 116 | Contacts and bug reports 117 | ======================== 118 | 119 | Andrew D. Smith 120 | andrewds@usc.edu 121 | 122 | Ben Decato 123 | decato@usc.edu 124 | 125 | Meng Zhou 126 | mengzhou@usc.edu 127 | 128 | MethPipe and MethBase Users' Mailinglist 129 | methpipe@googlegroups.com 130 | http://groups.google.com/group/methpipe 131 | 132 | Copyright and License Information 133 | ================================= 134 | 135 | Copyright (C) 2018-2021 136 | University of Southern California, 137 | Andrew D. Smith 138 | 139 | Current Authors: Andrew D. Smith, Ben Decato, Meng Zhou, Liz Ji, 140 | Terence Li, Guilherme de Sena Brandine 141 | 142 | This is free software: you can redistribute it and/or modify it under 143 | the terms of the GNU General Public License as published by the Free 144 | Software Foundation, either version 3 of the License, or (at your 145 | option) any later version. 146 | 147 | This software is distributed in the hope that it will be useful, but 148 | WITHOUT ANY WARRANTY; without even the implied warranty of 149 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 150 | General Public License for more details. 151 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | dnl This file is part of methpipe 2 | dnl 3 | dnl Copyright (C) 2019: Andrew D. Smith 4 | dnl 5 | dnl Authors: Andrew D. Smith 6 | dnl 7 | dnl This is free software: you can redistribute it and/or modify it 8 | dnl under the terms of the GNU General Public License as published by 9 | dnl the Free Software Foundation, either version 3 of the License, or 10 | dnl (at your option) any later version. 11 | dnl 12 | dnl This software is distributed in the hope that it will be useful, 13 | dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | dnl General Public License for more details. 16 | 17 | AC_CONFIG_SUBDIRS([src/smithlab_cpp]) 18 | AC_INIT([methpipe], [5.0.1], [andrewds@usc.edu], 19 | [methpipe], [https://github.com/smithlabcode/methpipe]) 20 | dnl the config.h is not currently #included in the source, and only 21 | dnl used to keep command lines short. 22 | AC_CONFIG_HEADERS([config.h]) 23 | AM_INIT_AUTOMAKE([subdir-objects foreign]) 24 | 25 | 26 | AC_CONFIG_MACRO_DIR([m4]) 27 | AC_LANG(C++) 28 | AC_PROG_CXX 29 | AX_CXX_COMPILE_STDCXX_11([noext], [mandatory]) 30 | AC_PROG_RANLIB 31 | 32 | dnl recursively configure smithlab_cpp 33 | AX_SUBDIRS_CONFIGURE([src/smithlab_cpp], [--enable-hts]) 34 | 35 | dnl check for HTSLib if requested 36 | hts_fail_msg=" 37 | Failed to locate HTSLib on your system. Please use the LDFLAGS and 38 | CPPFLAGS variables to specify the directories where the HTSLib library 39 | and headers can be found. 40 | " 41 | 42 | gsl_fail_msg=" 43 | Failed to locate the GNU Scientific Library (GSL) on your system. Please use 44 | the LDFLAGS and CPPFLAGS variables to specify the directories where the GSL 45 | library and headers can be found. 46 | " 47 | 48 | zlib_fail_msg=" 49 | Failed to locate the ZLib on your system. Please use the LDFLAGS and CPPFLAGS 50 | variables to specify the directories where the ZLib library and headers can be 51 | found. 52 | " 53 | 54 | dnl check for required libraries 55 | AC_SEARCH_LIBS([hts_version], [hts], [], [AC_MSG_FAILURE([$hts_fail_msg])]) 56 | AC_SEARCH_LIBS([zlibVersion], [z], [], [AC_MSG_FAILURE([$zlib_fail_msg])]) 57 | AC_SEARCH_LIBS([cblas_dgemm], [gslcblas], [], [AC_MSG_FAILURE([$gsl_fail_msg])]) 58 | AC_SEARCH_LIBS([gsl_blas_dgemm], [gsl], [], [AC_MSG_FAILURE([$gsl_fail_msg])]) 59 | 60 | AC_CONFIG_FILES([ 61 | Makefile 62 | ]) 63 | AC_OUTPUT 64 | -------------------------------------------------------------------------------- /docs/figs/A-rich-base-composition.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/figs/A-rich-base-composition.pdf -------------------------------------------------------------------------------- /docs/figs/Methpipe_work_flow.eps: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/figs/Methpipe_work_flow.eps -------------------------------------------------------------------------------- /docs/figs/Methpipe_work_flow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/figs/Methpipe_work_flow.pdf -------------------------------------------------------------------------------- /docs/figs/Methpipe_work_flow.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/figs/Methpipe_work_flow.tif -------------------------------------------------------------------------------- /docs/figs/T-rich-base-composition.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/figs/T-rich-base-composition.pdf -------------------------------------------------------------------------------- /docs/methpipe-manual.bib: -------------------------------------------------------------------------------- 1 | 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | % Internal citations 4 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 5 | 6 | @article{song2013reference, 7 | title={A reference methylome database and analysis pipeline to facilitate integrative and comparative epigenomics}, 8 | author={Song, Qiang and Decato, Benjamin and Hong, Elizabeth E and Zhou, Meng and Fang, Fang and Qu, Jianghan and Garvin, Tyler and Kessler, Michael and Zhou, Jun and Smith, Andrew D}, 9 | journal={PloS one}, 10 | volume={8}, 11 | number={12}, 12 | pages={e81148}, 13 | year={2013}, 14 | publisher={Public Library of Science} 15 | } 16 | 17 | @article{fang2012genomic, 18 | title={Genomic landscape of human allele-specific DNA methylation}, 19 | author={Fang, Fang and Hodges, Emily and Molaro, Antoine and Dean, Matthew and Hannon, Gregory J and Smith, Andrew D}, 20 | journal={Proceedings of the National Academy of Sciences}, 21 | volume={109}, 22 | number={19}, 23 | pages={7332--7337}, 24 | year={2012}, 25 | publisher={National Acad Sciences} 26 | } 27 | 28 | @article{qu2013mlml, 29 | title={MLML: Consistent simultaneous estimates of DNA methylation and hydroxymethylation}, 30 | author={Qu, Jianghan and Zhou, Meng and Song, Qiang and Hong, Elizabeth E and Smith, Andrew D}, 31 | journal={Bioinformatics}, 32 | volume={29}, 33 | number={20}, 34 | pages={2645--2646}, 35 | year={2013} 36 | } 37 | 38 | @article{dolzhenko2014using, 39 | title={Using beta-binomial regression for high-precision differential methylation analysis in multifactor whole-genome bisulfite sequencing experiments}, 40 | author={Dolzhenko, Egor and Smith, Andrew D}, 41 | journal={BMC bioinformatics}, 42 | volume={15}, 43 | number={1}, 44 | pages={1--8}, 45 | year={2014}, 46 | publisher={BioMed Central} 47 | } 48 | 49 | @article{decato2020characterization, 50 | title={Characterization of universal features of partially methylated domains across tissues and species}, 51 | author={Decato, Benjamin E and Qu, Jianghan and Ji, Xiaojing and Wagenblast, Elvin and Knott, Simon RV and Hannon, Gregory J and Smith, Andrew D}, 52 | journal={Epigenetics \& Chromatin}, 53 | year={2020}, 54 | publisher={BioMed Central} 55 | } 56 | 57 | 58 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 59 | % External citations 60 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 61 | 62 | @article{altham1969exact, 63 | title={Exact Bayesian analysis of a 2$\times$ 2 contingency table, and Fisher's" exact" significance test}, 64 | author={Altham, Patricia ME}, 65 | journal={Journal of the Royal Statistical Society. Series B (Methodological)}, 66 | pages={261--269}, 67 | year={1969}, 68 | publisher={JSTOR} 69 | } 70 | 71 | 72 | @article{xie2011, 73 | author={ Xie, Hehuang and Wang, Min and Andrade, Alexandre de and Bonaldo, Maria de F. and Galat, Vasil and Arndt, Kelly and Rajaram, Veena and Goldman, Stewart and Tomita, Tadanori and Soares,Marcelo B.}, 74 | title = {Genome-wide quantitative assessment of variation in DNA methylation patterns}, 75 | journal = {Nucl. Acids Res.}, 76 | volume = {39}, 77 | number ={10}, 78 | page ={4099 -- 4108}, 79 | year= {2011} 80 | } 81 | -------------------------------------------------------------------------------- /docs/methpipe-manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/methpipe/05d53ce60e7a514fd30482b25eeba041bfc1e25c/docs/methpipe-manual.pdf -------------------------------------------------------------------------------- /m4/ax_cxx_check_lib.m4: -------------------------------------------------------------------------------- 1 | dnl @synopsis AX_CXX_CHECK_LIB(libname, functioname, action-if, action-if-not) 2 | dnl 3 | dnl The standard AC_CHECK_LIB can not test functions in namespaces. 4 | dnl Therefore AC_CHECK_LIB(cgicc, cgicc::Cgicc::getVersion) will always 5 | dnl fail. We need to decompose the functionname into a series of namespaces 6 | dnl where it gets declared so that it can be used for a link test. 7 | dnl 8 | dnl In the first version I did allow namespace::functionname to be a 9 | dnl reference to a void-argument global functionname (just wrapped in a 10 | dnl namespace) like its C counterparts would be - but in reality such 11 | dnl thing does not exist. The only global / static functions are always 12 | dnl made const-functions which is an attribute mangled along into the 13 | dnl library function export name. 14 | dnl 15 | dnl The normal usage will ask for a test of a class-member function which 16 | dnl should be presented with a full function spec with arguments given in 17 | dnl parentheses following the function name - if the function to test for 18 | dnl does expect arguments then you should add default initial values in the 19 | dnl prototype (even if they do not exist originally, these are used only 20 | dnl locally to build a correct function call in the configure test script). 21 | dnl 22 | dnl In the current version if you do omit the parenthesis from the macro 23 | dnl argument then the macro will assume that you want to check for the 24 | dnl class name - which is really to check for default constructor being 25 | dnl exported from the given library name. 26 | dnl 27 | dnl EXAMPLE: 28 | dnl AX_CXX_CHECK_LIB(cgicc, [cgicc::HTTPCookie]) 29 | dnl AX_CXX_CHECK_LIB(cgicc, [cgicc::Cgicc::getVersion () const], 30 | dnl AX_CXX_CHECK_LIB(boost_regex, [boost::RegEx::Position (int i = 0) const]) 31 | dnl 32 | dnl Result: 33 | dnl Just as the usual AX_CXX_CHECK_LIB - defines HAVE_LIBCGICC 34 | dnl and adds the libraries to the default library path (and 35 | dnl uses internally the normal ac_check_lib cache symbol 36 | dnl like ac_cv_lib_cgicc_cgicc__Cgicc) 37 | dnl 38 | dnl Footnote: The C++ language is not good at creating stable library 39 | dnl interfaces at the binary level - a lot of functionality is usually being 40 | dnl given as inline functions plus there is hardly a chance to create opaque 41 | dnl types. Therefore most C++ library tests will only do compile tests using 42 | dnl the header files. Doing a check_lib is however good to check the link 43 | dnl dependency before hitting it as an error in the build later. 44 | dnl 45 | dnl @category C++ 46 | dnl @author Guido U. Draheim 47 | dnl @vesion 2006-12-18 48 | 49 | AC_DEFUN([AX_CXX_CHECK_LIB], 50 | [m4_ifval([$3], , [AH_CHECK_LIB([$1])])dnl 51 | AS_LITERAL_IF([$1], 52 | [AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1_$2])], 53 | [AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1''_$2])])dnl 54 | AC_CACHE_CHECK([for $2 in -l$1], ac_Lib, 55 | [ac_check_lib_save_LIBS=$LIBS 56 | LIBS="-l$1 $5 $LIBS" 57 | case "$2" 58 | in *::*::*\(*) 59 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 60 | namespace `echo "$2" | sed -e "s/::.*//"` 61 | { class `echo "$2" | sed -e "s/.*::\\(.*\\)::.*/\\1/" -e "s/(.*//"` 62 | { public: int `echo "$2" | sed -e "s/.*:://" -e "/(/!s/..*/&()/"`; 63 | }; 64 | } 65 | ],[`echo "$2" | sed -e "s/(.*//" -e "s/\\(.*\\)::\\(.*\\)/((\\1*)(0))->\\2/g"`()])], 66 | [AS_VAR_SET(ac_Lib, yes)], 67 | [AS_VAR_SET(ac_Lib, no)]) 68 | ;; *::*::*) 69 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 70 | namespace `echo "$2" | sed -e "s/::.*//"` 71 | { namespace `echo "$2" | sed -e "s/.*::\\(.*\\)::.*/\\1/"` 72 | { class `echo "$2" | sed -e "s/.*:://"` 73 | { public: `echo "$2" | sed -e "s/.*:://"` (); 74 | }; 75 | } 76 | } 77 | ],[new $2()])], 78 | [AS_VAR_SET(ac_Lib, yes)], 79 | [AS_VAR_SET(ac_Lib, no)]) 80 | ;; *::*\(*) 81 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 82 | class `echo "$2" | sed -e "s/\\(.*\\)::.*/\\1/" -e "s/(.*//"` 83 | { public: int `echo "$2" | sed -e "s/.*:://" -e "/(/!s/..*/&()/"`; 84 | }; 85 | ],[`echo "$2" | sed -e "s/(.*//" -e "s/\\(.*\\)::\\(.*\\)/((\\1*)(0))->\\2/g"`()])], 86 | [AS_VAR_SET(ac_Lib, yes)], 87 | [AS_VAR_SET(ac_Lib, no)]) 88 | ;; *::*) 89 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 90 | namespace `echo "$2" | sed -e "s/::.*//"` 91 | { class `echo "$2" | sed -e "s/.*:://"` 92 | { public: `echo "$2" | sed -e "s/.*:://"` (); 93 | }; 94 | } 95 | ],[new $2()])], 96 | [AS_VAR_SET(ac_Lib, yes)], 97 | [AS_VAR_SET(ac_Lib, no)]) 98 | ;; *) 99 | AC_LINK_IFELSE([AC_LANG_CALL([], [$2])], 100 | [AS_VAR_SET(ac_Lib, yes)], 101 | [AS_VAR_SET(ac_Lib, no)]) 102 | ;; esac 103 | LIBS=$ac_check_lib_save_LIBS]) 104 | AS_IF([test AS_VAR_GET(ac_Lib) = yes], 105 | [m4_default([$3], [AC_DEFINE_UNQUOTED(AS_TR_CPP(HAVE_LIB$1)) 106 | LIBS="-l$1 $LIBS" 107 | ])], 108 | [$4])dnl 109 | AS_VAR_POPDEF([ac_Lib])dnl 110 | ])# AC_CHECK_LIB 111 | -------------------------------------------------------------------------------- /m4/ax_cxx_compile_stdcxx_11.m4: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html 3 | # ============================================================================= 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CXX_COMPILE_STDCXX_11([ext|noext], [mandatory|optional]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check for baseline language coverage in the compiler for the C++11 12 | # standard; if necessary, add switches to CXX and CXXCPP to enable 13 | # support. 14 | # 15 | # This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX 16 | # macro with the version set to C++11. The two optional arguments are 17 | # forwarded literally as the second and third argument respectively. 18 | # Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for 19 | # more information. If you want to use this macro, you also need to 20 | # download the ax_cxx_compile_stdcxx.m4 file. 21 | # 22 | # LICENSE 23 | # 24 | # Copyright (c) 2008 Benjamin Kosnik 25 | # Copyright (c) 2012 Zack Weinberg 26 | # Copyright (c) 2013 Roy Stogner 27 | # Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov 28 | # Copyright (c) 2015 Paul Norman 29 | # Copyright (c) 2015 Moritz Klammler 30 | # 31 | # Copying and distribution of this file, with or without modification, are 32 | # permitted in any medium without royalty provided the copyright notice 33 | # and this notice are preserved. This file is offered as-is, without any 34 | # warranty. 35 | 36 | #serial 18 37 | 38 | AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX]) 39 | AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [AX_CXX_COMPILE_STDCXX([11], [$1], [$2])]) 40 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile from methpipe software 2 | # 3 | # Copyright (C) 2010-2014 University of Southern California and 4 | # Andrew D. Smith 5 | # 6 | # Authors: Andrew D. Smith 7 | # 8 | # This program is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, either version 3 of the License, or 11 | # (at your option) any later version. 12 | # 13 | # This program is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with this program. If not, see . 20 | # 21 | 22 | # check if recursive clone was done 23 | SMITHLAB_CPP=$(abspath $(dir $(MAKEFILE_LIST)))/smithlab_cpp 24 | 25 | ifeq (,$(wildcard $(SMITHLAB_CPP)/Makefile)) 26 | $(error src/smithlab_cpp does not have a Makefile. \ 27 | Did you use --recursive when running git clone?) 28 | endif 29 | 30 | all_subdirs=common utils analysis amrfinder mlml radmeth 31 | lib_subdirs=common 32 | app_subdirs=analysis utils amrfinder mlml radmeth 33 | 34 | all_subdirs += $(SMITHLAB_CPP) 35 | lib_subdirs += $(SMITHLAB_CPP) 36 | 37 | all: 38 | make -C $(SMITHLAB_CPP) HAVE_HTSLIB=1 all 39 | @for i in $(app_subdirs); do \ 40 | make -C $${i} \ 41 | SMITHLAB_CPP=$(SMITHLAB_CPP) \ 42 | SRC_ROOT=$(METHPIPE_ROOT); \ 43 | done; 44 | 45 | install: 46 | @for i in $(app_subdirs); do \ 47 | make -C $${i} \ 48 | SMITHLAB_CPP=$(SMITHLAB_CPP) \ 49 | SRC_ROOT=$(METHPIPE_ROOT) install; \ 50 | done; 51 | 52 | clean: 53 | make -C $(SMITHLAB_CPP) clean 54 | @for i in $(all_subdirs); do \ 55 | make -C $${i} \ 56 | SMITHLAB_CPP=$(SMITHLAB_CPP) \ 57 | SRC_ROOT=$(METHPIPE_ROOT) clean; \ 58 | done; 59 | -------------------------------------------------------------------------------- /src/amrfinder/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2014 University of Southern California 2 | # and Andrew D. Smith and Benjamin E. Decato 3 | # 4 | # Authors: Andrew D. Smith and Benjamin E. Decato 5 | # 6 | # This is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This software is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this software; if not, write to the Free Software 18 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 19 | # 02110-1301 USA 20 | # 21 | 22 | ifndef SMITHLAB_CPP 23 | $(error SMITHLAB_CPP variable undefined) 24 | endif 25 | 26 | PROGS = allelicmeth amrfinder amrtester 27 | 28 | CXX = g++ 29 | CXXFLAGS = -Wall -std=c++11 30 | OPTFLAGS = -O3 31 | DEBUGFLAGS = -g 32 | 33 | ifdef DEBUG 34 | CXXFLAGS += $(DEBUGFLAGS) 35 | else 36 | CXXFLAGS += $(OPTFLAGS) 37 | endif 38 | 39 | COMMON_DIR = ../common 40 | INCLUDEDIRS = $(SMITHLAB_CPP) $(COMMON_DIR) 41 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS)) 42 | override CPPFLAGS += $(INCLUDEARGS) 43 | 44 | LDLIBS = -lgsl -lgslcblas -lz 45 | 46 | all: $(PROGS) 47 | 48 | install: $(PROGS) 49 | @mkdir -p $(SRC_ROOT)/bin 50 | @install -m 755 $(PROGS) $(SRC_ROOT)/bin 51 | 52 | $(PROGS): $(addprefix $(SMITHLAB_CPP)/, libsmithlab_cpp.a) 53 | 54 | amrfinder: $(addprefix $(COMMON_DIR)/, EpireadStats.o Epiread.o) 55 | 56 | amrtester: $(addprefix $(COMMON_DIR)/, EpireadStats.o Epiread.o) 57 | 58 | allelicmeth: $(addprefix $(COMMON_DIR)/, Epiread.o) 59 | 60 | %.o: %.cpp %.hpp 61 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(CPPFLAGS) 62 | 63 | %: %.cpp 64 | $(CXX) $(CXXFLAGS) -o $@ $^ $(CPPFLAGS) $(LDLIBS) $(LDFLAGS) 65 | 66 | clean: 67 | @-rm -f $(PROGS) *.o *.so *.a *~ 68 | 69 | .PHONY: clean 70 | -------------------------------------------------------------------------------- /src/analysis/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2014 University of Southern California 2 | # and Andrew D. Smith and Benjamin E. Decato 3 | # 4 | # Authors: Andrew D. Smith and Benjamin E. Decato 5 | # 6 | # This is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This software is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this software; if not, write to the Free Software 18 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 19 | # 02110-1301 USA 20 | 21 | PROGS = pmd methcounts bsrate hmr hypermr levels roimethstat \ 22 | methstates methentropy hmr_rep multimethstat 23 | 24 | CXX = g++ 25 | CXXFLAGS = -Wall -std=c++11 26 | OPTFLAGS = -O3 27 | DEBUGFLAGS = -g 28 | 29 | ifndef SRC_ROOT 30 | SRC_ROOT=../.. 31 | endif 32 | 33 | ifndef SMITHLAB_CPP 34 | $(error SMITHLAB_CPP variable undefined) 35 | endif 36 | 37 | ifdef DEBUG 38 | CXXFLAGS += $(DEBUGFLAGS) 39 | else 40 | CXXFLAGS += $(OPTFLAGS) 41 | endif 42 | 43 | COMMON_DIR = ../common 44 | INCLUDEDIRS = $(SMITHLAB_CPP) $(COMMON_DIR) 45 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS)) 46 | override CPPFLAGS += $(INCLUDEARGS) 47 | 48 | LDLIBS = -lgsl -lgslcblas -lz -lhts 49 | 50 | all: $(PROGS) 51 | 52 | install: $(PROGS) 53 | @mkdir -p $(SRC_ROOT)/bin 54 | @install -m 755 $(PROGS) $(SRC_ROOT)/bin 55 | 56 | $(PROGS): $(addprefix $(SMITHLAB_CPP)/, libsmithlab_cpp.a) 57 | 58 | levels hmr_rep hmr methcounts roimethstat hypermr pmd: \ 59 | $(addprefix $(COMMON_DIR)/, MethpipeSite.o) 60 | 61 | hmr hmr_rep: $(addprefix $(COMMON_DIR)/, TwoStateHMM.o) 62 | 63 | pmd: $(addprefix $(COMMON_DIR)/, bsutils.o \ 64 | TwoStateHMM_PMD.o EmissionDistribution.o) 65 | 66 | hypermr: $(addprefix $(COMMON_DIR)/, ThreeStateHMM.o Smoothing.o \ 67 | Distro.o BetaBin.o numerical_utils.o) 68 | 69 | levels: $(addprefix $(COMMON_DIR)/, LevelsCounter.o) 70 | 71 | methcounts: $(addprefix $(COMMON_DIR)/, MethpipeSite.o) 72 | 73 | roimethstat levels: $(addprefix $(COMMON_DIR)/, bsutils.o) 74 | 75 | %.o: %.cpp %.hpp 76 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(CPPFLAGS) 77 | 78 | %: %.cpp 79 | $(CXX) $(CXXFLAGS) -o $@ $^ $(CPPFLAGS) $(LDLIBS) $(LDFLAGS) 80 | 81 | clean: 82 | @-rm -f $(PROGS) *.o *.so *.a *~ 83 | 84 | .PHONY: clean 85 | -------------------------------------------------------------------------------- /src/analysis/levels.cpp: -------------------------------------------------------------------------------- 1 | /* levels: a program to compute coverage statistics, mutation rates, 2 | * and three different formulas for methylation levels described in 3 | * the paper: 4 | * 5 | * 'Leveling' the playing field for analyses of single-base 6 | * resolution DNA methylomes 7 | * Schultz, Schmitz & Ecker (TIG 2012) 8 | * 9 | * Note: the fractional methylation level calculated in this program 10 | * is inspired but different from the paper. What we are doing here is 11 | * using binomial test to determine significantly hyper/hypomethylated 12 | * sites, and only use these subset of sites to calculate methylation 13 | * level. 14 | * 15 | * Copyright (C) 2014-2015 University of Southern California and 16 | * Andrew D. Smith and Benjamin E Decato 17 | * 18 | * Authors: Andrew D. Smith and Benjamin E Decato 19 | * 20 | * This program is free software: you can redistribute it and/or modify 21 | * it under the terms of the GNU General Public License as published by 22 | * the Free Software Foundation, either version 3 of the License, or 23 | * (at your option) any later version. 24 | * 25 | * This program is distributed in the hope that it will be useful, 26 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 27 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 28 | * GNU General Public License for more details. 29 | */ 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | #include "OptionParser.hpp" 40 | #include "smithlab_utils.hpp" 41 | #include "smithlab_os.hpp" 42 | #include "MethpipeSite.hpp" 43 | #include "LevelsCounter.hpp" 44 | #include "zlib_wrapper.hpp" 45 | #include "bsutils.hpp" 46 | 47 | using std::string; 48 | using std::vector; 49 | using std::cout; 50 | using std::cerr; 51 | using std::endl; 52 | using std::to_string; 53 | using std::runtime_error; 54 | 55 | 56 | int 57 | main(int argc, const char **argv) { 58 | 59 | try { 60 | 61 | bool VERBOSE = false; 62 | string outfile; 63 | 64 | /****************** COMMAND LINE OPTIONS ********************/ 65 | OptionParser opt_parse(strip_path(argv[0]), "compute methylation levels", 66 | ""); 67 | opt_parse.add_opt("output", 'o', "output file (default: stdout)", 68 | false, outfile); 69 | opt_parse.add_opt("alpha", 'a', "alpha for confidence interval", 70 | false, LevelsCounter::alpha); 71 | opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); 72 | vector leftover_args; 73 | opt_parse.parse(argc, argv, leftover_args); 74 | if (opt_parse.help_requested()) { 75 | cerr << opt_parse.help_message() << endl 76 | << opt_parse.about_message() << endl; 77 | return EXIT_SUCCESS; 78 | } 79 | if (opt_parse.about_requested()) { 80 | cerr << opt_parse.about_message() << endl; 81 | return EXIT_SUCCESS; 82 | } 83 | if (opt_parse.option_missing()) { 84 | cerr << opt_parse.option_missing_message() << endl; 85 | return EXIT_SUCCESS; 86 | } 87 | if (leftover_args.size() != 1) { 88 | cerr << opt_parse.help_message() << endl; 89 | return EXIT_SUCCESS; 90 | } 91 | const string meth_file = leftover_args.front(); 92 | /****************** END COMMAND LINE OPTIONS *****************/ 93 | 94 | igzfstream in(meth_file); 95 | if (!in) 96 | throw std::runtime_error("bad input file: " + meth_file); 97 | 98 | LevelsCounter cpg("cpg"); 99 | LevelsCounter cpg_symmetric("cpg_symmetric"); 100 | LevelsCounter chh("chh"); 101 | LevelsCounter cxg("cxg"); 102 | LevelsCounter ccg("ccg"); 103 | LevelsCounter cytosines("cytosines"); 104 | 105 | MSite site, prev_site; 106 | size_t chrom_count = 0; 107 | 108 | while (in >> site) { 109 | 110 | if (site.chrom != prev_site.chrom) { 111 | ++chrom_count; 112 | if (VERBOSE) 113 | cerr << "PROCESSING:\t" << site.chrom << "\n"; 114 | } 115 | 116 | if (site.is_cpg()) { 117 | cpg.update(site); 118 | if (site.is_mate_of(prev_site)) { 119 | site.add(prev_site); 120 | cpg_symmetric.update(site); 121 | } 122 | } 123 | else if (site.is_chh()) chh.update(site); 124 | else if (site.is_ccg()) ccg.update(site); 125 | else if (site.is_cxg()) cxg.update(site); 126 | else throw runtime_error("bad site context: " + site.context); 127 | 128 | cytosines.update(site); 129 | 130 | prev_site = site; 131 | } 132 | 133 | std::ofstream of; 134 | if (!outfile.empty()) of.open(outfile.c_str()); 135 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 136 | 137 | out << cytosines << endl 138 | << cpg << endl 139 | << cpg_symmetric << endl 140 | << chh << endl 141 | << ccg << endl 142 | << cxg << endl; 143 | } 144 | catch (const std::exception &e) { 145 | cerr << e.what() << endl; 146 | return EXIT_FAILURE; 147 | } 148 | return EXIT_SUCCESS; 149 | } 150 | -------------------------------------------------------------------------------- /src/common-experimental/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2011 University of Southern California and 2 | # Andrew D. Smith 3 | # 4 | # Authors: Andrew D. Smith 5 | # 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | # 19 | 20 | ifndef SMITHLAB_CPP 21 | $(error SMITHLAB_CPP variable undefined) 22 | endif 23 | 24 | CXX = g++ 25 | CXXFLAGS = -std=c++11 -Wall 26 | OPTFLAGS = -O3 27 | DEBUGFLAGS = -g 28 | 29 | INCLUDEDIRS = $(SMITHLAB_CPP)/ ../common/ 30 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS)) 31 | 32 | ifdef DEBUG 33 | CXXFLAGS += $(DEBUGFLAGS) 34 | else 35 | CXXFLAGS += $(OPTFLAGS) 36 | endif 37 | 38 | %.o: %.cpp %.hpp 39 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(INCLUDEARGS) 40 | 41 | clean: 42 | @-rm -f *.o *~ 43 | .PHONY: clean 44 | -------------------------------------------------------------------------------- /src/common-experimental/ModelParams.cxx: -------------------------------------------------------------------------------- 1 | d/* 2 | * Copyright (C) 2012 University of Southern California 3 | * Andrew D Smith and Qiang Song 4 | * Author: Qiang Song 5 | * 6 | * This is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this software; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 19 | * 02110-1301 USA 20 | */ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include "smithlab_utils.hpp" 31 | 32 | using std::vector; 33 | using std::string; 34 | using std::endl; 35 | using std::cerr; 36 | using std::getline; 37 | 38 | static void 39 | convert_to_stringstream(const string &infile, std::stringstream &ss) 40 | { 41 | std::ifstream in(infile.c_str()); 42 | 43 | while (!in.eof()) 44 | { 45 | string str; 46 | getline(in, str); 47 | const size_t comment_start = str.find("#"); 48 | if (comment_start != string::npos) 49 | str.erase(comment_start); 50 | str = smithlab::strip(str); 51 | if (!str.empty()) 52 | ss << str << endl; 53 | } 54 | in.close(); 55 | } 56 | 57 | template void 58 | read_param_file(const std::string &infile, size_t &n, 59 | std::vector > &trans, 60 | std::vector &emissions, 61 | std::vector &durations); 62 | { 63 | std::stringstream ss(std::stringstream::in | std::stringstream::out); 64 | convert_to_stringstream(infile, ss); 65 | 66 | ss >> n; 67 | string tmp_str; 68 | getline(ss, tmp_str); 69 | 70 | emissions.clear(); 71 | for (size_t i = 0; i < n; ++i) 72 | { 73 | string tmp_str; 74 | getline(ss, tmp_str); 75 | emissions.push_back(Distro_Type(tmp_str)); 76 | } 77 | 78 | durations.clear(); 79 | for (size_t i = 0; i < n; ++i) 80 | { 81 | string tmp_str; 82 | getline(ss, tmp_str); 83 | durations.push_back(Distro_Type(tmp_str)); 84 | } 85 | 86 | trans.resize(n, vector(n)); 87 | for (size_t i = 0; i < n; ++i) 88 | for (size_t j = 0; j < n; ++j) 89 | ss >> trans[i][j]; 90 | } 91 | 92 | template void 93 | write_param_file(const std::string &outfile, const size_t &n, 94 | const std::vector > &trans, 95 | const std::vector &emissions, 96 | const std::vector &durations); 97 | { 98 | std::ofstream out(outfile.c_str()); 99 | 100 | out << "# number of states" << endl; 101 | out << n << endl; 102 | 103 | out << "\n# emmission distributions" << endl; 104 | std::copy(emissions.begin(), emissions.end(), 105 | std::ostream_iterator(out, "\n")); 106 | 107 | out << "\n# duration distributions" << endl; 108 | std::copy(durations.begin(), durations.end(), 109 | std::ostream_iterator(out, "\n")); 110 | 111 | out << "\n# state transition probabilities" << endl; 112 | for (size_t i = 0; i < n; ++i) 113 | { 114 | copy(trans[i].begin(), trans[i].end(), 115 | std::ostream_iterator(out, "\t")); 116 | out << endl; 117 | } 118 | } 119 | 120 | -------------------------------------------------------------------------------- /src/common-experimental/ModelParams.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 University of Southern California 3 | * Andrew D Smith and Qiang Song 4 | * Author: Qiang Song 5 | * 6 | * This is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this software; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 19 | * 02110-1301 USA 20 | */ 21 | 22 | #ifndef MODEL_PARAMS_HPP 23 | #define MODEL_PARAMS_HPP 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include "smithlab_utils.hpp" 33 | 34 | template void 35 | read_param_file(const std::string &infile, size_t &n, 36 | std::vector > &trans, 37 | std::vector &emissions, 38 | std::vector &durations); 39 | 40 | template void 41 | write_param_file(const std::string &outfile, const size_t &n, 42 | const std::vector > &trans, 43 | const std::vector &emissions, 44 | const std::vector &durations); 45 | 46 | static void 47 | convert_to_stringstream(const std::string &infile, std::stringstream &ss) 48 | { 49 | std::ifstream in(infile.c_str()); 50 | 51 | while (!in.eof()) 52 | { 53 | std::string str; 54 | std::getline(in, str); 55 | const size_t comment_start = str.find("#"); 56 | if (comment_start != std::string::npos) 57 | str.erase(comment_start); 58 | str = smithlab::strip(str); 59 | if (!str.empty()) 60 | ss << str << std::endl; 61 | } 62 | in.close(); 63 | } 64 | 65 | template void 66 | read_param_file(const std::string &infile, size_t &n, 67 | std::vector > &trans, 68 | std::vector &emissions, 69 | std::vector &durations) 70 | { 71 | std::stringstream ss(std::stringstream::in | std::stringstream::out); 72 | convert_to_stringstream(infile, ss); 73 | 74 | ss >> n; 75 | std::string tmp_str; 76 | getline(ss, tmp_str); 77 | 78 | emissions.clear(); 79 | for (size_t i = 0; i < n; ++i) 80 | { 81 | std::string tmp_str; 82 | std::getline(ss, tmp_str); 83 | emissions.push_back(Distro_Type(tmp_str)); 84 | } 85 | 86 | durations.clear(); 87 | for (size_t i = 0; i < n; ++i) 88 | { 89 | std::string tmp_str; 90 | std::getline(ss, tmp_str); 91 | durations.push_back(Distro_Type(tmp_str)); 92 | } 93 | 94 | trans.resize(n, std::vector(n)); 95 | for (size_t i = 0; i < n; ++i) 96 | for (size_t j = 0; j < n; ++j) 97 | ss >> trans[i][j]; 98 | } 99 | 100 | template void 101 | write_param_file(const std::string &outfile, const size_t &n, 102 | const std::vector > &trans, 103 | const std::vector &emissions, 104 | const std::vector &durations) 105 | { 106 | std::ofstream out(outfile.c_str()); 107 | 108 | out << "# number of states" << std::endl; 109 | out << n << std::endl; 110 | 111 | out << "\n# emmission distributions" << std::endl; 112 | std::copy(emissions.begin(), emissions.end(), 113 | std::ostream_iterator(out, "\n")); 114 | 115 | out << "\n# duration distributions" << std::endl; 116 | std::copy(durations.begin(), durations.end(), 117 | std::ostream_iterator(out, "\n")); 118 | 119 | out << "\n# state transition probabilities" << std::endl; 120 | for (size_t i = 0; i < n; ++i) 121 | { 122 | std::copy(trans[i].begin(), trans[i].end(), 123 | std::ostream_iterator(out, "\t")); 124 | out << std::endl; 125 | } 126 | } 127 | #endif 128 | 129 | -------------------------------------------------------------------------------- /src/common-experimental/ThreeStateHDHMM.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2011 University of Southern California 3 | Authors: Andrew D. Smith, Song Qiang 4 | 5 | This file is part of rmap. 6 | 7 | rmap is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | rmap is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with rmap; if not, write to the Free Software 19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 | */ 21 | 22 | #ifndef THREE_STATE_HDHMM_HPP 23 | #define THREE_STATE_HDHMM_HPP 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include "smithlab_utils.hpp" 31 | #include "Distro.hpp" 32 | 33 | enum STATE_LABELS {GAIN, SAME, LOSS}; 34 | struct Triplet { double gain, same, loss; }; 35 | 36 | class ThreeStateHDHMM { 37 | public: 38 | ThreeStateHDHMM( 39 | const std::vector &_observations, 40 | const std::vector &_reset_points, 41 | const double tol, 42 | const size_t max_itr, const bool v, 43 | const size_t _MAX_LEN); 44 | 45 | void 46 | set_parameters(const Distro & _gain_emission, 47 | const Distro & _same_emission, 48 | const Distro & _loss_emission, 49 | const Distro & _gain_duration, 50 | const Distro & _same_duration, 51 | const Distro & _loss_duration, 52 | const std::vector > & _trans); 53 | void 54 | get_parameters(Distro & _gain_emission, 55 | Distro & _same_emission, 56 | Distro & _loss_emission, 57 | Distro & _gain_duration, 58 | Distro & _same_duration, 59 | Distro & _loss_duration, 60 | std::vector > & _trans) const; 61 | 62 | double 63 | BaumWelchTraining(); 64 | 65 | double 66 | PosteriorDecoding(); 67 | 68 | void 69 | get_posterior_scores(std::vector &scores, 70 | std::vector &classes); 71 | 72 | private: 73 | 74 | //////////// methods //////////// 75 | double 76 | single_iteration(); 77 | double 78 | forward_algorithm(const size_t start, const size_t end); 79 | double 80 | backward_algorithm(const size_t start, const size_t end); 81 | 82 | double 83 | gain_segment_log_likelihood(const size_t start, const size_t end); 84 | 85 | double 86 | same_segment_log_likelihood(const size_t start, const size_t end); 87 | 88 | double 89 | loss_segment_log_likelihood(const size_t start, const size_t end); 90 | 91 | void 92 | estimate_state_posterior(const size_t start, const size_t end); 93 | 94 | void estimate_parameters(); 95 | 96 | void update_observation_likelihood(); 97 | 98 | //////// data //////// 99 | std::vector observations; 100 | std::vector reset_points; 101 | std::vector meth_lp, unmeth_lp; 102 | std::vector gain_log_likelihood, same_log_likelihood, loss_log_likelihood; 103 | 104 | // HMM internal data 105 | Distro gain_emission, same_emission, loss_emission; 106 | Distro gain_duration, same_duration, loss_duration; 107 | 108 | Triplet lp_start, lp_end; 109 | std::vector > trans; 110 | 111 | std::vector forward; 112 | std::vector backward; 113 | std::vector gain_posteriors, same_posteriors, loss_posteriors; 114 | 115 | // parameters 116 | // double MIN_PROB; 117 | double tolerance; 118 | size_t max_iterations; 119 | bool VERBOSE; 120 | size_t MAX_LEN; 121 | }; 122 | // } 123 | 124 | #endif 125 | -------------------------------------------------------------------------------- /src/common-experimental/TwoStateCTHMM.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2019 Andrew D. Smith 3 | Author: Andrew D. Smith 4 | 5 | This is free software; you can redistribute it and/or modify it 6 | under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This software is distributed in the hope that it will be useful, but 11 | WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | General Public License for more details. 14 | */ 15 | 16 | #ifndef TWO_STATE_CTHMM_HPP 17 | #define TWO_STATE_CTHMM_HPP 18 | 19 | #include "smithlab_utils.hpp" 20 | #include 21 | 22 | struct betabin; 23 | struct prob_mat; 24 | 25 | class TwoStateCTHMM { 26 | public: 27 | 28 | TwoStateCTHMM(const double ds, const double mp, const double tol, 29 | const size_t max_itr, const bool v, bool d = false) : 30 | desert_size(ds), MIN_PROB(mp), tolerance(tol), max_iterations(max_itr), 31 | VERBOSE(v), DEBUG(d) {} 32 | 33 | double 34 | ViterbiDecoding(const std::vector &pos, 35 | const std::vector > &values, 36 | const std::vector &reset_points, 37 | const std::vector &start_trans, 38 | const std::vector > &trans, 39 | const std::vector &end_trans, 40 | const double fg_alpha, const double fg_beta, 41 | const double bg_alpha, const double bg_beta, 42 | std::vector &ml_classes) const; 43 | 44 | 45 | double 46 | BaumWelchTraining(const std::vector &pos, 47 | const std::vector > &values, 48 | const std::vector &reset_points, 49 | std::vector &start_trans, 50 | double &mu0, double &mu1, 51 | std::vector &end_trans, 52 | double &fg_alpha, double &fg_beta, 53 | double &bg_alpha, double &bg_beta) const; 54 | 55 | double 56 | PosteriorDecoding(const std::vector &pos, 57 | const std::vector > &values, 58 | const std::vector &reset_points, 59 | const std::vector &start_trans, 60 | const double mu0, const double mu1, 61 | const std::vector &end_trans, 62 | const double fg_alpha, const double fg_beta, 63 | const double bg_alpha, const double bg_beta, 64 | std::vector &classes, 65 | std::vector &llr_scores) const; 66 | 67 | std::string 68 | error_log() const; 69 | 70 | static const size_t FG_TO_BG_TRANSITION = 1; 71 | static const size_t BG_TO_FG_TRANSITION = 2; 72 | 73 | private: 74 | 75 | double 76 | BaumWelchTraining(const std::vector &pos, 77 | const std::vector > &values, 78 | const std::vector &reset_points, 79 | double &p_sf, double &p_sb, 80 | double &mu0, double &mu1, 81 | double &p_ft, double &p_bt, 82 | betabin &fg_distro, betabin &bg_distro) const; 83 | 84 | double 85 | PosteriorDecoding(const std::vector &pos, 86 | const std::vector > &values, 87 | const std::vector &reset_points, 88 | const double p_sf, const double p_sb, 89 | const double mu0, const double mu1, 90 | const double p_ft, const double p_bt, 91 | const betabin &fg_distro, 92 | const betabin &bg_distro, 93 | std::vector &classes, 94 | std::vector &llr_scores) const; 95 | 96 | double 97 | single_iteration(const std::vector &pos, 98 | const std::vector > &values, 99 | const std::vector &vals_a, 100 | const std::vector &vals_b, 101 | const std::vector &reset_points, 102 | std::vector > &forward, 103 | std::vector > &backward, 104 | double &p_sf, double &p_sb, 105 | double &mu0, double &mu1, 106 | double &p_ft, double &p_bt, 107 | betabin &fg_distro, betabin &bg_distro) const; 108 | 109 | double 110 | forward_algorithm(const std::vector &pos, 111 | const std::vector > &vals, 112 | const size_t start, const size_t end, 113 | const double lp_sf, const double lp_sb, 114 | const std::vector &lm, 115 | const double lp_ft, const double lp_bt, 116 | const betabin &fg_distro, 117 | const betabin &bg_distro, 118 | std::vector > &f) const; 119 | double 120 | backward_algorithm(const std::vector &pos, 121 | const std::vector > &vals, 122 | const size_t start, const size_t end, 123 | const double lp_sf, const double lp_sb, 124 | const std::vector &lm, 125 | const double lp_ft, const double lp_bt, 126 | const betabin &fg_distro, 127 | const betabin &bg_distro, 128 | std::vector > &b) const; 129 | 130 | double 131 | log_sum_log_vec(const std::vector &vals, size_t limit) const; 132 | 133 | void 134 | estimate_emissions(const std::vector > &f, 135 | const std::vector > &b, 136 | std::vector &fg_probs, 137 | std::vector &bg_probs) const; 138 | 139 | void 140 | estimate_transitions(const std::vector &pos, 141 | const std::vector > &vals, 142 | const size_t start, const size_t end, 143 | const std::vector > &f, 144 | const std::vector > &b, 145 | const double total, 146 | const betabin &fg_distro, 147 | const betabin &bg_distro, 148 | const std::vector &lm, 149 | std::vector &ff_vals, 150 | std::vector &fb_vals, 151 | std::vector &bf_vals, 152 | std::vector &bb_vals) const; 153 | 154 | double 155 | log_sum_log(const double p, const double q) const; 156 | 157 | uint32_t desert_size; 158 | double MIN_PROB; 159 | double tolerance; 160 | size_t max_iterations; 161 | bool VERBOSE; 162 | bool DEBUG; 163 | }; 164 | 165 | #endif 166 | -------------------------------------------------------------------------------- /src/common-experimental/contingency-table.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 University of Southern California and 3 | * Andrew D. Smith, Song Qiang 4 | * 5 | * Authors: Andrew D. Smith, Song Qiang 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program. If not, see . 19 | */ 20 | 21 | #include 22 | 23 | #include 24 | 25 | #include "contingency-table.hpp" 26 | #include "numerical_utils.hpp" 27 | 28 | using std::min; 29 | 30 | static inline double 31 | log_prob_hypergeo(const size_t meth_a, const size_t unmeth_a, 32 | const size_t meth_b, const size_t unmeth_b, 33 | const size_t k) 34 | { 35 | return gsl_sf_lnchoose(meth_b + unmeth_b - 1, k) + 36 | gsl_sf_lnchoose(meth_a + unmeth_a - 1, meth_a + meth_b - 1 - k) - 37 | gsl_sf_lnchoose(meth_a + unmeth_a + meth_b + unmeth_b - 2, 38 | meth_a + meth_b - 1); 39 | } 40 | 41 | double 42 | ContingencyTable::beta_population_greater( 43 | const size_t meth_a, const size_t unmeth_a, 44 | const size_t meth_b, const size_t unmeth_b) 45 | { 46 | double p = 0; 47 | 48 | for (size_t k = meth_b > unmeth_a ? meth_b - unmeth_a : 0; 49 | k < meth_b; ++k) 50 | p = log_sum_log(p, log_prob_hypergeo( 51 | meth_a, unmeth_a, meth_b, unmeth_b, k)); 52 | return exp(p); 53 | } 54 | 55 | -------------------------------------------------------------------------------- /src/common-experimental/contingency-table.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 University of Southern California and 3 | * Andrew D. Smith, Song Qiang 4 | * 5 | * Authors: Andrew D. Smith, Song Qiang 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program. If not, see . 19 | */ 20 | 21 | #ifndef CONTINGENCY_TABLE_HPP 22 | #define CONTINGENCY_TABLE_HPP 23 | 24 | namespace ContingencyTable 25 | { 26 | double 27 | beta_population_greater(const size_t meth_a, const size_t unmeth_a, 28 | const size_t meth_b, const size_t unmeth_b); 29 | }; 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/common-experimental/false_discovery_rate.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 University of Southern California and 3 | * Andrew D. Smith, Song Qiang 4 | * 5 | * Authors: Andrew D. Smith, Song Qiang 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program. If not, see . 19 | */ 20 | 21 | #include 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #include "false_discovery_rate.hpp" 28 | 29 | using std::vector; 30 | using std::upper_bound; 31 | 32 | double 33 | FDR::get_empirical_p_value(const vector &random_scores, 34 | const double &observed_score) 35 | { 36 | return random_scores.size() == 0 ? 0 : 37 | (random_scores.end() - 38 | upper_bound(random_scores.begin(), random_scores.end(), observed_score)) 39 | / static_cast(random_scores.size()); 40 | } 41 | 42 | 43 | void 44 | FDR::assign_empirical_p_values( 45 | const vector &random_scores, 46 | const vector &observed_scores, 47 | vector &p_values) 48 | { 49 | // make sure random_scores are sorted 50 | assert(std::adjacent_find(random_scores.begin(), random_scores.end(), 51 | std::greater()) 52 | == random_scores.end()); 53 | 54 | // get p_values 55 | p_values.resize(observed_scores.size()); 56 | for (size_t i = 0; i < observed_scores.size(); ++i) 57 | p_values[i] = get_empirical_p_value(random_scores, observed_scores[i]); 58 | 59 | // std::transform(observed_scores.begin(), observed_scores.end(), 60 | // p_values.begin(), 61 | // std::bind1st(std::ptr_fun(get_empirical_p_value), 62 | // random_scores)); 63 | } 64 | 65 | double 66 | FDR::get_fdr_cutoff(const vector &p_values, const double fdr) 67 | { 68 | if (fdr < 0) return 0; 69 | else if (fdr > 1) return 1; 70 | 71 | vector local(p_values); 72 | std::sort(local.begin(), local.end()); 73 | assert(local.size() > 0); 74 | size_t i = 0; 75 | for (; i < local.size() - 1 && 76 | local[i+1] < fdr*static_cast(i+1)/local.size(); ++i); 77 | assert(i < local.size()); 78 | return local[i]; 79 | } 80 | 81 | -------------------------------------------------------------------------------- /src/common-experimental/false_discovery_rate.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 University of Southern California and 3 | * Andrew D. Smith, Song Qiang 4 | * 5 | * Authors: Andrew D. Smith, Song Qiang 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program. If not, see . 19 | */ 20 | 21 | #ifndef FALSE_DISCOVERY_RATE 22 | #define FALSE_DISCOVERY_RATE 23 | 24 | #include 25 | 26 | namespace FDR 27 | { 28 | double 29 | get_empirical_p_value(const std::vector &random_scores, 30 | const double &observed_score); 31 | void 32 | assign_empirical_p_values(const std::vector &random_scores, 33 | const std::vector &observed_scores, 34 | std::vector &p_values); 35 | double 36 | get_fdr_cutoff(const std::vector &p_values, const double fdr); 37 | }; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/common-experimental/nonparametric-test.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 University of Southern California and 3 | * Song Qiang 4 | * 5 | * Authors: Song Qiang 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program. If not, see . 19 | */ 20 | 21 | #ifndef NON_PARAMETRIC_TEST_HPP 22 | #define NON_PARAMETRIC_TEST_HPP 23 | 24 | #include 25 | 26 | namespace NonParametricTest 27 | { 28 | double 29 | sign_test(const std::vector &x, 30 | const std::vector &y, 31 | const bool alternative = false); 32 | 33 | double 34 | wilcoxon_test(const std::vector &x, 35 | const std::vector &y, 36 | const bool alternative = false); 37 | }; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/common/BetaBin.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2011 University of Southern California 3 | Authors: Andrew D. Smith, Song Qiang 4 | 5 | This file is part of rmap. 6 | 7 | rmap is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | rmap is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with rmap; if not, write to the Free Software 19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 | */ 21 | 22 | #include "BetaBin.hpp" 23 | 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include 32 | #include 33 | 34 | 35 | using std::vector; 36 | using std::pair; 37 | using std::setw; 38 | using std::max; 39 | using std::min; 40 | using std::cerr; 41 | using std::endl; 42 | using std::string; 43 | using std::setprecision; 44 | 45 | ////////////////////////////////////////////// 46 | ////// struct betabin ////// 47 | ////////////////////////////////////////////// 48 | 49 | const double betabin::tolerance = 1e-10; 50 | 51 | betabin::betabin() : 52 | alpha(1), beta(1), lnbeta_helper(gsl_sf_lnbeta(1, 1)) {} 53 | 54 | betabin::betabin(const double a, const double b) : 55 | alpha(a), beta(b), lnbeta_helper(gsl_sf_lnbeta(a, b)) {} 56 | 57 | betabin::betabin(const string &str) 58 | { 59 | std::istringstream iss(str, std::istringstream::in); 60 | string name; 61 | iss >> name >> alpha >> beta; 62 | if (name != "betabin" || alpha < 0 || beta < 0) 63 | { 64 | cerr << "betabin::betabin: " 65 | << "bad string representation of betabin distribution: " 66 | << str << endl; 67 | throw "bad string representation of betabin distribution"; 68 | } 69 | lnbeta_helper = gsl_sf_lnbeta(alpha, beta); 70 | } 71 | 72 | 73 | string 74 | betabin::tostring() const 75 | { 76 | std::ostringstream os; 77 | os << "betabin " << setprecision(4) << alpha << " " 78 | << setprecision(4) << beta; 79 | return os.str(); 80 | } 81 | 82 | 83 | double 84 | betabin::operator()(const pair &val) const 85 | { 86 | const size_t x = static_cast(val.first); 87 | const size_t n = static_cast(x + val.second); 88 | return gsl_sf_lnchoose(n, x) + 89 | gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper; 90 | } 91 | 92 | double 93 | betabin::log_likelihood(const pair &val) const 94 | { 95 | const size_t x = static_cast(val.first); 96 | const size_t n = static_cast(x + val.second); 97 | return gsl_sf_lnchoose(n, x) + 98 | gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper; 99 | } 100 | 101 | double 102 | betabin::sign(const double x) 103 | { 104 | return (x >= 0) ? 1.0 : -1.0; 105 | } 106 | 107 | double 108 | betabin::invpsi(const double tolerance, const double x) 109 | { 110 | double L = 1.0, Y = std::exp(x); 111 | while (L > tolerance) 112 | { 113 | Y += L*sign(x - gsl_sf_psi(Y)); 114 | L /= 2.0; 115 | } 116 | return Y; 117 | } 118 | 119 | double 120 | betabin::movement(const double curr, const double prev) 121 | { 122 | return std::abs(curr - prev)/std::max(std::fabs(curr), std::fabs(prev)); 123 | } 124 | 125 | void 126 | betabin::fit(const vector &vals_a, const vector &vals_b, 127 | const vector &p) 128 | { 129 | const double p_total = std::accumulate(p.begin(), p.end(), 0.0); 130 | const double alpha_rhs = inner_product(vals_a.begin(), vals_a.end(), 131 | p.begin(), 0.0)/p_total; 132 | const double beta_rhs = inner_product(vals_b.begin(), vals_b.end(), 133 | p.begin(), 0.0)/p_total; 134 | double prev_alpha = 0.0, prev_beta = 0.0; 135 | alpha = beta = 0.01; 136 | while (movement(alpha, prev_alpha) > tolerance && 137 | movement(beta, prev_beta) > tolerance) 138 | { 139 | prev_alpha = alpha; 140 | prev_beta = beta; 141 | alpha = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + alpha_rhs); 142 | beta = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + beta_rhs); 143 | } 144 | lnbeta_helper = gsl_sf_lnbeta(alpha, beta); 145 | } 146 | 147 | -------------------------------------------------------------------------------- /src/common/BetaBin.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2011 University of Southern California 3 | Authors: Andrew D. Smith, Song Qiang 4 | 5 | This file is part of rmap. 6 | 7 | rmap is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | rmap is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with rmap; if not, write to the Free Software 19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 | */ 21 | 22 | #ifndef BETABIN_HPP 23 | #define BETABIN_HPP 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | // struct betabin; 30 | struct betabin 31 | { 32 | betabin(); 33 | betabin(const double a, const double b); 34 | betabin(const std::string &str); 35 | double operator()(const std::pair &val) const; 36 | double log_likelihood(const std::pair &val) const; 37 | double sign(const double x); 38 | double invpsi(const double tolerance, const double x); 39 | double movement(const double curr, const double prev); 40 | void fit(const std::vector &vals_a, 41 | const std::vector &vals_b, 42 | const std::vector &p); 43 | std::string tostring() const; 44 | double alpha; 45 | double beta; 46 | double lnbeta_helper; 47 | 48 | static const double tolerance; 49 | }; 50 | 51 | #endif 52 | 53 | -------------------------------------------------------------------------------- /src/common/EmissionDistribution.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2017 University of Southern California 3 | Authors: Andrew D. Smith and Benjamin E. Decato 4 | 5 | This file is part of methpipe. 6 | 7 | methpipe is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | methpipe is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with rmap; if not, write to the Free Software 19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 | */ 21 | 22 | #include "EmissionDistribution.hpp" 23 | 24 | using std::vector; 25 | using std::pair; 26 | using std::setw; 27 | using std::max; 28 | using std::min; 29 | using std::cerr; 30 | using std::endl; 31 | using std::string; 32 | using std::setprecision; 33 | 34 | EmissionDistribution::EmissionDistribution() : 35 | alpha(1), beta(1), lnbeta_helper(gsl_sf_lnbeta(1, 1)) {} 36 | 37 | EmissionDistribution::EmissionDistribution(const double a, const double b) : 38 | alpha(a), beta(b), lnbeta_helper(gsl_sf_lnbeta(a, b)) {} 39 | 40 | EmissionDistribution::EmissionDistribution(const string &str) { 41 | std::istringstream iss(str, std::istringstream::in); 42 | string name; 43 | iss >> name >> alpha >> beta; 44 | if (name != "edtn" || alpha < 0 || beta < 0) 45 | { 46 | cerr << "EmissionDistribution::EmissionDistribution: " 47 | << "bad string representation of emission distribution: " 48 | << str << endl; 49 | throw "bad string representation of emission distribution"; 50 | } 51 | lnbeta_helper = gsl_sf_lnbeta(alpha, beta); 52 | } 53 | 54 | EmissionDistribution::~EmissionDistribution() {} 55 | 56 | string 57 | EmissionDistribution::tostring() const { 58 | std::ostringstream os; 59 | os << "Emission dtn params: " << setprecision(4) << alpha << " " 60 | << setprecision(4) << beta; 61 | return os.str(); 62 | } 63 | 64 | 65 | double 66 | EmissionDistribution::sign(const double x) { 67 | return (x >= 0) ? 1.0 : -1.0; 68 | } 69 | 70 | 71 | double 72 | EmissionDistribution::invpsi(const double tolerance, const double x) { 73 | double L = 1.0, Y = std::exp(x); 74 | while (L > tolerance) 75 | { 76 | Y += L*sign(x - gsl_sf_psi(Y)); 77 | L /= 2.0; 78 | } 79 | return Y; 80 | } 81 | 82 | 83 | double 84 | EmissionDistribution::movement(const double curr, const double prev) { 85 | return std::abs(curr - prev)/std::max(std::fabs(curr), std::fabs(prev)); 86 | } 87 | 88 | 89 | void 90 | EmissionDistribution::fit(const vector &vals_a, 91 | const vector &vals_b, const vector &p) { 92 | const double p_total = std::accumulate(p.begin(), p.end(), 0.0); 93 | const double alpha_rhs = inner_product(vals_a.begin(), vals_a.end(), 94 | p.begin(), 0.0)/p_total; 95 | const double beta_rhs = inner_product(vals_b.begin(), vals_b.end(), 96 | p.begin(), 0.0)/p_total; 97 | 98 | double prev_alpha = 0.0, prev_beta = 0.0; 99 | alpha = beta = 0.01; 100 | while (movement(alpha, prev_alpha) > tolerance && 101 | movement(beta, prev_beta) > tolerance) 102 | { 103 | prev_alpha = alpha; 104 | prev_beta = beta; 105 | alpha = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + alpha_rhs); 106 | beta = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + beta_rhs); 107 | } 108 | lnbeta_helper = gsl_sf_lnbeta(alpha, beta); 109 | } 110 | 111 | Beta::Beta() : EmissionDistribution() {} 112 | Beta::Beta(const double a, const double b) : EmissionDistribution(a,b) {} 113 | Beta::Beta(const std::string &str) : EmissionDistribution(str) {} 114 | 115 | double 116 | Beta::operator()(const pair &val) const { 117 | const double p = val.first/val.second; 118 | return (alpha-1.0)*log(p) + (beta-1.0)*log(1.0-p) - gsl_sf_lnbeta(alpha, beta); 119 | } 120 | 121 | double 122 | Beta::log_likelihood(const pair &val) const { 123 | const double p = val.first/val.second; 124 | return (alpha-1.0)*log(p) + (beta-1.0)*log(1.0-p) - gsl_sf_lnbeta(alpha, beta); 125 | } 126 | 127 | BetaBinomial::BetaBinomial() : EmissionDistribution() {} 128 | BetaBinomial::BetaBinomial(const double a, const double b) 129 | : EmissionDistribution(a,b) {} 130 | BetaBinomial::BetaBinomial(const std::string &str) 131 | : EmissionDistribution(str) {} 132 | 133 | double 134 | BetaBinomial::operator()(const pair &val) const { 135 | const size_t x = static_cast(val.first); 136 | const size_t n = static_cast(x + val.second); 137 | return gsl_sf_lnchoose(n, x) + 138 | gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper; 139 | } 140 | 141 | double 142 | BetaBinomial::log_likelihood(const pair &val) const { 143 | const size_t x = static_cast(val.first); 144 | const size_t n = static_cast(x + val.second); 145 | return gsl_sf_lnchoose(n, x) + 146 | gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper; 147 | } 148 | -------------------------------------------------------------------------------- /src/common/EmissionDistribution.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2017 University of Southern California 3 | Authors: Andrew D. Smith and Benjamin E. Decato 4 | 5 | This file is part of methpipe. 6 | 7 | methpipe is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | methpipe is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with rmap; if not, write to the Free Software 19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 | */ 21 | 22 | #ifndef EM_DTN 23 | #define EM_DTN 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | /** Emission distributions for methylation should be modeled either as 37 | * Beta or Beta Binomial. Since they will be used simultaneously, it is 38 | * helpful to have an abstraction so that we can put them in the same 39 | * container. 40 | */ 41 | class EmissionDistribution 42 | { 43 | public: 44 | EmissionDistribution(); 45 | virtual ~EmissionDistribution(); 46 | EmissionDistribution(const double a, const double b); 47 | EmissionDistribution(const std::string &str); 48 | virtual double operator()(const std::pair &val) const = 0; 49 | virtual double log_likelihood(const std::pair &val) const = 0; 50 | std::string tostring() const; 51 | double getalpha() { return alpha; }; 52 | double getbeta() { return beta; }; 53 | void fit(const std::vector &vals_a, 54 | const std::vector &vals_b, 55 | const std::vector &p); 56 | 57 | protected: 58 | double sign(const double x); 59 | double invpsi(const double tolerance, const double x); 60 | double movement(const double curr, const double prev); 61 | double alpha; 62 | double beta; 63 | double lnbeta_helper; 64 | 65 | const double tolerance = 1e-10; 66 | }; 67 | 68 | class Beta : public EmissionDistribution 69 | { 70 | public: 71 | Beta(); 72 | Beta(const double a, const double b); 73 | Beta(const std::string &str); 74 | double operator()(const std::pair &val) const; 75 | double log_likelihood(const std::pair &val) const; 76 | }; 77 | 78 | class BetaBinomial : public EmissionDistribution 79 | { 80 | public: 81 | BetaBinomial(); 82 | BetaBinomial(const double a, const double b); 83 | BetaBinomial(const std::string &str); 84 | double operator()(const std::pair &val) const; 85 | double log_likelihood(const std::pair &val) const; 86 | }; 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /src/common/Epiread.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2011 University of Southern California and 2 | * Andrew D. Smith and Fang Fang 3 | * 4 | * Authors: Fang Fang and Andrew D. Smith 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include "Epiread.hpp" 25 | 26 | using std::vector; 27 | using std::string; 28 | 29 | size_t 30 | adjust_read_offsets(vector &reads) { 31 | size_t first_read_offset = std::numeric_limits::max(); 32 | for (size_t i = 0; i < reads.size(); ++i) 33 | first_read_offset = std::min(reads[i].pos, first_read_offset); 34 | for (size_t i = 0; i < reads.size(); ++i) 35 | reads[i].pos -= first_read_offset; 36 | return first_read_offset; 37 | } 38 | 39 | 40 | size_t 41 | get_n_cpgs(const vector &reads) { 42 | size_t n_cpgs = 0; 43 | for (size_t i = 0; i < reads.size(); ++i) 44 | n_cpgs = std::max(n_cpgs, reads[i].end()); 45 | return n_cpgs; 46 | } 47 | 48 | std::istream& 49 | operator>>(std::istream &in, epiread &er) { 50 | string buffer; 51 | if (getline(in, buffer)) { 52 | std::istringstream is(buffer); 53 | if (!(is >> er.chr >> er.pos >> er.seq)) 54 | throw std::runtime_error("malformed epiread line:\n" + buffer); 55 | } 56 | return in; 57 | } 58 | 59 | 60 | std::ostream& 61 | operator<<(std::ostream &out, const epiread &er) { 62 | return out << er.chr << '\t' << er.pos << '\t' << er.seq; 63 | } 64 | -------------------------------------------------------------------------------- /src/common/Epiread.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2011 University of Southern California and 2 | * Andrew D. Smith and Fang Fang 3 | * 4 | * Authors: Fang Fang and Andrew D. Smith 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #ifndef EPIREAD 21 | #define EPIREAD 22 | 23 | #include 24 | #include 25 | #include "smithlab_utils.hpp" 26 | 27 | struct epiread { 28 | std::string chr; 29 | size_t pos; 30 | std::string seq; 31 | epiread() {} 32 | epiread(const size_t p, const std::string &s) : pos(p), seq(s) {} 33 | epiread(const std::string &c, const size_t p, const std::string &s) 34 | : chr(c), pos(p), seq(s) {} 35 | 36 | bool operator<(const epiread &other) const { 37 | return (chr < other.chr || (chr == other.chr && pos < other.pos)); 38 | } 39 | size_t end() const {return pos + seq.length();} 40 | size_t length() const {return seq.length();} 41 | }; 42 | 43 | std::istream& operator>>(std::istream &in, epiread &er); 44 | std::ostream& operator<<(std::ostream &out, const epiread &er); 45 | 46 | size_t 47 | adjust_read_offsets(std::vector &reads); 48 | 49 | size_t 50 | get_n_cpgs(const std::vector &reads); 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /src/common/EpireadStats.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2014 University of Southern California and 2 | * Andrew D. Smith and Fang Fang and Benjamin Decato 3 | * 4 | * Authors: Fang Fang and Benjamin Decato and Andrew D. Smith 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #include "EpireadStats.hpp" 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include 32 | #include 33 | 34 | using std::string; 35 | using std::vector; 36 | using std::isfinite; 37 | 38 | static const double PSEUDOCOUNT = 1e-10; 39 | 40 | inline bool 41 | is_meth(const epiread &r, const size_t pos) {return (r.seq[pos] == 'C');} 42 | 43 | inline bool 44 | un_meth(const epiread &r, const size_t pos) {return (r.seq[pos] == 'T');} 45 | 46 | double 47 | log_likelihood(const epiread &r, const vector &a) { 48 | double ll = 0.0; 49 | for (size_t i = 0; i < r.seq.length(); ++i) 50 | if (is_meth(r, i) || un_meth(r, i)) { 51 | const double val = (is_meth(r, i) ? a[r.pos + i] : (1.0 - a[r.pos + i])); 52 | assert(isfinite(log(val))); 53 | ll += log(val); 54 | } 55 | return ll; 56 | } 57 | 58 | 59 | double 60 | log_likelihood(const epiread &r, const double mixing, 61 | const vector &a1, const vector &a2) { 62 | return log(mixing*exp(log_likelihood(r, a1)) + 63 | (1.0 - mixing)*exp(log_likelihood(r, a2))); 64 | } 65 | 66 | 67 | double 68 | log_likelihood(const vector &reads, const double mixing, 69 | const vector &a1, const vector &a2) { 70 | double ll = 0.0; 71 | for (size_t i = 0; i < reads.size(); ++i) 72 | ll += log_likelihood(reads[i], mixing, a1, a2); 73 | return ll; 74 | } 75 | 76 | 77 | static double 78 | expectation_step(const vector &reads, const double mixing, 79 | const vector &a1, const vector &a2, 80 | vector &indicators) { 81 | const double log_mixing1 = log(mixing); 82 | const double log_mixing2 = log(1.0 - mixing); 83 | assert(isfinite(log_mixing1) && isfinite(log_mixing2)); 84 | 85 | double score = 0; 86 | for (size_t i = 0; i < reads.size(); ++i) { 87 | const double ll1 = log_mixing1 + log_likelihood(reads[i], a1); 88 | const double ll2 = log_mixing2 + log_likelihood(reads[i], a2); 89 | assert(isfinite(ll1) && isfinite(ll2)); 90 | const double log_denom = log(exp(ll1) + exp(ll2)); 91 | score += log_denom; 92 | indicators[i] = exp(ll1 - log_denom); 93 | assert(isfinite(log_denom) && isfinite(indicators[i])); 94 | } 95 | return score; 96 | } 97 | 98 | 99 | void 100 | fit_epiallele(double pseudo, const vector &reads, 101 | const vector &indicators, vector &a) { 102 | const size_t n_cpgs = a.size(); 103 | vector meth(n_cpgs, 0.0), total(n_cpgs, 0.0); 104 | for (size_t i = 0; i < reads.size(); ++i) { 105 | const size_t start = reads[i].pos; 106 | const double weight = indicators[i]; 107 | for (size_t j = 0; j < reads[i].seq.length(); ++j) 108 | if (is_meth(reads[i], j) || un_meth(reads[i], j)) { 109 | meth[start + j] += weight*(is_meth(reads[i], j)); 110 | total[start + j] += weight; 111 | } 112 | } 113 | for (size_t i = 0; i < n_cpgs; ++i) 114 | a[i] = (meth[i] + pseudo)/(total[i] + 2*pseudo); 115 | } 116 | 117 | 118 | static void 119 | maximization_step(const vector &reads, const vector &indicators, 120 | vector &a1, vector &a2) { 121 | 122 | vector inverted_indicators(indicators); 123 | for (size_t i = 0; i < inverted_indicators.size(); ++i) 124 | inverted_indicators[i] = 1.0 - inverted_indicators[i]; 125 | 126 | // Fit the regular model parameters. Since the two epialleles' 127 | // likelihoods are summed, we need to make sure the pseudocount 128 | // is proportional to the pseudocount used in the single allele model. 129 | fit_epiallele(0.5*PSEUDOCOUNT, reads, indicators, a1); 130 | fit_epiallele(0.5*PSEUDOCOUNT, reads, inverted_indicators, a2); 131 | } 132 | 133 | 134 | static void 135 | rescale_indicators(const double mixing, vector &indic) { 136 | const double n_reads = indic.size(); 137 | const double total = accumulate(indic.begin(), indic.end(), 0.0); 138 | const double ratio = total/n_reads; 139 | 140 | if (mixing < ratio) 141 | for (size_t i = 0; i < indic.size(); ++i) 142 | indic[i] *= (mixing/ratio); 143 | 144 | else { 145 | const double adjustment = mixing/(1.0 - ratio); 146 | for (size_t i = 0; i < indic.size(); ++i) 147 | indic[i] = 1.0 - (1.0 - indic[i])*adjustment; 148 | } 149 | } 150 | 151 | 152 | static double 153 | expectation_maximization(const size_t max_itr, const vector &reads, 154 | const double &mixing, vector &indicators, 155 | vector &a1, vector &a2) { 156 | 157 | static const double EPIREAD_STATS_TOLERANCE = 1e-10; 158 | 159 | double prev_score = -std::numeric_limits::max(); 160 | for (size_t i = 0; i < max_itr; ++i) { 161 | 162 | const double score = expectation_step(reads, mixing, a1, a2, indicators); 163 | rescale_indicators(mixing, indicators); 164 | maximization_step(reads, indicators, a1, a2); 165 | 166 | if ((prev_score - score)/prev_score < EPIREAD_STATS_TOLERANCE) 167 | break; 168 | prev_score = score; 169 | } 170 | return prev_score; 171 | } 172 | 173 | 174 | double 175 | resolve_epialleles(const size_t max_itr, const vector &reads, 176 | const double &mixing, vector &indicators, 177 | vector &a1, vector &a2) { 178 | 179 | indicators.clear(); 180 | indicators.resize(reads.size(), 0.0); 181 | for (size_t i = 0; i < reads.size(); ++i) { 182 | const double l1 = log_likelihood(reads[i], a1); 183 | const double l2 = log_likelihood(reads[i], a2); 184 | indicators[i] = exp(l1 - log(exp(l1) + exp(l2))); 185 | } 186 | 187 | return expectation_maximization(max_itr, reads, mixing, 188 | indicators, a1, a2); 189 | } 190 | 191 | 192 | double 193 | fit_single_epiallele(const vector &reads, vector &a) { 194 | assert(reads.size() > 0); 195 | vector indicators(reads.size(), 1.0); 196 | fit_epiallele(PSEUDOCOUNT, reads, indicators, a); 197 | 198 | double score = 0.0; 199 | for (size_t i = 0; i < reads.size(); ++i) { 200 | score += log_likelihood(reads[i], a); 201 | assert(isfinite(score)); 202 | } 203 | return score; 204 | } 205 | 206 | 207 | void 208 | compute_model_likelihoods(double &single_score, double &pair_score, 209 | const size_t &max_itr, const double &low_prob, 210 | const double &high_prob, const size_t &n_cpgs, 211 | const vector &reads) { 212 | 213 | static const double mixing = 0.5; 214 | 215 | // try a single epi-allele and compute its log likelihood 216 | vector a0(n_cpgs, 0.5); 217 | single_score = fit_single_epiallele(reads, a0); 218 | 219 | // initialize the pair epi-alleles and indicators, and do the actual 220 | // computation to infer alleles, compute its log likelihood 221 | vector a1(n_cpgs, low_prob), a2(n_cpgs, high_prob), indicators; 222 | resolve_epialleles(max_itr, reads, mixing, indicators, a1, a2); 223 | pair_score = log_likelihood(reads, mixing, a1, a2); 224 | 225 | } 226 | 227 | 228 | double 229 | test_asm_lrt(const size_t max_itr, const double low_prob, 230 | const double high_prob, vector reads) { 231 | double single_score = std::numeric_limits::min(); 232 | double pair_score = std::numeric_limits::min(); 233 | adjust_read_offsets(reads); 234 | const size_t n_cpgs = get_n_cpgs(reads); 235 | 236 | compute_model_likelihoods(single_score, pair_score, max_itr, low_prob, 237 | high_prob, n_cpgs, reads); 238 | 239 | // degrees of freedom = 2*n_cpgs for two-allele model 240 | // minus n_cpgs for one-allele model 241 | const size_t df = n_cpgs; 242 | 243 | const double llr_stat = -2*(single_score - pair_score); 244 | const double p_value = 1.0 - gsl_cdf_chisq_P(llr_stat, df); 245 | return p_value; 246 | } 247 | 248 | 249 | double 250 | test_asm_bic(const size_t max_itr, const double low_prob, 251 | const double high_prob, vector reads) { 252 | 253 | double single_score = std::numeric_limits::min(); 254 | double pair_score = std::numeric_limits::min(); 255 | adjust_read_offsets(reads); 256 | const size_t n_cpgs = get_n_cpgs(reads); 257 | 258 | compute_model_likelihoods(single_score, pair_score, max_itr, low_prob, 259 | high_prob, n_cpgs, reads); 260 | 261 | // compute bic scores and compare 262 | const double bic_single = n_cpgs*log(reads.size()) - 2*single_score; 263 | const double bic_pair = 2*n_cpgs*log(reads.size()) - 2*pair_score; 264 | return bic_pair - bic_single; 265 | } 266 | -------------------------------------------------------------------------------- /src/common/EpireadStats.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2011 University of Southern California and 2 | * Andrew D. Smith and Fang Fang 3 | * 4 | * Authors: Fang Fang and Andrew D. Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #ifndef EPIREAD_STATS 21 | #define EPIREAD_STATS 22 | 23 | #include "Epiread.hpp" 24 | #include 25 | 26 | //////////////////////////////////////////////////////////////////////// 27 | //////////////////////////////////////////////////////////////////////// 28 | ////// 29 | ////// FUNCTIONS FOR A SINGLE EPITYPE 30 | ////// 31 | 32 | double 33 | log_likelihood(const epiread &r, const std::vector &a); 34 | void 35 | fit_epiallele(const std::vector &reads, 36 | const std::vector &indicators, std::vector &a); 37 | double 38 | fit_single_epiallele(const std::vector &reads, std::vector &a); 39 | 40 | //////////////////////////////////////////////////////////////////////// 41 | //////////////////////////////////////////////////////////////////////// 42 | ////// 43 | ////// FUNCTIONS FOR TWO EPITYPES 44 | ////// 45 | 46 | double 47 | log_likelihood(const epiread &r, const double z, 48 | const std::vector &a1, const std::vector &a2); 49 | double 50 | log_likelihood(const epiread &r, const std::vector &a1, 51 | const std::vector &a2); 52 | double 53 | log_likelihood(const std::vector &reads, const std::vector &indicators, 54 | const std::vector &a1, const std::vector &a2); 55 | 56 | double 57 | resolve_epialleles(const size_t max_itr, 58 | const std::vector &reads, 59 | std::vector &indicators, 60 | std::vector &a1, std::vector &a2); 61 | 62 | double 63 | test_asm_lrt(const size_t max_itr, const double low_prob, 64 | const double high_prob, std::vector reads); 65 | 66 | double 67 | test_asm_bic(const size_t max_itr, const double low_prob, 68 | const double high_prob, std::vector reads); 69 | 70 | 71 | class EpireadStats { 72 | public: 73 | EpireadStats(const double lp, 74 | const double hp, 75 | const double cv, 76 | const size_t mi, 77 | const bool UB) : 78 | low_prob(lp), high_prob(hp), 79 | critical_value(cv), max_itr(mi), 80 | USE_BIC(UB) {} 81 | 82 | double 83 | test_asm(const std::vector &reads, bool &is_significant) const { 84 | const double score = (USE_BIC) ? 85 | test_asm_bic(max_itr, low_prob, high_prob, reads) : 86 | test_asm_lrt(max_itr, low_prob, high_prob, reads); 87 | is_significant = (score < critical_value || (USE_BIC && score < 0.0)); 88 | return score; 89 | } 90 | 91 | private: 92 | double low_prob; 93 | double high_prob; 94 | double critical_value; 95 | size_t max_itr; 96 | bool USE_BIC; 97 | }; 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /src/common/LevelsCounter.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2018 Andrew D. Smith 2 | * 3 | * Authors: Andrew D. Smith 4 | * 5 | * This is free software: you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This software is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | */ 15 | 16 | #include "LevelsCounter.hpp" 17 | #include "bsutils.hpp" 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | using std::string; 24 | using std::to_string; 25 | using std::runtime_error; 26 | 27 | void 28 | LevelsCounter::update(const MSite &s) { 29 | if (s.is_mutated()) { 30 | ++mutations; 31 | } 32 | else if (s.n_reads > 0) { 33 | ++sites_covered; 34 | max_depth = std::max(max_depth, s.n_reads); 35 | total_c += s.n_meth(); 36 | total_t += s.n_reads - s.n_meth(); 37 | mean_agg += s.meth; 38 | double lower = 0.0, upper = 0.0; 39 | wilson_ci_for_binomial(alpha, s.n_reads, s.meth, lower, upper); 40 | called_meth += (lower > 0.5); 41 | called_unmeth += (upper < 0.5); 42 | } 43 | ++total_sites; 44 | } 45 | 46 | string 47 | LevelsCounter::tostring() const { 48 | static const string indent = string(2, ' '); 49 | const bool good = (sites_covered != 0); 50 | std::ostringstream oss; 51 | // directly counted values 52 | oss << context + ":\n" 53 | << indent << "total_sites: " << total_sites << '\n' 54 | << indent << "sites_covered: " << sites_covered << '\n' 55 | << indent << "total_c: " << total_c << '\n' 56 | << indent << "total_t: " << total_t << '\n' 57 | << indent << "max_depth: " << max_depth << '\n' 58 | << indent << "mutations: " << mutations << '\n' 59 | << indent << "called_meth: " << called_meth << '\n' 60 | << indent << "called_unmeth: " << called_unmeth << '\n' 61 | << indent << "mean_agg: " << mean_agg << '\n'; 62 | 63 | // derived values 64 | oss << indent << "coverage: " << coverage() << '\n' 65 | << indent << "sites_covered_fraction: " 66 | << static_cast(sites_covered)/total_sites << '\n' 67 | << indent << "mean_depth: " 68 | << static_cast(coverage())/total_sites << '\n' 69 | << indent << "mean_depth_covered: " 70 | << static_cast(coverage())/sites_covered << '\n' 71 | << indent << "mean_meth: " 72 | << (good ? to_string(mean_meth()) : "NA") << '\n' 73 | << indent << "mean_meth_weighted: " 74 | << (good ? to_string(mean_meth_weighted()) : "NA") << '\n' 75 | << indent << "fractional_meth: " 76 | << (good ? to_string(fractional_meth()) : "NA"); 77 | return oss.str(); 78 | } 79 | 80 | double LevelsCounter::alpha = 0.95; 81 | 82 | std::ostream & 83 | operator<<(std::ostream &out, const LevelsCounter &cs) { 84 | return out << cs.tostring(); 85 | } 86 | 87 | static void 88 | check_label(const string &observed, const string expected) { 89 | if (observed != expected) 90 | throw runtime_error("bad levels format [" + observed + "," + expected + "]"); 91 | } 92 | 93 | std::istream & 94 | operator>>(std::istream &in, LevelsCounter &cs) { 95 | in >> cs.context; // get the context 96 | cs.context = cs.context.substr(0, cs.context.find_first_of(":")); 97 | 98 | string label; 99 | in >> label >> cs.total_sites; // the total sites 100 | check_label(label, "total_sites:"); 101 | 102 | in >> label >> cs.sites_covered; // the sites covered 103 | check_label(label, "sites_covered:"); 104 | 105 | in >> label >> cs.total_c; // the total c 106 | check_label(label, "total_c:"); 107 | 108 | in >> label >> cs.total_t; // the total t 109 | check_label(label, "total_t:"); 110 | 111 | in >> label >> cs.max_depth; // the max depth 112 | check_label(label, "max_depth:"); 113 | 114 | in >> label >> cs.mutations; // the number of mutations 115 | check_label(label, "mutations:"); 116 | 117 | in >> label >> cs.called_meth; // the number of sites called methylated 118 | check_label(label, "called_meth:"); 119 | 120 | in >> label >> cs.called_unmeth; // the number of sites called unmethylated 121 | check_label(label, "called_unmeth:"); 122 | 123 | in >> label >> cs.mean_agg; // the mean aggregate 124 | check_label(label, "mean_agg:"); 125 | 126 | return in; 127 | } 128 | -------------------------------------------------------------------------------- /src/common/LevelsCounter.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2018 Andrew D. Smith 2 | * 3 | * Authors: Andrew D. Smith 4 | * 5 | * This is free software: you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This software is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | */ 15 | 16 | #ifndef LEVELS_COUNTER_HPP 17 | #define LEVELS_COUNTER_HPP 18 | 19 | #include "MethpipeSite.hpp" 20 | 21 | #include 22 | #include 23 | 24 | struct LevelsCounter { 25 | std::string context; 26 | size_t total_sites; 27 | size_t sites_covered; 28 | size_t max_depth; 29 | size_t mutations; 30 | size_t total_c, total_t; 31 | size_t called_meth, called_unmeth; 32 | double mean_agg; 33 | LevelsCounter(const std::string &c) : 34 | context(c), total_sites(0), sites_covered(0), max_depth(0), 35 | mutations(0), total_c(0), total_t(0), 36 | called_meth(0), called_unmeth(0), 37 | mean_agg(0.0) {} 38 | 39 | LevelsCounter() : 40 | total_sites(0), sites_covered(0), max_depth(0), 41 | mutations(0), total_c(0), total_t(0), 42 | called_meth(0), called_unmeth(0), 43 | mean_agg(0.0) {} 44 | 45 | void update(const MSite &s); 46 | 47 | size_t coverage() const {return total_c + total_t;} 48 | size_t total_called() const {return called_meth + called_unmeth;} 49 | 50 | double mean_meth_weighted() const { 51 | return static_cast(total_c)/coverage(); 52 | } 53 | double fractional_meth() const { 54 | return static_cast(called_meth)/total_called(); 55 | } 56 | double mean_meth() const { 57 | return mean_agg/sites_covered; 58 | } 59 | 60 | std::string tostring() const; 61 | 62 | static double alpha; 63 | }; 64 | 65 | std::ostream & 66 | operator<<(std::ostream &out, const LevelsCounter &cs); 67 | 68 | std::istream & 69 | operator>>(std::istream &in, LevelsCounter &cs); 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /src/common/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2011 University of Southern California and 2 | # Andrew D. Smith 3 | # 4 | # Authors: Andrew D. Smith 5 | # 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | # 19 | 20 | ifndef SMITHLAB_CPP 21 | $(error SMITHLAB_CPP variable undefined) 22 | endif 23 | 24 | CXX = g++ 25 | CXXFLAGS = -Wall -std=c++11 26 | OPTFLAGS = -O3 27 | DEBUGFLAGS = -g 28 | 29 | INCLUDEDIRS = $(SMITHLAB_CPP)/ 30 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS)) 31 | override CPPFLAGS += $(INCLUDEARGS) 32 | 33 | ifdef DEBUG 34 | CXXFLAGS += $(DEBUGFLAGS) 35 | else 36 | CXXFLAGS += $(OPTFLAGS) 37 | endif 38 | 39 | %.o: %.cpp %.hpp 40 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(CPPFLAGS) 41 | 42 | clean: 43 | @-rm -f *.o *~ 44 | .PHONY: clean 45 | -------------------------------------------------------------------------------- /src/common/MethpipeFiles.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2012 University of Southern California 3 | Authors: Andrew D. Smith, Song Qiang, Benjamin Decato 4 | 5 | This program is free software; you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with This program; if not, write to the Free Software 17 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | */ 19 | 20 | #ifndef METHPIPE_FILES_HPP 21 | #define METHPIPE_FILES_HPP 22 | 23 | #include 24 | #include 25 | #include 26 | #include "GenomicRegion.hpp" 27 | 28 | namespace methpipe { 29 | enum FILETYPE {OLD, NEW}; 30 | 31 | std::string 32 | skip_header(std::istream &in); 33 | 34 | void 35 | load_cpgs(const std::string &cpgs_file, 36 | std::vector &cpgs, 37 | std::vector > &meths, 38 | std::vector &reads); 39 | 40 | void 41 | load_cpgs(const std::string &cpgs_file, 42 | std::vector &cpgs, 43 | std::vector > &meths, 44 | std::vector &reads); 45 | 46 | void 47 | load_cpgs_old(const std::string &cpgs_file, 48 | std::vector &cpgs, 49 | std::vector > &meths, 50 | std::vector &reads); 51 | 52 | void 53 | load_cpgs_old(const std::string &cpgs_file, 54 | std::vector &cpgs, 55 | std::vector > &meths, 56 | std::vector &reads); 57 | 58 | std::istream& 59 | read_site(std::istream &in, std::string &chrom, size_t &pos, 60 | std::string &strand, std::string &seq, 61 | double &meth, size_t &coverage); 62 | 63 | std::istream& 64 | read_site(std::istream &in, std::string &chrom, size_t &pos, 65 | char &strand, std::string &seq, 66 | double &meth, size_t &coverage); 67 | 68 | std::istream& 69 | read_site(std::istream &in, std::string &chrom, size_t &pos, 70 | std::string &strand, std::string &seq, 71 | double &meth, size_t &coverage, bool &is_array_data); 72 | 73 | std::ostream& 74 | write_site(std::ostream &out, const std::string &chrom, const size_t &pos, 75 | const std::string &strand, const std::string &seq, 76 | const double &meth, const size_t &coverage); 77 | 78 | // re-locate the file handler point to the first line 79 | // that are at or behind location chrom, pos 80 | void 81 | seek_site(std::istream &in, const std::string &chrom, 82 | const size_t pos); 83 | 84 | bool 85 | is_methpipe_file_single(const std::string &file); 86 | 87 | bool 88 | is_methpipe_file_array(const std::string &file); 89 | 90 | // files to support old format 91 | std::istream& 92 | read_site_old(std::istream &in, std::string &chrom, size_t &pos, 93 | std::string &strand, std::string &seq, 94 | double &meth, size_t &coverage); 95 | 96 | std::ostream & 97 | write_site_old(std::ostream &out, const std::string &chrom, 98 | const size_t &pos, const std::string &strand, 99 | const std::string &seq, const double &meth, 100 | const size_t &coverage); 101 | 102 | // functions for methdiff results I/O 103 | std::ostream & 104 | write_methdiff_site(std::ostream &out, const std::string &chrom, 105 | const size_t pos, const std::string &strand, 106 | const std::string &seq, const double diffscore, 107 | const size_t meth_a, const size_t unmeth_a, 108 | const size_t meth_b, const size_t unmeth_b); 109 | 110 | std::istream & 111 | read_methdiff_site(std::istream &in, std::string &chrom, 112 | size_t &pos, std::string &strand, 113 | std::string &seq, double &diffscore, 114 | size_t &meth_a, size_t &unmeth_a, 115 | size_t &meth_b, size_t &unmeth_b); 116 | } 117 | #endif 118 | -------------------------------------------------------------------------------- /src/common/MethpipeSite.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2015 University of Southern California 3 | Authors: Andrew D. Smith 4 | 5 | This program is free software; you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with This program; if not, write to the Free Software 17 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | */ 19 | 20 | #include "MethpipeSite.hpp" 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "smithlab_utils.hpp" 28 | 29 | using std::string; 30 | using std::runtime_error; 31 | 32 | MSite::MSite(const string &line) { 33 | /* GS: this is faster but seems to be genenerating issues when 34 | * compiled with clang 35 | std::istringstream iss; 36 | iss.rdbuf()->pubsetbuf(const_cast(line.c_str()), line.length()); */ 37 | std::istringstream iss(line); 38 | string strand_tmp; 39 | if (!(iss >> chrom >> pos >> strand_tmp >> context >> meth >> n_reads)) 40 | throw std::runtime_error("bad methpipe site line: \"" + line + "\""); 41 | strand = strand_tmp[0]; 42 | if (strand != '-' && strand != '+') 43 | throw std::runtime_error("bad methpipe site line: \"" + line + "\""); 44 | } 45 | 46 | 47 | string 48 | MSite::tostring() const { 49 | std::ostringstream oss; 50 | oss << chrom << '\t' 51 | << pos << '\t' 52 | << strand << '\t' 53 | << context << '\t' 54 | << meth << '\t' 55 | << n_reads; 56 | return oss.str(); 57 | } 58 | 59 | 60 | size_t 61 | distance(const MSite &a, const MSite &b) { 62 | return a.chrom == b.chrom ? std::max(a.pos, b.pos) - std::min(a.pos, b.pos) : 63 | std::numeric_limits::max(); 64 | } 65 | -------------------------------------------------------------------------------- /src/common/MethpipeSite.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2015 University of Southern California 3 | Authors: Andrew D. Smith 4 | 5 | This program is free software; you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with This program; if not, write to the Free Software 17 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | */ 19 | 20 | #ifndef METHPIPE_SITE_HPP 21 | #define METHPIPE_SITE_HPP 22 | 23 | #include 24 | #include 25 | 26 | struct MSite { 27 | 28 | MSite() {} 29 | MSite(const std::string &_chrom, 30 | const size_t _pos, 31 | const char _strand, 32 | const std::string &_context, 33 | const double _meth, 34 | const size_t _n_reads) : 35 | chrom(_chrom), pos(_pos), strand(_strand), 36 | context(_context), meth(_meth), n_reads(_n_reads) {} 37 | explicit MSite(const std::string &line); 38 | 39 | std::string chrom; 40 | size_t pos; 41 | char strand; 42 | std::string context; 43 | double meth; 44 | size_t n_reads; 45 | 46 | bool operator<(const MSite &other) const { 47 | int r = chrom.compare(other.chrom); 48 | return (r < 0 || 49 | (r == 0 && 50 | (pos < other.pos || 51 | (pos == other.pos && strand < other.strand)))); 52 | } 53 | 54 | size_t n_meth() const {return std::round(meth*n_reads);} 55 | size_t n_unmeth() const {return n_reads - n_meth();} 56 | 57 | ////////////////////////////////////////////////////////////// 58 | /// FUNCTIONS BELOW ARE FOR MANIPULATING SYMMETRIC CPG SITES 59 | ////////////////////////////////////////////////////////////// 60 | void add(const MSite &other) { 61 | if (!is_mutated() && other.is_mutated()) 62 | context += 'x'; 63 | // ADS: order matters below as n_reads update invalidates n_meth() 64 | // function until meth has been updated 65 | const size_t total_c_reads = n_meth() + other.n_meth(); 66 | n_reads += other.n_reads; 67 | meth = static_cast(total_c_reads)/std::max(1ul, n_reads); 68 | } 69 | 70 | // ADS: function below has redundant check for is_cpg, which is 71 | // expensive and might be ok to remove 72 | bool is_mate_of(const MSite &first) { 73 | return (first.pos + 1 == pos && first.is_cpg() && is_cpg() && 74 | first.strand == '+' && strand == '-'); 75 | } 76 | 77 | //////////////////////////////////////////////////////////////////////// 78 | ///// Functions below test the type of site. These are CpG, CHH, and 79 | ///// CHG divided into two kinds: CCG and CXG, the former including a 80 | ///// CpG within. Also included is a function that tests if a site 81 | ///// has a mutation. 82 | //////////////////////////////////////////////////////////////////////// 83 | bool is_cpg() const { 84 | return context.length() >= 3 && 85 | (context[0] == 'C' && context[1] == 'p' && context[2] == 'G'); 86 | } 87 | bool is_chh() const { 88 | return context.length() >= 3 && 89 | (context[0] == 'C' && context[1] == 'H' && context[2] == 'H'); 90 | } 91 | bool is_ccg() const { 92 | return context.length() >= 3 && 93 | (context[0] == 'C' && context[1] == 'C' && context[2] == 'G'); 94 | } 95 | bool is_cxg() const { 96 | return context.length() >= 3 && 97 | (context[0] == 'C' && context[1] == 'X' && context[2] == 'G'); 98 | } 99 | bool is_mutated() const { 100 | return context.length() == 4 && context[3] == 'x'; 101 | } 102 | 103 | void set_mutated() { 104 | if (!is_mutated()) 105 | context += 'x'; 106 | } 107 | void set_unmutated() { 108 | if (is_mutated()) 109 | context.resize(context.length() - 1); 110 | } 111 | 112 | std::string tostring() const; 113 | }; 114 | 115 | template T & 116 | operator>>(T &in, MSite &s) { 117 | std::string line; 118 | if (getline(in, line)) 119 | s = MSite(line); 120 | return in; 121 | } 122 | 123 | template T & 124 | operator<<(T &out, const MSite &s) { 125 | out << s.tostring(); // seems to be an issue returning this directly 126 | return out; 127 | } 128 | 129 | size_t 130 | distance(const MSite &a, const MSite &b); 131 | 132 | #endif 133 | -------------------------------------------------------------------------------- /src/common/Smoothing.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2008 Cold Spring Harbor Laboratory 3 | Authors: Andrew D. Smith 4 | 5 | This file is part of rmap. 6 | 7 | rmap is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | rmap is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with rmap; if not, write to the Free Software 19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 | */ 21 | 22 | #include "Smoothing.hpp" 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "smithlab_utils.hpp" 32 | 33 | using std::vector; 34 | using std::transform; 35 | using std::divides; 36 | using std::runtime_error; 37 | 38 | static double 39 | Epanechnikov_kernel(double i, double j, double bandwidth) { 40 | const double u = (j - i)/bandwidth; 41 | return 0.75*(1.0 - u*u); 42 | } 43 | 44 | void 45 | KernelSmoothing(const double bandwidth, 46 | const vector &x_vals, 47 | const vector &y_vals, 48 | const vector &x_target, 49 | vector &y_target) { 50 | assert(x_vals.size() == y_vals.size()); 51 | 52 | // allocate the space for the new y vals 53 | y_target.resize(x_target.size(), 0); 54 | 55 | // set the index into the current point to use for smoothing 56 | size_t x_start = 0; 57 | size_t x_end = 0; 58 | 59 | // iterate over the x target vals 60 | for (size_t i = 0; i < x_target.size(); ++i) { 61 | 62 | // calculate the x starting point 63 | while (x_start < x_vals.size() && x_vals[x_start] < x_target[i] - bandwidth) 64 | ++x_start; 65 | 66 | // calculate the x ending point 67 | while (x_end < x_vals.size() && x_vals[x_end] < x_target[i] + bandwidth) 68 | ++x_end; 69 | 70 | if (x_start >= x_end) 71 | throw runtime_error("smoothing using an interval of size 0"); 72 | 73 | // set the number of points used for smoothing current value 74 | const size_t lim = x_end - x_start; 75 | 76 | // calculate the weights 77 | vector weights(lim); 78 | for (size_t j = 0; j < lim; ++j) 79 | weights[j] = Epanechnikov_kernel(x_target[i], x_vals[x_start+j], bandwidth); 80 | const double weight_sum = accumulate(weights.begin(), weights.end(), 0.0); 81 | transform(weights.begin(), weights.end(), weights.begin(), 82 | [weight_sum] (const double w) {return w / weight_sum;}); 83 | 84 | // apply the weights 85 | y_target[i] = 0; 86 | for (size_t j = 0; j < lim; ++j) 87 | y_target[i] += y_vals[x_start + j]*weights[j]; 88 | } 89 | } 90 | 91 | 92 | 93 | void 94 | KernelSmoothing(const double bandwidth, const vector &y_vals, 95 | vector &y_target) { 96 | 97 | // allocate the space for the new y vals 98 | y_target.resize(y_vals.size(), 0); 99 | 100 | // set the index into the current point to use for smoothing 101 | size_t x_start = 0; 102 | size_t x_end = 0; 103 | 104 | // iterate over the x target vals 105 | for (size_t i = 0; i < y_vals.size(); ++i) { 106 | 107 | // calculate the x starting point 108 | while (x_start < y_vals.size() && x_start < i - bandwidth) 109 | ++x_start; 110 | 111 | // calculate the x ending point 112 | while (x_end < y_vals.size() && x_end < i + bandwidth) 113 | ++x_end; 114 | 115 | if (x_start >= x_end) 116 | throw runtime_error("smoothing using an interval of size 0"); 117 | 118 | // set the number of points used for smoothing current value 119 | const size_t lim = x_end - x_start; 120 | 121 | // calculate the weights 122 | vector weights(lim); 123 | for (size_t j = 0; j < lim; ++j) 124 | weights[j] = Epanechnikov_kernel(i, x_start + j, bandwidth); 125 | 126 | const double weight_sum = accumulate(weights.begin(), weights.end(), 0.0); 127 | transform(weights.begin(), weights.end(), weights.begin(), 128 | [weight_sum] (const double w) {return w / weight_sum;}); 129 | 130 | // apply the weights 131 | y_target[i] = 0; 132 | for (size_t j = 0; j < lim; ++j) 133 | y_target[i] += y_vals[x_start + j]*weights[j]; 134 | } 135 | } 136 | 137 | #include 138 | 139 | void 140 | LocalLinearRegression(const double bandwidth, 141 | const vector &x_vals, 142 | const vector &y_vals, 143 | const vector &x_target, 144 | vector &y_target) { 145 | 146 | // Make sure the x and y vectors are of the same length 147 | assert(x_vals.size() == y_vals.size()); 148 | 149 | // allocate the space for the new y vals 150 | y_target.resize(x_target.size(), 0); 151 | 152 | // set the index into the current point to use for smoothing 153 | size_t x_start = 0, x_end = 0; 154 | 155 | // iterate over the x target vals 156 | for (size_t i = 0; i < x_target.size(); ++i) { 157 | 158 | // calculate the x starting point 159 | while (x_start < x_vals.size() && x_vals[x_start] < x_target[i] - bandwidth) 160 | ++x_start; 161 | 162 | // calculate the x ending point 163 | while (x_end < x_vals.size() && x_vals[x_end] < x_target[i] + bandwidth) 164 | ++x_end; 165 | 166 | if (x_start >= x_end) 167 | throw runtime_error("smoothing using an interval of size 0"); 168 | 169 | // set the number of points used for smoothing current value 170 | const size_t lim = x_end - x_start; 171 | 172 | // calculate the weights 173 | vector weights(lim); 174 | for (size_t j = 0; j < lim; ++j) 175 | weights[j] = Epanechnikov_kernel(x_target[i], x_vals[x_start+j], bandwidth); 176 | const double weight_sum = accumulate(weights.begin(), weights.end(), 0.0); 177 | transform(weights.begin(), weights.end(), weights.begin(), 178 | [weight_sum] (const double w) {return w / weight_sum;}); 179 | 180 | double intercept = 0, slope = 0; 181 | double c00 = 0, c10 = 0, c11 = 0; 182 | double ssq = 0; 183 | gsl_fit_wlinear(&x_vals[x_start], 1, &weights[0], 1, &y_vals[x_start], 1, lim, 184 | &intercept, &slope, &c00, &c10, &c11, &ssq); 185 | y_target[i] = intercept + slope*x_target[i]; 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/common/Smoothing.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2008 Cold Spring Harbor Laboratory 3 | Authors: Andrew D. Smith 4 | 5 | This file is part of rmap. 6 | 7 | rmap is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | rmap is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with rmap; if not, write to the Free Software 19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 | */ 21 | 22 | #ifndef SMOOTHING_HPP 23 | #define SMOOTHING_HPP 24 | 25 | #include 26 | 27 | void 28 | KernelSmoothing(const double bandwidth, 29 | const std::vector &x_values, 30 | const std::vector &y_values, 31 | const std::vector &x_target, 32 | std::vector &y_target); 33 | 34 | void 35 | LocalLinearRegression(const double bandwidth, 36 | const std::vector &x_values, 37 | const std::vector &y_values, 38 | const std::vector &x_target, 39 | std::vector &y_target); 40 | 41 | 42 | void 43 | KernelSmoothing(const double bandwidth, 44 | const std::vector &y_vals, 45 | std::vector &y_target); 46 | 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/common/ThreeStateHMM.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2011 University of Southern California 3 | Authors: Andrew D. Smith, Song Qiang 4 | 5 | This file is part of rmap. 6 | 7 | rmap is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | rmap is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with rmap; if not, write to the Free Software 19 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 | */ 21 | 22 | #ifndef THREE_STATE_HMM_HPP 23 | #define THREE_STATE_HMM_HPP 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include "smithlab_utils.hpp" 31 | #include "Distro.hpp" 32 | #include "BetaBin.hpp" 33 | 34 | enum STATE_LABELS {hypo, HYPER, HYPO}; 35 | 36 | struct Triplet {double hypo, HYPER, HYPO;}; 37 | 38 | class ThreeStateHMM { 39 | public: 40 | 41 | ThreeStateHMM(const std::vector > &_observations, 42 | const std::vector &_reset_points, 43 | const double tol, const size_t max_itr, const bool v); 44 | 45 | void 46 | set_parameters(const betabin & _hypo_emission, 47 | const betabin & _HYPER_emission, 48 | const betabin & _HYPO_emission, 49 | const std::vector > &_trans); 50 | 51 | void 52 | get_parameters(betabin & _hypo_emission, 53 | betabin & _HYPER_emission, 54 | betabin & _HYPO_emission, 55 | std::vector > &_trans) const; 56 | 57 | double 58 | BaumWelchTraining(); 59 | 60 | double 61 | PosteriorDecoding(); 62 | 63 | double 64 | ViterbiDecoding(); 65 | 66 | void 67 | get_state_posteriors(std::vector &scores) const; 68 | 69 | void 70 | get_classes(std::vector &classes) const; 71 | 72 | private: 73 | 74 | //////////// methods //////////// 75 | double 76 | single_iteration(); 77 | double 78 | forward_algorithm(const size_t start, const size_t end); 79 | double 80 | backward_algorithm(const size_t start, const size_t end); 81 | 82 | double 83 | hypo_segment_log_likelihood(const size_t start, const size_t end); 84 | 85 | double 86 | HYPER_segment_log_likelihood(const size_t start, const size_t end); 87 | 88 | double 89 | HYPO_segment_log_likelihood(const size_t start, const size_t end); 90 | 91 | void 92 | estimate_state_posterior(const size_t start, const size_t end); 93 | void 94 | estimate_posterior_trans_prob(const size_t start, const size_t end); 95 | void 96 | estimate_parameters(); 97 | void 98 | update_observation_likelihood(); 99 | 100 | double 101 | ViterbiDecoding(const size_t start, const size_t end); 102 | 103 | //////// data //////// 104 | std::vector > observations; 105 | std::vector reset_points; 106 | std::vector meth_lp, unmeth_lp; 107 | std::vector hypo_log_likelihood, HYPER_log_likelihood, HYPO_log_likelihood; 108 | 109 | // HMM internal data 110 | betabin hypo_emission, HYPER_emission, HYPO_emission; 111 | 112 | Triplet lp_start, lp_end; 113 | std::vector > trans; 114 | 115 | std::vector forward; 116 | std::vector backward; 117 | std::vector hypo_posteriors, HYPER_posteriors, HYPO_posteriors; 118 | std::vector hypo_hypo, hypo_HYPER, 119 | HYPER_hypo, HYPER_HYPER, HYPER_HYPO, 120 | HYPO_HYPER, HYPO_HYPO; 121 | 122 | // result 123 | std::vector classes; 124 | std::vector state_posteriors; 125 | 126 | // parameters 127 | double tolerance; 128 | size_t max_iterations; 129 | bool VERBOSE; 130 | }; 131 | 132 | #endif 133 | -------------------------------------------------------------------------------- /src/common/TwoStateHMM.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2019 Andrew D. Smith 3 | Author: Andrew D. Smith 4 | 5 | This is free software; you can redistribute it and/or modify it 6 | under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This software is distributed in the hope that it will be useful, but 11 | WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | General Public License for more details. 14 | */ 15 | 16 | #ifndef TWO_STATE_HMM_HPP 17 | #define TWO_STATE_HMM_HPP 18 | 19 | #include 20 | #include 21 | 22 | struct betabin; 23 | 24 | class TwoStateHMM { 25 | public: 26 | 27 | TwoStateHMM(const double tol, const size_t max_itr, const bool v) : 28 | tolerance(tol), max_iterations(max_itr), VERBOSE(v) {} 29 | 30 | double 31 | ViterbiDecoding(const std::vector > &values, 32 | const std::vector &reset_points, 33 | const double f_to_b_trans, const double b_to_f_trans, 34 | const double fg_alpha, const double fg_beta, 35 | const double bg_alpha, const double bg_beta, 36 | std::vector &ml_classes) const; 37 | 38 | 39 | double 40 | BaumWelchTraining(const std::vector > &values, 41 | const std::vector &reset_points, 42 | double &f_to_b_trans, double &b_to_f_trans, 43 | double &fg_alpha, double &fg_beta, 44 | double &bg_alpha, double &bg_beta) const; 45 | 46 | double 47 | PosteriorDecoding(const std::vector > &values, 48 | const std::vector &reset_points, 49 | const double f_to_b_trans, const double b_to_f_trans, 50 | const double fg_alpha, const double fg_beta, 51 | const double bg_alpha, const double bg_beta, 52 | std::vector &classes, 53 | std::vector &llr_scores) const; 54 | 55 | void 56 | PosteriorScores(const std::vector > &values, 57 | const std::vector &reset_points, 58 | const double f_to_b_trans, const double b_to_f_trans, 59 | const double fg_alpha, const double fg_beta, 60 | const double bg_alpha, const double bg_beta, 61 | const bool class_id, 62 | std::vector &llr_scores) const; 63 | 64 | void 65 | TransitionPosteriors(const std::vector > &values, 66 | const std::vector &reset_points, 67 | const double f_to_b_trans, const double b_to_f_trans, 68 | const double fg_alpha, const double fg_beta, 69 | const double bg_alpha, const double bg_beta, 70 | const size_t transition, 71 | std::vector &scores) const; 72 | 73 | // FOR MULTIPLE REPLICATES 74 | double 75 | BaumWelchTraining(const std::vector > > &values, 76 | const std::vector &reset_points, 77 | double &f_to_b_trans, double &b_to_f_trans, 78 | std::vector &fg_alpha, 79 | std::vector &fg_beta, 80 | std::vector &bg_alpha, 81 | std::vector &bg_beta) const; 82 | 83 | double 84 | PosteriorDecoding(const std::vector > > &values, 85 | const std::vector &reset_points, 86 | const double f_to_b_trans, const double b_to_f_trans, 87 | const std::vector &fg_alpha, 88 | const std::vector &fg_beta, 89 | const std::vector &bg_alpha, 90 | const std::vector &bg_beta, 91 | std::vector &classes, 92 | std::vector &llr_scores) const; 93 | 94 | void 95 | PosteriorScores(const std::vector > > &values, 96 | const std::vector &reset_points, 97 | const double f_to_b_trans, const double b_to_f_trans, 98 | const std::vector &fg_alpha, 99 | const std::vector &fg_beta, 100 | const std::vector &bg_alpha, 101 | const std::vector &bg_beta, 102 | const bool &fg_class, 103 | std::vector &llr_scores) const; 104 | 105 | 106 | private: 107 | 108 | double 109 | ViterbiDecoding(const std::vector > &values, 110 | const std::vector &reset_points, 111 | const double p_fb, const double p_bf, 112 | const betabin &fg_distro, const betabin &bg_distro, 113 | std::vector &ml_classes) const; 114 | 115 | double 116 | BaumWelchTraining(const std::vector > &values, 117 | const std::vector &reset_points, 118 | double &p_fb, double &p_bf, 119 | betabin &fg_distro, betabin &bg_distro) const; 120 | 121 | double 122 | PosteriorDecoding(const std::vector > &values, 123 | const std::vector &reset_points, 124 | const double p_fb, const double p_bf, 125 | const betabin &fg_distro, 126 | const betabin &bg_distro, 127 | std::vector &classes, 128 | std::vector &llr_scores) const; 129 | 130 | void 131 | PosteriorScores(const std::vector > &values, 132 | const std::vector &reset_points, 133 | const double p_fb, const double p_bf, 134 | const betabin &fg_distro, const betabin &bg_distro, 135 | const bool class_id, 136 | std::vector &llr_scores) const; 137 | 138 | void 139 | TransitionPosteriors(const std::vector > &values, 140 | const std::vector &reset_points, 141 | const double p_fb, const double p_bf, 142 | const betabin &fg_distro, const betabin &bg_distro, 143 | const size_t transition, 144 | std::vector &scores) const; 145 | 146 | // FOR MULTIPLE REPLICATES 147 | 148 | double 149 | BaumWelchTraining(const std::vector > > &values, 150 | const std::vector &reset_points, 151 | double &p_fb, double &p_bf, 152 | std::vector &fg_distro, 153 | std::vector &bg_distro) const; 154 | 155 | void 156 | PosteriorScores(const std::vector > > &values, 157 | const std::vector &reset_points, 158 | const double p_fb, const double p_bf, 159 | const std::vector &fg_distro, 160 | const std::vector &bg_distro, 161 | const bool fg_class, 162 | std::vector &llr_scores) const; 163 | 164 | double 165 | PosteriorDecoding(const std::vector > > &values, 166 | const std::vector &reset_points, 167 | const double p_fb, const double p_bf, 168 | const std::vector &fg_distro, 169 | const std::vector &bg_distro, 170 | std::vector &classes, 171 | std::vector &llr_scores) const; 172 | 173 | 174 | double tolerance; 175 | size_t max_iterations; 176 | bool VERBOSE; 177 | }; 178 | 179 | #endif 180 | -------------------------------------------------------------------------------- /src/common/bsutils.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Andrew D. Smith 3 | * 4 | * Authors: Andrew D. Smith 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #include "bsutils.hpp" 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | //// CONFIDENCE INTERVALS //**************//////////////////////// 31 | #include 32 | void 33 | wilson_ci_for_binomial(const double alpha, const double n, 34 | const double p_hat, double &lower, double &upper) { 35 | const double z = gsl_cdf_ugaussian_Pinv(1 - alpha/2); 36 | const double denom = 1 + z*z/n; 37 | const double first_term = p_hat + z*z/(2*n); 38 | const double discriminant = p_hat*(1 - p_hat)/n + z*z/(4*n*n); 39 | lower = std::max(0.0, (first_term - z*std::sqrt(discriminant))/denom); 40 | upper = std::min(1.0, (first_term + z*std::sqrt(discriminant))/denom); 41 | } 42 | //////////////////////////**************//////////////////////// 43 | 44 | 45 | void 46 | adjust_region_ends(const std::vector > &clusters, 47 | std::vector ®ions) { 48 | assert(clusters.size() == regions.size()); 49 | for (size_t i = 0; i < regions.size(); ++i) { 50 | size_t max_pos = regions[i].get_end(); 51 | size_t min_pos = regions[i].get_start(); 52 | for (size_t j = 0; j < clusters[i].size(); ++j) { 53 | max_pos = std::max(clusters[i][j].get_end(), max_pos); 54 | min_pos = std::min(clusters[i][j].get_start(), min_pos); 55 | } 56 | regions[i].set_end(max_pos); 57 | regions[i].set_start(min_pos); 58 | } 59 | } 60 | 61 | 62 | void 63 | relative_sort(const std::vector &mapped_locations, 64 | const std::vector &names, 65 | std::vector &lookup) { 66 | 67 | std::unordered_map names_map; 68 | for (size_t i = 0; i < names.size(); ++i) 69 | names_map[names[i]] = i; 70 | 71 | for (size_t i = 0; i < mapped_locations.size(); ++i) { 72 | const std::unordered_map::const_iterator 73 | j(names_map.find(mapped_locations[i].get_name())); 74 | if (j == names_map.end()) 75 | throw std::runtime_error("read sequence not found for: " + names[i]); 76 | lookup.push_back(j->second); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/common/bsutils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 University of Southern California and 3 | * Andrew D. Smith 4 | * 5 | * Authors: Andrew D. Smith 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program. If not, see . 19 | */ 20 | 21 | #ifndef BSUTILS_HPP 22 | #define BSUTILS_HPP 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #include 29 | #include 30 | 31 | inline bool 32 | is_cytosine(char c) {return (c == 'c' || c == 'C');} 33 | 34 | inline bool 35 | is_guanine(char c) {return (c == 'g' || c == 'G');} 36 | 37 | inline bool 38 | is_thymine(char c) {return (c == 't' || c == 'T');} 39 | 40 | inline bool 41 | is_adenine(char c) {return (c == 'a' || c == 'A');} 42 | 43 | 44 | //// CONFIDENCE INTERVALS //**************//////////////////////// 45 | void 46 | wilson_ci_for_binomial(const double alpha, const double n, 47 | const double p_hat, double &lower, double &upper); 48 | 49 | 50 | inline bool 51 | is_cpg(const std::string &s, size_t i) { 52 | return (i < (s.length() - 1)) && 53 | is_cytosine(s[i]) && is_guanine(s[i + 1]); 54 | } 55 | 56 | 57 | void 58 | adjust_region_ends(const std::vector > &clusters, 59 | std::vector ®ions); 60 | 61 | 62 | void 63 | relative_sort(const std::vector &mapped_locations, 64 | const std::vector &names, 65 | std::vector &lookup); 66 | 67 | 68 | template static void 69 | separate_regions(const std::vector &big_regions, 70 | const std::vector ®ions, 71 | const std::vector &seqs, 72 | std::vector > &sep_regions, 73 | std::vector > &sep_seqs) { 74 | size_t rr_id = 0; 75 | const size_t n_regions = regions.size(); 76 | assert(n_regions <= seqs.size()); 77 | 78 | const size_t n_big_regions = big_regions.size(); 79 | sep_regions.resize(n_big_regions); 80 | sep_seqs.resize(n_big_regions); 81 | for (size_t i = 0; i < n_big_regions; ++i) { 82 | const std::string current_chrom(big_regions[i].get_chrom()); 83 | const size_t current_start = big_regions[i].get_start(); 84 | const size_t current_end = big_regions[i].get_end(); 85 | while (rr_id < n_regions && 86 | (regions[rr_id].get_chrom() < current_chrom || 87 | (regions[rr_id].get_chrom() == current_chrom && 88 | regions[rr_id].get_end() <= current_start))) 89 | ++rr_id; 90 | while (rr_id < n_regions && 91 | (regions[rr_id].get_chrom() == current_chrom && 92 | regions[rr_id].get_start() < current_end)) { 93 | sep_regions[i].push_back(regions[rr_id]); 94 | sep_seqs[i].push_back(seqs[rr_id]); 95 | ++rr_id; 96 | } 97 | } 98 | } 99 | 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /src/common/numerical_utils.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 University of Southern California 3 | * Andrew D Smith and Qiang Song 4 | * Author: Qiang Song and Andrew D. Smith 5 | * 6 | * This is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this software; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 19 | * 02110-1301 USA 20 | */ 21 | 22 | #include "numerical_utils.hpp" 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | using std::vector; 29 | 30 | double 31 | log_sum_log_vec(const std::vector &vals, const size_t limit) 32 | { 33 | const std::vector::const_iterator x = 34 | std::max_element(vals.begin(), vals.begin() + limit); 35 | const double max_val = *x; 36 | const size_t max_idx = x - vals.begin(); 37 | double sum = 1.0; 38 | for (size_t i = 0; i < limit; ++i) 39 | { 40 | if (i != max_idx) 41 | { 42 | sum += exp(vals[i] - max_val); 43 | } 44 | } 45 | return max_val + log(sum); 46 | } 47 | 48 | double 49 | log_sum_log(const std::vector::const_iterator &begin, 50 | const std::vector::const_iterator &end) 51 | { 52 | const std::vector::const_iterator max_itr = 53 | std::max_element(begin, end); 54 | const double max_val = *max_itr; 55 | 56 | double sum = 1.0; 57 | for (std::vector::const_iterator itr = begin; itr < end; ++itr) 58 | if (itr != max_itr) sum += exp(*itr - max_val); 59 | 60 | return max_val + log(sum); 61 | } 62 | 63 | 64 | -------------------------------------------------------------------------------- /src/common/numerical_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 University of Southern California 3 | * Andrew D Smith and Qiang Song 4 | * Author: Qiang Song and Andrew D. Smith 5 | * 6 | * This is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this software; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 19 | * 02110-1301 USA 20 | */ 21 | 22 | #ifndef NUMERICAL_UTILS_HPP 23 | #define NUMERICAL_UTILS_HPP 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | inline double 30 | log_sum_log(const double p, const double q) 31 | { 32 | if (p == 0) {return q;} 33 | else if (q == 0) {return p;} 34 | const double larger = (p > q) ? p : q; 35 | const double smaller = (p > q) ? q : p; 36 | return larger + log(1.0 + exp(smaller - larger)); 37 | } 38 | 39 | inline double 40 | log_sum_log(const double p, const double q, const double r) 41 | { 42 | return log_sum_log(log_sum_log(p, q), r); 43 | } 44 | 45 | double 46 | log_sum_log_vec(const std::vector &vals, const size_t limit); 47 | 48 | double 49 | log_sum_log(const std::vector::const_iterator &begin, 50 | const std::vector::const_iterator &end); 51 | 52 | #endif 53 | 54 | -------------------------------------------------------------------------------- /src/experimental/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2011 University of Southern California 2 | # and Andrew D. Smith 3 | # 4 | # Authors: Andrew D. Smith 5 | # 6 | # This is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This software is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this software; if not, write to the Free Software 18 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 19 | # 02110-1301 USA 20 | # 21 | 22 | ifndef SMITHLAB_CPP 23 | $(error SMITHLAB_CPP variable undefined) 24 | endif 25 | 26 | PROGS = dmr-hdhmm 27 | 28 | CXX = g++ 29 | CXXFLAGS = -std=c++11 -Wall 30 | OPTFLAGS = -O3 31 | DEBUGFLAGS = -g 32 | 33 | ifdef DEBUG 34 | CXXFLAGS += $(DEBUGFLAGS) 35 | else 36 | CXXFLAGS += $(OPTFLAGS) 37 | endif 38 | 39 | COMMON_DIR = ../common 40 | EXPERIMENTAL_DIR = ../common-experimental 41 | INCLUDEDIRS = $(SMITHLAB_CPP) $(COMMON_DIR) $(EXPERIMENTAL_DIR) 42 | 43 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS)) 44 | 45 | LIBS = -lgsl -lgslcblas 46 | 47 | all: $(PROGS) 48 | 49 | install: $(PROGS) 50 | @mkdir -p $(SRC_ROOT)/bin 51 | @install -m 755 $(PROGS) $(SRC_ROOT)/bin 52 | 53 | $(PROGS): $(addprefix $(SMITHLAB_CPP)/, \ 54 | smithlab_os.o smithlab_utils.o GenomicRegion.o OptionParser.o) \ 55 | $(addprefix $(COMMON_DIR)/, MethpipeFiles.o) 56 | 57 | dmr-hdhmm: $(addprefix $(SMITHLAB_CPP)/, RNG.o) \ 58 | $(addprefix $(EXPERIMENTAL_DIR)/, ThreeStateHDHMM.o \ 59 | false_discovery_rate.o contingency-table.o nonparametric-test.o) \ 60 | $(addprefix $(COMMON_DIR)/, Smoothing.o Distro.o BetaBin.o) 61 | 62 | %.o: %.cpp %.hpp 63 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(INCLUDEARGS) 64 | 65 | %: %.cpp 66 | $(CXX) $(CXXFLAGS) -o $@ $^ $(INCLUDEARGS) $(LIBS) 67 | 68 | clean: 69 | @-rm -f $(PROGS) *.o *.so *.a *~ 70 | 71 | .PHONY: clean 72 | -------------------------------------------------------------------------------- /src/mlml/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2014 University of Southern California 2 | # and Andrew D. Smith and Benjamin E. Decato 3 | # 4 | # Authors: Andrew D. Smith and Benjamin E. Decato 5 | # 6 | # This is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This software is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this software; if not, write to the Free Software 18 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 19 | # 02110-1301 USA 20 | # 21 | 22 | ifndef SMITHLAB_CPP 23 | $(error SMITHLAB_CPP variable undefined) 24 | endif 25 | 26 | PROGS = mlml 27 | 28 | CXX = g++ 29 | CXXFLAGS = -Wall -std=c++11 30 | OPTFLAGS = -O3 31 | DEBUGFLAGS = -g 32 | 33 | ifdef DEBUG 34 | CXXFLAGS += $(DEBUGFLAGS) 35 | else 36 | CXXFLAGS += $(OPTFLAGS) 37 | endif 38 | 39 | COMMON_DIR = ../common 40 | INCLUDEDIRS = $(SMITHLAB_CPP) $(COMMON_DIR) 41 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS)) 42 | override CPPFLAGS += $(INCLUDEARGS) 43 | 44 | LDLIBS = -lgsl -lgslcblas 45 | 46 | all: $(PROGS) 47 | 48 | install: $(PROGS) 49 | @mkdir -p $(SRC_ROOT)/bin 50 | @install -m 755 $(PROGS) $(SRC_ROOT)/bin 51 | 52 | $(PROGS): $(addprefix $(SMITHLAB_CPP)/, libsmithlab_cpp.a) 53 | 54 | mlml: $(addprefix $(COMMON_DIR)/, MethpipeSite.o) 55 | 56 | %.o: %.cpp %.hpp 57 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(CPPFLAGS) 58 | 59 | %: %.cpp 60 | $(CXX) $(CXXFLAGS) -o $@ $^ $(CPPFLAGS) $(LDLIBS) $(LDFLAGS) 61 | 62 | clean: 63 | @-rm -f $(PROGS) *.o *.so *.a *~ 64 | 65 | .PHONY: clean 66 | -------------------------------------------------------------------------------- /src/radmeth/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2013 University of Southern California and 2 | # Egor Dolzhenko 3 | # Andrew D Smith 4 | # 5 | # Authors: Andrew D. Smith and Egor Dolzhenko 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | 17 | ifndef SMITHLAB_CPP 18 | $(error SMITHLAB_CPP variable undefined) 19 | endif 20 | 21 | PROGS = radmeth radmeth-adjust radmeth-merge methdiff dmr 22 | 23 | CXX = g++ 24 | CXXFLAGS = -Wall -std=c++11 25 | OPTFLAGS = -O3 26 | DEBUGFLAGS = -g 27 | 28 | ifdef DEBUG 29 | CXXFLAGS += $(DEBUGFLAGS) 30 | else 31 | CXXFLAGS += $(OPTFLAGS) 32 | endif 33 | 34 | COMMON_DIR = ../common 35 | INCLUDEDIRS = $(SMITHLAB_CPP) $(COMMON_DIR) 36 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS)) 37 | override CPPFLAGS += $(INCLUDEARGS) 38 | 39 | LDLIBS = -lgsl -lgslcblas -lz 40 | 41 | all: $(PROGS) 42 | 43 | install: $(PROGS) 44 | @mkdir -p $(SRC_ROOT)/bin 45 | @install -m 755 $(PROGS) $(SRC_ROOT)/bin 46 | 47 | $(PROGS): $(addprefix $(SMITHLAB_CPP)/, libsmithlab_cpp.a) 48 | 49 | methdiff: $(addprefix $(COMMON_DIR)/, MethpipeSite.o) 50 | 51 | %.o: %.cpp %.hpp 52 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(CPPFLAGS) 53 | 54 | %: %.cpp 55 | $(CXX) $(CXXFLAGS) -o $@ $^ $(CPPFLAGS) $(LDLIBS) $(LDFLAGS) 56 | 57 | clean: 58 | @-rm -f $(PROGS) *.o *.so *.a *~ 59 | -------------------------------------------------------------------------------- /src/radmeth/README.md: -------------------------------------------------------------------------------- 1 | RADMeth: Regression Analysis of Differential Methylation 2 | ======================================================== 3 | 4 | RADMeth: Regression Analysis of Differential Methilation is a software for 5 | computing individual differentially methylated sites and genomic regions in 6 | whole genome bisulfite sequencing (WGBS) data. 7 | 8 | Contact Information 9 | ------------------- 10 | 11 | Egor Dolzhenko 12 | dolzhenk@usc.edu 13 | http://smithlabresearch.org/ 14 | 15 | Installation 16 | ------------ 17 | *Before attempting to compile RADMeth please make sure that GNU Scientific 18 | Library (http://www.gnu.org/software/gsl/) is installed on your system* 19 | Alternatively, you can download pre-compiled binaries for either Linux or Mac 20 | from http://smithlabresearch.org/software/radmeth/ 21 | 22 | To compile RADMeth, enter the program's root directory (e.g. radmeth/) and 23 | execute 24 | 25 | > make 26 | 27 | After the compilation, the binaries can be found in radmeth/bin/ 28 | 29 | Usage 30 | ----- 31 | 32 | Please see the manual, which can be obtained at 33 | http://smithlabresearch.org/software/radmeth/ 34 | 35 | License 36 | ------- 37 | Copyright (C) 2013 University of Southern California and 38 | Egor Dolzhenko 39 | Andrew D Smith 40 | 41 | Authors: Andrew D. Smith and Egor Dolzhenko 42 | 43 | This program is free software: you can redistribute it and/or modify 44 | it under the terms of the GNU General Public License as published by 45 | the Free Software Foundation, either version 3 of the License, or 46 | (at your option) any later version. 47 | 48 | This program is distributed in the hope that it will be useful, 49 | but WITHOUT ANY WARRANTY; without even the implied warranty of 50 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 51 | GNU General Public License for more details. 52 | -------------------------------------------------------------------------------- /src/radmeth/methdiff.cpp: -------------------------------------------------------------------------------- 1 | /* methdiff: Computes probability that individual CpGs have higher 2 | * methylation in file A than in file B, where files A and B 3 | * are specified on the command line. 4 | * 5 | * Copyright (C) 2011-2019 Andrew D Smith 6 | * 7 | * Author: Andrew D. Smith 8 | * 9 | * This is free software; you can redistribute it and/or modify it 10 | * under the terms of the GNU General Public License as published by 11 | * the Free Software Foundation; either version 2 of the License, or 12 | * (at your option) any later version. 13 | * 14 | * This software is distributed in the hope that it will be useful, 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 | * General Public License for more details. 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "smithlab_utils.hpp" 26 | #include "smithlab_os.hpp" 27 | #include "GenomicRegion.hpp" 28 | #include "OptionParser.hpp" 29 | #include "zlib_wrapper.hpp" 30 | 31 | #include "MethpipeSite.hpp" 32 | 33 | #include 34 | 35 | using std::string; 36 | using std::vector; 37 | using std::cout; 38 | using std::endl; 39 | using std::cerr; 40 | using std::pair; 41 | using std::runtime_error; 42 | using std::min; 43 | 44 | using std::ostream_iterator; 45 | using std::ofstream; 46 | 47 | static inline double 48 | log_sum_log(const double p, const double q) { 49 | if (p == 0) {return q;} 50 | else if (q == 0) {return p;} 51 | const double larger = (p > q) ? p : q; 52 | const double smaller = (p > q) ? q : p; 53 | return larger + log(1.0 + exp(smaller - larger)); 54 | } 55 | 56 | 57 | static double 58 | log_hyper_g_greater(size_t meth_a, size_t unmeth_a, 59 | size_t meth_b, size_t unmeth_b, size_t k) { 60 | return gsl_sf_lnchoose(meth_b + unmeth_b - 1, k) + 61 | gsl_sf_lnchoose(meth_a + unmeth_a - 1, meth_a + meth_b - 1 - k) - 62 | gsl_sf_lnchoose(meth_a + unmeth_a + meth_b + unmeth_b - 2, 63 | meth_a + meth_b - 1); 64 | } 65 | 66 | 67 | static double 68 | test_greater_population(const size_t meth_a, const size_t unmeth_a, 69 | const size_t meth_b, const size_t unmeth_b) { 70 | double p = 0; 71 | for (size_t k = (meth_b > unmeth_a) ? meth_b - unmeth_a : 0; k < meth_b; ++k) 72 | p = log_sum_log(p, log_hyper_g_greater(meth_a, unmeth_a, 73 | meth_b, unmeth_b, k)); 74 | return exp(p); 75 | } 76 | 77 | 78 | template T& 79 | write_methdiff_site(T &out, 80 | const MSite &a, const MSite &b, const double diffscore) { 81 | 82 | MSite c(a); 83 | c.n_reads = a.n_meth(); 84 | c.meth = diffscore; 85 | 86 | std::ostringstream oss; 87 | oss << c; 88 | 89 | oss << '\t' << a.n_unmeth() // a.n_meth() already output with 'c' 90 | << '\t' << b.n_meth() 91 | << '\t' << b.n_unmeth() << endl; 92 | 93 | out << oss.str(); 94 | return out; 95 | } 96 | 97 | 98 | static bool 99 | same_chrom_and_pos(const MSite &a, const MSite &b) { 100 | return a.chrom == b.chrom && a.pos == b.pos; 101 | } 102 | 103 | 104 | template 105 | static void 106 | process_sites(const bool VERBOSE, igzfstream &in_a, igzfstream &in_b, 107 | const bool allow_uncovered, const double pseudocount, T &out) { 108 | 109 | MSite a, b; 110 | string prev_chrom; 111 | in_b >> b; // load first site for "b" 112 | while (in_a >> a) { 113 | 114 | if (VERBOSE && a.chrom != prev_chrom) 115 | cerr << "[processing: " << a.chrom << "]" << endl; 116 | 117 | while (in_b && b < a) in_b >> b; // find appropriate "b" site 118 | 119 | if (same_chrom_and_pos(a, b)) { 120 | 121 | if (allow_uncovered || min(a.n_reads, b.n_reads) > 0) { 122 | 123 | const size_t meth_a = a.n_meth() + pseudocount; 124 | const size_t unmeth_a = a.n_unmeth() + pseudocount; 125 | const size_t meth_b = b.n_meth() + pseudocount; 126 | const size_t unmeth_b = b.n_unmeth() + pseudocount; 127 | 128 | const double diffscore = test_greater_population(meth_b, unmeth_b, 129 | meth_a, unmeth_a); 130 | 131 | write_methdiff_site(out, a, b, diffscore); 132 | } 133 | } 134 | swap(prev_chrom, a.chrom); 135 | } 136 | } 137 | 138 | 139 | int 140 | main(int argc, const char **argv) { 141 | 142 | try { 143 | 144 | string outfile; 145 | size_t pseudocount = 1; 146 | 147 | // run mode flags 148 | bool allow_uncovered = true; 149 | bool VERBOSE = false; 150 | 151 | /****************** COMMAND LINE OPTIONS ********************/ 152 | OptionParser opt_parse(strip_path(argv[0]), 153 | "compute probability that site " 154 | "has higher methylation in file A than B", 155 | " "); 156 | opt_parse.add_opt("pseudo", 'p', "pseudocount (default: 1)", 157 | false, pseudocount); 158 | opt_parse.add_opt("nonzero-only", 'A', 159 | "process only sites with coveage in both samples", 160 | false, allow_uncovered); 161 | opt_parse.add_opt("out", 'o', "output file", true, outfile); 162 | opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); 163 | vector leftover_args; 164 | opt_parse.parse(argc, argv, leftover_args); 165 | if (argc == 1 || opt_parse.help_requested()) { 166 | cerr << opt_parse.help_message() << endl 167 | << opt_parse.about_message() << endl; 168 | return EXIT_SUCCESS; 169 | } 170 | if (opt_parse.about_requested()) { 171 | cerr << opt_parse.about_message() << endl; 172 | return EXIT_SUCCESS; 173 | } 174 | if (opt_parse.option_missing()) { 175 | cerr << opt_parse.option_missing_message() << endl; 176 | return EXIT_SUCCESS; 177 | } 178 | if (leftover_args.size() != 2) { 179 | cerr << opt_parse.help_message() << endl; 180 | return EXIT_SUCCESS; 181 | } 182 | const string cpgs_file_a = leftover_args[0]; 183 | const string cpgs_file_b = leftover_args[1]; 184 | /****************** END COMMAND LINE OPTIONS *****************/ 185 | 186 | if (VERBOSE) 187 | cerr << "[opening methcounts file: " << cpgs_file_a << "]" << endl; 188 | igzfstream in_a(cpgs_file_a); 189 | if (!in_a) 190 | throw runtime_error("cannot open file: " + cpgs_file_a); 191 | 192 | if (VERBOSE) 193 | cerr << "[opening methcounts file: " << cpgs_file_b << "]" << endl; 194 | igzfstream in_b(cpgs_file_b); 195 | if (!in_b) 196 | throw runtime_error("cannot open file: " + cpgs_file_b); 197 | 198 | if (outfile.empty() || !has_gz_ext(outfile)) { 199 | std::ofstream of; 200 | if (!outfile.empty()) of.open(outfile); 201 | std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); 202 | 203 | process_sites(VERBOSE, in_a, in_b, allow_uncovered, pseudocount, out); 204 | } 205 | else { 206 | ogzfstream out(outfile); 207 | process_sites(VERBOSE, in_a, in_b, allow_uncovered, pseudocount, out); 208 | } 209 | } 210 | catch (runtime_error &e) { 211 | cerr << "ERROR:\t" << e.what() << endl; 212 | return EXIT_FAILURE; 213 | } 214 | catch (std::bad_alloc &ba) { 215 | cerr << "ERROR: could not allocate memory" << endl; 216 | return EXIT_FAILURE; 217 | } 218 | return EXIT_SUCCESS; 219 | } 220 | -------------------------------------------------------------------------------- /src/radmeth/radmeth-merge.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013 University of Southern California and 2 | * Egor Dolzhenko 3 | * Andrew D Smith 4 | * 5 | * Authors: Andrew D. Smith and Egor Dolzhenko 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU General Public License as published by 9 | * the Free Software Foundation, either version 3 of the License, or 10 | * (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | // smithlab headers 29 | #include "OptionParser.hpp" 30 | #include "smithlab_os.hpp" 31 | #include "smithlab_utils.hpp" 32 | #include "GenomicRegion.hpp" 33 | 34 | using std::string; 35 | using std::vector; 36 | using std::cerr; 37 | using std::cout; 38 | using std::endl; 39 | using std::istringstream; 40 | using std::istream; 41 | using std::ostream; 42 | using std::ifstream; 43 | using std::ofstream; 44 | using std::runtime_error; 45 | 46 | // Attemps to find the next significant CpG site. Returns true if one was found 47 | // and flase otherwise. 48 | static bool 49 | read_next_significant_cpg(istream &cpg_stream, GenomicRegion &cpg, 50 | double cutoff, bool &skipped_any, bool &n_sig_sites, 51 | size_t &test_cov, size_t &test_meth, 52 | size_t &rest_cov, size_t &rest_meth) { 53 | GenomicRegion region; 54 | skipped_any = false; 55 | n_sig_sites = false; 56 | string cpg_encoding; 57 | 58 | while (getline(cpg_stream, cpg_encoding)) { 59 | string record, chrom, name, sign; 60 | size_t position; 61 | double raw_pval, adjusted_pval, corrected_pval; 62 | 63 | istringstream iss(cpg_encoding); 64 | iss.exceptions(std::ios::failbit); 65 | iss >> chrom >> position >> sign >> name >> raw_pval 66 | >> adjusted_pval >> corrected_pval 67 | >> test_cov >> test_meth >> rest_cov >> rest_meth; 68 | 69 | if (0 <= corrected_pval && corrected_pval < cutoff) { 70 | cpg.set_chrom(chrom); 71 | cpg.set_start(position); 72 | cpg.set_end(position + 1); 73 | n_sig_sites = (0 <= raw_pval && raw_pval < cutoff); 74 | return true; 75 | } 76 | skipped_any = true; 77 | } 78 | 79 | return false; 80 | } 81 | 82 | static void 83 | merge(istream &cpg_stream, ostream &dmr_stream, double cutoff) { 84 | 85 | GenomicRegion dmr; 86 | dmr.set_name("dmr"); 87 | 88 | size_t dmr_test_cov = 0; 89 | size_t dmr_test_meth = 0; 90 | size_t dmr_rest_cov = 0; 91 | size_t dmr_rest_meth = 0; 92 | 93 | size_t test_cov = 0; 94 | size_t test_meth = 0; 95 | size_t rest_cov = 0; 96 | size_t rest_meth = 0; 97 | 98 | // Find the first significant CpG, or terminate the function if none exist. 99 | bool skipped_last_cpg, n_sig_sites; 100 | if (!read_next_significant_cpg(cpg_stream, dmr, cutoff, skipped_last_cpg, 101 | n_sig_sites, test_cov, test_meth, 102 | rest_cov, rest_meth)) 103 | return; 104 | 105 | dmr.set_score(n_sig_sites); 106 | dmr_test_cov += test_cov; 107 | dmr_test_meth += test_meth; 108 | dmr_rest_cov += rest_cov; 109 | dmr_rest_meth += rest_meth; 110 | 111 | GenomicRegion cpg; 112 | cpg.set_name("dmr"); 113 | 114 | while (read_next_significant_cpg(cpg_stream, cpg, cutoff, skipped_last_cpg, 115 | n_sig_sites, test_cov, test_meth, 116 | rest_cov, rest_meth)) { 117 | 118 | if (skipped_last_cpg || cpg.get_chrom() != dmr.get_chrom()) { 119 | if (dmr.get_score() != 0) 120 | dmr_stream << dmr.get_chrom() << '\t' 121 | << dmr.get_start() << '\t' 122 | << dmr.get_end() << '\t' 123 | << dmr.get_name() << '\t' 124 | << dmr.get_score() << '\t' 125 | << double(dmr_test_meth)/dmr_test_cov - 126 | double(dmr_rest_meth)/dmr_rest_cov << endl; 127 | dmr = cpg; 128 | dmr.set_score(n_sig_sites); 129 | dmr_test_cov = test_cov; 130 | dmr_test_meth = test_meth; 131 | dmr_rest_cov = rest_cov; 132 | dmr_rest_meth = rest_meth; 133 | } 134 | else { 135 | dmr.set_end(cpg.get_end()); 136 | dmr.set_score(dmr.get_score() + n_sig_sites); 137 | dmr_test_cov += test_cov; 138 | dmr_test_meth += test_meth; 139 | dmr_rest_cov += rest_cov; 140 | dmr_rest_meth += rest_meth; 141 | } 142 | } 143 | if (dmr.get_score() != 0) { 144 | dmr_stream << dmr.get_chrom() << '\t' 145 | << dmr.get_start() << '\t' 146 | << dmr.get_end() << '\t' 147 | << dmr.get_name() << '\t' 148 | << dmr.get_score() << '\t' 149 | << double(dmr_test_meth)/dmr_test_cov - 150 | double(dmr_rest_meth)/dmr_rest_cov << endl; 151 | } 152 | } 153 | 154 | int 155 | main(int argc, const char **argv) { 156 | 157 | try { 158 | 159 | /* FILES */ 160 | string outfile; 161 | string bin_spec = "1:200:25"; 162 | double cutoff = 0.01; 163 | 164 | /**************** GET COMMAND LINE ARGUMENTS *************************/ 165 | OptionParser opt_parse(strip_path(argv[0]), 166 | "merge significantly differentially" 167 | " methylated CpGs into DMRs", 168 | ""); 169 | opt_parse.set_show_defaults(); 170 | opt_parse.add_opt("output", 'o', 171 | "output file (default: stdout)", false, outfile); 172 | opt_parse.add_opt("cutoff", 'p', "p-value cutoff", false , cutoff); 173 | opt_parse.add_opt("bins", 'b', "corrlation bin specs", false , bin_spec); 174 | vector leftover_args; 175 | opt_parse.parse(argc, argv, leftover_args); 176 | if (argc == 1 || opt_parse.help_requested()) { 177 | cerr << opt_parse.help_message() << endl; 178 | return EXIT_SUCCESS; 179 | } 180 | if (opt_parse.about_requested()) { 181 | cerr << opt_parse.about_message() << endl; 182 | return EXIT_SUCCESS; 183 | } 184 | if (opt_parse.option_missing()) { 185 | cerr << opt_parse.option_missing_message() << endl; 186 | return EXIT_SUCCESS; 187 | } 188 | if (leftover_args.size() != 1) { 189 | cerr << opt_parse.help_message() << endl; 190 | return EXIT_SUCCESS; 191 | } 192 | const string bed_filename = leftover_args.front(); 193 | /************************************************************************/ 194 | 195 | ofstream of; 196 | if (!outfile.empty()) of.open(outfile); 197 | ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); 198 | 199 | ifstream in(bed_filename); 200 | if (!in) 201 | throw runtime_error("could not open file: " + bed_filename); 202 | 203 | merge(in, out, cutoff); 204 | 205 | } 206 | catch (const std::exception &e) { 207 | cerr << "ERROR: " << e.what() << endl; 208 | exit(EXIT_FAILURE); 209 | } 210 | return EXIT_SUCCESS; 211 | } 212 | -------------------------------------------------------------------------------- /src/utils/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2011 University of Southern California 2 | # and Andrew D. Smith 3 | # 4 | # Authors: Andrew D. Smith 5 | # 6 | # This is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This software is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this software; if not, write to the Free Software 18 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 19 | # 02110-1301 USA 20 | # 21 | 22 | CXX = g++ 23 | CXXFLAGS = -Wall -std=c++11 24 | OPTFLAGS = -O3 25 | DEBUGFLAGS = -g 26 | 27 | ifndef SRC_ROOT 28 | SRC_ROOT=../.. 29 | endif 30 | 31 | ifndef SMITHLAB_CPP 32 | $(error SMITHLAB_CPP variable undefined) 33 | endif 34 | 35 | PROGS = lc_approx fast-liftover lift-filter \ 36 | merge-bsrate merge-methcounts \ 37 | symmetric-cpgs clean-hairpins selectsites guessprotocol \ 38 | format_reads duplicate-remover 39 | 40 | COMMON_DIR = ../common 41 | INCLUDEDIRS = $(SMITHLAB_CPP) $(COMMON_DIR) 42 | INCLUDEARGS = $(addprefix -I,$(INCLUDEDIRS)) 43 | override CPPFLAGS += $(INCLUDEARGS) 44 | 45 | LDLIBS = -lgsl -lgslcblas -lz -lhts 46 | 47 | ifdef DEBUG 48 | CXXFLAGS += $(DEBUGFLAGS) 49 | else 50 | CXXFLAGS += $(OPTFLAGS) 51 | endif 52 | 53 | all: $(PROGS) 54 | 55 | install: $(PROGS) 56 | @mkdir -p $(SRC_ROOT)/bin 57 | @install -m 755 $(PROGS) $(SRC_ROOT)/bin 58 | 59 | $(PROGS): $(addprefix $(SMITHLAB_CPP)/, libsmithlab_cpp.a) 60 | 61 | merge-methcounts symmetric-cpgs selectsites lift-filter \ 62 | fast-liftover guessprotocol: \ 63 | $(addprefix $(COMMON_DIR)/, MethpipeSite.o) 64 | 65 | %.o: %.cpp %.hpp 66 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(CPPFLAGS) 67 | 68 | %: %.cpp 69 | $(CXX) $(CXXFLAGS) -o $@ $^ $(CPPFLAGS) $(LDLIBS) $(LDFLAGS) 70 | 71 | clean: 72 | @-rm -f $(PROGS) *.o *.so *.a *~ 73 | 74 | .PHONY: clean 75 | -------------------------------------------------------------------------------- /src/utils/bigWig_to_methcounts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # bigWig_to_methcounts.py: a tool to conver MethBase tracks to 3 | # methcounts format. 4 | # 5 | # Copyright (C) 2014 University of Southern California and 6 | # Meng Zhou 7 | # 8 | # Authors: Meng Zhou 9 | # 10 | # This program is free software: you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation, either version 3 of the License, or 13 | # (at your option) any later version. 14 | # 15 | # This program is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | # 20 | # You should have received a copy of the GNU General Public License 21 | # along with this program. If not, see . 22 | 23 | """This script is used for converting tracks of MethBase in bigWig (.bw) 24 | format to methcounts format. 25 | """ 26 | 27 | import sys, os 28 | import subprocess, tempfile 29 | from optparse import OptionParser 30 | 31 | def parse_line(line): 32 | field = line.split() 33 | chr = field[0] 34 | start = field[1] 35 | value = field[3] 36 | 37 | return ([chr, start], value) 38 | 39 | def write_line(fh, chr, start, meth_level, coverage): 40 | outline = "\t".join((chr, start, "+", "CpG", \ 41 | str(meth_level), str(coverage))) + "\n" 42 | fh.write(outline) 43 | 44 | def is_exe(file): 45 | return os.path.isfile(file) and os.access(file, os.X_OK) 46 | 47 | def which(program): 48 | """Do the same thing as linux "which" command. 49 | """ 50 | for path in os.environ["PATH"].split(os.pathsep): 51 | path = path.strip('"') 52 | exe_file = os.path.join(path, program) 53 | if is_exe(exe_file): 54 | return exe_file 55 | 56 | return None 57 | 58 | def opt_validation(parser, opt): 59 | if not opt.meth or not opt.read: 60 | parser.print_help() 61 | sys.exit(0) 62 | if not opt.bwtool: 63 | if which("bigWigToBedGraph"): 64 | opt.bwtool = which("bigWigToBedGraph") 65 | else: 66 | sys.stderr.write("Cannot locate bigWigToBedGraph. Please specify path.\n") 67 | sys.exit(1) 68 | else: 69 | opt.bwtool = os.path.abspath(opt.bwtool) 70 | if not is_exe(opt.bwtool): 71 | sys.stderr.write(\ 72 | "%s is not a proper executable file. Please check your path!\n"%opt.bwtool) 73 | sys.exit(1) 74 | 75 | def main(): 76 | usage = "Usage: %prog -m -r " + \ 77 | " -o [-p ]" 78 | parser = OptionParser(usage=usage) 79 | parser.add_option("-m", "--methylation", action="store", type="string", 80 | dest="meth", help="MethBase methylation track file.", \ 81 | metavar="") 82 | parser.add_option("-r", "--read-coverage", action="store", type="string", 83 | dest="read", help="MethBase read coverage track file.", \ 84 | metavar="") 85 | parser.add_option("-p", "--path", action="store", type="string", \ 86 | dest="bwtool", \ 87 | help="Path to bigWigToBedGraph executable file. " + \ 88 | "Leave blank if you already have it in environment path.", metavar="") 89 | parser.add_option("-o", "--output", action="store", type="string", \ 90 | dest="output", \ 91 | help="Output methcounts file.") 92 | (opt, args) = parser.parse_args(sys.argv) 93 | opt_validation(parser, opt) 94 | 95 | # conver bw files 96 | methtmp = tempfile.NamedTemporaryFile() 97 | readtmp = tempfile.NamedTemporaryFile() 98 | convert_args = [opt.bwtool, opt.meth, methtmp.name] 99 | try: 100 | subprocess.check_call(convert_args) 101 | except subprocess.CalledProcessError: 102 | sys.stderr.write( 103 | "An error occured in converting track file %s\n"%opt.meth) 104 | sys.exit(1) 105 | convert_args = [opt.bwtool, opt.read, readtmp.name] 106 | try: 107 | subprocess.check_call(convert_args) 108 | except subprocess.CalledProcessError: 109 | sys.stderr.write( 110 | "An error occured in converting track file %s\n"%opt.read) 111 | sys.exit(1) 112 | 113 | # combine converted files 114 | outputfh = open(opt.output, 'w') 115 | meth_line = methtmp.readline() 116 | read_line = readtmp.readline() 117 | while read_line and meth_line: 118 | (meth_coordinate, meth_value) = parse_line(meth_line) 119 | (read_coordinate, read_value) = parse_line(read_line) 120 | order = cmp(meth_coordinate, read_coordinate) 121 | if order == 0: 122 | write_line(outputfh, read_coordinate[0], read_coordinate[1], \ 123 | meth_value, read_value) 124 | meth_line = methtmp.readline() 125 | read_line = readtmp.readline() 126 | elif order == 1: 127 | # site missing in methylation track 128 | write_line(outputfh, read_coordinate[0], read_coordinate[1], \ 129 | 0, 0) 130 | read_line = readtmp.readline() 131 | else: 132 | # site missing in read track 133 | write_line(outputfh, meth_coordinate[0], meth_coordinate[1], \ 134 | 0, 0) 135 | meth_line = methtmp.readline() 136 | 137 | methtmp.close() 138 | readtmp.close() 139 | outputfh.close() 140 | 141 | if __name__ == '__main__': 142 | main() 143 | -------------------------------------------------------------------------------- /src/utils/fast-liftover.cpp: -------------------------------------------------------------------------------- 1 | /* fast-liftover: lift over sites using index file 2 | * 3 | * Copyright (C) 2014 University of Southern California and 4 | * Andrew D. Smith 5 | * 6 | * Authors: Jenny Qu, Qiang Song 7 | * 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | */ 18 | 19 | 20 | /* 21 | Sample indexfile line: 22 | [T-chr] [T-start] [T-end] [S-chr]:[S-start]:[S-end]:[S-strand] [] [T-strand] 23 | chr21 26608683 26608684 chr1:3007015:3007016:- 0 + 24 | */ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #include "smithlab_utils.hpp" 37 | #include "smithlab_os.hpp" 38 | #include "OptionParser.hpp" 39 | #include "GenomicRegion.hpp" 40 | #include "MethpipeSite.hpp" 41 | 42 | using std::string; 43 | using std::ios_base; 44 | using std::vector; 45 | using std::cout; 46 | using std::cerr; 47 | using std::endl; 48 | using std::unordered_map; 49 | using std::runtime_error; 50 | 51 | struct SimpleSite { 52 | string chrom; 53 | uint32_t pos; 54 | char strand; 55 | SimpleSite() {} 56 | SimpleSite(const string &c, const uint32_t p, const char s) : 57 | chrom(c), pos(p), strand(s) {} 58 | }; 59 | 60 | void 61 | flip_strand(SimpleSite &s) { 62 | if (s.strand == '-') { 63 | s.pos--; 64 | s.strand = '+'; 65 | } 66 | } 67 | 68 | typedef 69 | unordered_map liftover_index; 70 | 71 | static void 72 | read_index_file(const bool plus_strand, const string &index_file, 73 | unordered_map &index) { 74 | 75 | std::ifstream in(index_file); 76 | if (!in) 77 | throw runtime_error("problem opening index file: " + index_file); 78 | 79 | size_t from_pos, to_pos; 80 | string from_chrom, to_chrom; 81 | string to_strand; 82 | MSite curr_site; 83 | while (in >> from_chrom >> from_pos >> to_chrom >> to_pos >> to_strand) { 84 | SimpleSite the_site(to_chrom, to_pos, to_strand[0]); 85 | if (plus_strand) 86 | flip_strand(the_site); 87 | index[from_chrom][from_pos] = the_site; 88 | } 89 | } 90 | 91 | static bool 92 | lift_site(const unordered_map &index, 93 | MSite &meth_site) { 94 | 95 | auto chrom_index = index.find(meth_site.chrom); 96 | if (chrom_index == end(index)) 97 | return false; 98 | 99 | auto pos_index = chrom_index->second.find(meth_site.pos); 100 | if (pos_index == end(chrom_index->second)) 101 | return false; 102 | 103 | meth_site.chrom = pos_index->second.chrom; 104 | meth_site.pos = pos_index->second.pos; 105 | meth_site.strand = pos_index->second.strand; 106 | return true; 107 | } 108 | 109 | int 110 | main(int argc, const char **argv) { 111 | try { 112 | string indexfile; 113 | string tofile; 114 | string fromfile; 115 | string unlifted_file; 116 | 117 | bool VERBOSE = false; 118 | bool plus_strand = false; 119 | 120 | /****************** COMMAND LINE OPTIONS ********************/ 121 | OptionParser opt_parse(strip_path(argv[0]), 122 | "Fast liftOver-all cytosine-by strand" ); 123 | opt_parse.add_opt("indexfile", 'i', "index file", true, indexfile); 124 | opt_parse.add_opt("from", 'f', "Original file", true, fromfile); 125 | opt_parse.add_opt("to", 't', "Output file liftovered", true, tofile); 126 | opt_parse.add_opt("unlifted", 'u', "(optional) File for unlifted sites", 127 | false, unlifted_file); 128 | opt_parse.add_opt("plus-strand", 'p', "(optional) Report sites on + strand", 129 | false, plus_strand); 130 | opt_parse.add_opt("verbose", 'v', "(optional) Print more information", 131 | false, VERBOSE); 132 | 133 | vector leftover_args; 134 | opt_parse.parse(argc, argv, leftover_args); 135 | if (argc == 1 || opt_parse.help_requested()) { 136 | cerr << opt_parse.help_message() << endl; 137 | return EXIT_SUCCESS; 138 | } 139 | if (opt_parse.about_requested()) { 140 | cerr << opt_parse.about_message() << endl; 141 | return EXIT_SUCCESS; 142 | } 143 | if (opt_parse.option_missing()) { 144 | cerr << opt_parse.option_missing_message() << endl; 145 | return EXIT_SUCCESS; 146 | } 147 | /****************** END COMMAND LINE OPTIONS *****************/ 148 | 149 | unordered_map index; 150 | if (VERBOSE) 151 | cerr << "[loading liftover index file " << indexfile << "]" << endl; 152 | read_index_file(plus_strand, indexfile, index); 153 | 154 | std::ifstream in(fromfile); 155 | if (!in) 156 | throw runtime_error("cannot open input file: " + fromfile); 157 | 158 | std::ofstream out(tofile); 159 | if (!out) 160 | throw runtime_error("cannot open output file: " + tofile); 161 | 162 | std::ofstream unlifted; 163 | if (!unlifted_file.empty()) 164 | unlifted.open(unlifted_file.c_str()); 165 | 166 | if (VERBOSE) 167 | cerr << "[lifting from: " << fromfile << " to: " << tofile << "]" << endl; 168 | 169 | MSite lifted, meth_site; 170 | while (in >> meth_site) { 171 | if (lift_site(index, meth_site)) 172 | out << meth_site << endl; 173 | else if (!unlifted_file.empty()) 174 | unlifted << meth_site << endl; 175 | } 176 | } 177 | catch (const runtime_error &e) { 178 | cerr << e.what() << endl; 179 | return EXIT_FAILURE; 180 | } 181 | catch (std::bad_alloc &ba) { 182 | cerr << "ERROR: could not allocate memory" << endl; 183 | return EXIT_FAILURE; 184 | } 185 | return EXIT_SUCCESS; 186 | } 187 | -------------------------------------------------------------------------------- /src/utils/guessprotocol.cpp: -------------------------------------------------------------------------------- 1 | /* guessprotocol: a program for guessing whether a wgbs protocol is 2 | * original, pbat or random pbat 3 | * 4 | * Copyright (C) 2019 5 | * 6 | * Authors: Andrew D. Smith 7 | * 8 | * This program is free software: you can redistribute it and/or 9 | * modify it under the terms of the GNU General Public License as 10 | * published by the Free Software Foundation, either version 3 of the 11 | * License, or (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, but 14 | * WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 | * General Public License for more details. 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "OptionParser.hpp" 28 | #include "smithlab_utils.hpp" 29 | #include "smithlab_os.hpp" 30 | 31 | using std::string; 32 | using std::vector; 33 | using std::cout; 34 | using std::cerr; 35 | using std::endl; 36 | using std::min; 37 | using std::runtime_error; 38 | 39 | // store each read from one end 40 | struct FASTQRecord { 41 | string name; 42 | string seq; 43 | }; 44 | 45 | // see if two reads from two ends match to each other (they should 46 | // have the same name) 47 | static bool 48 | mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends 49 | const FASTQRecord &a, const FASTQRecord &b) { 50 | assert(to_ignore_at_end < a.name.length()); 51 | return equal(begin(a.name), end(a.name) - to_ignore_at_end, begin(b.name)); 52 | } 53 | 54 | // Read 4 lines one time from fastq and fill in the FASTQRecord structure 55 | std::istream& 56 | operator>>(std::istream& s, FASTQRecord &r) { 57 | if (getline(s, r.name)) { 58 | 59 | if (r.name.empty() || r.name[0] != '@') 60 | throw std::runtime_error("bad name line: " + r.name); 61 | 62 | r.name = r.name.substr(1, r.name.find_first_of(' ')); 63 | 64 | if (!getline(s, r.seq)) 65 | throw runtime_error("failed to read expected seq line"); 66 | 67 | string tmp; 68 | if (!getline(s, tmp)) 69 | throw runtime_error("failed to read expected + line"); 70 | 71 | if (!getline(s, tmp)) 72 | throw runtime_error("failed to read expected score line"); 73 | } 74 | return s; 75 | } 76 | 77 | 78 | static string 79 | guess_protocol(const double fraction_t_rich) { 80 | if (fraction_t_rich >= 0.8) { 81 | return "original"; 82 | } 83 | if (fraction_t_rich <= 0.2) { 84 | return "pbat"; 85 | } 86 | if (fraction_t_rich >= 0.4 && fraction_t_rich <= 0.6) { 87 | return "random"; 88 | } 89 | return "inconclusive"; 90 | } 91 | 92 | int 93 | main(int argc, const char **argv) { 94 | 95 | try { 96 | 97 | size_t reads_to_check = 1000000; 98 | size_t name_suffix_len = 0; 99 | 100 | /****************** COMMAND LINE OPTIONS ********************/ 101 | OptionParser opt_parse(strip_path(argv[0]), 102 | "guess whether protocol is ordinary, pbat or random", 103 | " []"); 104 | opt_parse.add_opt("nreads", 'n', "number of reads in initial check", 105 | false, reads_to_check); 106 | opt_parse.add_opt("ignore", 'i', "length of read name suffix " 107 | "to ignore when matching", false, name_suffix_len); 108 | vector leftover_args; 109 | opt_parse.parse(argc, argv, leftover_args); 110 | if (argc == 1 || opt_parse.help_requested()) { 111 | cerr << opt_parse.help_message() << endl 112 | << opt_parse.about_message() << endl; 113 | return EXIT_SUCCESS; 114 | } 115 | if (opt_parse.option_missing()) { 116 | cerr << opt_parse.option_missing_message() << endl; 117 | return EXIT_SUCCESS; 118 | } 119 | if (opt_parse.about_requested() || leftover_args.size() > 2) { 120 | cerr << opt_parse.about_message() << endl; 121 | return EXIT_SUCCESS; 122 | } 123 | const vector reads_files(leftover_args); 124 | /****************** END COMMAND LINE OPTIONS *****************/ 125 | 126 | if (reads_files.size() == 2) { 127 | // Input: paired-end reads with end1 and end2 128 | std::ifstream in1(reads_files.front()); 129 | if (!in1) 130 | throw runtime_error("cannot open input file: " + reads_files.front()); 131 | 132 | std::ifstream in2(reads_files.back()); 133 | if (!in2) 134 | throw runtime_error("cannot open input file: " + reads_files.back()); 135 | 136 | size_t n_pairs = 0; 137 | size_t t_rich_pairs = 0; 138 | 139 | FASTQRecord end_one, end_two; 140 | while (in1 >> end_one && in2 >> end_two && n_pairs < reads_to_check) { 141 | ++n_pairs; 142 | 143 | // two reads should be in paired-ends 144 | if (!mates(name_suffix_len, end_one, end_two)) 145 | throw runtime_error("expected mates, got: " + 146 | end_one.name + " and " + end_two.name); 147 | 148 | const double end_one_a = 149 | count(begin(end_one.seq), end(end_one.seq), 'A') + 150 | count(begin(end_one.seq), end(end_one.seq), 'C'); 151 | const double end_one_t = 152 | count(begin(end_one.seq), end(end_one.seq), 'T') + 153 | count(begin(end_one.seq), end(end_one.seq), 'G'); 154 | 155 | const double end_two_a = 156 | count(begin(end_two.seq), end(end_two.seq), 'A') + 157 | count(begin(end_two.seq), end(end_two.seq), 'C'); 158 | const double end_two_t = 159 | count(begin(end_two.seq), end(end_two.seq), 'T') + 160 | count(begin(end_two.seq), end(end_two.seq), 'G'); 161 | 162 | const double t_rich_count = (end_one_t + end_two_a); 163 | const double pbat_count = (end_one_a + end_two_t); 164 | 165 | t_rich_pairs += (t_rich_count > pbat_count); 166 | } 167 | const double fraction_t_rich = static_cast(t_rich_pairs)/n_pairs; 168 | cout << guess_protocol(fraction_t_rich) << '\t' 169 | << "fraction_t_rich=" << fraction_t_rich << '\t' 170 | << "t_rich_pairs=" << t_rich_pairs << '\t' 171 | << "pairs_examined=" << n_pairs << endl; 172 | } 173 | else { // if (reads_files.size() == 1) 174 | // Input: single-end reads 175 | std::ifstream in(reads_files.front()); 176 | if (!in) 177 | throw runtime_error("cannot open input file: " + reads_files.front()); 178 | 179 | size_t n_reads = 0; 180 | size_t t_rich_reads = 0; 181 | 182 | FASTQRecord r; 183 | while (in >> r && n_reads < reads_to_check) { 184 | ++n_reads; 185 | const double a = (count(begin(r.seq), end(r.seq), 'A') + 186 | count(begin(r.seq), end(r.seq), 'C')); 187 | const double t = (count(begin(r.seq), end(r.seq), 'T') + 188 | count(begin(r.seq), end(r.seq), 'G')); 189 | t_rich_reads += (t > a); 190 | } 191 | const double fraction_t_rich = static_cast(t_rich_reads)/n_reads; 192 | cout << guess_protocol(fraction_t_rich) << '\t' 193 | << "fraction_t_rich=" << fraction_t_rich << '\t' 194 | << "t_rich_reads=" << t_rich_reads << '\t' 195 | << "reads_examined=" << n_reads << endl; 196 | } 197 | } 198 | catch (const runtime_error &e) { 199 | cerr << e.what() << endl; 200 | return EXIT_FAILURE; 201 | } 202 | catch (std::bad_alloc &ba) { 203 | cerr << "ERROR: could not allocate memory" << endl; 204 | return EXIT_FAILURE; 205 | } 206 | return EXIT_SUCCESS; 207 | } 208 | -------------------------------------------------------------------------------- /src/utils/lc_approx.cpp: -------------------------------------------------------------------------------- 1 | /* apxlc: a program for approximately and very quickly counting lines 2 | * 3 | * Copyright (C) 2012 University of Southern California and 4 | * Andrew D. Smith 5 | * 6 | * Authors: Andrew D. Smith 7 | * 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see . 20 | */ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include "smithlab_utils.hpp" 31 | #include "smithlab_os.hpp" 32 | #include "OptionParser.hpp" 33 | 34 | using std::string; 35 | using std::ios_base; 36 | using std::vector; 37 | using std::cout; 38 | using std::cerr; 39 | using std::endl; 40 | using std::runtime_error; 41 | 42 | static size_t 43 | get_approx_line_count(const bool VERBOSE, const string &filename, 44 | const size_t n_samples, size_t sample_size) { 45 | 46 | static const size_t megabyte = (1ul << 20); 47 | static const size_t kilobyte = (1ul << 10); 48 | 49 | const size_t filesize = get_filesize(filename); 50 | 51 | if (sample_size == 0) 52 | sample_size = std::min(megabyte/10, filesize/n_samples); 53 | 54 | const size_t increment = 55 | std::floor((filesize - sample_size*n_samples)/ 56 | (n_samples - 1.0)) + sample_size; 57 | 58 | assert(filesize > n_samples && filesize > sample_size && 59 | filesize > n_samples*sample_size); 60 | 61 | if (VERBOSE) { 62 | cerr << "[PROCESSING FILE: " << filename << "]" << endl 63 | << "[FILESIZE: " 64 | << static_cast(filesize)/megabyte << "MB]" << endl 65 | << "[CHUNK SIZE: " 66 | << static_cast(1.0*sample_size/kilobyte) << "KB]" << endl 67 | << "[NUM CHUNKS: " << n_samples << "]" << endl 68 | << "[TOTAL SAMPLE: " 69 | << (1.0*n_samples*sample_size)/megabyte << "MB]" << endl; 70 | } 71 | std::ifstream in(filename.c_str(), ios_base::binary); 72 | if (!in) 73 | throw runtime_error("cannot open input file " + string(filename)); 74 | 75 | vector buffer(sample_size); 76 | double total_lines = 0.0; 77 | for (size_t i = 0; i < filesize && in.good(); i += increment) { 78 | in.seekg(i, ios_base::beg); 79 | in.read(&buffer.front(), sample_size); 80 | if (in.good()) 81 | total_lines += (0.5 + count(buffer.begin(), buffer.end(), '\n')); 82 | } 83 | return (filesize*total_lines)/(n_samples*sample_size); 84 | } 85 | 86 | 87 | 88 | int 89 | main(int argc, const char **argv) { 90 | try { 91 | 92 | size_t n_samples = 100; 93 | size_t sample_size = 0; 94 | bool VERBOSE = false; 95 | 96 | /****************** COMMAND LINE OPTIONS ********************/ 97 | OptionParser opt_parse(strip_path(argv[0]), 98 | "approximate line counting in large files", 99 | " ..." ); 100 | opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); 101 | opt_parse.add_opt("samples", 'n', "number of samples", false, n_samples); 102 | opt_parse.add_opt("size", 'z', "sample size (bytes)", false, sample_size); 103 | 104 | vector leftover_args; 105 | opt_parse.parse(argc, argv, leftover_args); 106 | if (argc == 1 || opt_parse.help_requested()) { 107 | cerr << opt_parse.help_message() << endl; 108 | return EXIT_SUCCESS; 109 | } 110 | if (opt_parse.about_requested()) { 111 | cerr << opt_parse.about_message() << endl; 112 | return EXIT_SUCCESS; 113 | } 114 | if (opt_parse.option_missing()) { 115 | cerr << opt_parse.option_missing_message() << endl; 116 | return EXIT_SUCCESS; 117 | } 118 | if (leftover_args.size() < 1) { 119 | cerr << opt_parse.help_message() << endl; 120 | return EXIT_FAILURE; 121 | } 122 | vector filenames(leftover_args); 123 | /****************** END COMMAND LINE OPTIONS *****************/ 124 | ////////////////////////////////////////////////////////////// 125 | 126 | for (size_t i = 0; i < filenames.size(); ++i) 127 | cout << filenames[i] << "\t" 128 | << get_approx_line_count(VERBOSE, filenames[i], 129 | n_samples, sample_size) << endl; 130 | } 131 | catch (const runtime_error &e) { 132 | cerr << e.what() << endl; 133 | return EXIT_FAILURE; 134 | } 135 | catch (std::bad_alloc &ba) { 136 | cerr << "ERROR: could not allocate memory" << endl; 137 | return EXIT_FAILURE; 138 | } 139 | return EXIT_SUCCESS; 140 | } 141 | -------------------------------------------------------------------------------- /src/utils/lift-filter.cpp: -------------------------------------------------------------------------------- 1 | /* lift-filter: process lift results 2 | * 3 | * Copyright (C) 2014 University of Southern California and 4 | * Andrew D. Smith 5 | * 6 | * Authors: Jenny Qu 7 | * 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "smithlab_utils.hpp" 27 | #include "smithlab_os.hpp" 28 | #include "OptionParser.hpp" 29 | #include "MethpipeSite.hpp" 30 | 31 | using std::string; 32 | using std::vector; 33 | using std::cerr; 34 | using std::endl; 35 | using std::runtime_error; 36 | 37 | static bool 38 | same_chrom_pos_strand(const MSite &a, const MSite &b) { 39 | return a.pos == b.pos && a.chrom == b.chrom && a.strand == b.strand; 40 | } 41 | 42 | int 43 | main(int argc, const char **argv) { 44 | try{ 45 | string pfile; 46 | bool VERBOSE = false; 47 | bool UNIQUE = false; 48 | 49 | /****************** COMMAND LINE OPTIONS ********************/ 50 | OptionParser opt_parse(strip_path(argv[0]), 51 | "Process duplicated sites from fast-liftover output", 52 | ""); 53 | opt_parse.add_opt("output", 'o', "Output processed methcount", true, pfile); 54 | opt_parse.add_opt("unique", 'u', "keep unique sites", false, UNIQUE); 55 | opt_parse.add_opt("verbose", 'v', "print more information", false, VERBOSE); 56 | 57 | vector leftover_args; 58 | opt_parse.parse(argc, argv, leftover_args); 59 | if (argc == 1 || opt_parse.help_requested()) { 60 | cerr << opt_parse.help_message() << endl; 61 | return EXIT_SUCCESS; 62 | } 63 | if (opt_parse.about_requested()) { 64 | cerr << opt_parse.about_message() << endl; 65 | return EXIT_SUCCESS; 66 | } 67 | if (opt_parse.option_missing()) { 68 | cerr << opt_parse.option_missing_message() << endl; 69 | return EXIT_SUCCESS; 70 | } 71 | if (leftover_args.empty()) { 72 | cerr << opt_parse.help_message() << endl; 73 | return EXIT_SUCCESS; 74 | } 75 | const string mfile(leftover_args.front()); 76 | /****************** END COMMAND LINE OPTIONS *****************/ 77 | 78 | std::ifstream in(mfile); 79 | if (!in) 80 | throw runtime_error("cannot open input file: " + mfile); 81 | 82 | std::ofstream out(pfile); 83 | //if (!of) 84 | // throw runtime_error("cannot open output file: " + pfile); 85 | //std::ostream out(of.rdbuf()); 86 | 87 | // read first site 88 | MSite curr_site; 89 | if (!(in >> curr_site)) 90 | throw runtime_error("failed reading: " + mfile); 91 | 92 | MSite next_site; 93 | bool site_is_unique = true; 94 | while (in >> next_site) { 95 | if (same_chrom_pos_strand(curr_site, next_site)) { 96 | site_is_unique = false; 97 | curr_site.add(next_site); 98 | } 99 | else { 100 | if (!UNIQUE || site_is_unique) 101 | out << curr_site << endl; 102 | site_is_unique = true; 103 | curr_site = next_site; 104 | } 105 | } 106 | if (!UNIQUE || site_is_unique) 107 | out << curr_site << endl; 108 | 109 | } 110 | catch (const runtime_error &e) { 111 | cerr << e.what() << endl; 112 | return EXIT_FAILURE; 113 | } 114 | catch (std::bad_alloc &ba) { 115 | cerr << "ERROR: could not allocate memory" << endl; 116 | return EXIT_FAILURE; 117 | } 118 | return EXIT_SUCCESS; 119 | } 120 | -------------------------------------------------------------------------------- /src/utils/selectsites.cpp: -------------------------------------------------------------------------------- 1 | /* selectsites: program to select sites, specified in a methcounts 2 | * format file, that are contained in given (bed format) intervals 3 | * 4 | * Copyright (C) 2019 Andrew D. Smith 5 | * 6 | * Authors: Andrew D. Smith 7 | * 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "OptionParser.hpp" 28 | #include "smithlab_utils.hpp" 29 | #include "smithlab_os.hpp" 30 | #include "GenomicRegion.hpp" 31 | #include "MethpipeSite.hpp" 32 | #include "zlib_wrapper.hpp" 33 | 34 | using std::string; 35 | using std::vector; 36 | using std::cout; 37 | using std::cerr; 38 | using std::endl; 39 | using std::ios_base; 40 | using std::runtime_error; 41 | using std::ifstream; 42 | 43 | 44 | static void 45 | collapsebed(vector ®ions) { 46 | size_t j = 0; 47 | for (size_t i = 1; i < regions.size(); ++i) { 48 | if (regions[j].same_chrom(regions[i]) && 49 | regions[i].get_start() <= regions[j].get_end()) { 50 | regions[j].set_end(std::max(regions[j].get_end(), regions[i].get_end())); 51 | } 52 | else { 53 | regions[++j] = regions[i]; 54 | } 55 | } 56 | regions.erase(begin(regions) + j + 1, end(regions)); 57 | } 58 | 59 | static bool 60 | precedes(const GenomicRegion &r, const MSite &s) { 61 | return (r.get_chrom() < s.chrom || 62 | (r.get_chrom() == s.chrom && r.get_end() <= s.pos)); 63 | } 64 | 65 | 66 | static bool 67 | contains(const GenomicRegion &r, const MSite &s) { 68 | return (r.get_chrom() == s.chrom && 69 | (r.get_start() <= s.pos && s.pos < r.get_end())); 70 | } 71 | 72 | template 73 | static void 74 | process_all_sites(const bool VERBOSE, 75 | const string &sites_file, 76 | const vector ®ions, 77 | T &out) { 78 | 79 | igzfstream in(sites_file); 80 | if (!in) 81 | throw runtime_error("cannot open file: " + sites_file); 82 | 83 | MSite the_site; 84 | size_t i = 0; 85 | while (in >> the_site) { 86 | while (i < regions.size() && precedes(regions[i], the_site)) 87 | ++i; 88 | 89 | if (contains(regions[i], the_site)) 90 | out << the_site << "\n"; 91 | } 92 | } 93 | 94 | 95 | //////////////////////////////////////////////////////////////////////// 96 | /// CODE BELOW HERE IS FOR SEARCHING ON DISK 97 | static void 98 | move_to_start_of_line(ifstream &in) { 99 | char next; 100 | while (in.good() && in.get(next) && next != '\n') { 101 | in.unget(); 102 | in.unget(); 103 | } 104 | if (in.bad()) 105 | // hope this only happens when hitting the start of the file 106 | in.clear(); 107 | } 108 | 109 | static void 110 | find_start_line(const string &chr, const size_t idx, ifstream &site_in) { 111 | 112 | site_in.seekg(0, ios_base::beg); 113 | const size_t begin_pos = site_in.tellg(); 114 | site_in.seekg(0, ios_base::end); 115 | const size_t end_pos = site_in.tellg(); 116 | 117 | if (end_pos - begin_pos < 2) 118 | throw runtime_error("empty meth file"); 119 | 120 | size_t step_size = (end_pos - begin_pos)/2; 121 | 122 | site_in.seekg(0, ios_base::beg); 123 | string low_chr; 124 | size_t low_idx = 0; 125 | if (!(site_in >> low_chr >> low_idx)) 126 | throw runtime_error("failed navigating inside file"); 127 | 128 | // MAGIC: need the -2 here to get past the EOF and possibly a '\n' 129 | site_in.seekg(-2, ios_base::end); 130 | move_to_start_of_line(site_in); 131 | string high_chr; 132 | size_t high_idx; 133 | if (!(site_in >> high_chr >> high_idx)) 134 | throw runtime_error("failed navigating inside file"); 135 | 136 | size_t pos = step_size; 137 | site_in.seekg(pos, ios_base::beg); 138 | move_to_start_of_line(site_in); 139 | 140 | while (step_size > 0) { 141 | string mid_chr; 142 | size_t mid_idx = 0; 143 | if (!(site_in >> mid_chr >> mid_idx)) 144 | throw runtime_error("failed navigating inside file"); 145 | step_size /= 2; 146 | if (chr < mid_chr || (chr == mid_chr && idx <= mid_idx)) { 147 | std::swap(mid_chr, high_chr); 148 | std::swap(mid_idx, high_idx); 149 | pos -= step_size; 150 | } 151 | else { 152 | std::swap(mid_chr, low_chr); 153 | std::swap(mid_idx, low_idx); 154 | pos += step_size; 155 | } 156 | site_in.seekg(pos, ios_base::beg); 157 | move_to_start_of_line(site_in); 158 | } 159 | } 160 | 161 | static void 162 | get_sites_in_region(ifstream &site_in, const GenomicRegion ®ion, 163 | std::ostream &out) { 164 | 165 | string chrom(region.get_chrom()); 166 | const size_t start_pos = region.get_start(); 167 | const size_t end_pos = region.get_end(); 168 | find_start_line(chrom, start_pos, site_in); 169 | 170 | MSite the_site; 171 | while (site_in >> the_site && (the_site.chrom == chrom && 172 | (the_site.pos < end_pos))) 173 | if (start_pos <= the_site.pos) 174 | out << the_site << endl; 175 | } 176 | 177 | static void 178 | process_with_sites_on_disk(const string &sites_file, 179 | vector ®ions, 180 | std::ostream &out) { 181 | 182 | ifstream in(sites_file); 183 | if (!in) 184 | throw runtime_error("cannot open file: " + sites_file); 185 | 186 | for (size_t i = 0; i < regions.size() && in; ++i) 187 | get_sites_in_region(in, regions[i], out); 188 | } 189 | /// END OF CODE FOR SEARCHING ON DISK 190 | //////////////////////////////////////////////////////////////////////// 191 | 192 | 193 | int 194 | main(int argc, const char **argv) { 195 | 196 | try { 197 | 198 | bool VERBOSE = false; 199 | bool LOAD_ENTIRE_FILE = false; 200 | 201 | string outfile; 202 | 203 | const string description = 204 | "Select sites inside a set of genomic intervals. " 205 | "Sites must be specified in methcounts format. " 206 | "Intervals must be specified in bed format."; 207 | 208 | /****************** COMMAND LINE OPTIONS ********************/ 209 | OptionParser opt_parse(strip_path(argv[0]), description, 210 | " ", 2); 211 | opt_parse.add_opt("output", 'o', "output file (default: stdout)", 212 | false, outfile); 213 | opt_parse.add_opt("preload", 'p', 214 | "preload sites (use for large target intervals)", 215 | false, LOAD_ENTIRE_FILE); 216 | opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); 217 | opt_parse.set_show_defaults(); 218 | vector leftover_args; 219 | opt_parse.parse(argc, argv, leftover_args); 220 | if (argc == 1 || opt_parse.help_requested()) { 221 | cerr << opt_parse.help_message() << endl 222 | << opt_parse.about_message() << endl; 223 | return EXIT_SUCCESS; 224 | } 225 | if (opt_parse.about_requested()) { 226 | cerr << opt_parse.about_message() << endl; 227 | return EXIT_SUCCESS; 228 | } 229 | if (opt_parse.option_missing()) { 230 | cerr << opt_parse.option_missing_message() << endl; 231 | return EXIT_SUCCESS; 232 | } 233 | if (leftover_args.size() != 2) { 234 | cerr << opt_parse.help_message() << endl; 235 | return EXIT_SUCCESS; 236 | } 237 | const string regions_file = leftover_args.front(); 238 | const string sites_file = leftover_args.back(); 239 | /****************** END COMMAND LINE OPTIONS *****************/ 240 | 241 | vector regions; 242 | ReadBEDFile(regions_file, regions); 243 | if (!check_sorted(regions)) 244 | throw runtime_error("regions not sorted in file: " + regions_file); 245 | 246 | const size_t n_orig_regions = regions.size(); 247 | collapsebed(regions); 248 | if (VERBOSE && n_orig_regions != regions.size()) 249 | cerr << "[number of regions merged due to overlap: " 250 | << n_orig_regions - regions.size() << "]" << endl; 251 | 252 | if (outfile.empty() || !has_gz_ext(outfile)) { 253 | std::ofstream of; 254 | if (!outfile.empty()) of.open(outfile.c_str()); 255 | std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); 256 | if (!outfile.empty() && !out) 257 | throw runtime_error("failed to open output file: " + outfile); 258 | 259 | if (LOAD_ENTIRE_FILE) 260 | process_all_sites(VERBOSE, sites_file, regions, out); 261 | else 262 | process_with_sites_on_disk(sites_file, regions, out); 263 | } 264 | else { 265 | // not supporting search on disk for gz file 266 | ogzfstream out(outfile); 267 | process_all_sites(VERBOSE, sites_file, regions, out); 268 | } 269 | } 270 | catch (const runtime_error &e) { 271 | cerr << e.what() << endl; 272 | return EXIT_FAILURE; 273 | } 274 | catch (std::bad_alloc &ba) { 275 | cerr << "ERROR: could not allocate memory" << endl; 276 | return EXIT_FAILURE; 277 | } 278 | return EXIT_SUCCESS; 279 | } 280 | -------------------------------------------------------------------------------- /src/utils/symmetric-cpgs.cpp: -------------------------------------------------------------------------------- 1 | /* symmetric-cpgs: extract the CpG sites from a methcounts output 2 | * file and produce a new one with the CpGs treated unstranded. 3 | * 4 | * Copyright (C) 2014 University of Southern California and 5 | * Andrew D. Smith 6 | * 7 | * Authors: Andrew D. Smith 8 | * 9 | * This program is free software: you can redistribute it and/or 10 | * modify it under the terms of the GNU General Public License as 11 | * published by the Free Software Foundation, either version 3 of the 12 | * License, or (at your option) any later version. 13 | * 14 | * This program is distributed in the hope that it will be useful, but 15 | * WITHOUT ANY WARRANTY; without even the implied warranty of 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 | * General Public License for more details. 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | // from smithlab_cpp 27 | #include "OptionParser.hpp" 28 | #include "smithlab_utils.hpp" 29 | #include "smithlab_os.hpp" 30 | #include "zlib_wrapper.hpp" 31 | 32 | // from methpipe/common 33 | #include "MethpipeSite.hpp" 34 | 35 | #include "zlib_wrapper.hpp" 36 | 37 | using std::string; 38 | using std::cout; 39 | using std::cerr; 40 | using std::endl; 41 | 42 | 43 | inline bool 44 | found_symmetric(const MSite &prev_cpg, const MSite &curr_cpg) { 45 | // assumes check for CpG already done 46 | return (prev_cpg.strand == '+' && 47 | curr_cpg.strand == '-' && 48 | prev_cpg.pos + 1 == curr_cpg.pos); 49 | } 50 | 51 | template 52 | static void 53 | process_sites(const bool include_mutated, igzfstream &in, T &out) { 54 | 55 | MSite prev_site, curr_site; 56 | bool prev_is_good_cpg = false; 57 | if (in >> prev_site) 58 | if (prev_site.is_cpg() && (include_mutated || !prev_site.is_mutated())) 59 | prev_is_good_cpg = true; 60 | 61 | while (in >> curr_site) { 62 | if (curr_site.is_cpg() && (include_mutated || !curr_site.is_mutated())) { 63 | if (prev_is_good_cpg && found_symmetric(prev_site, curr_site)) { 64 | prev_site.add(curr_site); 65 | out << prev_site << '\n'; 66 | } 67 | prev_is_good_cpg = true; 68 | } 69 | else prev_is_good_cpg = false; 70 | std::swap(prev_site, curr_site); 71 | } 72 | } 73 | 74 | int 75 | main(int argc, const char **argv) { 76 | 77 | try { 78 | 79 | string outfile; 80 | bool VERBOSE; 81 | bool include_mutated = false; 82 | 83 | const string description = 84 | "Get CpG sites and make methylation levels symmetric."; 85 | 86 | /****************** COMMAND LINE OPTIONS ********************/ 87 | OptionParser opt_parse(strip_path(argv[0]), 88 | description, ""); 89 | opt_parse.add_opt("output", 'o', "output file (default: stdout)", 90 | false, outfile); 91 | opt_parse.add_opt("muts", 'm', "include mutated CpG sites", 92 | false, include_mutated); 93 | opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); 94 | std::vector leftover_args; 95 | opt_parse.parse(argc, argv, leftover_args); 96 | if (argc == 1 || opt_parse.help_requested()) { 97 | cerr << opt_parse.help_message() << endl 98 | << opt_parse.about_message() << endl; 99 | return EXIT_SUCCESS; 100 | } 101 | if (opt_parse.about_requested()) { 102 | cerr << opt_parse.about_message() << endl; 103 | return EXIT_SUCCESS; 104 | } 105 | if (opt_parse.option_missing()) { 106 | cerr << opt_parse.option_missing_message() << endl; 107 | return EXIT_SUCCESS; 108 | } 109 | if (leftover_args.size() != 1) { 110 | cerr << opt_parse.help_message() << endl; 111 | return EXIT_SUCCESS; 112 | } 113 | const string filename(leftover_args.front()); 114 | /****************** END COMMAND LINE OPTIONS *****************/ 115 | 116 | igzfstream in(filename); 117 | if (!in) 118 | throw std::runtime_error("could not open file: " + filename); 119 | 120 | if (outfile.empty() || !has_gz_ext(outfile)) { 121 | std::ofstream of; 122 | if (!outfile.empty()) of.open(outfile.c_str()); 123 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 124 | process_sites(include_mutated, in, out); 125 | } 126 | else { 127 | ogzfstream out(outfile); 128 | process_sites(include_mutated, in, out); 129 | } 130 | } 131 | catch (const std::runtime_error &e) { 132 | cerr << e.what() << endl; 133 | return EXIT_FAILURE; 134 | } 135 | return EXIT_SUCCESS; 136 | } 137 | --------------------------------------------------------------------------------