├── bin ├── pyhammlet │ ├── __init__.py │ ├── palette.pdf │ ├── palette.txt │ ├── io.py │ ├── RLE.py │ └── plotting.py ├── sortStates └── samToCounts ├── lib └── gzstream │ ├── version │ ├── logo.gif │ ├── README │ ├── test_gunzip.C │ ├── test_gzip.C │ ├── Makefile │ ├── gzstream.h │ ├── gzstream.C │ └── index.html ├── logo ├── logo.png ├── logo.xcf ├── logo-inv.png ├── logo-inv.xcf ├── logo-round.png ├── logo-round.xcf ├── logo-inv-noborder.png ├── logo-inv-noborder.xcf ├── logo-round-250px.png ├── logo-round-250px.xcf ├── logo-boxdrawing.txt ├── logo-boxdrawing-centered.groff └── logo-round.base64 ├── doc ├── hammlet-manpage-a4.pdf ├── hammlet-manpage-letter.pdf ├── man-preamble.tex ├── hammlet.bib └── pandoc.css ├── INSTALL.txt ├── src ├── TransitionHyperParam.hpp ├── Blocks.hpp ├── Statistics.hpp ├── InitialHyperParam.hpp ├── tools │ ├── avg.cpp │ ├── maxSegmentation.cpp │ ├── MappedValues.hpp │ ├── GenomeGetter.hpp │ ├── combineCounts.cpp │ └── mapLinesToGenome.cpp ├── Initial.hpp ├── KahanAggregator.hpp ├── Trellis.hpp ├── MultiVector.hpp ├── Blocks │ ├── FixedBlocks.hpp │ ├── SplittableBlocks.hpp │ └── BreakpointArray.hpp ├── Emissions.hpp ├── Transitions.hpp ├── includes.hpp ├── EFD.hpp ├── ThetaHyperParam.hpp ├── Tags.hpp ├── StateSequence.hpp ├── AutoPriors.hpp ├── HMM.hpp ├── uintmath.hpp ├── Mapping.hpp ├── StateMarginalsIterator.hpp ├── StateSequence │ ├── Mixture.hpp │ └── ForwardBackward.hpp ├── utils.hpp ├── Distribution.hpp ├── Conjugate.hpp ├── Theta.hpp ├── wavelet.hpp ├── Records.hpp ├── Statistics │ └── IntegralArray.hpp └── SufficientStatistics.hpp ├── README.md └── Makefile /bin/pyhammlet/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lib/gzstream/version: -------------------------------------------------------------------------------- 1 | 1.5 (08 Jan 2003) 2 | -------------------------------------------------------------------------------- /logo/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo.png -------------------------------------------------------------------------------- /logo/logo.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo.xcf -------------------------------------------------------------------------------- /logo/logo-inv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-inv.png -------------------------------------------------------------------------------- /logo/logo-inv.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-inv.xcf -------------------------------------------------------------------------------- /logo/logo-round.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-round.png -------------------------------------------------------------------------------- /logo/logo-round.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-round.xcf -------------------------------------------------------------------------------- /lib/gzstream/logo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/lib/gzstream/logo.gif -------------------------------------------------------------------------------- /bin/pyhammlet/palette.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/bin/pyhammlet/palette.pdf -------------------------------------------------------------------------------- /doc/hammlet-manpage-a4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/doc/hammlet-manpage-a4.pdf -------------------------------------------------------------------------------- /logo/logo-inv-noborder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-inv-noborder.png -------------------------------------------------------------------------------- /logo/logo-inv-noborder.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-inv-noborder.xcf -------------------------------------------------------------------------------- /logo/logo-round-250px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-round-250px.png -------------------------------------------------------------------------------- /logo/logo-round-250px.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-round-250px.xcf -------------------------------------------------------------------------------- /doc/hammlet-manpage-letter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/doc/hammlet-manpage-letter.pdf -------------------------------------------------------------------------------- /INSTALL.txt: -------------------------------------------------------------------------------- 1 | To install, run make, or simply use a C++11-compliant compiler, e.g. 2 | g++ -O3 --std=c++11 -o hammlet main.cpp 3 | 4 | -------------------------------------------------------------------------------- /logo/logo-boxdrawing.txt: -------------------------------------------------------------------------------- 1 | ┏━━━━━┓ ┏━━━━━┓ 2 | ┣━━━━━┫ ┃ ┏━━━┫ 3 | ┃ ┏━━━┫ ┃ ┃ ┏━┫ 4 | ┃ ┃ ┏━┻━━━━━┫ ┃ ┃ ┃ 5 | ┃ ┃ ┃ ┏━━━━━┫ ┃ ┃ ┃ 6 | ┃ ┃ ┃ ┣━━━━━┛ ┃ ┃ ┃ 7 | ┃ ┃ ┃ ┣━━━━━┳━┛ ┃ ┃ 8 | ┣━┛ ┃ ┃ ┣━━━┛ ┃ 9 | ┣━━━┛ ┃ ┣━━━━━┫ 10 | ┗━━━━━┛ ┗━━━━━┛ -------------------------------------------------------------------------------- /src/TransitionHyperParam.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TRANSITIONSHYPERPARAM_HPP 2 | #define TRANSITIONSHYPERPARAM_HPP 3 | 4 | #include "Tags.hpp" 5 | 6 | 7 | template 8 | using TransitionHyperParam = Conjugate; 9 | 10 | 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /src/Blocks.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BLOCKS_HPP 2 | #define BLOCKS_HPP 3 | 4 | 5 | template 6 | class Blocks; 7 | 8 | #include "Blocks/BreakpointArray.hpp" 9 | #include "Blocks/SplittableBlocks.hpp" 10 | #include "Blocks/FixedBlocks.hpp" 11 | 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /lib/gzstream/README: -------------------------------------------------------------------------------- 1 | 2 | gzstream 3 | C++ iostream classes wrapping the zlib compression library. 4 | =========================================================================== 5 | 6 | See index.html for documentation and installation instructions. 7 | -------------------------------------------------------------------------------- /bin/sortStates: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Sort by absolute value of last emission means 4 | # $1 - parameter file created by HaMMLET 5 | 6 | echo "#state mean" 7 | tail -n 1 $1 | awk '{ for (i=1;i<=NF;i+=2) print (i-1)/2"\t"$i }' | sed -r 's/-([^-]+)/\1\t-/g;' | sort -k 2 -n -r | awk '{print $1"\t"$3$2}' -------------------------------------------------------------------------------- /src/Statistics.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STATISTICS_HPP 2 | #define STATISTICS_HPP 3 | 4 | #include "SufficientStatistics.hpp" 5 | 6 | template 7 | class Statistics; 8 | 9 | #include "Blocks.hpp" 10 | 11 | #include "Statistics/IntegralArray.hpp" 12 | // #include "Statistics/Fixed.hpp" 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /src/InitialHyperParam.hpp: -------------------------------------------------------------------------------- 1 | #ifndef INITIALHYPERPARAM_HPP 2 | #define INITIALHYPERPARAM_HPP 3 | 4 | #include "includes.hpp" 5 | #include "Tags.hpp" 6 | #include "Conjugate.hpp" 7 | 8 | 9 | // for the time being, tau is an alias for conjugates 10 | template 11 | using InitialHyperParam = Conjugate; 12 | 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /logo/logo-boxdrawing-centered.groff: -------------------------------------------------------------------------------- 1 | .PP 2 | .ce 3 | ┏━━━━━┓ ┏━━━━━┓ 4 | .ce 5 | ┣━━━━━┫ ┃ ┏━━━┫ 6 | .ce 7 | ┃ ┏━━━┫ ┃ ┃ ┏━┫ 8 | .ce 9 | ┃ ┃ ┏━┻━━━━━┫ ┃ ┃ ┃ 10 | .ce 11 | ┃ ┃ ┃ ┏━━━━━┫ ┃ ┃ ┃ 12 | .ce 13 | ┃ ┃ ┃ ┣━━━━━┛ ┃ ┃ ┃ 14 | .ce 15 | ┃ ┃ ┃ ┣━━━━━┳━┛ ┃ ┃ 16 | .ce 17 | ┣━┛ ┃ ┃ ┣━━━┛ ┃ 18 | .ce 19 | ┣━━━┛ ┃ ┣━━━━━┫ 20 | .ce 21 | ┗━━━━━┛ ┗━━━━━┛ 22 | -------------------------------------------------------------------------------- /doc/man-preamble.tex: -------------------------------------------------------------------------------- 1 | \usepackage[charter]{mathdesign} 2 | \setlength{\columnsep}{1cm} 3 | \usepackage{microtype} 4 | \DisableLigatures[-]{} % disable replacement of -- 5 | \usepackage{breakurl} 6 | \usepackage{graphicx} 7 | 8 | \usepackage{eso-pic} 9 | \newcommand\AtPageUpperRight[1]{\AtPageUpperLeft{\makebox[\paperwidth][r]{#1}}} 10 | \AddToShipoutPictureBG{ 11 | \AtPageUpperRight{\raisebox{-\height}{\includegraphics[width=0.6in, keepaspectratio]{logo/logo-inv.png}}} 12 | } 13 | 14 | 15 | \hyphenation{white-space} 16 | -------------------------------------------------------------------------------- /bin/pyhammlet/palette.txt: -------------------------------------------------------------------------------- 1 | #a6cee3 2 | #1f78b4 3 | #b2df8a 4 | #33a02c 5 | #fb9a99 6 | #e31a1c 7 | #fdbf6f 8 | #ff7f00 9 | #cab2d6 10 | #6a3d9a 11 | #ffff99 12 | #b15928 13 | #437e6a 14 | #ba78c6 15 | #694d48 16 | #9d840b 17 | #306928 18 | #ba768c 19 | #849b6d 20 | #698af8 21 | #6bf9df 22 | #39fb3f 23 | #bb32a7 24 | #ebca0c 25 | #16b0c9 26 | #8f94b9 27 | #3da88a 28 | #726441 29 | #f0d4c0 30 | #f6208f 31 | #6047f7 32 | #d21b4f 33 | #d0dcca 34 | #c96f63 35 | #6d5d7b 36 | #bba09a 37 | #a0ad3d 38 | #8a3565 39 | #778884 40 | #b99164 41 | #236575 42 | #6c8117 43 | #817577 44 | #f893d0 45 | #525952 46 | #f3edfe 47 | #2cd58b 48 | #27a0ea 49 | #3a5680 50 | #fd28ff 51 | #933b30 52 | #03e8ff 53 | #cdc190 54 | #7d6bb7 55 | #7a4a0f 56 | #8fc0b8 -------------------------------------------------------------------------------- /doc/hammlet.bib: -------------------------------------------------------------------------------- 1 | @article {hammlet, 2 | author = {Wiedenhoeft, John and Brugel, Eric and Schliep, Alexander}, 3 | title = {Fast Bayesian Inference of Copy Number Variants using Hidden Markov Models with Wavelet Compression}, 4 | year = {2016}, 5 | doi = { 10.1371/journal.pcbi.1004871}, 6 | journal = {PLOS Computational Biology}, 7 | issue={12}, 8 | number={5}, 9 | url={http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004871} 10 | } 11 | 12 | @inproceedings{hammletrecomb, 13 | author = {Wiedenhoeft, John and Brugel, Eric and Schliep, Alexander}, 14 | title = {Fast Bayesian Inference of Copy Number Variants using Hidden Markov Models with Wavelet Compression}, 15 | year = {2016}, 16 | booktitle={Research in Computational Molecular Biology: 20th Annual Conference, RECOMB 2017}, 17 | isbn={9783319319575} 18 | } 19 | -------------------------------------------------------------------------------- /src/tools/avg.cpp: -------------------------------------------------------------------------------- 1 | // reads data stream and puts average of non-overlapping windows to stdout 2 | #include 3 | using std::ostream; 4 | using std::endl; 5 | using std::cin; 6 | using std::cout; 7 | using std::flush; 8 | 9 | 10 | #include 11 | using std::runtime_error; 12 | using std::exception; 13 | 14 | #include 15 | using std::istringstream; 16 | 17 | int main( int argc, const char* argv[] ) { 18 | 19 | 20 | if ( argc <= 1 ) { 21 | throw runtime_error( "Not enough arguments!" ); 22 | } 23 | 24 | istringstream ss( argv[1] ); 25 | size_t windowSize; 26 | ss >> windowSize ; 27 | float v; 28 | float sum = 0; 29 | size_t pos = 0; 30 | while ( cin >> v ) { 31 | sum += v; 32 | pos++; 33 | if ( pos == windowSize ) { 34 | cout << sum / pos << endl; 35 | pos = 0; 36 | sum = 0; 37 | } 38 | } 39 | if ( pos != 0 ) { 40 | cout << sum / pos << endl; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![HaMMLET](https://github.com/wiedenhoeft/HaMMLET/blob/dev/logo/logo-inv-noborder.png) 2 | 3 | HaMMLET – Fast Bayesian HMM segmentation for big data 4 | ===================================================== 5 | 6 | This software implements Forward-Backward Gibbs sampling for Bayesian segmentation in Hidden Markov Models (HMM). It uses dynamic wavelet compression to drastically improve convergence and memory consumption, making inference possible on large-scale data. 7 | 8 | For instance, HaMMLET can be used on a regular laptop for segmentation of genomic data, such as array-CGH or depth-of coverage from whole-genome sequencing (WGS), to find candidates for copy-number variants (CNV). For details, please refer to the doc/ directory. 9 | 10 | For implementation details and the theory behind this approach, please refer to my [thesis](https://rucore.libraries.rutgers.edu/rutgers-lib/59275/) (DOI: [10.7282/t3-4e1k-ph18](https://doi.org/doi:10.7282/t3-4e1k-ph18)). 11 | -------------------------------------------------------------------------------- /src/Initial.hpp: -------------------------------------------------------------------------------- 1 | #ifndef INITIAL_HPP 2 | #define INITIAL_HPP 3 | 4 | 5 | #include "includes.hpp" 6 | #include "Tags.hpp" 7 | #include "Distribution.hpp" 8 | #include "InitialHyperParam.hpp" 9 | #include "StateSequence.hpp" 10 | #include "Transitions.hpp" 11 | 12 | template // e.g. Dirichlet 13 | class Initial { 14 | 15 | Observation mValue; 16 | SufficientStatistics< Categorical > mCounts; 17 | Distribution mDist; 18 | 19 | public: 20 | 21 | // delete copy constructor 22 | Initial( const Initial& that ) = delete; 23 | 24 | Initial( size_t nrStates, rng_t& RNG ) : 25 | mValue( nrStates ), 26 | mCounts( nrStates ), 27 | mDist( RNG ) {} 28 | 29 | Initial( vector< real_t>& vec, rng_t& RNG ) : 30 | mValue( vec ), 31 | mDist( RNG ) {}; 32 | 33 | 34 | template 35 | void sample( 36 | InitialHyperParamType& tau_pi // NOTE tau_pi cannot be constant since we 37 | ) { 38 | mDist.resample( mValue, tau_pi.posterior() ); 39 | tau_pi.reset(); 40 | } 41 | 42 | 43 | 44 | vector valueVector() const { // NOTE this is intermediate level is necessary, since dist might be a more complicated structure than a simple probability vector itself, e.g. when using Dirichlet process priors 45 | return mValue.probs(); 46 | }; 47 | 48 | size_t nrStates() const { 49 | return mValue.domainSize(); 50 | } 51 | 52 | 53 | string str() const { 54 | return mValue.str(); 55 | } 56 | 57 | 58 | }; 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/KahanAggregator.hpp: -------------------------------------------------------------------------------- 1 | #ifndef KAHANAGGREGATOR_HPP 2 | #define KAHANAGGREGATOR_HPP 3 | 4 | // This class aggregates values into a sum, while keeping track of the error using Kahan's algorithm. Subtractions are stored separately so that only a single subtraction is used when calling sum() 5 | template 6 | class KahanAggregator { 7 | T mPosSum; 8 | T mNegSum; 9 | T mPosError; 10 | T mNegError; 11 | size_t mN; // number of terms 12 | 13 | 14 | public: 15 | 16 | // TODO operator+, operator() 17 | 18 | KahanAggregator(): 19 | mPosSum( 0 ), 20 | mNegSum( 0 ), 21 | mPosError( 0 ), 22 | mNegError( 0 ), 23 | mN( 0 ) {} 24 | 25 | 26 | void add( T x, size_t N = 1 ) { 27 | T y = x - mPosError; 28 | T temp = mPosSum + y; 29 | mPosError = ( temp - mPosSum ) - y; 30 | mPosSum = temp; 31 | mN += N; 32 | } 33 | 34 | 35 | void subtract( T x, size_t N = 1 ) { 36 | T y = x - mNegError; 37 | T temp = mNegSum + y; 38 | mNegError = ( temp - mNegSum ) - y; 39 | mNegSum = temp; 40 | mN += N; 41 | } 42 | 43 | T sum() const { 44 | return mPosSum - mNegSum; 45 | } 46 | 47 | T error() const { 48 | return mPosError + mNegError; // TODO minus? 49 | } 50 | 51 | size_t nrTerms() const { 52 | return mN; 53 | } 54 | 55 | void setNrTerms( size_t N ) { 56 | mN = N; 57 | } 58 | 59 | void reset() { 60 | mPosSum = T( 0 ); 61 | mPosError = T( 0 ); 62 | mNegSum = T( 0 ); 63 | mNegError = T( 0 ); 64 | mN = 0; 65 | } 66 | }; 67 | 68 | #endif 69 | 70 | -------------------------------------------------------------------------------- /src/Trellis.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TRELLIS_HPP 2 | #define TRELLIS_HPP 3 | 4 | #include 5 | 6 | #include "uintmath.hpp" 7 | 8 | class Trellis { 9 | 10 | vector mVec; 11 | size_t mNrStates; 12 | rng_t& mRNG; 13 | 14 | void assertRange( size_t d ) const { 15 | if ( d >= mNrStates ) { 16 | throw runtime_error( "Trellis dimension index out of bounds!" ); 17 | } 18 | } 19 | public: 20 | 21 | // delete copy constructor 22 | Trellis( const Trellis& that ) = delete; 23 | 24 | Trellis( rng_t& RNG ): 25 | mNrStates( 2 ) , 26 | mRNG( RNG ) 27 | {}; 28 | 29 | Trellis( size_t nrStates, rng_t& RNG ): 30 | mNrStates( nrStates ) , 31 | mRNG( RNG ) 32 | {}; 33 | 34 | 35 | real_t& operator()( size_t t, size_t d ) { 36 | assertRange( d ); 37 | return mVec[t * mNrStates + d]; 38 | } 39 | 40 | 41 | // return reference to last element at dimension d 42 | real_t& back( size_t d ) { 43 | return mVec[mVec.size() - mNrStates + d]; 44 | } 45 | 46 | 47 | // this interface is for cases when the number of states is not known beforehand, e.g. for Dirichlet Process Priors 48 | void setNrStates( size_t K ) { 49 | mNrStates = K; 50 | } 51 | 52 | size_t size() const { 53 | return divide( mVec.size(), mNrStates ); 54 | } 55 | 56 | 57 | void push_back( const vector& vec ) { 58 | mVec.insert( mVec.end(), vec.begin(), vec.end() ); 59 | } 60 | 61 | size_t sample( size_t t ) const { 62 | discrete_distribution dist( mVec.begin() + ( t * mNrStates ), mVec.begin() + ( ( t + 1 )*mNrStates ) ); 63 | size_t result = dist( mRNG ); 64 | dist.reset(); 65 | return result; 66 | } 67 | 68 | void reserve( size_t N ) { 69 | mVec.reserve( N * mNrStates ); 70 | } 71 | 72 | 73 | void clear() { 74 | mVec.clear(); 75 | } 76 | }; 77 | 78 | 79 | #endif 80 | -------------------------------------------------------------------------------- /src/tools/maxSegmentation.cpp: -------------------------------------------------------------------------------- 1 | // Given a file of state marginals, compute the maximum posterior margins. 2 | 3 | #include "../Parser.hpp" 4 | 5 | 6 | #include 7 | using std::cin; 8 | using std::cout; 9 | using std::endl; 10 | using std::istream; 11 | using std::ostream; 12 | 13 | #include 14 | using std::ifstream; 15 | using std::ofstream; 16 | 17 | #include 18 | using std::stoi; 19 | using std::getline; 20 | 21 | #include 22 | using std::runtime_error; 23 | 24 | #include 25 | using std::stringstream; 26 | 27 | int main( int argc, const char* argv[] ) { 28 | 29 | Parser args( argc, argv ); 30 | args.registerFlags( {"-i", "-infile"}, "" ); 31 | args.registerFlags( {"-h", "--help", "-help"}, "" ); 32 | args.parseArgs(); 33 | 34 | if ( args.isSet( "-h" ) ) { 35 | cout << "Given a marginals file (-i) or input from STDIN, computes the maximum posterior margins segmentation, combining adjacent segments whenever possible." << endl; 36 | return 0; 37 | } 38 | 39 | const bool readFromFile = args.isSet( "-i" ); 40 | ifstream realInFile; 41 | if ( readFromFile ) { 42 | realInFile.open( args.parse( "-i" ), std::ios::in ); 43 | } 44 | istream& inFile = ( readFromFile ? realInFile : cin ); 45 | 46 | string line; 47 | size_t count; 48 | size_t RLE = 0; 49 | size_t totalRLE = 0; 50 | size_t col = 0; 51 | size_t maxCol = 0; 52 | size_t index = 0; 53 | size_t maxIndex = 0; 54 | size_t prevIndex = 0; 55 | 56 | while ( getline( inFile, line ) ) { 57 | stringstream ss( line ); 58 | 59 | index = 0; 60 | maxIndex = 0; 61 | col=0; 62 | maxCol = 0; 63 | ss >> RLE; 64 | while ( ss >> col ) { 65 | if ( col > maxCol ) { 66 | maxIndex = index; 67 | maxCol = col; 68 | } 69 | index++; 70 | } 71 | 72 | if ( maxIndex == prevIndex ) { 73 | totalRLE += RLE; 74 | } else { 75 | cout << totalRLE << "\t" << prevIndex << endl; 76 | totalRLE = RLE; 77 | prevIndex = maxIndex; 78 | } 79 | 80 | } 81 | cout << totalRLE << "\t" << maxIndex << endl; 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/MultiVector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef MULTIVECTOR_HPP 2 | #define MULTIVECTOR_HPP 3 | 4 | #include 5 | using std::vector; 6 | 7 | #include 8 | using std::runtime_error; 9 | 10 | template 11 | class MultiVector { 12 | vector mVec; 13 | const size_t mNrDim; 14 | 15 | public: 16 | 17 | 18 | MultiVector( 19 | size_t nrDim 20 | ): 21 | mNrDim( nrDim ) { 22 | if ( mNrDim <= 0 ) { 23 | throw runtime_error( "Number of dimensions in multivector must be positive!" ); 24 | } 25 | } 26 | 27 | MultiVector( 28 | T entry, 29 | size_t size, 30 | size_t nrDim 31 | ): 32 | mVec( entry, nrDim* size ), 33 | mNrDim( nrDim ) { 34 | if ( mNrDim <= 0 ) { 35 | throw runtime_error( "Number of dimensions in multivector must be positive!" ); 36 | } 37 | } 38 | 39 | 40 | // direct access to underlying vector 41 | inline T& operator[]( size_t i ) { 42 | if ( i >= mVec.size() ) { 43 | throw runtime_error( "Direct index out of bounds for multivector!" ); 44 | } 45 | return mVec[i]; 46 | } 47 | 48 | inline const T& operator[]( size_t i ) const { 49 | if ( i >= mVec.size() ) { 50 | throw runtime_error( "Direct index out of bounds for multivector!" ); 51 | } 52 | return mVec[i]; 53 | } 54 | 55 | 56 | inline T& operator()( size_t pos, size_t dim ) { 57 | if ( dim >= mNrDim ) { 58 | throw runtime_error( "Multivector dimension index out of bounds!" ); 59 | } 60 | const size_t i = pos * mNrDim + dim; 61 | if ( i < mVec.size() ) { 62 | return mVec[i]; 63 | } else { 64 | throw runtime_error( "Multivector index out of bounds!" ); 65 | } 66 | } 67 | 68 | size_t size() const { 69 | return mVec.size() / mNrDim; 70 | } 71 | 72 | void reserve( size_t size ) { 73 | mVec.reserve( mNrDim * size ); 74 | } 75 | 76 | 77 | void resize( size_t N ) { 78 | mVec.resize( N * mNrDim ); 79 | } 80 | 81 | 82 | void push_back( T& entry ) { 83 | mVec.reserve( mVec.size() + mNrDim ); 84 | for ( size_t d = 0; d < mNrDim; ++d ) { 85 | mVec.push_back( entry ); 86 | } 87 | } 88 | 89 | 90 | void swap( vector& vec ) { 91 | const size_t s = mVec.size(); 92 | if ( s != ( s / mNrDim )*mNrDim ) { 93 | throw runtime_error( "Cannot swap into multivector, size is not a multiple of dimensions!" ); 94 | } 95 | mVec.swap( vec ); 96 | } 97 | 98 | size_t nrDim()const { 99 | return mNrDim; 100 | } 101 | }; 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /src/Blocks/FixedBlocks.hpp: -------------------------------------------------------------------------------- 1 | #ifndef FIXEDBLOCKS_HPP 2 | #define FIXEDBLOCKS_HPP 3 | 4 | template<> 5 | class Blocks { 6 | // number of input positions 7 | const size_t mSize; 8 | const vector& mSizes; 9 | 10 | size_t mBlockCounter; 11 | // the maximum size that an iterator can jump forward (pruning limit) 12 | Direction mDirection; 13 | 14 | // the boundaries of the current block 15 | size_t mBlockStart; 16 | size_t mBlockEnd; 17 | size_t mBlockSize; 18 | 19 | public: 20 | 21 | // delete copy constructor 22 | Blocks( const Blocks& that ) = delete; 23 | 24 | 25 | // NOTE this constructor swaps its input vectors, i.e. they are empty outsize of this class 26 | Blocks( 27 | vector& sizes 28 | ) : 29 | mSizes( sizes ), 30 | mBlockCounter( 0 ), 31 | mSize( accumulate( sizes.begin(), sizes.end(), 0 ) ), 32 | mDirection( unset ), 33 | mBlockStart( 0 ), 34 | mBlockEnd( 0 ) { 35 | 36 | // check that weights contain data 37 | if ( mSize <= 0 ) { 38 | throw runtime_error( "Input vector for breakpoint weights is empty!" ); 39 | } 40 | }; 41 | 42 | 43 | template 44 | void createBlocks( Theta& param ); 45 | 46 | void initForward() { 47 | mDirection = forward; 48 | mBlockStart = 0; 49 | mBlockEnd = 0; 50 | mBlockSize = 0; 51 | mBlockCounter = 0; 52 | // initialize block at 0 53 | } 54 | 55 | 56 | 57 | // get the end of a block starting at for a given threshold 58 | // return false if the block end is the last possible value 59 | inline bool next() { 60 | if ( mBlockEnd >= mSize ) { 61 | mDirection = unset; 62 | return false; 63 | } else { 64 | mBlockStart = mBlockEnd; 65 | mBlockEnd = mBlockStart + mSizes[mBlockCounter]; 66 | mBlockCounter++; 67 | mBlockSize = mBlockEnd - mBlockStart; 68 | return true; 69 | } 70 | } 71 | 72 | 73 | size_t nrBlocks() const { 74 | return mSizes.size(); 75 | } 76 | 77 | size_t start() const { 78 | return mBlockStart; 79 | } 80 | 81 | size_t end() const { 82 | return mBlockEnd; 83 | } 84 | 85 | size_t pos() const { 86 | if ( mBlockCounter > 0 ) { 87 | return mBlockCounter - 1; 88 | } else { 89 | throw runtime_error( "No blocks created yet, position is undefined!" ); 90 | } 91 | } 92 | 93 | size_t N() const { 94 | return mBlockSize ; 95 | } 96 | 97 | size_t T() const { 98 | return mSize; 99 | } 100 | 101 | void printBlock() const { 102 | cout << "[" << mBlockStart << ":" << mBlockEnd << ") " << mBlockSize << " "; 103 | } 104 | 105 | 106 | }; 107 | 108 | #endif 109 | -------------------------------------------------------------------------------- /src/Emissions.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EMISSIONS_HPP 2 | #define EMISSIONS_HPP 3 | 4 | #include "includes.hpp" 5 | #include "Tags.hpp" 6 | #include "Blocks.hpp" 7 | #include "Statistics.hpp" 8 | 9 | 10 | // A wrapper around a combination of a data structure holding data points/sufficient statistics and an associated block structure 11 | template 12 | class Emissions, Blocks> { 13 | 14 | Statistics& mStats; 15 | Blocks& mBlocks; 16 | 17 | public: 18 | 19 | Emissions( 20 | Statistics& stats, 21 | Blocks& blocks 22 | ): 23 | mStats( stats ), 24 | mBlocks( blocks ) { 25 | if ( mStats.size() != mBlocks.size() ) { 26 | throw runtime_error( "Block structure and statistics have different number of data points!" ); 27 | } 28 | } 29 | 30 | 31 | Statistics& stats() { 32 | return mStats; 33 | } 34 | 35 | Blocks& blocks() { 36 | return mBlocks; 37 | } 38 | 39 | 40 | const Statistics& stats() const { 41 | return mStats; 42 | } 43 | 44 | const Blocks& blocks() const { 45 | return mBlocks; 46 | } 47 | 48 | 49 | 50 | void createBlocks( real_t thresh ) { 51 | mBlocks.createBlocks( thresh ); 52 | } 53 | 54 | template 55 | void createBlocks( const Theta& theta ) { 56 | mBlocks.createBlocks( theta ); 57 | } 58 | 59 | 60 | size_t nrBlocks() const { 61 | return mBlocks.nrBlocks(); 62 | } 63 | 64 | size_t nrDim() const { 65 | return mStats.nrDim(); 66 | } 67 | 68 | size_t start() const { 69 | return mBlocks.start(); 70 | } 71 | 72 | size_t end() const { 73 | return mBlocks.end(); 74 | } 75 | 76 | size_t blockSize() const { 77 | return mBlocks.blockSize(); 78 | } 79 | 80 | size_t size() const { 81 | return mBlocks.size(); 82 | } 83 | 84 | void initForward() { 85 | mBlocks.initForward(); 86 | } 87 | 88 | bool next() { 89 | if ( mBlocks.next() ) { 90 | mStats.setStats( mBlocks ); 91 | return true; 92 | } else { 93 | return false; 94 | } 95 | } 96 | 97 | 98 | const SufficientStatistics& suffStat( size_t dim ) const { 99 | return mStats.suffStat( dim ); 100 | } 101 | 102 | 103 | template 104 | void aggregateStatistics( 105 | const StateSequenceType& q, 106 | HyperParamType& tau_theta, 107 | const Mapping& mapping, 108 | const size_t ignoreBlockSize = 1 109 | ) { 110 | mStats.aggregateStatistics( q, mBlocks, tau_theta, mapping, ignoreBlockSize ); 111 | } 112 | }; 113 | 114 | 115 | 116 | 117 | 118 | 119 | #endif 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /src/tools/MappedValues.hpp: -------------------------------------------------------------------------------- 1 | // Values mapped to a genome, such as read-depth, start counts etc. 2 | 3 | #include 4 | using std::vector; 5 | 6 | 7 | #include 8 | using std::string; 9 | 10 | #include 11 | using std::map; 12 | 13 | #include 14 | using std::runtime_error; 15 | 16 | #include 17 | using std::sort; 18 | 19 | 20 | template 21 | class MappedValueEntry { 22 | public: 23 | size_t pos; 24 | T entry; 25 | 26 | 27 | MappedValueEntry(): pos( 0 ), entry() {} 28 | 29 | MappedValueEntry( size_t p, T v ): pos( p ), entry( v ) {} 30 | 31 | 32 | bool operator<( const MappedValueEntry& b ) const { 33 | return pos < b.pos; 34 | } 35 | 36 | 37 | bool operator>( const MappedValueEntry& b ) const { 38 | return pos > b.pos; 39 | } 40 | 41 | MappedValueEntry& operator+=( const MappedValueEntry& rhs ) { 42 | if ( pos != rhs.pos ) { 43 | throw runtime_error( "Cannot add values, positions don't match!" ); 44 | } 45 | entry += rhs.entry; 46 | return *this; 47 | } 48 | 49 | const MappedValueEntry operator+( const MappedValueEntry& other ) const { 50 | return MappedValueEntry( *this ) += other; 51 | } 52 | 53 | MappedValueEntry& operator-=( const MappedValueEntry& rhs ) { 54 | if ( pos != rhs.pos ) { 55 | throw runtime_error( "Cannot add values, positions don't match!" ); 56 | } 57 | entry -= rhs.entry; 58 | return *this; 59 | } 60 | 61 | const MappedValueEntry operator-( const MappedValueEntry& other ) const { 62 | return MappedValueEntry( *this ) -= other; 63 | } 64 | }; 65 | 66 | 67 | template 68 | void sortAddAndCompress( 69 | vector>& vec 70 | ) { 71 | sort( vec.begin(), vec.end() ); 72 | size_t L = 0; 73 | size_t R = 1; 74 | while ( R < vec.size() ) { 75 | if ( vec[L].pos == vec[R].pos ) { 76 | vec[L].entry += vec[R].entry; 77 | } else { 78 | ++L; 79 | vec[L] = vec[R]; 80 | } 81 | ++R; 82 | } 83 | vec.resize( L + 1 ); 84 | } 85 | 86 | template 87 | void sortMultiplyAndCompress( 88 | vector>& vec 89 | ) { 90 | sort( vec.begin(), vec.end() ); 91 | size_t L = 0; 92 | size_t R = 1; 93 | while ( R < vec.size() ) { 94 | if ( vec[L].pos == vec[R].pos ) { 95 | vec[L].entry *= vec[R].entry; 96 | } else { 97 | ++L; 98 | vec[L] = vec[R]; 99 | } 100 | ++R; 101 | } 102 | vec.resize( L + 1 ); 103 | } 104 | 105 | 106 | 107 | template 108 | class MappedValues { 109 | map>> mEntries; // map refseq ID to vector of (pos, T) tuples 110 | 111 | 112 | void update( const MappedValues& other ) {}; 113 | 114 | // TODO add(), subtract(bool removeZero=false) 115 | }; 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /bin/samToCounts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Coverage information is extracted from SAM file. Three files are created: 4 | # 5 | # *-count.csv.gz contains the counts for each position in the genome, ordered by refseq name (chromosome name) 6 | # *-pos.csv.gz contains the corresponding positions within each chromosome. 7 | # *-size.csv contains 3 columns: the names of the refseqs in alphabetic order, the number of mapped start positions (leftmost), as well as the cumulative sum of the second column (easier for stream processing, and guarantees that the refseq order can always be restored). This corresponds to the lines in the other two files. For instance, "chr1 1000 1000" means that the first 1000 counts and positions come from chromosome 1. Using this file, each line in the pos file can be assigned its chromosome, and hence each count can be mapped uniquely to its genomic position. 8 | 9 | # to handle uniq's weird output: remove leading whitespace, merge multiple spaces into one and replace spaces by tabs 10 | shopt -s expand_aliases 11 | 12 | function tabuniq(){ 13 | uniq -c | sed -e 's/^\s\+\(\w\+\) /\1\t/g' 14 | } 15 | 16 | 17 | # set I/O 18 | samfile=$1 # or bamfile 19 | outprefix=$2 20 | # set the following bits to be ignored: 21 | # 4 read unmapped 22 | # 256 not primary alignment 23 | # 512 read fails platform/vendor quality checks 24 | # 1024 read is PCR or optical duplicate 25 | # 2048 supplementary alignment 26 | filterbits=${3:-3844} 27 | 28 | sortdir=${4:-/tmp} # directory for temporary sort files 29 | 30 | 31 | # Create the read-depth using a system of named pipes, for space efficiency: 32 | colfifo=${outprefix}-cols.fifo 33 | sizefifo=${outprefix}-size.fifo 34 | mapfifo=${outprefix}-map.fifo 35 | mkfifo ${colfifo} 36 | mkfifo ${sizefifo} 37 | mkfifo ${mapfifo} 38 | 39 | sizeoutfile=${outprefix}-size.csv 40 | posoutfile=${outprefix}-pos.csv.gz 41 | countoutfile=${outprefix}-count.csv.gz 42 | 43 | 44 | # sort by refseq, then leftmost mapping, and count; if a read maps to the exact same position, only count it once, since this will be due to alternative alignment 45 | ( samtools view -F ${filterbits} ${samfile} | cut -f 1,3,4 | sort -T ${sortdir} -k2,2V -k3,3n -k1,1 -u | cut -f 2,3 | tabuniq > ${colfifo} )& 46 | 47 | # split into two pipes: sizefifo will proceed to get number of positions per refseq, the other will contain positions and counts 48 | ( cat ${colfifo} | tee ${sizefifo} > ${mapfifo} )& 49 | 50 | # write a file containing the refseq and the number of mapped positions 51 | ( cut -f 2 ${sizefifo} | tabuniq | awk '{total += $1; print $2"\t"$1"\t"total}' > ${sizeoutfile} )& 52 | 53 | 54 | # write one gzip file containing all counts, and one containing all positions, without refseq. Positions are for ordered by refseq as in the file output above. 55 | ( cat ${mapfifo} | tee >( cut -f 3 | gzip -c > ${posoutfile}) | cut -f 1 | gzip -c > ${countoutfile} )& 56 | 57 | wait 58 | rm ${colfifo} 59 | rm ${sizefifo} 60 | rm ${mapfifo} 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /src/Transitions.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TRANSITIONS_HPP 2 | #define TRANSITIONS_HPP 3 | 4 | #include "includes.hpp" 5 | #include "Tags.hpp" 6 | #include "TransitionHyperParam.hpp" 7 | #include "Distribution.hpp" 8 | #include "Observation.hpp" 9 | #include "StateSequence.hpp" 10 | 11 | 12 | 13 | template 14 | ostream& operator<<( 15 | ostream& output, 16 | const Transitions& D ) { 17 | output << D.str(); 18 | return output; 19 | } 20 | 21 | 22 | template // e.g. DirichletVector 23 | class Transitions { 24 | 25 | size_t mNrStates; 26 | Distribution mDist; 27 | Observation mValue; 28 | SufficientStatistics mCounts; // the count matrix TODO are there any cases where this is not CategoricalVector? 29 | 30 | 31 | public: 32 | 33 | // delete copy constructor 34 | Transitions( const Transitions& that ) = delete; 35 | 36 | ////////// constructors ////////// 37 | 38 | Transitions( size_t nrStates, 39 | rng_t& RNG 40 | ) : 41 | mNrStates( nrStates ), 42 | mDist( RNG ), 43 | mValue( nrStates ), 44 | mCounts( nrStates ) {}; 45 | 46 | inline const real_t& operator()( 47 | const size_t from, 48 | const size_t to ) const { 49 | return mValue( from, to ); 50 | }; 51 | 52 | inline real_t& operator()( 53 | const size_t from, 54 | const size_t to ) { 55 | return mValue( from, to ); 56 | }; 57 | 58 | 59 | 60 | 61 | ////////// const methods ////////// 62 | 63 | size_t nrStates() const { 64 | return mNrStates; 65 | } 66 | 67 | string str() const { 68 | return mValue.str(); 69 | } 70 | 71 | 72 | ////////// non-const methods ////////// 73 | 74 | template 75 | void sample( 76 | TransitionHyperParam& tau_A ) { // NOTE tau_A cannot be const since we update the parameters 77 | mDist.resample( mValue, tau_A.posterior() ); 78 | tau_A.reset(); 79 | } 80 | 81 | // template 82 | // void sample( 83 | // TransitionParamType& tau_A 84 | // ) { 85 | // mDist.resample( mValue, tau_A.posterior() ); 86 | // tau_A.reset(); 87 | // } 88 | 89 | 90 | }; 91 | 92 | 93 | 94 | // dummy specializations 95 | 96 | template<> template 97 | void Transitions::sample( 98 | TransitionHyperParam& tau_A ) { // NOTE tau_A cannot be const since we update the parameters 99 | } 100 | /* 101 | template<> template< typename StateSequenceType, typename EmissionType, typename InitialType, typename TransitionParamType> 102 | void Transitions::sample( 103 | const StateSequenceType& q, 104 | EmissionType& y, // NOTE A does not stochastically depend on y; y is only passed because it contains the block sizes which we need for self-transitions 105 | const InitialType& pi, 106 | TransitionParamType& tau_A ) { 107 | }*/ 108 | 109 | 110 | #endif 111 | 112 | -------------------------------------------------------------------------------- /lib/gzstream/test_gunzip.C: -------------------------------------------------------------------------------- 1 | // ============================================================================ 2 | // gzstream, C++ iostream classes wrapping the zlib compression library. 3 | // Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner 4 | // 5 | // This library is free software; you can redistribute it and/or 6 | // modify it under the terms of the GNU Lesser General Public 7 | // License as published by the Free Software Foundation; either 8 | // version 2.1 of the License, or (at your option) any later version. 9 | // 10 | // This library is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | // Lesser General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public 16 | // License along with this library; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 | // ============================================================================ 19 | // 20 | // File : test_gunzip.C 21 | // Revision : $Revision: 1.3 $ 22 | // Revision_date : $Date: 2001/10/04 15:09:28 $ 23 | // Author(s) : Deepak Bandyopadhyay, Lutz Kettner 24 | // 25 | // Short test program reading a file, uncompressing it, and writing it. 26 | // ============================================================================ 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | int main( int argc, char*argv[]) { 34 | if ( argc != 3) { 35 | std::cerr << "Usage: " << argv[0] <<" \n"; 36 | return EXIT_FAILURE; 37 | } 38 | // check alternate way of opening file 39 | igzstream in2; 40 | in2.open( argv[1]); 41 | if ( ! in2.good()) { 42 | std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n"; 43 | return EXIT_FAILURE; 44 | } 45 | in2.close(); 46 | if ( ! in2.good()) { 47 | std::cerr << "ERROR: Closing file `" << argv[1] << "' failed.\n"; 48 | return EXIT_FAILURE; 49 | } 50 | // now use the shorter way with the constructor to open the same file 51 | igzstream in( argv[1]); 52 | if ( ! in.good()) { 53 | std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n"; 54 | return EXIT_FAILURE; 55 | } 56 | std::ofstream out( argv[2]); 57 | if ( ! out.good()) { 58 | std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n"; 59 | return EXIT_FAILURE; 60 | } 61 | char c; 62 | while ( in.get(c)) 63 | out << c; 64 | in.close(); 65 | out.close(); 66 | if ( ! in.eof()) { 67 | std::cerr << "ERROR: Reading file `" << argv[1] << "' failed.\n"; 68 | return EXIT_FAILURE; 69 | } 70 | if ( ! out.good()) { 71 | std::cerr << "ERROR: Writing file `" << argv[2] << "' failed.\n"; 72 | return EXIT_FAILURE; 73 | } 74 | return EXIT_SUCCESS; 75 | } 76 | 77 | // ============================================================================ 78 | // EOF 79 | -------------------------------------------------------------------------------- /lib/gzstream/test_gzip.C: -------------------------------------------------------------------------------- 1 | // ============================================================================ 2 | // gzstream, C++ iostream classes wrapping the zlib compression library. 3 | // Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner 4 | // 5 | // This library is free software; you can redistribute it and/or 6 | // modify it under the terms of the GNU Lesser General Public 7 | // License as published by the Free Software Foundation; either 8 | // version 2.1 of the License, or (at your option) any later version. 9 | // 10 | // This library is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | // Lesser General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU Lesser General Public 16 | // License along with this library; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 | // ============================================================================ 19 | // 20 | // File : test_gzip.C 21 | // Revision : $Revision: 1.3 $ 22 | // Revision_date : $Date: 2001/10/04 15:09:28 $ 23 | // Author(s) : Deepak Bandyopadhyay, Lutz Kettner 24 | // 25 | // Short test program reading a file, compressing it, and writing it. 26 | // ============================================================================ 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | int main( int argc, char*argv[]) { 34 | if ( argc != 3) { 35 | std::cerr << "Usage: " << argv[0] <<" \n"; 36 | return EXIT_FAILURE; 37 | } 38 | // check alternate way of opening file 39 | ogzstream out2; 40 | out2.open( argv[2]); 41 | if ( ! out2.good()) { 42 | std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n"; 43 | return EXIT_FAILURE; 44 | } 45 | out2.close(); 46 | if ( ! out2.good()) { 47 | std::cerr << "ERROR: Closing file `" << argv[2] << "' failed.\n"; 48 | return EXIT_FAILURE; 49 | } 50 | // now use the shorter way with the constructor to open the same file 51 | ogzstream out( argv[2]); 52 | if ( ! out.good()) { 53 | std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n"; 54 | return EXIT_FAILURE; 55 | } 56 | std::ifstream in( argv[1]); 57 | if ( ! in.good()) { 58 | std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n"; 59 | return EXIT_FAILURE; 60 | } 61 | char c; 62 | while ( in.get(c)) 63 | out << c; 64 | in.close(); 65 | out.close(); 66 | if ( ! in.eof()) { 67 | std::cerr << "ERROR: Reading file `" << argv[1] << "' failed.\n"; 68 | return EXIT_FAILURE; 69 | } 70 | if ( ! out.good()) { 71 | std::cerr << "ERROR: Writing file `" << argv[2] << "' failed.\n"; 72 | return EXIT_FAILURE; 73 | } 74 | return EXIT_SUCCESS; 75 | } 76 | 77 | // ============================================================================ 78 | // EOF 79 | -------------------------------------------------------------------------------- /src/tools/GenomeGetter.hpp: -------------------------------------------------------------------------------- 1 | // Class that reads compressed genome representations and serves as a kind of iterator. 2 | 3 | #include 4 | using std:: string; 5 | 6 | #include 7 | using std::vector; 8 | 9 | #include 10 | using std::ifstream; 11 | using std::getline; 12 | 13 | #include 14 | using std::stringstream; 15 | 16 | #include 17 | using std::runtime_error; 18 | 19 | #include "gzstream.h" 20 | 21 | class GenomeGetter { 22 | string mRefSeq; 23 | string mPrevRefSeq; 24 | size_t mPos; 25 | size_t mPrevPos; 26 | 27 | bool mIsNewRefSeq; 28 | 29 | // size of the current refseq, and the index of the line within that refseq 30 | size_t mRefSeqSize; 31 | size_t mRefSeqIndex; 32 | 33 | // same as above, but cumulative 34 | size_t mTotalSize; 35 | size_t mTotalIndex; 36 | 37 | igzstream posfile; 38 | igzstream sizefile; 39 | 40 | string line; 41 | public: 42 | 43 | // if only a prefix is provided, the input is assumed to consist of 3 files: size, pos, and count 44 | GenomeGetter( const string& prefix ): 45 | mRefSeq( "" ), 46 | mPrevRefSeq( "" ), 47 | mPos( 0 ), 48 | mPrevPos( 0 ), 49 | mRefSeqSize( 0 ), 50 | mRefSeqIndex( 0 ), 51 | mTotalSize( 0 ), 52 | mTotalIndex( 0 ), 53 | mIsNewRefSeq( false ) { 54 | sizefile.open( ( prefix + "-size.csv" ).c_str() ); 55 | if ( !sizefile ) { 56 | throw runtime_error( "Cannot read " + prefix + "-size.csv!" ); 57 | } 58 | posfile.open( ( prefix + "-pos.csv.gz" ).c_str() ); 59 | if ( !posfile ) { 60 | throw runtime_error( "Cannot read " + prefix + "-pos.csv.gz!" ); 61 | } 62 | } 63 | 64 | 65 | bool next() { 66 | stringstream ss( line ); 67 | if ( mRefSeqIndex == mRefSeqSize ) { // start a new refseq 68 | mIsNewRefSeq = true; 69 | mPrevRefSeq = mRefSeq; 70 | if ( getline( sizefile, line ) ) { 71 | ss.str( line ); 72 | ss >> mRefSeq; 73 | ss >> mRefSeqSize; 74 | ss >> mTotalSize; // TODO check that this increased, adds up etc. 75 | mRefSeqIndex = 0; 76 | ss.clear(); 77 | 78 | } else { // no more lines 79 | // TODO assert there is nothing left in the other files 80 | mRefSeq = ""; 81 | mPos = 0; 82 | return false; 83 | } 84 | } else { 85 | mIsNewRefSeq = false; 86 | } 87 | if ( getline( posfile, line ) ) { 88 | ss.str( line ); 89 | mPrevPos = mPos; 90 | ss >> mPos; 91 | ss.clear(); 92 | } else { 93 | throw runtime_error( "Not enough entries in position file!" ); 94 | } 95 | 96 | mRefSeqIndex++; 97 | mTotalIndex++; 98 | return true; 99 | } 100 | 101 | const string& refseq() const { 102 | return mRefSeq; 103 | } 104 | 105 | const string& prevRefseq() const { 106 | return mPrevRefSeq; 107 | } 108 | 109 | const size_t& pos() const { 110 | return mPos; 111 | } 112 | 113 | const size_t& prevPos() const { 114 | return mPrevPos; 115 | } 116 | 117 | bool refseqChanged() const { 118 | return mIsNewRefSeq; 119 | } 120 | 121 | 122 | }; 123 | -------------------------------------------------------------------------------- /src/includes.hpp: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDES_HPP 2 | #define INCLUDES_HPP 3 | 4 | #include 5 | // using int_16_t; 6 | 7 | // This file contains header inclusions, because we are lazy (and consistent), as well as some basic constants and typedefs. 8 | 9 | // Type to use for real numbers. Note that we don't introduce separate types for data, wavelet coefficients and probabilities, as one might be tempted, because that would lead to many implicit type conversions in likelihood computations, wavelet thresholding etc. 10 | typedef float real_t; 11 | 12 | // Type to use for the marginal counts. Has to be a signed integer type. Its maximum is the maximum number of counts per state and position as well as the maximum number of recorded states. 13 | typedef int16_t marginal_t; // the type used to record marginal counts 14 | 15 | 16 | #include 17 | using std::is_integral; 18 | using std::is_unsigned; 19 | 20 | #include 21 | using std::size_t; 22 | 23 | 24 | #include 25 | using std::vector; 26 | 27 | #include 28 | using std::queue; 29 | 30 | #include 31 | using std::deque; 32 | 33 | #include 34 | using std::array; 35 | 36 | 37 | #include 38 | using std::string; 39 | using std::to_string; 40 | 41 | 42 | #include 43 | using std::stringstream; 44 | using std::istringstream; 45 | 46 | 47 | #include 48 | using std::istream; 49 | using std::ostream; 50 | using std::endl; 51 | using std::cin; 52 | using std::cout; 53 | using std::cerr; 54 | using std::clog; 55 | using std::wcout; 56 | using std::flush; 57 | using std::boolalpha; 58 | using std::ios; 59 | 60 | 61 | #include 62 | using std::ifstream; 63 | using std::ofstream; 64 | 65 | 66 | #include 67 | using std::runtime_error; // TODO throw the appropriate errors, like logic_error etc. 68 | using std::exception; 69 | 70 | #include 71 | using std::pow; 72 | using std::exp; // e^x 73 | using std::exp2; 74 | using std::log; // natural log 75 | using std::log2; 76 | using std::log10; 77 | using std::sqrt; 78 | using std::ceil; 79 | using std::floor; 80 | using std::abs; 81 | 82 | using std::isfinite; 83 | 84 | 85 | 86 | #include 87 | using std::min; 88 | using std::max; 89 | using std::nth_element; 90 | using std::reverse; 91 | using std::fill; 92 | 93 | 94 | #include 95 | using std::partial_sum; 96 | using std::plus; 97 | using std::accumulate; //e.g. sum of vector 98 | 99 | 100 | #include 101 | using std::istream_iterator; 102 | using std::ostream_iterator; 103 | using std::back_inserter; 104 | 105 | 106 | #include 107 | using std::unordered_map; 108 | 109 | 110 | #include 111 | using std::stack; 112 | 113 | 114 | #include 115 | using std::numeric_limits; 116 | 117 | 118 | #include 119 | using std::time; 120 | 121 | 122 | #include 123 | using std::setprecision; 124 | 125 | 126 | const real_t inf = numeric_limits::infinity(); 127 | const real_t sqrt2 = sqrt( 2.0 ); 128 | const real_t sqrt2half = sqrt2 / 2.0; // sqrt(2)/2 = 1/sqrt(2) 129 | 130 | 131 | #endif 132 | 133 | -------------------------------------------------------------------------------- /src/EFD.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EFD_HPP 2 | #define EFD_HPP 3 | 4 | // Functionality related to exponential family distributions 5 | 6 | #include "SufficientStatistics.hpp" 7 | #include "Observation.hpp" 8 | 9 | 10 | ////////// Inner Products of parameters and sufficient statistics ////////// 11 | 12 | template 13 | real_t innerProduct( 14 | const SufficientStatistics& suffstat, 15 | const Observation& param ); 16 | 17 | 18 | 19 | 20 | 21 | ////////// Normal ////////// 22 | 23 | real_t innerProduct( 24 | const SufficientStatistics& suffstat, 25 | const Observation& param 26 | ) { 27 | real_t result = ( 2.0 * param.mean() * suffstat.sum() - suffstat.sumSq() ) / ( 2.0 * param.var( ) ); 28 | if ( !isfinite( result ) ) { 29 | throw runtime_error( "Result of Normal inner product is not finite!" ); 30 | } 31 | return result; 32 | } 33 | 34 | 35 | real_t logNormalizer( 36 | const Observation& param ) { 37 | return log( param.stdev() ) + param.mean() * param.mean() / ( 2 * param.var() ); 38 | } 39 | 40 | 41 | real_t sampleMean( 42 | const SufficientStatistics& suffstat, 43 | size_t N ) { 44 | if ( N <= 0 ) { 45 | throw runtime_error( "Cannot calculate mean from zero observations!" ); 46 | } 47 | double n = N; 48 | return suffstat.sum() / n; 49 | } 50 | 51 | real_t sampleVariance( 52 | const SufficientStatistics& suffstat, 53 | size_t N ) { 54 | if ( N <= 0 ) { 55 | throw runtime_error( "Cannot calculate variance from zero observations!" ); 56 | } 57 | double n = N; 58 | double avg = sampleMean( suffstat, N ); 59 | return suffstat.sumSq() / n - ( avg * avg ); 60 | } 61 | 62 | 63 | 64 | ////////// Geometric distribution ////////// 65 | 66 | real_t innerProduct( 67 | const SufficientStatistics& suffstat, 68 | const Observation& param ) { 69 | real_t result = suffstat.sum() * param.value(); 70 | return result; 71 | } 72 | 73 | 74 | real_t logNormalizer( 75 | const Observation& param ) { 76 | return log( param.value() ); 77 | } 78 | 79 | 80 | 81 | 82 | // calculates the inner product in the PDF of an EFD between the current sufficient statistics and a set of parameters under a current mapping 83 | template 84 | real_t innerProduct( 85 | const EmissionObject& y, 86 | const vector>& param, // TODO ParamType in template definition? 87 | const vector& mapping ) { 88 | real_t result = 0; 89 | for ( auto dim = 0; dim < y.nrDim(); dim++ ) { 90 | result += innerProduct( y.suffStat( dim ), param[mapping[dim]] ); 91 | } 92 | return result; 93 | } 94 | 95 | 96 | template 97 | real_t innerProduct( 98 | const EmissionObject& y, 99 | const vector>& param // TODO ParamType in template definition? 100 | ) { 101 | real_t result = 0; 102 | for ( auto dim = 0; dim < y.nrDim(); dim++ ) { 103 | result += innerProduct( y.suffStat( dim ), param[dim] ); 104 | } 105 | return result; 106 | } 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | #endif 117 | 118 | 119 | -------------------------------------------------------------------------------- /src/ThetaHyperParam.hpp: -------------------------------------------------------------------------------- 1 | #ifndef THETAHYPERPARAM_HPP 2 | #define THETAHYPERPARAM_HPP 3 | 4 | #include "Tags.hpp" 5 | #include "Mapping.hpp" 6 | #include "Conjugate.hpp" 7 | #include "Observation.hpp" 8 | #include "SufficientStatistics.hpp" 9 | 10 | 11 | 12 | 13 | template // e.g. NormalInverseGammaParam 14 | class ThetaHyperParam { 15 | 16 | const size_t mNrParams; 17 | 18 | vector> mParams; 19 | 20 | public: 21 | 22 | 23 | // automatic priors 24 | ThetaHyperParam( 25 | const vector< vector< real_t > >& hyperparams 26 | ) : mNrParams( hyperparams.size() ) { 27 | 28 | if ( mNrParams <= 0 ) { 29 | throw runtime_error( "Number of hyperparameters must be positive" ); 30 | } 31 | 32 | if ( mNrParams <= 0 ) { 33 | throw runtime_error( "Number of emission hyperparameters must be positive! Did you forget to provide them, or to use -a?" ); 34 | } 35 | 36 | 37 | for ( const auto & hp : hyperparams ) { 38 | mParams.push_back( Conjugate( hp ) ); 39 | } 40 | } 41 | 42 | ThetaHyperParam( 43 | const vector < Observation>& hyperparams 44 | ) : mNrParams( hyperparams.size() ) { 45 | 46 | if ( mNrParams <= 0 ) { 47 | throw runtime_error( "Number of hyperparameters must be positive" ); 48 | } 49 | 50 | for ( const auto & hp : hyperparams ) { 51 | mParams.push_back( Conjugate( hp ) ); 52 | } 53 | } 54 | 55 | 56 | ThetaHyperParam( 57 | const Observation& hyperparams, 58 | const size_t nrDim 59 | ) : mNrParams( nrParams ), 60 | mParams( hyperparams, nrDim ) { 61 | 62 | if ( mNrParams <= 0 ) { 63 | throw runtime_error( "Number of hyperparameters must be positive" ); 64 | } 65 | 66 | } 67 | 68 | size_t nrParams() const { 69 | return mNrParams; 70 | } 71 | 72 | 73 | 74 | ////////// accessors ////////// 75 | //NOTE round parentheses access the data through the mapping, square brackets access the parameters directly 76 | 77 | template 78 | inline void addObservation( 79 | const SufficientStatistics& suffStat, 80 | const size_t N, 81 | const size_t dim ) { 82 | 83 | mParams[dim].addObservation( suffStat, N ); 84 | } 85 | 86 | 87 | // TODO some objects use posterior(), make consistent 88 | // TODO implicit conversion? 89 | const Observation& posterior( 90 | const size_t d ) const { 91 | return mParams[d].posterior(); 92 | } 93 | 94 | 95 | const Observation& prior( 96 | const size_t d ) const { 97 | return mParams[d].prior(); 98 | } 99 | 100 | void reset() { 101 | for ( auto & p : mParams ) { 102 | p.reset(); 103 | } 104 | } 105 | 106 | 107 | string str() const { 108 | return concat( mParams, "\t", "\n" ); 109 | } 110 | 111 | const Conjugate operator[]( size_t d ) const { 112 | return mParams[d]; 113 | } 114 | }; 115 | 116 | 117 | 118 | 119 | ////////////////////////////////////////////////// TEMPLATE SPECIALIZATIONS ////////////////////////////////////////////////// 120 | // #include "WaveletTree.hpp" // required implementation 121 | 122 | 123 | 124 | #endif 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /src/Blocks/SplittableBlocks.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SPLITTABLEBLOCKS_HPP 2 | #define SPLITTABLEBLOCKS_HPP 3 | 4 | template<> 5 | class Blocks { 6 | const size_t mSize; 7 | deque mSizes; 8 | size_t mStart; 9 | size_t mEnd; 10 | size_t mPos; // the current position as seen from the outside. 11 | size_t mIndex; // the index in mSizes corresponding to mPos 12 | bool mFirstIter; 13 | 14 | void rotate() { 15 | mSizes.push_back( mSizes.front() ); 16 | mSizes.pop_front(); 17 | if ( mIndex > 0 ) { 18 | mIndex--; 19 | } else { 20 | mIndex = mSizes.size() - 1; 21 | } 22 | } 23 | 24 | public: 25 | 26 | 27 | Blocks( size_t size ): 28 | mSize( size ), 29 | mPos( 0 ), 30 | mStart( 0 ), 31 | mEnd( 0 ), 32 | mIndex( 0 ), 33 | mFirstIter( true ) { 34 | if ( size == 0 ) { 35 | throw runtime_error( "Cannot create block structure for 0 positions!" ); 36 | } 37 | mSizes.push_back( size ); 38 | } 39 | 40 | // TODO is there any way to swap this? 41 | Blocks( const vector& sizes ): 42 | mSize( accumulate( sizes.begin(), sizes.end(), 0 ) ), 43 | mPos( 0 ), 44 | mStart( 0 ), 45 | mEnd( 0 ), 46 | mIndex( 0 ), 47 | mFirstIter( true ) { 48 | for ( auto & x : sizes ) { 49 | mSizes.push_back( x ); 50 | } 51 | } 52 | 53 | // Split the current block such that the first new block is of size s, and move to the first block; this preserves pos(). If s is >= the size of the block, an exception is thrown 54 | void split( size_t s ) { 55 | if ( mSizes[mIndex] <= s ) { 56 | throw runtime_error( "Cannot split block into this size!" ); 57 | } 58 | 59 | // rotate until current position is at the front 60 | while ( mIndex != 0 ) { 61 | rotate(); 62 | } 63 | 64 | mSizes.push_back( s ); 65 | mSizes[0] = mSizes[0] - s; 66 | mIndex = mSizes.size() - 1; 67 | }; 68 | 69 | 70 | // Move to the first position, so pos()==0. 71 | void initForward() { 72 | mFirstIter = true; 73 | if ( mIndex >= mPos ) { 74 | mIndex -= mPos; 75 | } else { 76 | mIndex += ( mSizes.size() - mPos ); 77 | } 78 | mPos = 0; 79 | mStart = 0; 80 | mEnd = 0; 81 | } 82 | 83 | // move to the next block, in modular fashion (wrap around) 84 | // returns false if wrapped around, otherwise true. 85 | bool next() { 86 | if ( mFirstIter ) { 87 | mFirstIter = false; 88 | mEnd = mStart + mSizes[mIndex]; 89 | } else { 90 | mIndex = ( mIndex + 1 ) % mSizes.size(); 91 | mPos++; 92 | mStart = mEnd; 93 | mEnd += mSizes[mIndex]; 94 | } 95 | if ( mPos == mSizes.size() ) { 96 | mPos = 0; 97 | return false; 98 | } else { 99 | return true; 100 | } 101 | }; 102 | 103 | size_t start() const { 104 | return mStart; 105 | } 106 | 107 | size_t end() const { 108 | return mEnd; 109 | } 110 | 111 | 112 | // return the number of blocks 113 | size_t nrBlocks()const { 114 | return mSizes.size(); 115 | }; 116 | 117 | 118 | // return the current block size 119 | size_t size() const { 120 | return mSizes[mIndex]; 121 | }; 122 | 123 | 124 | // return the index of the current block 125 | size_t pos() const { 126 | return mPos % mSizes.size(); 127 | } 128 | }; 129 | 130 | 131 | #endif 132 | -------------------------------------------------------------------------------- /src/Tags.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TAGS_HPP 2 | #define TAGS_HPP 3 | 4 | 5 | enum MappingType {combinations, independent}; 6 | 7 | 8 | // these empty classes are used as tags for template specialization, allowing the compiler to inline and optimize for different use cases 9 | 10 | ////////// These classes don't do anything by themselves, they are used as tags for template specialization ////////// 11 | 12 | class Dummy {}; // a tag for when the class is omitted, e.g. a transition matrix when no transitions are used 13 | 14 | class Normal {}; 15 | class NormalParam {}; 16 | using NormalInverseGamma = NormalParam; 17 | class NormalInverseGammaParam {}; 18 | 19 | 20 | class NormalVector {}; 21 | class NormalParamVector {}; 22 | using NormalInverseGammaVector = NormalParamVector; 23 | class NormalInverseGammaParamVector {}; 24 | 25 | 26 | class InverseGamma {}; 27 | class InverseGammaParam {}; 28 | class NormalGammaParam {}; 29 | 30 | 31 | // using NormalGamma = NormalParam; 32 | 33 | class Categorical {}; 34 | class CategoricalParam {}; 35 | using Dirichlet = CategoricalParam; 36 | class DirichletParam {}; 37 | 38 | 39 | class CategoricalVector {}; // transition counts are SufficientStatistics 40 | class CategoricalParamVector {}; 41 | using DirichletVector = CategoricalParamVector; 42 | class DirichletParamVector {}; 43 | 44 | 45 | class Geometric {}; 46 | class Beta {}; 47 | class BetaParam {}; 48 | 49 | 50 | 51 | // sampling tags for state sequence 52 | class ForwardBackward {}; // forward-backward sampling 53 | class Mixture {}; // sampling of each block individually 54 | class DirectGibbs {}; // sample direct Gibbs, i.e. including transitions into and out of the state 55 | 56 | ////////// tags for data structures ////////// 57 | class Vector {}; // plain data structure for uncompressed sampling 58 | class WaveletTree {}; 59 | class Fixed {}; 60 | class IntegralArray {}; 61 | class Splittable {}; 62 | class BreakpointArray {}; 63 | 64 | 65 | 66 | ////////// forward declarations ////////// 67 | 68 | template < typename StateSequenceType, 69 | typename EmissionDataStructure, 70 | typename EmissionDistType, // e.g. Normal 71 | typename ThetaDistType, // e.g. NormalInverseGamma 72 | typename TransitionDistType, // e.g. DirichletVector 73 | typename InitialDistType, // e.g. Dirichlet 74 | typename ThetaParamType, // e.g. NormalInverseGammaParam 75 | typename TransitionParamType, // e.g. DirichletParam 76 | typename InitialParamType // e.g. DirichletParam 77 | > 78 | class HMM; 79 | 80 | 81 | template < typename Type > 82 | class StateSequence; 83 | 84 | template < typename DataStructure, typename DistType > 85 | class Emissions; 86 | 87 | template < typename DistType > 88 | class Transitions; 89 | 90 | template < typename DistType > 91 | class Theta; 92 | 93 | template < typename DistType > 94 | class Initial; 95 | 96 | 97 | 98 | template 99 | class Observation; 100 | 101 | template 102 | class SufficientStatistics; 103 | 104 | template 105 | class Distribution; 106 | 107 | template 108 | class Conjugate; 109 | 110 | template 111 | class ThetaHyperParam; 112 | 113 | 114 | class Mapping; 115 | 116 | 117 | 118 | #endif 119 | -------------------------------------------------------------------------------- /lib/gzstream/Makefile: -------------------------------------------------------------------------------- 1 | # ============================================================================ 2 | # gzstream, C++ iostream classes wrapping the zlib compression library. 3 | # Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner 4 | # 5 | # This library is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 2.1 of the License, or (at your option) any later version. 9 | # 10 | # This library is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public 16 | # License along with this library; if not, write to the Free Software 17 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 | # ============================================================================ 19 | # 20 | # File : Makefile 21 | # Revision : $Revision: 1.3 $ 22 | # Revision_date : $Date: 2001/10/04 15:09:28 $ 23 | # Author(s) : Deepak Bandyopadhyay, Lutz Kettner 24 | # 25 | # ============================================================================ 26 | 27 | # ---------------------------------------------------------------------------- 28 | # adapt these settings to your need: 29 | # add '-DGZSTREAM_NAMESPACE=name' to CPPFLAGS to place the classes 30 | # in its own namespace. Note, this macro needs to be set while creating 31 | # the library as well while compiling applications based on it. 32 | # As an alternative, gzstream.C and gzstream.h can be edited. 33 | # ---------------------------------------------------------------------------- 34 | 35 | # CXX = CC -n32 -LANG:std # for SGI Irix 6.5, MIPSpro CC version 7.30 36 | CXX = g++ # for Linux RedHat 6.1, g++ version 2.95.2 37 | 38 | CPPFLAGS = -I. -O 39 | LDFLAGS = -L. -lgzstream -lz 40 | AR = ar cr 41 | 42 | # ---------------------------------------------------------------------------- 43 | # plain simple rules to make and cleanup the library: 44 | # make default; compiles the library 45 | # make test; compiles and executes test. O.K. message marks success. 46 | # make clean; removes temporary files 47 | # make cleanall; removes temporary files, the library, and programs 48 | # ---------------------------------------------------------------------------- 49 | 50 | default: libgzstream.a 51 | 52 | test: test_gzip test_gunzip 53 | ./test_gzip COPYING.LIB gz.tmp.gz 54 | gunzip gz.tmp.gz 55 | diff COPYING.LIB gz.tmp 56 | gzip gz.tmp 57 | ./test_gunzip gz.tmp.gz gz.tmp 58 | diff COPYING.LIB gz.tmp 59 | rm gz.tmp.gz gz.tmp 60 | # *** O.K. Test finished successfully. *** 61 | 62 | gzstream.o : gzstream.C gzstream.h 63 | ${CXX} ${CPPFLAGS} -c -o gzstream.o gzstream.C 64 | 65 | test_gzip.o : test_gzip.C gzstream.h 66 | ${CXX} ${CPPFLAGS} -c -o test_gzip.o test_gzip.C 67 | 68 | test_gunzip.o : test_gunzip.C gzstream.h 69 | ${CXX} ${CPPFLAGS} -c -o test_gunzip.o test_gunzip.C 70 | 71 | libgzstream.a : gzstream.o 72 | ${AR} libgzstream.a gzstream.o 73 | 74 | test_gzip : test_gzip.o libgzstream.a 75 | ${CXX} -o test_gzip test_gzip.o ${LDFLAGS} 76 | 77 | test_gunzip : test_gunzip.o libgzstream.a 78 | ${CXX} -o test_gunzip test_gunzip.o ${LDFLAGS} 79 | 80 | clean : 81 | rm *.o 82 | 83 | cleanall : 84 | rm *.o libgzstream.a test_gzip test_gunzip 85 | 86 | # ============================================================================ 87 | # EOF 88 | 89 | -------------------------------------------------------------------------------- /src/StateSequence.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STATESEQUENCE_HPP 2 | #define STATESEQUENCE_HPP 3 | 4 | #include "includes.hpp" 5 | #include "Emissions.hpp" 6 | #include "Theta.hpp" 7 | #include "Transitions.hpp" 8 | #include "Initial.hpp" 9 | #include"Blocks.hpp" 10 | #include "Statistics.hpp" 11 | #include "KahanAggregator.hpp" 12 | #include "Trellis.hpp" 13 | #include "Records.hpp" 14 | 15 | 16 | 17 | 18 | template 19 | class StateSequence { 20 | 21 | vector mStates; 22 | rng_t& mRNG; 23 | vector mPrevStateSequence; // for direct Gibbs 24 | Trellis mTrellis; // implementation as member avoids frequent allocations 25 | 26 | public: 27 | 28 | // delete copy constructor 29 | StateSequence( const StateSequence& that ) = delete; 30 | 31 | StateSequence( rng_t& RNG ) : mTrellis( RNG ), mRNG( RNG ) {}; 32 | 33 | // template 34 | // void sample( 35 | // EmissionsType& Y, // TODO cannot be const due to next() 36 | // const ThetaType& theta, 37 | // const TransitionsType& A, 38 | // const InitialType& pi, 39 | // const bool useSelfTransitions = true 40 | // ); 41 | 42 | 43 | 44 | template < 45 | typename StatsStructure, 46 | typename StatsType, 47 | typename BlocksType, 48 | typename ThetaType, 49 | typename TauThetaType, 50 | typename TransitionsType, 51 | typename TauAType, 52 | typename InitialType, 53 | typename TauPiType > 54 | void sample( 55 | Emissions, Blocks>& y, 56 | const ThetaType& theta, 57 | TauThetaType& tau_theta, 58 | const TransitionsType& A, 59 | TauAType& tau_A, 60 | const InitialType& pi, 61 | TauPiType& tau_pi, 62 | const Mapping& mapping, 63 | Records& records, 64 | const bool doRecord, 65 | const bool useSelfTransitions ); 66 | 67 | 68 | size_t size() const { 69 | return mStates.size(); 70 | } 71 | 72 | const vector& states() const { 73 | return mStates; 74 | } 75 | 76 | const marginal_t& operator[]( 77 | const size_t s ) const { 78 | if ( s >= mStates.size() ) { 79 | throw runtime_error( "State sequence index " + to_string( s ) + " out of bounds!" ); 80 | } 81 | return mStates[s]; 82 | } 83 | 84 | marginal_t operator[]( 85 | const size_t s ) { 86 | if ( s >= mStates.size() ) { 87 | throw runtime_error( "State sequence index " + to_string( s ) + " out of bounds!" ); 88 | } 89 | return mStates[s]; 90 | } 91 | 92 | string str() const { 93 | stringstream ss; 94 | copy( mStates.begin(), mStates.end(), ostream_iterator( ss, " " ) ); 95 | string s = ss.str(); 96 | s = s.substr( 0, s.length() - 1 ); // TODO inefficient, copies all but the last character 97 | return s; 98 | }; 99 | 100 | void clear() { 101 | deleteVector( mStates ); 102 | deleteVector( mPrevStateSequence ); 103 | mTrellis.clear(); 104 | } 105 | 106 | }; 107 | 108 | 109 | 110 | 111 | 112 | ////////////////////////////////////////////////// TEMPLATE SPECIALIZATIONS ////////////////////////////////////////////////// 113 | 114 | 115 | #include "StateSequence/ForwardBackward.hpp" 116 | #include "StateSequence/Mixture.hpp" 117 | // #include "StateSequence/DirectGibbs.hpp" 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | #endif 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /src/AutoPriors.hpp: -------------------------------------------------------------------------------- 1 | #ifndef AUTOPRIORS_HPP 2 | #define AUTOPRIORS_HPP 3 | 4 | #include 5 | using std::runtime_error; 6 | 7 | #include 8 | using std::vector; 9 | 10 | #include "includes.hpp" 11 | 12 | #include "Emissions.hpp" 13 | #include "SufficientStatistics.hpp" 14 | #include "Tags.hpp" 15 | 16 | 17 | // Normal, breakpoint array 18 | vector NormalInverseGammaAutoPrior( 19 | real_t s2 , // desired variance 20 | real_t p , // desired probability of sampling a variance below s^2 21 | real_t dataMean, 22 | real_t dataVar // in wavelet tree autopriors, this is max(sample variance of data, variance of block means) 23 | ) { 24 | 25 | if ( p < 0 || p > 1 ) { 26 | throw runtime_error( "Parameter p for automatic priors is a probability and must be in [0,1]!" ); 27 | } 28 | 29 | if ( s2 <= 0 ) { 30 | throw runtime_error( "Parameter s2 for automatic priors is a variance and must be positive!" ); 31 | } 32 | 33 | if ( dataVar <= 0 ) { 34 | throw runtime_error( "Data variance provided to autoprior must be positive!" ); 35 | } 36 | 37 | 38 | const real_t M1 = 0.3361; 39 | const real_t M2 = -0.0042; 40 | const real_t M3 = -0.0201; 41 | 42 | const real_t b = -log( p ); 43 | 44 | const real_t alpha = 2.0; 45 | const real_t beta = s2 * ( ( 2.0 * sqrt( b ) ) / ( M1 * sqrt( b ) + sqrt( 2.0 ) * ( M2 * b * exp( M3 * sqrt( b ) ) + 1 ) ) + b ); 46 | const real_t mu0 = dataMean; 47 | const real_t nu = beta / dataVar; 48 | 49 | if ( alpha <= 0 ) { 50 | throw runtime_error( "Autoprior yields non-positive alpha!" ); 51 | } 52 | 53 | if ( beta <= 0 ) { 54 | throw runtime_error( "Autoprior yields non-positive beta!" ); 55 | } 56 | 57 | if ( nu <= 0 ) { 58 | throw runtime_error( "Autoprior yields non-positive nu!" ); 59 | } 60 | 61 | if ( !isfinite( alpha ) ) { 62 | throw runtime_error( "Autoprior yields non-finite alpha!" ); 63 | } 64 | 65 | if ( !isfinite( beta ) ) { 66 | throw runtime_error( "Autoprior yields non-finite beta!" ); 67 | } 68 | 69 | if ( !isfinite( mu0 ) ) { 70 | throw runtime_error( "Autoprior yields non-finite mu0!" ); 71 | } 72 | 73 | if ( !isfinite( nu ) ) { 74 | throw runtime_error( "Autoprior yields non-finite nu!" ); 75 | } 76 | 77 | vector v {alpha, beta, mu0, nu}; 78 | return v; 79 | 80 | } 81 | 82 | 83 | // auto prior for Gaussian statistics 84 | // template