├── bin
    ├── pyhammlet
    │   ├── __init__.py
    │   ├── palette.pdf
    │   ├── palette.txt
    │   ├── io.py
    │   ├── RLE.py
    │   └── plotting.py
    ├── sortStates
    └── samToCounts
├── lib
    └── gzstream
    │   ├── version
    │   ├── logo.gif
    │   ├── README
    │   ├── test_gunzip.C
    │   ├── test_gzip.C
    │   ├── Makefile
    │   ├── gzstream.h
    │   ├── gzstream.C
    │   └── index.html
├── logo
    ├── logo.png
    ├── logo.xcf
    ├── logo-inv.png
    ├── logo-inv.xcf
    ├── logo-round.png
    ├── logo-round.xcf
    ├── logo-inv-noborder.png
    ├── logo-inv-noborder.xcf
    ├── logo-round-250px.png
    ├── logo-round-250px.xcf
    ├── logo-boxdrawing.txt
    ├── logo-boxdrawing-centered.groff
    └── logo-round.base64
├── doc
    ├── hammlet-manpage-a4.pdf
    ├── hammlet-manpage-letter.pdf
    ├── man-preamble.tex
    ├── hammlet.bib
    └── pandoc.css
├── INSTALL.txt
├── src
    ├── TransitionHyperParam.hpp
    ├── Blocks.hpp
    ├── Statistics.hpp
    ├── InitialHyperParam.hpp
    ├── tools
    │   ├── avg.cpp
    │   ├── maxSegmentation.cpp
    │   ├── MappedValues.hpp
    │   ├── GenomeGetter.hpp
    │   ├── combineCounts.cpp
    │   └── mapLinesToGenome.cpp
    ├── Initial.hpp
    ├── KahanAggregator.hpp
    ├── Trellis.hpp
    ├── MultiVector.hpp
    ├── Blocks
    │   ├── FixedBlocks.hpp
    │   ├── SplittableBlocks.hpp
    │   └── BreakpointArray.hpp
    ├── Emissions.hpp
    ├── Transitions.hpp
    ├── includes.hpp
    ├── EFD.hpp
    ├── ThetaHyperParam.hpp
    ├── Tags.hpp
    ├── StateSequence.hpp
    ├── AutoPriors.hpp
    ├── HMM.hpp
    ├── uintmath.hpp
    ├── Mapping.hpp
    ├── StateMarginalsIterator.hpp
    ├── StateSequence
    │   ├── Mixture.hpp
    │   └── ForwardBackward.hpp
    ├── utils.hpp
    ├── Distribution.hpp
    ├── Conjugate.hpp
    ├── Theta.hpp
    ├── wavelet.hpp
    ├── Records.hpp
    ├── Statistics
    │   └── IntegralArray.hpp
    └── SufficientStatistics.hpp
├── README.md
└── Makefile


/bin/pyhammlet/__init__.py:
--------------------------------------------------------------------------------
1 |  
2 | 


--------------------------------------------------------------------------------
/lib/gzstream/version:
--------------------------------------------------------------------------------
1 | 1.5 (08 Jan 2003)
2 | 


--------------------------------------------------------------------------------
/logo/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo.png


--------------------------------------------------------------------------------
/logo/logo.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo.xcf


--------------------------------------------------------------------------------
/logo/logo-inv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-inv.png


--------------------------------------------------------------------------------
/logo/logo-inv.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-inv.xcf


--------------------------------------------------------------------------------
/logo/logo-round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-round.png


--------------------------------------------------------------------------------
/logo/logo-round.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-round.xcf


--------------------------------------------------------------------------------
/lib/gzstream/logo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/lib/gzstream/logo.gif


--------------------------------------------------------------------------------
/bin/pyhammlet/palette.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/bin/pyhammlet/palette.pdf


--------------------------------------------------------------------------------
/doc/hammlet-manpage-a4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/doc/hammlet-manpage-a4.pdf


--------------------------------------------------------------------------------
/logo/logo-inv-noborder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-inv-noborder.png


--------------------------------------------------------------------------------
/logo/logo-inv-noborder.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-inv-noborder.xcf


--------------------------------------------------------------------------------
/logo/logo-round-250px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-round-250px.png


--------------------------------------------------------------------------------
/logo/logo-round-250px.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/logo/logo-round-250px.xcf


--------------------------------------------------------------------------------
/doc/hammlet-manpage-letter.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wiedenhoeft/HaMMLET/HEAD/doc/hammlet-manpage-letter.pdf


--------------------------------------------------------------------------------
/INSTALL.txt:
--------------------------------------------------------------------------------
1 | To install, run make, or simply use a C++11-compliant compiler, e.g.
2 | g++ -O3 --std=c++11 -o hammlet main.cpp 
3 | 
4 | 


--------------------------------------------------------------------------------
/logo/logo-boxdrawing.txt:
--------------------------------------------------------------------------------
 1 | ┏━━━━━┓     ┏━━━━━┓
 2 | ┣━━━━━┫     ┃ ┏━━━┫
 3 | ┃ ┏━━━┫     ┃ ┃ ┏━┫
 4 | ┃ ┃ ┏━┻━━━━━┫ ┃ ┃ ┃
 5 | ┃ ┃ ┃ ┏━━━━━┫ ┃ ┃ ┃
 6 | ┃ ┃ ┃ ┣━━━━━┛ ┃ ┃ ┃
 7 | ┃ ┃ ┃ ┣━━━━━┳━┛ ┃ ┃
 8 | ┣━┛ ┃ ┃     ┣━━━┛ ┃
 9 | ┣━━━┛ ┃     ┣━━━━━┫
10 | ┗━━━━━┛     ┗━━━━━┛


--------------------------------------------------------------------------------
/src/TransitionHyperParam.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef TRANSITIONSHYPERPARAM_HPP
 2 | #define TRANSITIONSHYPERPARAM_HPP
 3 | 
 4 | #include "Tags.hpp"
 5 | 
 6 | 
 7 | template <typename ParamType>
 8 | using TransitionHyperParam = Conjugate<ParamType>;
 9 | 
10 | 
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/src/Blocks.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef BLOCKS_HPP
 2 | #define BLOCKS_HPP
 3 | 
 4 | 
 5 | template<typename T>
 6 | class Blocks;
 7 | 
 8 | #include "Blocks/BreakpointArray.hpp"
 9 | #include "Blocks/SplittableBlocks.hpp"
10 | #include "Blocks/FixedBlocks.hpp"
11 | 
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/lib/gzstream/README:
--------------------------------------------------------------------------------
1 | 
2 |                               gzstream
3 |       C++ iostream classes wrapping the zlib compression library.
4 | ===========================================================================
5 | 
6 |     See index.html for documentation and installation instructions.
7 | 


--------------------------------------------------------------------------------
/bin/sortStates:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # Sort by absolute value of last emission means
4 | # $1 - parameter file created by HaMMLET
5 | 
6 | echo "#state	mean"
7 | tail -n 1 $1  | awk '{ for (i=1;i<=NF;i+=2) print (i-1)/2"\t"$i }'  |  sed -r 's/-([^-]+)/\1\t-/g;' | sort -k 2 -n -r | awk '{print $1"\t"$3$2}' 


--------------------------------------------------------------------------------
/src/Statistics.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef STATISTICS_HPP
 2 | #define STATISTICS_HPP
 3 | 
 4 | #include "SufficientStatistics.hpp"
 5 | 
 6 | template<typename T, typename StatsType>
 7 | class Statistics;
 8 | 
 9 | #include "Blocks.hpp"
10 | 
11 | #include "Statistics/IntegralArray.hpp"
12 | // #include "Statistics/Fixed.hpp"
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/src/InitialHyperParam.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef INITIALHYPERPARAM_HPP
 2 | #define INITIALHYPERPARAM_HPP
 3 | 
 4 | #include "includes.hpp"
 5 | #include "Tags.hpp"
 6 | #include "Conjugate.hpp"
 7 | 
 8 | 
 9 | // for the time being, tau is an alias for conjugates
10 | template <typename ParamType>
11 | using InitialHyperParam = Conjugate<ParamType>;
12 | 
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/logo/logo-boxdrawing-centered.groff:
--------------------------------------------------------------------------------
 1 | .PP
 2 | .ce
 3 | ┏━━━━━┓     ┏━━━━━┓ 
 4 | .ce
 5 | ┣━━━━━┫     ┃ ┏━━━┫
 6 | .ce
 7 | ┃ ┏━━━┫     ┃ ┃ ┏━┫
 8 | .ce
 9 | ┃ ┃ ┏━┻━━━━━┫ ┃ ┃ ┃
10 | .ce
11 | ┃ ┃ ┃ ┏━━━━━┫ ┃ ┃ ┃
12 | .ce
13 | ┃ ┃ ┃ ┣━━━━━┛ ┃ ┃ ┃
14 | .ce
15 | ┃ ┃ ┃ ┣━━━━━┳━┛ ┃ ┃
16 | .ce
17 | ┣━┛ ┃ ┃     ┣━━━┛ ┃
18 | .ce
19 | ┣━━━┛ ┃     ┣━━━━━┫
20 | .ce
21 | ┗━━━━━┛     ┗━━━━━┛ 
22 | 


--------------------------------------------------------------------------------
/doc/man-preamble.tex:
--------------------------------------------------------------------------------
 1 | \usepackage[charter]{mathdesign} 
 2 | \setlength{\columnsep}{1cm}
 3 | \usepackage{microtype}
 4 | \DisableLigatures[-]{}	% disable replacement of --
 5 | \usepackage{breakurl}
 6 | \usepackage{graphicx}
 7 | 
 8 | \usepackage{eso-pic}
 9 | \newcommand\AtPageUpperRight[1]{\AtPageUpperLeft{\makebox[\paperwidth][r]{#1}}}
10 | \AddToShipoutPictureBG{
11 | 	\AtPageUpperRight{\raisebox{-\height}{\includegraphics[width=0.6in, keepaspectratio]{logo/logo-inv.png}}}
12 | }
13 | 
14 | 
15 | \hyphenation{white-space}
16 | 


--------------------------------------------------------------------------------
/bin/pyhammlet/palette.txt:
--------------------------------------------------------------------------------
 1 | #a6cee3
 2 | #1f78b4
 3 | #b2df8a
 4 | #33a02c
 5 | #fb9a99
 6 | #e31a1c
 7 | #fdbf6f
 8 | #ff7f00
 9 | #cab2d6
10 | #6a3d9a
11 | #ffff99
12 | #b15928
13 | #437e6a
14 | #ba78c6
15 | #694d48
16 | #9d840b
17 | #306928
18 | #ba768c
19 | #849b6d
20 | #698af8
21 | #6bf9df
22 | #39fb3f
23 | #bb32a7
24 | #ebca0c
25 | #16b0c9
26 | #8f94b9
27 | #3da88a
28 | #726441
29 | #f0d4c0
30 | #f6208f
31 | #6047f7
32 | #d21b4f
33 | #d0dcca
34 | #c96f63
35 | #6d5d7b
36 | #bba09a
37 | #a0ad3d
38 | #8a3565
39 | #778884
40 | #b99164
41 | #236575
42 | #6c8117
43 | #817577
44 | #f893d0
45 | #525952
46 | #f3edfe
47 | #2cd58b
48 | #27a0ea
49 | #3a5680
50 | #fd28ff
51 | #933b30
52 | #03e8ff
53 | #cdc190
54 | #7d6bb7
55 | #7a4a0f
56 | #8fc0b8


--------------------------------------------------------------------------------
/doc/hammlet.bib:
--------------------------------------------------------------------------------
 1 | @article {hammlet,
 2 | 	author = {Wiedenhoeft, John and Brugel, Eric and Schliep, Alexander},
 3 | 	title = {Fast Bayesian Inference of Copy Number Variants using Hidden Markov Models with Wavelet Compression},
 4 | 	year = {2016},
 5 | 	doi = { 10.1371/journal.pcbi.1004871},
 6 | 	journal = {PLOS Computational Biology},
 7 | 	issue={12},
 8 | 	number={5},
 9 | 	url={http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004871}
10 | }
11 | 
12 | @inproceedings{hammletrecomb,
13 | 	author = {Wiedenhoeft, John and Brugel, Eric and Schliep, Alexander},
14 | 	title = {Fast Bayesian Inference of Copy Number Variants using Hidden Markov Models with Wavelet Compression},
15 | 	year = {2016},
16 | 	booktitle={Research in Computational Molecular Biology: 20th Annual Conference, RECOMB 2017},
17 | 	isbn={9783319319575}
18 | } 
19 | 


--------------------------------------------------------------------------------
/src/tools/avg.cpp:
--------------------------------------------------------------------------------
 1 | // reads data stream and puts average of non-overlapping windows to stdout
 2 | #include <iostream>
 3 | using std::ostream;
 4 | using std::endl;
 5 | using std::cin;
 6 | using std::cout;
 7 | using std::flush;
 8 | 
 9 | 
10 | #include <stdexcept>
11 | using std::runtime_error;
12 | using std::exception;
13 | 
14 | #include <sstream>
15 | using std::istringstream;
16 | 
17 | int main( int argc, const char* argv[] ) {
18 | 
19 | 
20 | 	if ( argc <= 1 ) {
21 | 		throw runtime_error( "Not enough arguments!" );
22 | 	}
23 | 
24 | 	istringstream ss( argv[1] );
25 | 	size_t windowSize;
26 | 	ss >> windowSize ;
27 | 	float v;
28 | 	float sum = 0;
29 | 	size_t pos = 0;
30 | 	while ( cin >> v ) {
31 | 		sum += v;
32 | 		pos++;
33 | 		if ( pos == windowSize ) {
34 | 			cout << sum / pos << endl;
35 | 			pos = 0;
36 | 			sum = 0;
37 | 		}
38 | 	}
39 | 	if ( pos != 0 ) {
40 | 		cout << sum / pos << endl;
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![HaMMLET](https://github.com/wiedenhoeft/HaMMLET/blob/dev/logo/logo-inv-noborder.png)
 2 | 
 3 | HaMMLET – Fast Bayesian HMM segmentation for big data
 4 | =====================================================
 5 | 
 6 | This software implements Forward-Backward Gibbs sampling for Bayesian segmentation in Hidden Markov Models (HMM). It uses dynamic wavelet compression to drastically improve convergence and memory consumption, making inference possible on large-scale data. 
 7 | 
 8 | For instance, HaMMLET can be used on a regular laptop for segmentation of genomic data, such as array-CGH or depth-of coverage from whole-genome sequencing (WGS), to find candidates for copy-number variants (CNV). For details, please refer to the doc/ directory.
 9 | 
10 | For implementation details and the theory behind this approach, please refer to my [thesis](https://rucore.libraries.rutgers.edu/rutgers-lib/59275/) (DOI: [10.7282/t3-4e1k-ph18](https://doi.org/doi:10.7282/t3-4e1k-ph18)).
11 | 


--------------------------------------------------------------------------------
/src/Initial.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef INITIAL_HPP
 2 | #define INITIAL_HPP
 3 | 
 4 | 
 5 | #include "includes.hpp"
 6 | #include "Tags.hpp"
 7 | #include "Distribution.hpp"
 8 | #include "InitialHyperParam.hpp"
 9 | #include "StateSequence.hpp"
10 | #include "Transitions.hpp"
11 | 
12 | template <typename  DistType>	// e.g. Dirichlet
13 | class Initial {
14 | 
15 | 		Observation<DistType> mValue;
16 | 		SufficientStatistics< Categorical > mCounts;
17 | 		Distribution<DistType> mDist;
18 | 
19 | 	public:
20 | 
21 | 		// delete copy constructor
22 | 		Initial( const Initial& that ) = delete;
23 | 
24 | 		Initial( size_t nrStates, rng_t& RNG ) :
25 | 			mValue( nrStates ),
26 | 			mCounts( nrStates ),
27 | 			mDist( RNG ) {}
28 | 
29 | 		Initial( vector< real_t>& vec, rng_t& RNG ) :
30 | 			mValue( vec ),
31 | 			mDist( RNG ) {};
32 | 
33 | 
34 | 		template<typename InitialHyperParamType>
35 | 		void sample(
36 | 		    InitialHyperParamType& tau_pi // NOTE tau_pi cannot be constant since we
37 | 		) {
38 | 			mDist.resample( mValue, tau_pi.posterior() );
39 | 			tau_pi.reset();
40 | 		}
41 | 
42 | 
43 | 
44 | 		vector<real_t> valueVector() const {	// NOTE this is intermediate level is necessary, since dist might be a more complicated structure than a simple probability vector itself, e.g. when using Dirichlet process priors
45 | 			return mValue.probs();
46 | 		};
47 | 
48 | 		size_t nrStates() const {
49 | 			return mValue.domainSize();
50 | 		}
51 | 
52 | 
53 | 		string str() const {
54 | 			return mValue.str();
55 | 		}
56 | 		
57 | 
58 | };
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/KahanAggregator.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef KAHANAGGREGATOR_HPP
 2 | #define KAHANAGGREGATOR_HPP
 3 | 
 4 | // This class aggregates values into a sum, while keeping track of the error using Kahan's algorithm. Subtractions are stored separately so that only a single subtraction is used when calling sum()
 5 | template <typename T>
 6 | class KahanAggregator {
 7 | 		T mPosSum;
 8 | 		T mNegSum;
 9 | 		T mPosError;
10 | 		T mNegError;
11 | 		size_t mN;	// number of terms
12 | 
13 | 
14 | 	public:
15 | 
16 | 		// TODO operator+, operator()
17 | 
18 | 		KahanAggregator():
19 | 			mPosSum( 0 ),
20 | 			mNegSum( 0 ),
21 | 			mPosError( 0 ),
22 | 			mNegError( 0 ),
23 | 			mN( 0 ) {}
24 | 
25 | 
26 | 		void add( T x, size_t N = 1 ) {
27 | 			T y = x - mPosError;
28 | 			T temp = mPosSum + y;
29 | 			mPosError = ( temp - mPosSum ) - y;
30 | 			mPosSum = temp;
31 | 			mN += N;
32 | 		}
33 | 
34 | 
35 | 		void subtract( T x, size_t N = 1 ) {
36 | 			T y = x - mNegError;
37 | 			T temp = mNegSum + y;
38 | 			mNegError = ( temp - mNegSum ) - y;
39 | 			mNegSum = temp;
40 | 			mN += N;
41 | 		}
42 | 
43 | 		T sum() const {
44 | 			return mPosSum - mNegSum;
45 | 		}
46 | 
47 | 		T error() const {
48 | 			return mPosError + mNegError;	// TODO minus?
49 | 		}
50 | 
51 | 		size_t nrTerms() const {
52 | 			return mN;
53 | 		}
54 | 
55 | 		void setNrTerms( size_t N ) {
56 | 			mN = N;
57 | 		}
58 | 
59 | 		void reset() {
60 | 			mPosSum = T( 0 );
61 | 			mPosError = T( 0 );
62 | 			mNegSum = T( 0 );
63 | 			mNegError = T( 0 );
64 | 			mN = 0;
65 | 		}
66 | };
67 | 
68 | #endif
69 | 
70 | 


--------------------------------------------------------------------------------
/src/Trellis.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef TRELLIS_HPP
 2 | #define TRELLIS_HPP
 3 | 
 4 | #include <stdexcept>
 5 | 
 6 | #include "uintmath.hpp"
 7 | 
 8 | class Trellis {
 9 | 
10 | 		vector<real_t> mVec;
11 | 		size_t mNrStates;
12 | 		rng_t& mRNG;
13 | 
14 | 		void assertRange( size_t d ) const {
15 | 			if ( d >= mNrStates ) {
16 | 				throw runtime_error( "Trellis dimension index out of bounds!" );
17 | 			}
18 | 		}
19 | 	public:
20 | 
21 | 		// delete copy constructor
22 | 		Trellis( const Trellis& that ) = delete;
23 | 
24 | 		Trellis( rng_t& RNG ):
25 | 			mNrStates( 2 ) ,
26 | 			mRNG( RNG )
27 | 		{};
28 | 
29 | 		Trellis( size_t nrStates, rng_t& RNG ):
30 | 			mNrStates( nrStates ) ,
31 | 			mRNG( RNG )
32 | 		{};
33 | 
34 | 
35 | 		real_t& operator()( size_t t, size_t d ) {
36 | 			assertRange( d );
37 | 			return mVec[t * mNrStates + d];
38 | 		}
39 | 
40 | 
41 | 		// return reference to last element at dimension d
42 | 		real_t& back( size_t d ) {
43 | 			return mVec[mVec.size() - mNrStates + d];
44 | 		}
45 | 
46 | 
47 | 		// this interface is for cases when the number of states is not known beforehand, e.g. for Dirichlet Process Priors
48 | 		void setNrStates( size_t K ) {
49 | 			mNrStates = K;
50 | 		}
51 | 
52 | 		size_t size() const {
53 | 			return divide( mVec.size(), mNrStates );
54 | 		}
55 | 
56 | 
57 | 		void push_back( const vector<real_t>& vec ) {
58 | 			mVec.insert( mVec.end(), vec.begin(), vec.end() );
59 | 		}
60 | 
61 | 		size_t sample( size_t t ) const {
62 | 			discrete_distribution<size_t> dist( mVec.begin() + ( t * mNrStates ), mVec.begin() + ( ( t + 1 )*mNrStates ) );
63 | 			size_t result = dist( mRNG );
64 | 			dist.reset();
65 | 			return result;
66 | 		}
67 | 
68 | 		void reserve( size_t N ) {
69 | 			mVec.reserve( N * mNrStates );
70 | 		}
71 | 
72 | 
73 | 		void clear() {
74 | 			mVec.clear();
75 | 		}
76 | };
77 | 
78 | 
79 | #endif
80 | 


--------------------------------------------------------------------------------
/src/tools/maxSegmentation.cpp:
--------------------------------------------------------------------------------
 1 | // Given a file of state marginals, compute the maximum posterior margins.
 2 | 
 3 | #include "../Parser.hpp"
 4 | 
 5 | 
 6 | #include <iostream>
 7 | using std::cin;
 8 | using std::cout;
 9 | using std::endl;
10 | using std::istream;
11 | using std::ostream;
12 | 
13 | #include <fstream>
14 | using std::ifstream;
15 | using std::ofstream;
16 | 
17 | #include <string>
18 | using std::stoi;
19 | using std::getline;
20 | 
21 | #include <stdexcept>
22 | using std::runtime_error;
23 | 
24 | #include <sstream>
25 | using std::stringstream;
26 | 
27 | int main( int argc, const char* argv[] ) {
28 | 
29 | 	Parser args( argc, argv );
30 | 	args.registerFlags( {"-i", "-infile"}, "" );
31 | 	args.registerFlags( {"-h", "--help", "-help"}, "" );
32 | 	args.parseArgs();
33 | 
34 | 	if ( args.isSet( "-h" ) ) {
35 | 		cout << "Given a marginals file (-i) or input from STDIN, computes the maximum posterior margins segmentation, combining adjacent segments whenever possible." << endl;
36 | 		return 0;
37 | 	}
38 | 
39 | 	const bool readFromFile = args.isSet( "-i" );
40 | 	ifstream realInFile;
41 | 	if ( readFromFile ) {
42 | 		realInFile.open( args.parse<string>( "-i" ), std::ios::in );
43 | 	}
44 | 	istream& inFile = ( readFromFile ? realInFile : cin );
45 | 
46 | 	string line;
47 | 	size_t count;
48 | 	size_t RLE = 0;
49 | 	size_t totalRLE = 0;
50 | 	size_t col = 0;
51 | 	size_t maxCol = 0;
52 | 	size_t index = 0;
53 | 	size_t maxIndex = 0;
54 | 	size_t prevIndex = 0;
55 | 
56 | 	while ( getline( inFile, line ) ) {
57 | 		stringstream ss( line );
58 | 
59 | 		index = 0;
60 | 		maxIndex = 0;
61 | 		col=0;
62 | 		maxCol = 0;
63 | 		ss >> RLE;
64 | 		while ( ss >> col ) {
65 | 			if ( col > maxCol ) {
66 | 				maxIndex = index;
67 | 				maxCol = col;
68 | 			}
69 | 			index++;
70 | 		}
71 | 
72 | 		if ( maxIndex == prevIndex ) {
73 | 			totalRLE += RLE;
74 | 		} else {
75 | 			cout << totalRLE << "\t" << prevIndex << endl;
76 | 			totalRLE = RLE;
77 | 			prevIndex = maxIndex;
78 | 		}
79 | 
80 | 	}
81 | 	cout << totalRLE << "\t" << maxIndex << endl;
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/src/MultiVector.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef MULTIVECTOR_HPP
  2 | #define MULTIVECTOR_HPP
  3 | 
  4 | #include <vector>
  5 | using std::vector;
  6 | 
  7 | #include <stdexcept>
  8 | using std::runtime_error;
  9 | 
 10 | template <typename T>
 11 | class MultiVector {
 12 | 		vector<T> mVec;
 13 | 		const size_t mNrDim;
 14 | 
 15 | 	public:
 16 | 
 17 | 
 18 | 		MultiVector(
 19 | 		    size_t nrDim
 20 | 		):
 21 | 			mNrDim( nrDim ) {
 22 | 			if ( mNrDim <= 0 ) {
 23 | 				throw runtime_error( "Number of dimensions in multivector must be positive!" );
 24 | 			}
 25 | 		}
 26 | 
 27 | 		MultiVector(
 28 | 		    T entry,
 29 | 		    size_t size,
 30 | 		    size_t nrDim
 31 | 		):
 32 | 			mVec( entry, nrDim* size ),
 33 | 			mNrDim( nrDim ) {
 34 | 			if ( mNrDim <= 0 ) {
 35 | 				throw runtime_error( "Number of dimensions in multivector must be positive!" );
 36 | 			}
 37 | 		}
 38 | 
 39 | 
 40 | 		// direct access to underlying vector
 41 | 		inline T& operator[]( size_t i ) {
 42 | 			if ( i >= mVec.size() ) {
 43 | 				throw runtime_error( "Direct index out of bounds for multivector!" );
 44 | 			}
 45 | 			return mVec[i];
 46 | 		}
 47 | 
 48 | 		inline const T& operator[]( size_t i ) const {
 49 | 			if ( i >= mVec.size() ) {
 50 | 				throw runtime_error( "Direct index out of bounds for multivector!" );
 51 | 			}
 52 | 			return mVec[i];
 53 | 		}
 54 | 
 55 | 
 56 | 		inline T& operator()( size_t pos, size_t dim ) {
 57 | 			if ( dim >= mNrDim ) {
 58 | 				throw runtime_error( "Multivector dimension index out of bounds!" );
 59 | 			}
 60 | 			const size_t i = pos * mNrDim + dim;
 61 | 			if ( i < mVec.size() ) {
 62 | 				return mVec[i];
 63 | 			} else {
 64 | 				throw runtime_error( "Multivector index out of bounds!" );
 65 | 			}
 66 | 		}
 67 | 
 68 | 		size_t size() const {
 69 | 			return mVec.size() / mNrDim;
 70 | 		}
 71 | 
 72 | 		void reserve( size_t size ) {
 73 | 			mVec.reserve( mNrDim * size );
 74 | 		}
 75 | 
 76 | 
 77 | 		void resize( size_t N ) {
 78 | 			mVec.resize( N * mNrDim );
 79 | 		}
 80 | 
 81 | 
 82 | 		void push_back( T& entry ) {
 83 | 			mVec.reserve( mVec.size() + mNrDim );
 84 | 			for ( size_t d = 0; d < mNrDim; ++d ) {
 85 | 				mVec.push_back( entry );
 86 | 			}
 87 | 		}
 88 | 
 89 | 
 90 | 		void swap( vector<T>& vec ) {
 91 | 			const size_t s = mVec.size();
 92 | 			if ( s != ( s / mNrDim )*mNrDim ) {
 93 | 				throw runtime_error( "Cannot swap into multivector, size is not a multiple of dimensions!" );
 94 | 			}
 95 | 			mVec.swap( vec );
 96 | 		}
 97 | 
 98 | 		size_t nrDim()const {
 99 | 			return mNrDim;
100 | 		}
101 | };
102 | 
103 | #endif
104 | 


--------------------------------------------------------------------------------
/src/Blocks/FixedBlocks.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef FIXEDBLOCKS_HPP
  2 | #define FIXEDBLOCKS_HPP
  3 | 
  4 | template<>
  5 | class Blocks<Fixed> {
  6 | 		// number of input positions
  7 | 		const size_t mSize;
  8 | 		const vector<size_t>& mSizes;
  9 | 
 10 | 		size_t mBlockCounter;
 11 | 		// the maximum size that an iterator can jump forward (pruning limit)
 12 | 		Direction mDirection;
 13 | 
 14 | 		// the boundaries of the current block
 15 | 		size_t mBlockStart;
 16 | 		size_t mBlockEnd;
 17 | 		size_t mBlockSize;
 18 | 
 19 | 	public:
 20 | 
 21 | 		// delete copy constructor
 22 | 		Blocks( const Blocks& that ) = delete;
 23 | 
 24 | 
 25 | 		// NOTE this constructor swaps its input vectors, i.e. they are empty outsize of this class
 26 | 		Blocks(
 27 | 		    vector<size_t>& sizes
 28 | 		) :
 29 | 			mSizes( sizes ),
 30 | 			mBlockCounter( 0 ),
 31 | 			mSize( accumulate( sizes.begin(), sizes.end(), 0 ) ),
 32 | 			mDirection( unset ),
 33 | 			mBlockStart( 0 ),
 34 | 			mBlockEnd( 0 ) {
 35 | 
 36 | 			// check that weights contain data
 37 | 			if ( mSize <= 0 ) {
 38 | 				throw runtime_error( "Input vector for breakpoint weights is empty!" );
 39 | 			}
 40 | 		};
 41 | 
 42 | 
 43 | 		template<typename ParamType>
 44 | 		void createBlocks( Theta<ParamType>& param );
 45 | 
 46 | 		void initForward() {
 47 | 			mDirection = forward;
 48 | 			mBlockStart = 0;
 49 | 			mBlockEnd = 0;
 50 | 			mBlockSize = 0;
 51 | 			mBlockCounter = 0;
 52 | 			// initialize block at 0
 53 | 		}
 54 | 
 55 | 
 56 | 
 57 | 		// get the end of a block starting at <start> for a given threshold
 58 | 		// return false if the block end is the last possible value
 59 | 		inline bool next() {
 60 | 			if ( mBlockEnd >= mSize ) {
 61 | 				mDirection = unset;
 62 | 				return false;
 63 | 			} else {
 64 | 				mBlockStart = mBlockEnd;
 65 | 				mBlockEnd = mBlockStart + mSizes[mBlockCounter];
 66 | 				mBlockCounter++;
 67 | 				mBlockSize = mBlockEnd - mBlockStart;
 68 | 				return true;
 69 | 			}
 70 | 		}
 71 | 
 72 | 
 73 | 		size_t nrBlocks() const {
 74 | 			return mSizes.size();
 75 | 		}
 76 | 
 77 | 		size_t start() const {
 78 | 			return mBlockStart;
 79 | 		}
 80 | 
 81 | 		size_t end() const {
 82 | 			return mBlockEnd;
 83 | 		}
 84 | 
 85 | 		size_t pos() const {
 86 | 			if ( mBlockCounter > 0 ) {
 87 | 				return mBlockCounter - 1;
 88 | 			} else {
 89 | 				throw runtime_error( "No blocks created yet, position is undefined!" );
 90 | 			}
 91 | 		}
 92 | 
 93 | 		size_t N() const {
 94 | 			return mBlockSize ;
 95 | 		}
 96 | 
 97 | 		size_t T() const {
 98 | 			return mSize;
 99 | 		}
100 | 
101 | 		void printBlock() const {
102 | 			cout << "[" << mBlockStart << ":" << mBlockEnd << ") " << mBlockSize << " ";
103 | 		}
104 | 
105 | 
106 | };
107 | 
108 | #endif
109 | 


--------------------------------------------------------------------------------
/src/Emissions.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef EMISSIONS_HPP
  2 | #define EMISSIONS_HPP
  3 | 
  4 | #include "includes.hpp"
  5 | #include "Tags.hpp"
  6 | #include "Blocks.hpp"
  7 | #include "Statistics.hpp"
  8 | 
  9 | 
 10 | // A wrapper around a combination of a data structure holding data points/sufficient statistics and an associated block structure
 11 | template<typename S, typename T, typename B>
 12 | class Emissions<Statistics<S, T>, Blocks<B>> {
 13 | 
 14 | 		Statistics<S, T>& mStats;
 15 | 		Blocks<B>& mBlocks;
 16 | 
 17 | 	public:
 18 | 
 19 | 		Emissions(
 20 | 		    Statistics<S, T>& stats,
 21 | 		    Blocks<B>& blocks
 22 | 		):
 23 | 			mStats( stats ),
 24 | 			mBlocks( blocks ) {
 25 | 			if ( mStats.size() != mBlocks.size() ) {
 26 | 				throw runtime_error( "Block structure and statistics have different number of data points!" );
 27 | 			}
 28 | 		}
 29 | 
 30 | 
 31 | 		Statistics<S, T>& stats()  {
 32 | 			return mStats;
 33 | 		}
 34 | 
 35 | 		Blocks<B>& blocks()  {
 36 | 			return mBlocks;
 37 | 		}
 38 | 
 39 | 
 40 | 		const Statistics<S, T>& stats() const {
 41 | 			return mStats;
 42 | 		}
 43 | 
 44 | 		const Blocks<B>& blocks() const {
 45 | 			return mBlocks;
 46 | 		}
 47 | 
 48 | 
 49 | 
 50 | 		void createBlocks( real_t thresh ) {
 51 | 			mBlocks.createBlocks( thresh );
 52 | 		}
 53 | 
 54 | 		template <typename ParamType>
 55 | 		void createBlocks( const Theta<ParamType>& theta ) {
 56 | 			mBlocks.createBlocks( theta );
 57 | 		}
 58 | 
 59 | 
 60 | 		size_t nrBlocks() const {
 61 | 			return mBlocks.nrBlocks();
 62 | 		}
 63 | 
 64 | 		size_t nrDim() const {
 65 | 			return mStats.nrDim();
 66 | 		}
 67 | 
 68 | 		size_t start() const {
 69 | 			return mBlocks.start();
 70 | 		}
 71 | 
 72 | 		size_t end() const {
 73 | 			return mBlocks.end();
 74 | 		}
 75 | 
 76 | 		size_t blockSize() const {
 77 | 			return mBlocks.blockSize();
 78 | 		}
 79 | 
 80 | 		size_t size() const {
 81 | 			return mBlocks.size();
 82 | 		}
 83 | 
 84 | 		void initForward() {
 85 | 			mBlocks.initForward();
 86 | 		}
 87 | 
 88 | 		bool next() {
 89 | 			if ( mBlocks.next() ) {
 90 | 				mStats.setStats( mBlocks );
 91 | 				return true;
 92 | 			} else {
 93 | 				return false;
 94 | 			}
 95 | 		}
 96 | 
 97 | 
 98 | 		const SufficientStatistics<T>& suffStat( size_t dim ) const {
 99 | 			return mStats.suffStat( dim );
100 | 		}
101 | 
102 | 
103 | 		template<typename StateSequenceType, typename HyperParamType>
104 | 		void aggregateStatistics(
105 | 		    const StateSequenceType& q,
106 | 		    HyperParamType& tau_theta,
107 | 		    const Mapping& mapping,
108 | 		    const size_t ignoreBlockSize = 1
109 | 		) {
110 | 			mStats.aggregateStatistics( q, mBlocks, tau_theta, mapping, ignoreBlockSize );
111 | 		}
112 | };
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | #endif
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/src/tools/MappedValues.hpp:
--------------------------------------------------------------------------------
  1 | // Values mapped to a genome, such as read-depth, start counts etc.
  2 | 
  3 | #include <vector>
  4 | using std::vector;
  5 | 
  6 | 
  7 | #include <string>
  8 | using std::string;
  9 | 
 10 | #include <map>
 11 | using std::map;
 12 | 
 13 | #include <stdexcept>
 14 | using std::runtime_error;
 15 | 
 16 | #include <algorithm>
 17 | using std::sort;
 18 | 
 19 | 
 20 | template <typename T>
 21 | class MappedValueEntry {
 22 | 	public:
 23 | 		size_t pos;
 24 | 		T entry;
 25 | 
 26 | 
 27 | 		MappedValueEntry(): pos( 0 ), entry() {}
 28 | 
 29 | 		MappedValueEntry( size_t p, T v ): pos( p ), entry( v ) {}
 30 | 
 31 | 
 32 | 		bool operator<( const MappedValueEntry<T>& b ) const {
 33 | 			return pos < b.pos;
 34 | 		}
 35 | 
 36 | 
 37 | 		bool operator>( const MappedValueEntry<T>& b ) const {
 38 | 			return pos > b.pos;
 39 | 		}
 40 | 
 41 | 		MappedValueEntry<T>& operator+=( const MappedValueEntry<T>& rhs ) {
 42 | 			if ( pos != rhs.pos ) {
 43 | 				throw runtime_error( "Cannot add values, positions don't match!" );
 44 | 			}
 45 | 			entry += rhs.entry;
 46 | 			return *this;
 47 | 		}
 48 | 
 49 | 		const MappedValueEntry<T> operator+( const MappedValueEntry<T>& other ) const {
 50 | 			return MappedValueEntry<T>( *this ) += other;
 51 | 		}
 52 | 
 53 | 		MappedValueEntry<T>& operator-=( const MappedValueEntry<T>& rhs ) {
 54 | 			if ( pos != rhs.pos ) {
 55 | 				throw runtime_error( "Cannot add values, positions don't match!" );
 56 | 			}
 57 | 			entry -= rhs.entry;
 58 | 			return *this;
 59 | 		}
 60 | 
 61 | 		const MappedValueEntry<T> operator-( const MappedValueEntry<T>& other ) const {
 62 | 			return MappedValueEntry<T>( *this ) -= other;
 63 | 		}
 64 | };
 65 | 
 66 | 
 67 | template<typename T>
 68 | void sortAddAndCompress(
 69 |     vector<MappedValueEntry<T>>& vec
 70 | ) {
 71 | 	sort( vec.begin(), vec.end() );
 72 | 	size_t L = 0;
 73 | 	size_t R = 1;
 74 | 	while ( R < vec.size() ) {
 75 | 		if ( vec[L].pos == vec[R].pos )  {
 76 | 			vec[L].entry += vec[R].entry;
 77 | 		} else {
 78 | 			++L;
 79 | 			vec[L] = vec[R];
 80 | 		}
 81 | 		++R;
 82 | 	}
 83 | 	vec.resize( L + 1 );
 84 | }
 85 | 
 86 | template<typename T>
 87 | void sortMultiplyAndCompress(
 88 |     vector<MappedValueEntry<T>>& vec
 89 | ) {
 90 | 	sort( vec.begin(), vec.end() );
 91 | 	size_t L = 0;
 92 | 	size_t R = 1;
 93 | 	while ( R < vec.size() ) {
 94 | 		if ( vec[L].pos == vec[R].pos )  {
 95 | 			vec[L].entry *= vec[R].entry;
 96 | 		} else {
 97 | 			++L;
 98 | 			vec[L] = vec[R];
 99 | 		}
100 | 		++R;
101 | 	}
102 | 	vec.resize( L + 1 );
103 | }
104 | 
105 | 
106 | 
107 | template <typename T>
108 | class MappedValues {
109 | 		map<string, vector<MappedValueEntry<T>>> mEntries;	// map refseq ID to vector of (pos, T) tuples
110 | 
111 | 
112 | 		void update( const MappedValues<T>& other ) {};
113 | 
114 | 		// TODO add(), subtract(bool removeZero=false)
115 | };
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/bin/samToCounts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Coverage information is extracted from SAM file. Three files are created:
 4 | #
 5 | # *-count.csv.gz contains the counts for each position in the genome, ordered by refseq name (chromosome name)
 6 | # *-pos.csv.gz contains the corresponding positions within each chromosome.
 7 | # *-size.csv contains 3 columns: the names of the refseqs in alphabetic order, the number of mapped start positions (leftmost), as well as the cumulative sum of the second column (easier for stream processing, and guarantees that the refseq order can always be restored). This corresponds to the lines in the other two files. For instance, "chr1 1000 1000" means that the first 1000 counts and positions come from chromosome 1. Using this file, each line in the pos file can be assigned its chromosome, and hence each count can be mapped uniquely to its genomic position.
 8 | 
 9 | # to handle uniq's weird output: remove leading whitespace, merge multiple spaces into one and replace spaces by tabs
10 | shopt -s expand_aliases
11 | 
12 | function tabuniq(){
13 | 	uniq -c | sed -e 's/^\s\+\(\w\+\) /\1\t/g'
14 | }
15 | 
16 | 
17 | # set I/O
18 | samfile=$1	# or bamfile
19 | outprefix=$2
20 | # set the following bits to be ignored:
21 | # 4 	read unmapped
22 | # 256	not primary alignment
23 | # 512	read fails platform/vendor quality checks
24 | # 1024	read is PCR or optical duplicate
25 | # 2048	supplementary alignment
26 | filterbits=${3:-3844}	
27 | 
28 | sortdir=${4:-/tmp}	# directory for temporary sort files
29 | 
30 | 
31 | # Create the read-depth using a system of named pipes, for space efficiency:
32 | colfifo=${outprefix}-cols.fifo
33 | sizefifo=${outprefix}-size.fifo
34 | mapfifo=${outprefix}-map.fifo
35 | mkfifo ${colfifo}
36 | mkfifo ${sizefifo}
37 | mkfifo ${mapfifo}
38 | 
39 | sizeoutfile=${outprefix}-size.csv
40 | posoutfile=${outprefix}-pos.csv.gz
41 | countoutfile=${outprefix}-count.csv.gz
42 | 
43 | 
44 | # sort by refseq, then leftmost mapping, and count; if a read maps to the exact same position, only count it once, since this will be due to alternative alignment
45 | ( samtools view -F ${filterbits} ${samfile} | cut -f 1,3,4 | sort -T ${sortdir} -k2,2V -k3,3n -k1,1 -u | cut -f 2,3 | tabuniq > ${colfifo} )&
46 | 
47 | # split into two pipes: sizefifo will proceed to get number of positions per refseq, the other will contain positions and counts
48 | ( cat ${colfifo} | tee ${sizefifo} > ${mapfifo} )&
49 | 
50 | # write a file containing the refseq and the number of mapped positions 
51 | ( cut -f 2 ${sizefifo} | tabuniq | awk '{total += $1; print $2"\t"$1"\t"total}' > ${sizeoutfile} )&
52 | 
53 | 
54 | # write one gzip file containing all counts, and one containing all positions, without refseq. Positions are for ordered by refseq as in the file output above. 
55 | ( cat ${mapfifo} | tee >( cut -f 3 | gzip -c > ${posoutfile}) | cut -f 1 | gzip -c > ${countoutfile} )&
56 | 
57 | wait 
58 | rm ${colfifo}
59 | rm ${sizefifo}
60 | rm ${mapfifo}
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/src/Transitions.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef TRANSITIONS_HPP
  2 | #define TRANSITIONS_HPP
  3 | 
  4 | #include "includes.hpp"
  5 | #include "Tags.hpp"
  6 | #include "TransitionHyperParam.hpp"
  7 | #include "Distribution.hpp"
  8 | #include "Observation.hpp"
  9 | #include "StateSequence.hpp"
 10 | 
 11 | 
 12 | 
 13 | template <typename ParamType>
 14 | ostream& operator<<(
 15 |     ostream& output,
 16 |     const Transitions<ParamType>& D )  {
 17 | 	output << D.str();
 18 | 	return output;
 19 | }
 20 | 
 21 | 
 22 | template <typename DistType> // e.g. DirichletVector
 23 | class Transitions {
 24 | 
 25 | 		size_t mNrStates;
 26 | 		Distribution<DistType> mDist;
 27 | 		Observation<DistType> mValue;
 28 | 		SufficientStatistics<CategoricalVector> mCounts;	// the count matrix TODO are there any cases where this is not CategoricalVector?
 29 | 
 30 | 
 31 | 	public:
 32 | 
 33 | 		// delete copy constructor
 34 | 		Transitions( const Transitions& that ) = delete;
 35 | 
 36 | 		////////// constructors //////////
 37 | 
 38 | 		Transitions( size_t nrStates,
 39 | 		             rng_t& RNG
 40 | 		           ) :
 41 | 			mNrStates( nrStates ),
 42 | 			mDist( RNG ),
 43 | 			mValue( nrStates ),
 44 | 			mCounts( nrStates ) {};
 45 | 
 46 | 		inline const real_t& operator()(
 47 | 		    const size_t from,
 48 | 		    const size_t to ) const {
 49 | 			return mValue( from, to );
 50 | 		};
 51 | 
 52 | 		inline real_t& operator()(
 53 | 		    const size_t from,
 54 | 		    const size_t to ) {
 55 | 			return mValue( from, to );
 56 | 		};
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 		//////////  const methods //////////
 62 | 
 63 | 		size_t nrStates() const {
 64 | 			return mNrStates;
 65 | 		}
 66 | 
 67 | 		string str() const {
 68 | 			return mValue.str();
 69 | 		}
 70 | 
 71 | 
 72 | 		//////////  non-const methods //////////
 73 | 
 74 | 		template<typename TransitionParamType>
 75 | 		void sample(
 76 | 		    TransitionHyperParam<TransitionParamType>& tau_A ) {	// NOTE tau_A cannot be const since we update the parameters
 77 | 			mDist.resample( mValue, tau_A.posterior() );
 78 | 			tau_A.reset();
 79 | 		}
 80 | 
 81 | // 		template<typename TransitionParamType>
 82 | // 		void sample(
 83 | // 		    TransitionParamType& tau_A
 84 | // 		) {
 85 | // 			mDist.resample( mValue,  tau_A.posterior() );
 86 | // 			tau_A.reset();
 87 | // 		}
 88 | 
 89 | 
 90 | };
 91 | 
 92 | 
 93 | 
 94 | // dummy specializations
 95 | 
 96 | template<> template<typename TransitionParamType>
 97 | void Transitions<Dummy>::sample(
 98 |     TransitionHyperParam<TransitionParamType>& tau_A ) {	// NOTE tau_A cannot be const since we update the parameters
 99 | }
100 | /*
101 | template<> template< typename StateSequenceType,  typename EmissionType, typename InitialType,  typename TransitionParamType>
102 | void Transitions<Dummy>::sample(
103 |     const StateSequenceType& q,
104 |     EmissionType& y,	// NOTE A does not stochastically depend on y; y is only passed because it contains the block sizes which we need for self-transitions
105 |     const InitialType& pi,
106 |     TransitionParamType& tau_A ) {
107 | }*/
108 | 
109 | 
110 | #endif
111 | 
112 | 


--------------------------------------------------------------------------------
/lib/gzstream/test_gunzip.C:
--------------------------------------------------------------------------------
 1 | // ============================================================================
 2 | // gzstream, C++ iostream classes wrapping the zlib compression library.
 3 | // Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
 4 | //
 5 | // This library is free software; you can redistribute it and/or
 6 | // modify it under the terms of the GNU Lesser General Public
 7 | // License as published by the Free Software Foundation; either
 8 | // version 2.1 of the License, or (at your option) any later version.
 9 | //
10 | // This library is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | // Lesser General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU Lesser General Public
16 | // License along with this library; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 | // ============================================================================
19 | //
20 | // File          : test_gunzip.C
21 | // Revision      : $Revision: 1.3 $
22 | // Revision_date : $Date: 2001/10/04 15:09:28 $
23 | // Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
24 | // 
25 | // Short test program reading a file, uncompressing it, and writing it.
26 | // ============================================================================
27 | 
28 | #include <gzstream.h>
29 | #include <iostream>
30 | #include <fstream>
31 | #include <stdlib.h>
32 | 
33 | int main( int argc, char*argv[]) {
34 |     if ( argc != 3) {
35 | 	std::cerr << "Usage: " << argv[0] <<" <in-file> <out-file>\n";
36 | 	return EXIT_FAILURE;
37 |     }
38 |     // check alternate way of opening file
39 |     igzstream    in2;
40 |     in2.open( argv[1]);
41 |     if ( ! in2.good()) {
42 |         std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n";
43 | 	return EXIT_FAILURE;
44 |     }
45 |     in2.close();
46 |     if ( ! in2.good()) {
47 |         std::cerr << "ERROR: Closing file `" << argv[1] << "' failed.\n";
48 | 	return EXIT_FAILURE;
49 |     }
50 |     // now use the shorter way with the constructor to open the same file
51 |     igzstream in(  argv[1]);
52 |     if ( ! in.good()) {
53 |         std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n";
54 | 	return EXIT_FAILURE;
55 |     }
56 |     std::ofstream  out( argv[2]);
57 |     if ( ! out.good()) {
58 |         std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n";
59 | 	return EXIT_FAILURE;
60 |     }
61 |     char c;
62 |     while ( in.get(c))
63 | 	out << c;
64 |     in.close();
65 |     out.close();
66 |     if ( ! in.eof()) {
67 |         std::cerr << "ERROR: Reading file `" << argv[1] << "' failed.\n";
68 | 	return EXIT_FAILURE;
69 |     }
70 |     if ( ! out.good()) {
71 |         std::cerr << "ERROR: Writing file `" << argv[2] << "' failed.\n";
72 | 	return EXIT_FAILURE;
73 |     }
74 |     return EXIT_SUCCESS;
75 | }
76 | 
77 | // ============================================================================
78 | // EOF
79 | 


--------------------------------------------------------------------------------
/lib/gzstream/test_gzip.C:
--------------------------------------------------------------------------------
 1 | // ============================================================================
 2 | // gzstream, C++ iostream classes wrapping the zlib compression library.
 3 | // Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
 4 | //
 5 | // This library is free software; you can redistribute it and/or
 6 | // modify it under the terms of the GNU Lesser General Public
 7 | // License as published by the Free Software Foundation; either
 8 | // version 2.1 of the License, or (at your option) any later version.
 9 | //
10 | // This library is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | // Lesser General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU Lesser General Public
16 | // License along with this library; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 | // ============================================================================
19 | //
20 | // File          : test_gzip.C
21 | // Revision      : $Revision: 1.3 $
22 | // Revision_date : $Date: 2001/10/04 15:09:28 $
23 | // Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
24 | // 
25 | // Short test program reading a file, compressing it, and writing it.
26 | // ============================================================================
27 | 
28 | #include <gzstream.h>
29 | #include <iostream>
30 | #include <fstream>
31 | #include <stdlib.h>
32 | 
33 | int main( int argc, char*argv[]) {
34 |     if ( argc != 3) {
35 | 	std::cerr << "Usage: " << argv[0] <<" <in-file> <out-file>\n";
36 | 	return EXIT_FAILURE;
37 |     }
38 |     // check alternate way of opening file
39 |     ogzstream    out2;
40 |     out2.open( argv[2]);
41 |     if ( ! out2.good()) {
42 |         std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n";
43 | 	return EXIT_FAILURE;
44 |     }
45 |     out2.close();
46 |     if ( ! out2.good()) {
47 |         std::cerr << "ERROR: Closing file `" << argv[2] << "' failed.\n";
48 | 	return EXIT_FAILURE;
49 |     }
50 |     // now use the shorter way with the constructor to open the same file
51 |     ogzstream  out( argv[2]);
52 |     if ( ! out.good()) {
53 |         std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n";
54 | 	return EXIT_FAILURE;
55 |     }
56 |     std::ifstream in(  argv[1]);
57 |     if ( ! in.good()) {
58 |         std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n";
59 | 	return EXIT_FAILURE;
60 |     }
61 |     char c;
62 |     while ( in.get(c))
63 | 	out << c;
64 |     in.close();
65 |     out.close();
66 |     if ( ! in.eof()) {
67 |         std::cerr << "ERROR: Reading file `" << argv[1] << "' failed.\n";
68 | 	return EXIT_FAILURE;
69 |     }
70 |     if ( ! out.good()) {
71 |         std::cerr << "ERROR: Writing file `" << argv[2] << "' failed.\n";
72 | 	return EXIT_FAILURE;
73 |     }
74 |     return EXIT_SUCCESS;
75 | }
76 | 
77 | // ============================================================================
78 | // EOF
79 | 


--------------------------------------------------------------------------------
/src/tools/GenomeGetter.hpp:
--------------------------------------------------------------------------------
  1 | // Class that reads compressed genome representations and serves as a kind of iterator.
  2 | 
  3 | #include <string>
  4 | using std:: string;
  5 | 
  6 | #include <vector>
  7 | using std::vector;
  8 | 
  9 | #include <fstream>
 10 | using std::ifstream;
 11 | using std::getline;
 12 | 
 13 | #include <sstream>
 14 | using std::stringstream;
 15 | 
 16 | #include <stdexcept>
 17 | using std::runtime_error;
 18 | 
 19 | #include "gzstream.h"
 20 | 
 21 | class GenomeGetter {
 22 | 		string mRefSeq;
 23 | 		string mPrevRefSeq;
 24 | 		size_t mPos;
 25 | 		size_t mPrevPos;
 26 | 
 27 | 		bool mIsNewRefSeq;
 28 | 
 29 | 		// size of the current refseq, and the index of the line within that refseq
 30 | 		size_t mRefSeqSize;
 31 | 		size_t mRefSeqIndex;
 32 | 
 33 | 		// same as above, but cumulative
 34 | 		size_t mTotalSize;
 35 | 		size_t mTotalIndex;
 36 | 
 37 | 		igzstream posfile;
 38 | 		igzstream sizefile;
 39 | 
 40 | 		string line;
 41 | 	public:
 42 | 
 43 | 		// if only a prefix is provided, the input is assumed to consist of 3 files: size, pos, and count
 44 | 		GenomeGetter( const string& prefix ):
 45 | 			mRefSeq( "" ),
 46 | 			mPrevRefSeq( "" ),
 47 | 			mPos( 0 ),
 48 | 			mPrevPos( 0 ),
 49 | 			mRefSeqSize( 0 ),
 50 | 			mRefSeqIndex( 0 ),
 51 | 			mTotalSize( 0 ),
 52 | 			mTotalIndex( 0 ),
 53 | 			mIsNewRefSeq( false ) {
 54 | 			sizefile.open( ( prefix + "-size.csv" ).c_str() );
 55 | 			if ( !sizefile ) {
 56 | 				throw runtime_error( "Cannot read " + prefix + "-size.csv!" );
 57 | 			}
 58 | 			posfile.open( ( prefix + "-pos.csv.gz" ).c_str() );
 59 | 			if ( !posfile ) {
 60 | 				throw runtime_error( "Cannot read " + prefix + "-pos.csv.gz!" );
 61 | 			}
 62 | 		}
 63 | 
 64 | 
 65 | 		bool next() {
 66 | 			stringstream ss( line );
 67 | 			if ( mRefSeqIndex == mRefSeqSize ) { // start a new refseq
 68 | 				mIsNewRefSeq = true;
 69 | 				mPrevRefSeq = mRefSeq;
 70 | 				if ( getline( sizefile,  line ) ) {
 71 | 					ss.str( line );
 72 | 					ss >> mRefSeq;
 73 | 					ss >> mRefSeqSize;
 74 | 					ss >> mTotalSize;	// TODO check that this increased, adds up etc.
 75 | 					mRefSeqIndex = 0;
 76 | 					ss.clear();
 77 | 
 78 | 				} else {	// no more lines
 79 | 					// TODO assert there is nothing left in the other files
 80 | 					mRefSeq = "";
 81 | 					mPos = 0;
 82 | 					return false;
 83 | 				}
 84 | 			} else {
 85 | 				mIsNewRefSeq = false;
 86 | 			}
 87 | 			if ( getline( posfile, line ) ) {
 88 | 				ss.str( line );
 89 | 				mPrevPos = mPos;
 90 | 				ss >> mPos;
 91 | 				ss.clear();
 92 | 			} else {
 93 | 				throw runtime_error( "Not enough entries in position file!" );
 94 | 			}
 95 | 
 96 | 			mRefSeqIndex++;
 97 | 			mTotalIndex++;
 98 | 			return true;
 99 | 		}
100 | 
101 | 		const string& refseq() const {
102 | 			return mRefSeq;
103 | 		}
104 | 
105 | 		const string& prevRefseq() const {
106 | 			return mPrevRefSeq;
107 | 		}
108 | 
109 | 		const size_t& pos() const {
110 | 			return mPos;
111 | 		}
112 | 
113 | 		const size_t& prevPos() const {
114 | 			return mPrevPos;
115 | 		}
116 | 
117 | 		bool refseqChanged() const {
118 | 			return mIsNewRefSeq;
119 | 		}
120 | 
121 | 
122 | };
123 | 


--------------------------------------------------------------------------------
/src/includes.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef INCLUDES_HPP
  2 | #define INCLUDES_HPP
  3 | 
  4 | #include <cstdint>
  5 | // using int_16_t;
  6 | 
  7 | // This file contains header inclusions, because we are lazy (and consistent), as well as some basic constants and typedefs.
  8 | 
  9 | // Type to use for real numbers. Note that we don't introduce separate types for data, wavelet coefficients and probabilities, as one might be tempted, because that would lead to many implicit type conversions in likelihood computations, wavelet thresholding etc.
 10 | typedef float real_t;
 11 | 
 12 | // Type to use for the marginal counts. Has to be a signed integer type. Its maximum is the maximum number of counts per state and position as well as the maximum number of recorded states.
 13 | typedef int16_t marginal_t;	// the type used to record marginal counts
 14 | 
 15 | 
 16 | #include <type_traits>
 17 | using std::is_integral;
 18 | using std::is_unsigned;
 19 | 
 20 | #include <cstddef>
 21 | using std::size_t;
 22 | 
 23 | 
 24 | #include <vector>
 25 | using std::vector;
 26 | 
 27 | #include <queue>
 28 | using std::queue;
 29 | 
 30 | #include <deque>
 31 | using std::deque;
 32 | 
 33 | #include <array>
 34 | using std::array;
 35 | 
 36 | 
 37 | #include <string>
 38 | using std::string;
 39 | using std::to_string;
 40 | 
 41 | 
 42 | #include <sstream>
 43 | using std::stringstream;
 44 | using std::istringstream;
 45 | 
 46 | 
 47 | #include <iostream>
 48 | using std::istream;
 49 | using std::ostream;
 50 | using std::endl;
 51 | using std::cin;
 52 | using std::cout;
 53 | using std::cerr;
 54 | using std::clog;
 55 | using std::wcout;
 56 | using std::flush;
 57 | using std::boolalpha;
 58 | using std::ios;
 59 | 
 60 | 
 61 | #include <fstream>
 62 | using std::ifstream;
 63 | using std::ofstream;
 64 | 
 65 | 
 66 | #include <stdexcept>
 67 | using std::runtime_error;	// TODO throw the appropriate errors, like logic_error etc.
 68 | using std::exception;
 69 | 
 70 | #include <cmath>
 71 | using std::pow;
 72 | using std::exp;	// e^x
 73 | using std::exp2;
 74 | using std::log;	// natural log
 75 | using std::log2;
 76 | using std::log10;
 77 | using std::sqrt;
 78 | using std::ceil;
 79 | using std::floor;
 80 | using std::abs;
 81 | 
 82 | using std::isfinite;
 83 | 
 84 | 
 85 | 
 86 | #include <algorithm>
 87 | using std::min;
 88 | using std::max;
 89 | using std::nth_element;
 90 | using std::reverse;
 91 | using std::fill;
 92 | 
 93 | 
 94 | #include <numeric>
 95 | using std::partial_sum;
 96 | using std::plus;
 97 | using std::accumulate; //e.g. sum of vector
 98 | 
 99 | 
100 | #include <iterator>
101 | using std::istream_iterator;
102 | using std::ostream_iterator;
103 | using std::back_inserter;
104 | 
105 | 
106 | #include <unordered_map>
107 | using std::unordered_map;
108 | 
109 | 
110 | #include <stack>
111 | using std::stack;
112 | 
113 | 
114 | #include <climits>
115 | using std::numeric_limits;
116 | 
117 | 
118 | #include <ctime>
119 | using std::time;
120 | 
121 | 
122 | #include <iomanip>
123 | using std::setprecision;
124 | 
125 | 
126 | const real_t inf = numeric_limits<real_t>::infinity();
127 | const real_t sqrt2 = sqrt( 2.0 );
128 | const real_t sqrt2half = sqrt2 / 2.0;	// sqrt(2)/2 = 1/sqrt(2)
129 | 
130 | 
131 | #endif
132 | 
133 | 


--------------------------------------------------------------------------------
/src/EFD.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef EFD_HPP
  2 | #define EFD_HPP
  3 | 
  4 | // Functionality related to exponential family distributions
  5 | 
  6 | #include "SufficientStatistics.hpp"
  7 | #include "Observation.hpp"
  8 | 
  9 | 
 10 | ////////// Inner Products of parameters and sufficient statistics //////////
 11 | 
 12 | template<class SuffStatType, class ParamType>
 13 | real_t innerProduct(
 14 |     const SufficientStatistics<SuffStatType>& suffstat,
 15 |     const Observation<ParamType>& param );
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | ////////// Normal //////////
 22 | 
 23 | real_t innerProduct(
 24 |     const SufficientStatistics<Normal>& suffstat,
 25 |     const Observation<NormalParam>& param
 26 | )  {
 27 | 	real_t result = ( 2.0 * param.mean() * suffstat.sum() - suffstat.sumSq() ) / ( 2.0 * param.var( ) );
 28 | 	if ( !isfinite( result ) ) {
 29 | 		throw runtime_error( "Result of Normal inner product is not finite!" );
 30 | 	}
 31 | 	return result;
 32 | }
 33 | 
 34 | 
 35 | real_t logNormalizer(
 36 |     const Observation<NormalParam>& param ) {
 37 | 	return log( param.stdev() ) + param.mean() * param.mean() / ( 2 * param.var() );
 38 | }
 39 | 
 40 | 
 41 | real_t sampleMean(
 42 |     const SufficientStatistics<Normal>& suffstat,
 43 |     size_t N	) {
 44 | 	if ( N <= 0 ) {
 45 | 		throw runtime_error( "Cannot calculate mean from zero observations!" );
 46 | 	}
 47 | 	double n = N;
 48 | 	return suffstat.sum() / n;
 49 | }
 50 | 
 51 | real_t sampleVariance(
 52 |     const SufficientStatistics<Normal>& suffstat,
 53 |     size_t N	) {
 54 | 	if ( N <= 0 ) {
 55 | 		throw runtime_error( "Cannot calculate variance from zero observations!" );
 56 | 	}
 57 | 	double n = N;
 58 | 	double avg = sampleMean( suffstat, N );
 59 | 	return suffstat.sumSq() / n - ( avg * avg );
 60 | }
 61 | 
 62 | 
 63 | 
 64 | ////////// Geometric distribution //////////
 65 | 
 66 | real_t innerProduct(
 67 |     const SufficientStatistics<Geometric>& suffstat,
 68 |     const Observation<Beta>& param )  {
 69 | 	real_t result = suffstat.sum() * param.value();
 70 | 	return result;
 71 | }
 72 | 
 73 | 
 74 | real_t logNormalizer(
 75 |     const Observation<Beta>& param )  {
 76 | 	return log( param.value() );
 77 | }
 78 | 
 79 | 
 80 | 
 81 | 
 82 | // calculates the inner product in the PDF of an EFD between the current sufficient statistics and a set of parameters under a current mapping
 83 | template<class EmissionObject, class ParamType>
 84 | real_t innerProduct(
 85 |     const EmissionObject& y,
 86 |     const vector<Observation<ParamType>>& param,	// TODO ParamType in template definition?
 87 |     const vector<size_t>& mapping )  {
 88 | 	real_t result = 0;
 89 | 	for ( auto dim = 0; dim < y.nrDim(); dim++ ) {
 90 | 		result += innerProduct( y.suffStat( dim ),  param[mapping[dim]] );	
 91 | 	}
 92 | 	return result;
 93 | }
 94 | 
 95 | 
 96 | template<class EmissionObject, class ParamType>
 97 | real_t innerProduct(
 98 |     const EmissionObject& y,
 99 |     const vector<Observation<ParamType>>& param	// TODO ParamType in template definition?
100 | )  {
101 | 	real_t result = 0;
102 | 	for ( auto dim = 0; dim < y.nrDim(); dim++ ) {
103 | 		result += innerProduct( y.suffStat( dim ),  param[dim] );	
104 | 	}
105 | 	return result;
106 | }
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | #endif
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/src/ThetaHyperParam.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef THETAHYPERPARAM_HPP
  2 | #define THETAHYPERPARAM_HPP
  3 | 
  4 | #include "Tags.hpp"
  5 | #include "Mapping.hpp"
  6 | #include "Conjugate.hpp"
  7 | #include "Observation.hpp"
  8 | #include "SufficientStatistics.hpp"
  9 | 
 10 | 
 11 | 
 12 | 
 13 | template <typename ParamType>	// e.g. NormalInverseGammaParam
 14 | class ThetaHyperParam {
 15 | 
 16 | 		const size_t mNrParams;
 17 | 
 18 | 		vector<Conjugate<ParamType>> mParams;
 19 | 
 20 | 	public:
 21 | 
 22 | 
 23 | 		// automatic priors
 24 | 		ThetaHyperParam(
 25 | 		    const vector< vector< real_t > >& hyperparams
 26 | 		) : mNrParams( hyperparams.size() ) {
 27 | 
 28 | 			if ( mNrParams <= 0 ) {
 29 | 				throw runtime_error( "Number of hyperparameters must be positive" );
 30 | 			}
 31 | 
 32 | 			if ( mNrParams <= 0 ) {
 33 | 				throw runtime_error( "Number of emission hyperparameters must be positive! Did you forget to provide them, or to use -a?" );
 34 | 			}
 35 | 
 36 | 
 37 | 			for ( const auto & hp : hyperparams ) {
 38 | 				mParams.push_back( Conjugate<ParamType>( hp ) );
 39 | 			}
 40 | 		}
 41 | 
 42 | 		ThetaHyperParam(
 43 | 		    const vector < Observation<ParamType>>& hyperparams
 44 | 		) : mNrParams( hyperparams.size() ) {
 45 | 
 46 | 			if ( mNrParams <= 0 ) {
 47 | 				throw runtime_error( "Number of hyperparameters must be positive" );
 48 | 			}
 49 | 
 50 | 			for ( const auto & hp : hyperparams ) {
 51 | 				mParams.push_back( Conjugate<ParamType>( hp ) );
 52 | 			}
 53 | 		}
 54 | 
 55 | 
 56 | 		ThetaHyperParam(
 57 | 		    const  Observation<ParamType>& hyperparams,
 58 | 		    const size_t nrDim
 59 | 		) : mNrParams( nrParams ),
 60 | 			mParams( hyperparams, nrDim ) {
 61 | 
 62 | 			if ( mNrParams <= 0 ) {
 63 | 				throw runtime_error( "Number of hyperparameters must be positive" );
 64 | 			}
 65 | 
 66 | 		}
 67 | 
 68 | 		size_t nrParams() const {
 69 | 			return mNrParams;
 70 | 		}
 71 | 
 72 | 
 73 | 
 74 | 		////////// accessors //////////
 75 | 		//NOTE round parentheses access the data through the mapping, square brackets access the parameters directly
 76 | 
 77 | 		template<typename EmissionsType>
 78 | 		inline void addObservation(
 79 | 		    const SufficientStatistics<EmissionsType>& suffStat,
 80 | 		    const size_t N,
 81 | 		    const size_t dim ) {
 82 | 
 83 | 			mParams[dim].addObservation( suffStat, N );
 84 | 		}
 85 | 
 86 | 
 87 | 		// TODO some objects use posterior(), make consistent
 88 | 		// TODO implicit conversion?
 89 | 		const Observation<ParamType>& posterior(
 90 | 		    const size_t d	) const {
 91 | 			return mParams[d].posterior();
 92 | 		}
 93 | 
 94 | 
 95 | 		const Observation<ParamType>& prior(
 96 | 		    const size_t d	) const {
 97 | 			return mParams[d].prior();
 98 | 		}
 99 | 
100 | 		void reset() {
101 | 			for ( auto & p : mParams ) {
102 | 				p.reset();
103 | 			}
104 | 		}
105 | 
106 | 
107 | 		string str() const {
108 | 			return concat( mParams, "\t", "\n" );
109 | 		}
110 | 
111 | 		const Conjugate<ParamType> operator[]( size_t d ) const {
112 | 			return mParams[d];
113 | 		}
114 | };
115 | 
116 | 
117 | 
118 | 
119 | ////////////////////////////////////////////////// TEMPLATE SPECIALIZATIONS //////////////////////////////////////////////////
120 | // #include "WaveletTree.hpp"	// required implementation
121 | 
122 | 
123 | 
124 | #endif
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/src/Blocks/SplittableBlocks.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SPLITTABLEBLOCKS_HPP
  2 | #define SPLITTABLEBLOCKS_HPP
  3 | 
  4 | template<>
  5 | class Blocks<Splittable> {
  6 | 		const size_t mSize;
  7 | 		deque<size_t> mSizes;
  8 | 		size_t mStart;
  9 | 		size_t mEnd;
 10 | 		size_t mPos;	// the current position as seen from the outside.
 11 | 		size_t mIndex; // the index in mSizes corresponding to mPos
 12 | 		bool mFirstIter;
 13 | 
 14 | 		void rotate() {
 15 | 			mSizes.push_back( mSizes.front() );
 16 | 			mSizes.pop_front();
 17 | 			if ( mIndex > 0 ) {
 18 | 				mIndex--;
 19 | 			} else {
 20 | 				mIndex = mSizes.size() - 1;
 21 | 			}
 22 | 		}
 23 | 
 24 | 	public:
 25 | 
 26 | 
 27 | 		Blocks( size_t size ):
 28 | 			mSize( size ),
 29 | 			mPos( 0 ),
 30 | 			mStart( 0 ),
 31 | 			mEnd( 0 ),
 32 | 			mIndex( 0 ),
 33 | 			mFirstIter( true ) {
 34 | 			if ( size == 0 ) {
 35 | 				throw runtime_error( "Cannot create block structure for 0 positions!" );
 36 | 			}
 37 | 			mSizes.push_back( size );
 38 | 		}
 39 | 
 40 | 		// TODO is there any way to swap this?
 41 | 		Blocks( const vector<size_t>& sizes ):
 42 | 			mSize( accumulate( sizes.begin(), sizes.end(), 0 ) ),
 43 | 			mPos( 0 ),
 44 | 			mStart( 0 ),
 45 | 			mEnd( 0 ),
 46 | 			mIndex( 0 ),
 47 | 			mFirstIter( true ) {
 48 | 			for ( auto & x : sizes ) {
 49 | 				mSizes.push_back( x );
 50 | 			}
 51 | 		}
 52 | 
 53 | 		// Split the current block such that the first new block is of size s, and move to the first block; this preserves pos(). If s is >= the size of the block, an exception is thrown
 54 | 		void split( size_t s ) {
 55 | 			if ( mSizes[mIndex] <= s ) {
 56 | 				throw runtime_error( "Cannot split block into this size!" );
 57 | 			}
 58 | 
 59 | 			// rotate until current position is at the front
 60 | 			while ( mIndex != 0 ) {
 61 | 				rotate();
 62 | 			}
 63 | 
 64 | 			mSizes.push_back( s );
 65 | 			mSizes[0] = mSizes[0] - s;
 66 | 			mIndex = mSizes.size() - 1;
 67 | 		};
 68 | 
 69 | 
 70 | 		// Move to the first position, so pos()==0.
 71 | 		void initForward() {
 72 | 			mFirstIter = true;
 73 | 			if ( mIndex >= mPos ) {
 74 | 				mIndex -= mPos;
 75 | 			} else {
 76 | 				mIndex += ( mSizes.size() - mPos );
 77 | 			}
 78 | 			mPos = 0;
 79 | 			mStart = 0;
 80 | 			mEnd = 0;
 81 | 		}
 82 | 
 83 | 		// move to the next block, in modular fashion (wrap around)
 84 | 		// returns false if wrapped around, otherwise true.
 85 | 		bool next() {
 86 | 			if ( mFirstIter ) {
 87 | 				mFirstIter = false;
 88 | 				mEnd = mStart + mSizes[mIndex];
 89 | 			} else {
 90 | 				mIndex = ( mIndex + 1 ) % mSizes.size();
 91 | 				mPos++;
 92 | 				mStart = mEnd;
 93 | 				mEnd += mSizes[mIndex];
 94 | 			}
 95 | 			if ( mPos == mSizes.size() ) {
 96 | 				mPos = 0;
 97 | 				return false;
 98 | 			} else {
 99 | 				return true;
100 | 			}
101 | 		};
102 | 
103 | 		size_t start() const {
104 | 			return mStart;
105 | 		}
106 | 
107 | 		size_t end() const {
108 | 			return mEnd;
109 | 		}
110 | 
111 | 
112 | 		// return the number of blocks
113 | 		size_t nrBlocks()const {
114 | 			return mSizes.size();
115 | 		};
116 | 
117 | 
118 | 		// return the current block size
119 | 		size_t size() const {
120 | 			return mSizes[mIndex];
121 | 		};
122 | 
123 | 
124 | 		// return the index of the current block
125 | 		size_t pos() const {
126 | 			return mPos % mSizes.size();
127 | 		}
128 | };
129 | 
130 | 
131 | #endif
132 | 


--------------------------------------------------------------------------------
/src/Tags.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef TAGS_HPP
  2 | #define TAGS_HPP
  3 | 
  4 | 
  5 | enum MappingType {combinations, independent};
  6 | 
  7 | 
  8 | // these empty classes are used as tags for template specialization, allowing the compiler to inline and optimize for different use cases
  9 | 
 10 | ////////// These classes don't do anything by themselves, they are used as tags for template specialization //////////
 11 | 
 12 | class Dummy {};	// a tag for when the class is omitted, e.g. a transition matrix when no transitions are used
 13 | 
 14 | class Normal {};
 15 | class NormalParam {};
 16 | using NormalInverseGamma = NormalParam;
 17 | class NormalInverseGammaParam {};
 18 | 
 19 | 
 20 | class NormalVector {};
 21 | class NormalParamVector {};
 22 | using NormalInverseGammaVector = NormalParamVector;
 23 | class NormalInverseGammaParamVector {};
 24 | 
 25 | 
 26 | class InverseGamma {};
 27 | class InverseGammaParam {};
 28 | class NormalGammaParam {};
 29 | 
 30 | 
 31 | // using NormalGamma = NormalParam;
 32 | 
 33 | class Categorical {};
 34 | class CategoricalParam {};
 35 | using Dirichlet = CategoricalParam;
 36 | class DirichletParam {};
 37 | 
 38 | 
 39 | class CategoricalVector {};	// transition counts are SufficientStatistics<CategoricalVector>
 40 | class CategoricalParamVector {};
 41 | using DirichletVector = CategoricalParamVector;
 42 | class DirichletParamVector {};
 43 | 
 44 | 
 45 | class Geometric {};
 46 | class Beta {};
 47 | class BetaParam {};
 48 | 
 49 | 
 50 | 
 51 | // sampling tags for state sequence
 52 | class ForwardBackward {};	// forward-backward sampling
 53 | class Mixture {}; // sampling of each block individually
 54 | class DirectGibbs {};	// sample direct Gibbs, i.e. including transitions into and out of the state
 55 | 
 56 | ////////// tags for data structures //////////
 57 | class Vector {}; // plain data structure for uncompressed sampling
 58 | class WaveletTree {};
 59 | class Fixed {};
 60 | class IntegralArray {};
 61 | class Splittable {};
 62 | class BreakpointArray {};
 63 | 
 64 | 
 65 | 
 66 | //////////  forward declarations //////////
 67 | 
 68 | template < typename StateSequenceType,
 69 |          typename EmissionDataStructure,
 70 |          typename EmissionDistType, // e.g. Normal
 71 |          typename ThetaDistType,	// e.g. NormalInverseGamma
 72 |          typename TransitionDistType, // e.g. DirichletVector
 73 |          typename InitialDistType,	// e.g. Dirichlet
 74 |          typename ThetaParamType,	// e.g. NormalInverseGammaParam
 75 |          typename TransitionParamType,	// e.g. DirichletParam
 76 |          typename InitialParamType 	// e.g. DirichletParam
 77 |          >
 78 | class HMM;
 79 | 
 80 | 
 81 | template < typename Type >
 82 | class StateSequence;
 83 | 
 84 | template < typename DataStructure,  typename DistType >
 85 | class Emissions;
 86 | 
 87 | template < typename DistType >
 88 | class Transitions;
 89 | 
 90 | template < typename DistType >
 91 | class Theta;
 92 | 
 93 | template < typename DistType >
 94 | class Initial;
 95 | 
 96 | 
 97 | 
 98 | template <typename T>
 99 | class Observation;
100 | 
101 | template <typename T>
102 | class SufficientStatistics;
103 | 
104 | template <typename T>
105 | class Distribution;
106 | 
107 | template <typename T>
108 | class Conjugate;
109 | 
110 | template <typename DistType>
111 | class ThetaHyperParam;
112 | 
113 | 
114 | class Mapping;
115 | 
116 | 
117 | 
118 | #endif
119 | 


--------------------------------------------------------------------------------
/lib/gzstream/Makefile:
--------------------------------------------------------------------------------
 1 | # ============================================================================
 2 | # gzstream, C++ iostream classes wrapping the zlib compression library.
 3 | # Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
 4 | # 
 5 | # This library is free software; you can redistribute it and/or
 6 | # modify it under the terms of the GNU Lesser General Public
 7 | # License as published by the Free Software Foundation; either
 8 | # version 2.1 of the License, or (at your option) any later version.
 9 | # 
10 | # This library is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | # Lesser General Public License for more details.
14 | # 
15 | # You should have received a copy of the GNU Lesser General Public
16 | # License along with this library; if not, write to the Free Software
17 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 | # ============================================================================
19 | # 
20 | # File          : Makefile
21 | # Revision      : $Revision: 1.3 $
22 | # Revision_date : $Date: 2001/10/04 15:09:28 $
23 | # Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
24 | # 
25 | # ============================================================================
26 | 
27 | # ----------------------------------------------------------------------------
28 | # adapt these settings to your need:
29 | # add '-DGZSTREAM_NAMESPACE=name' to CPPFLAGS to place the classes
30 | # in its own namespace. Note, this macro needs to be set while creating
31 | # the library as well while compiling applications based on it.
32 | # As an alternative, gzstream.C and gzstream.h can be edited.
33 | # ----------------------------------------------------------------------------
34 | 
35 | # CXX      = CC -n32 -LANG:std   # for SGI Irix 6.5, MIPSpro CC version 7.30
36 | CXX      = g++   # for Linux RedHat 6.1, g++ version 2.95.2
37 | 
38 | CPPFLAGS = -I. -O
39 | LDFLAGS  = -L. -lgzstream -lz
40 | AR       = ar cr
41 | 
42 | # ----------------------------------------------------------------------------
43 | # plain simple rules to make and cleanup the library:
44 | # make default;   compiles the library
45 | # make test;      compiles and executes test. O.K. message marks success.
46 | # make clean;     removes temporary files
47 | # make cleanall;  removes temporary files, the library, and programs
48 | # ----------------------------------------------------------------------------
49 | 
50 | default: libgzstream.a
51 | 
52 | test:    test_gzip test_gunzip
53 | 	./test_gzip COPYING.LIB gz.tmp.gz
54 | 	gunzip gz.tmp.gz
55 | 	diff COPYING.LIB gz.tmp
56 | 	gzip gz.tmp
57 | 	./test_gunzip gz.tmp.gz gz.tmp
58 | 	diff COPYING.LIB gz.tmp
59 | 	rm gz.tmp.gz gz.tmp
60 | 	# *** O.K. Test finished successfully. ***
61 | 
62 | gzstream.o : gzstream.C gzstream.h
63 | 	${CXX} ${CPPFLAGS} -c -o gzstream.o gzstream.C
64 | 
65 | test_gzip.o : test_gzip.C gzstream.h
66 | 	${CXX} ${CPPFLAGS} -c -o test_gzip.o test_gzip.C
67 | 
68 | test_gunzip.o : test_gunzip.C gzstream.h
69 | 	${CXX} ${CPPFLAGS} -c -o test_gunzip.o test_gunzip.C
70 | 
71 | libgzstream.a : gzstream.o
72 | 	${AR} libgzstream.a gzstream.o
73 | 
74 | test_gzip : test_gzip.o libgzstream.a
75 | 	${CXX} -o test_gzip test_gzip.o ${LDFLAGS}
76 | 
77 | test_gunzip : test_gunzip.o libgzstream.a
78 | 	${CXX} -o test_gunzip test_gunzip.o ${LDFLAGS}
79 | 
80 | clean :
81 | 	rm *.o
82 | 
83 | cleanall :
84 | 	rm *.o libgzstream.a test_gzip test_gunzip
85 | 
86 | # ============================================================================
87 | # EOF
88 | 
89 | 


--------------------------------------------------------------------------------
/src/StateSequence.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef STATESEQUENCE_HPP
  2 | #define STATESEQUENCE_HPP
  3 | 
  4 | #include "includes.hpp"
  5 | #include "Emissions.hpp"
  6 | #include "Theta.hpp"
  7 | #include "Transitions.hpp"
  8 | #include "Initial.hpp"
  9 | #include"Blocks.hpp"
 10 | #include "Statistics.hpp"
 11 | #include "KahanAggregator.hpp"
 12 | #include "Trellis.hpp"
 13 | #include "Records.hpp"
 14 | 
 15 | 
 16 | 
 17 | 
 18 | template <typename Type>
 19 | class StateSequence {
 20 | 
 21 | 		vector<marginal_t> mStates;
 22 | 		rng_t& mRNG;
 23 | 		vector<size_t> mPrevStateSequence;	// for direct Gibbs
 24 | 		Trellis mTrellis;	// implementation as member avoids frequent allocations
 25 | 
 26 | 	public:
 27 | 
 28 | 		// delete copy constructor
 29 | 		StateSequence( const StateSequence& that ) = delete;
 30 | 
 31 | 		StateSequence( rng_t& RNG ) : mTrellis( RNG ), mRNG( RNG ) {};
 32 | 
 33 | // 		template<typename EmissionsType, typename ThetaType, typename TransitionsType, typename InitialType>
 34 | // 		void sample(
 35 | // 		    EmissionsType& Y,	// TODO cannot be const due to next()
 36 | // 		    const ThetaType& theta,
 37 | // 		    const TransitionsType& A,
 38 | // 		    const InitialType& pi,
 39 | // 		    const bool useSelfTransitions = true
 40 | // 		);
 41 | 
 42 | 
 43 | 
 44 | 		template <
 45 | 		typename StatsStructure,
 46 | 		         typename StatsType,
 47 | 		         typename BlocksType,
 48 | 		         typename ThetaType,
 49 | 		         typename TauThetaType,
 50 | 		         typename TransitionsType,
 51 | 		         typename TauAType,
 52 | 		         typename InitialType,
 53 | 		         typename TauPiType >
 54 | 		void sample(
 55 | 		    Emissions<Statistics<StatsStructure, StatsType>, Blocks<BlocksType>>& y,
 56 | 		    const ThetaType& theta,
 57 | 		    TauThetaType& tau_theta,
 58 | 		    const TransitionsType& A,
 59 | 		    TauAType& tau_A,
 60 | 		    const InitialType& pi,
 61 | 		    TauPiType& tau_pi,
 62 | 		    const Mapping& mapping,
 63 | 		    Records& records,
 64 | 		    const bool doRecord,
 65 | 		    const bool useSelfTransitions );
 66 | 
 67 | 
 68 | 		size_t size() const {
 69 | 			return mStates.size();
 70 | 		}
 71 | 
 72 | 		const vector<marginal_t>& states() const  {
 73 | 			return mStates;
 74 | 		}
 75 | 
 76 | 		const marginal_t& operator[](
 77 | 		    const size_t s	) const  {
 78 | 			if ( s >= mStates.size() ) {
 79 | 				throw runtime_error( "State sequence index " + to_string( s ) + " out of bounds!" );
 80 | 			}
 81 | 			return mStates[s];
 82 | 		}
 83 | 
 84 | 		marginal_t operator[](
 85 | 		    const size_t s	) {
 86 | 			if ( s >= mStates.size() ) {
 87 | 				throw runtime_error( "State sequence index " + to_string( s ) + " out of bounds!" );
 88 | 			}
 89 | 			return mStates[s];
 90 | 		}
 91 | 
 92 | 		string str() const {
 93 | 			stringstream ss;
 94 | 			copy( mStates.begin(), mStates.end(), ostream_iterator<marginal_t>( ss, " " ) );
 95 | 			string s = ss.str();
 96 | 			s = s.substr( 0, s.length() - 1 );	// TODO inefficient, copies all but the last character
 97 | 			return s;
 98 | 		};
 99 | 
100 | 		void clear() {
101 | 			deleteVector( mStates );
102 | 			deleteVector( mPrevStateSequence );
103 | 			mTrellis.clear();
104 | 		}
105 | 
106 | };
107 | 
108 | 
109 | 
110 | 
111 | 
112 | ////////////////////////////////////////////////// TEMPLATE SPECIALIZATIONS //////////////////////////////////////////////////
113 | 
114 | 
115 | #include "StateSequence/ForwardBackward.hpp"
116 | #include "StateSequence/Mixture.hpp"
117 | // #include "StateSequence/DirectGibbs.hpp"
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | #endif
137 | 
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/src/AutoPriors.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef AUTOPRIORS_HPP
  2 | #define AUTOPRIORS_HPP
  3 | 
  4 | #include <stdexcept>
  5 | using std::runtime_error;
  6 | 
  7 | #include <vector>
  8 | using std::vector;
  9 | 
 10 | #include "includes.hpp"
 11 | 
 12 | #include "Emissions.hpp"
 13 | #include "SufficientStatistics.hpp"
 14 | #include "Tags.hpp"
 15 | 
 16 | 
 17 | // Normal, breakpoint array
 18 | vector<real_t> NormalInverseGammaAutoPrior(
 19 |     real_t s2 ,	// desired variance
 20 |     real_t p ,	// desired probability of sampling a variance below s^2
 21 |     real_t dataMean,
 22 |     real_t dataVar	// in wavelet tree autopriors, this is max(sample variance of data, variance of block means)
 23 | ) {
 24 | 
 25 | 	if ( p < 0 || p > 1 ) {
 26 | 		throw runtime_error( "Parameter p for automatic priors is a probability and must be in [0,1]!" );
 27 | 	}
 28 | 
 29 | 	if ( s2 <= 0 ) {
 30 | 		throw runtime_error( "Parameter s2  for automatic priors is a variance and must be positive!" );
 31 | 	}
 32 | 
 33 | 	if ( dataVar <= 0 ) {
 34 | 		throw runtime_error( "Data variance provided to autoprior must be positive!" );
 35 | 	}
 36 | 
 37 | 
 38 | 	const real_t M1 = 0.3361;
 39 | 	const real_t M2 = -0.0042;
 40 | 	const real_t M3 = -0.0201;
 41 | 
 42 | 	const real_t b = -log( p );
 43 | 
 44 | 	const real_t alpha = 2.0;
 45 | 	const real_t beta = s2 * ( ( 2.0 * sqrt( b ) ) / ( M1 * sqrt( b ) + sqrt( 2.0 ) * ( M2 * b * exp( M3 * sqrt( b ) ) + 1 ) ) + b );
 46 | 	const real_t mu0 = dataMean;
 47 | 	const real_t nu = beta / dataVar;
 48 | 
 49 | 	if ( alpha <= 0 ) {
 50 | 		throw runtime_error( "Autoprior yields non-positive alpha!" );
 51 | 	}
 52 | 
 53 | 	if ( beta <= 0 ) {
 54 | 		throw runtime_error( "Autoprior yields non-positive beta!" );
 55 | 	}
 56 | 
 57 | 	if ( nu <= 0 ) {
 58 | 		throw runtime_error( "Autoprior yields non-positive nu!" );
 59 | 	}
 60 | 
 61 | 	if ( !isfinite( alpha ) ) {
 62 | 		throw runtime_error( "Autoprior yields non-finite alpha!" );
 63 | 	}
 64 | 
 65 | 	if ( !isfinite( beta ) ) {
 66 | 		throw runtime_error( "Autoprior yields non-finite beta!" );
 67 | 	}
 68 | 
 69 | 	if ( !isfinite( mu0 ) ) {
 70 | 		throw runtime_error( "Autoprior yields non-finite mu0!" );
 71 | 	}
 72 | 
 73 | 	if ( !isfinite( nu ) ) {
 74 | 		throw runtime_error( "Autoprior yields non-finite nu!" );
 75 | 	}
 76 | 
 77 | 	vector<real_t> v {alpha, beta, mu0, nu};
 78 | 	return v;
 79 | 
 80 | }
 81 | 
 82 | 
 83 | // auto prior for Gaussian statistics
 84 | // template<template <typename> class Stats, typename Blocks>
 85 | template<typename Stats, typename Blocks>
 86 | vector<real_t> autoPrior(
 87 |     real_t s2 ,	// desired variance
 88 |     real_t p ,	// desired probability of sampling variance below s2
 89 |     Emissions<Statistics<Stats, Normal>, Blocks>& y,
 90 |     const double noiseStdev	// an estimate of the standard deviation of the noise in the data
 91 | ) {
 92 | 
 93 | 	
 94 | 
 95 | 	y.initForward();
 96 | 	y.createBlocks( sqrt( 2 * log( ( double )y.blocks().size() ))*noiseStdev );
 97 | 	SufficientStatistics<Normal> muStats;	// sufficient statistics of observed block means
 98 | 
 99 | 	while ( y.next() ) {
100 | 		for ( size_t dim = 0; dim < y.nrDim(); ++dim ) {
101 | 			muStats.addObs( y.suffStat( dim ).sum() / y.blockSize() );	// NOTE sometimes, computations involve dim sum, without increasing the average weight
102 | 		}
103 | 	}
104 | 	size_t N = y.nrBlocks() * y.nrDim();
105 | 	double blocksMean = sampleMean( muStats, N );
106 | 	double blocksVariance = sampleVariance( muStats, N );
107 | 	return NormalInverseGammaAutoPrior( s2, p, blocksMean, blocksVariance );
108 | 	
109 | 	// TODO we're calculating block mean and block variance here
110 | }
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | #endif
119 | 


--------------------------------------------------------------------------------
/src/HMM.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef HMM_HPP
  2 | #define HMM_HPP
  3 | 
  4 | #include "includes.hpp"
  5 | #include "Tags.hpp"
  6 | #include "ThetaHyperParam.hpp"
  7 | #include "Mapping.hpp"
  8 | #include "Emissions.hpp"
  9 | #include "Theta.hpp"
 10 | #include "Transitions.hpp"
 11 | #include "Initial.hpp"
 12 | #include "StateSequence.hpp"
 13 | #include "InitialHyperParam.hpp"
 14 | #include "TransitionHyperParam.hpp"
 15 | #include "StateMarginals.hpp"
 16 | #include "Records.hpp"
 17 | 
 18 | // template < typename StateSequenceType,
 19 | //          typename EmissionsType, // e.g. Normal
 20 | //          typename ThetaType,	// e.g. NormalInverseGammaVector
 21 | //          typename ThetaParamType,	// e.g. NormalInverseGammaParamVector
 22 | //          typename TransitionType, // e.g. DirichletVector
 23 | //          typename TransitionParamType,	// e.g. DirichletParamVector
 24 | //          typename InitialType,	// e.g. Dirichlet
 25 | //          typename InitialParamType
 26 | //          >
 27 | // void sampleHMM(
 28 | //     EmissionsType& y,
 29 | //     StateSequenceType& q,
 30 | //     ThetaType& theta,
 31 | //     ThetaParamType& tau_theta,
 32 | //     TransitionType& A,
 33 | //     TransitionParamType& tau_A,
 34 | //     InitialType& pi,
 35 | //     InitialParamType& tau_pi,
 36 | //     const Mapping& mapping,
 37 | //     Records& records,
 38 | //     const bool doRecord,
 39 | //     const bool dynamic = true,
 40 | //     const bool useSelfTransitions = true
 41 | // ) {
 42 | //
 43 | // }
 44 | //
 45 | //
 46 | //
 47 | 
 48 | 
 49 | 
 50 | 
 51 | template < typename StateSequenceType,
 52 |          typename EmissionsType, // e.g. Normal
 53 |          typename ThetaType,	// e.g. NormalInverseGammaVector
 54 |          typename ThetaParamType,	// e.g. NormalInverseGammaParamVector
 55 |          typename TransitionType, // e.g. DirichletVector
 56 |          typename TransitionParamType,	// e.g. DirichletParamVector
 57 |          typename InitialType,	// e.g. Dirichlet
 58 |          typename InitialParamType
 59 |          >
 60 | void sampleHMM(
 61 |     EmissionsType& y,
 62 |     StateSequenceType& q,
 63 |     ThetaType& theta,
 64 |     ThetaParamType& tau_theta,
 65 |     TransitionType& A,
 66 |     TransitionParamType& tau_A,
 67 |     InitialType& pi,
 68 |     InitialParamType& tau_pi,
 69 |     const Mapping& mapping,
 70 |     // insert records for parameters etc.
 71 |     const size_t iterations,
 72 |     const size_t thinning,
 73 |     Records& records,
 74 |     const bool dynamic = true,
 75 |     const bool useSelfTransitions = true
 76 |                                     // TODO RNG
 77 | ) {
 78 | 
 79 | 
 80 | 	if ( iterations < 0 ) {
 81 | 		throw runtime_error( "Number of iterations must not be negative!" );
 82 | 	}
 83 | 
 84 | 	if ( thinning > iterations ) {
 85 | 		cout << "[WARNING] Thinning parameter is larger than number of iterations. No data will be recorded!" << endl;
 86 | 	}
 87 | 
 88 | 	const size_t nrStates = mapping.nrStates();
 89 | 	const size_t nrParams = mapping.nrParams();
 90 | 	const size_t nrDataDim = mapping.nrDataDims();
 91 | 
 92 | 
 93 | 	// TODO size checks go here
 94 | 
 95 | 	// sample priors
 96 | 
 97 | 
 98 | 	bool doRecord;
 99 | 	for ( auto i = 0; i <  iterations; ++i ) {
100 | 		if ( dynamic ) {
101 | 			y.createBlocks( theta );
102 | 		}
103 | 
104 | 		doRecord = false;
105 | 		if ( thinning > 0 ) {
106 | 			doRecord = ( ( i + 1 ) % thinning == 0 );
107 | 		}
108 | 		
109 | 		
110 | 		q.sample( y, theta, tau_theta, A, tau_A, pi, tau_pi, mapping, records, doRecord, useSelfTransitions );
111 | 		theta.sample( tau_theta );
112 | 		
113 | 		pi.sample( tau_pi ); //, records, doRecord );
114 | 		
115 | 		A.sample( tau_A ); //, records, doRecord );
116 | 		
117 | 		
118 | 		if ( doRecord ) {
119 | 			records.record( theta );
120 | 		}
121 | 	}
122 | 
123 | 
124 | 
125 | }
126 | 
127 | 
128 | 
129 | 
130 | #endif
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/src/uintmath.hpp:
--------------------------------------------------------------------------------
  1 | ////////// Mathematical operations and bit manipulations for unsigned integer types //////////
  2 | 
  3 | 
  4 | #ifndef UINTMATH_HPP
  5 | #define UINTMATH_HPP
  6 | 
  7 | #include "includes.hpp"
  8 | 
  9 | // keep lowest bit:
 10 | // set all except the rightmost 1-bit (LSB) to 0.
 11 | // this is 2^ctz(x, nrDigits)
 12 | inline size_t klb( size_t x ) {
 13 | 	if ( x > 0 ) {
 14 | 		return ( x & ( ( ~x ) + 1 ) );
 15 | 	} else {
 16 | 		throw runtime_error( "klb(0) is undefined!" );
 17 | 	}
 18 | }
 19 | 
 20 | 
 21 | // Calculate the number of trailing zeros.
 22 | // Given an integral type, this is log2(klb(x)).
 23 | inline size_t ctz( size_t x ) {
 24 | 	if ( x > 0 ) {
 25 | 		size_t c = 0;
 26 | 		x = ( x ^ ( x - 1 ) ) >> 1;
 27 | 		for ( c = 0; x; c++ ) {
 28 | 			x >>= 1;
 29 | 		}
 30 | 		return c;
 31 | 	} else {
 32 | 		throw runtime_error( "ctz(0) is undefined" );
 33 | 	}
 34 | }
 35 | 
 36 | 
 37 | // Set the lowest 1-bit to 0.
 38 | inline size_t ulb( size_t x ) {
 39 | 	return x & ( x - 1 );
 40 | }
 41 | 
 42 | 
 43 | // set trailing zero bits to 1
 44 | inline size_t stb( size_t x ) {
 45 | 	return x | ( x - 1 );
 46 | }
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | // check if a is a multiple of b
 53 | template <typename T>
 54 | inline bool divides( T a, T b ) {
 55 | 	return a == b * ( a / b );
 56 | }
 57 | 
 58 | // divide a/b if it is divisible, throw error otherwise
 59 | template <typename T>
 60 | inline T divide( T a, T b ) {
 61 | 	T d = a / b;
 62 | 	if ( b * d == a ) {
 63 | 		return d;
 64 | 	} else {
 65 | 		throw runtime_error( "Truncated integer division: " + to_string( b ) + " does not divide " + to_string( a ) + "!" );
 66 | 	}
 67 | }
 68 | 
 69 | 
 70 | // for unsigned integer types, round division a/b to the closest integer (.5 round up) without casting to float. round(a/b) = floor((a+ floor(b/2))/b)
 71 | template<typename T>
 72 | inline T rounddiv( T a, T b ) {
 73 | 	return ( a + ( b / 2 ) ) / b;
 74 | }
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | // integer 2**x for non-negative x
 81 | inline size_t iexp2( size_t x ) {
 82 | 	if ( x > numeric_limits<size_t>::digits ) {
 83 | 		throw runtime_error( "Exponent too large for iexp2()!" );
 84 | 	}
 85 | 	return ( 1 << x );
 86 | }
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | // round up to the next multiple of m
 93 | // Returns the lowest multiple of m greater-equal n
 94 | inline size_t ceil_mult( size_t n, size_t m ) {
 95 | 	return ( ( ( n + m ) - 1 ) / m ) * m;
 96 | }
 97 | 
 98 | // Return the lowest multiple of m strictly greater than n.
 99 | inline size_t higher_mult( size_t n, size_t m ) {
100 | 	return ( ( n + m ) / m ) * m;
101 | }
102 | 
103 | 
104 | // Returns the highest multiple of m less-equal than n
105 | inline size_t floor_mult( size_t n, size_t m ) {
106 | 	return ( n / m ) * m;
107 | }
108 | 
109 | 
110 | // Returns the highest multiple of m strictly smaller than n
111 | // NOTE not defined if n==0
112 | inline size_t lower_mult( size_t n, size_t m ) {
113 | 	if ( n > 0 ) {
114 | 		return ( ( n - 1 ) / m ) * m;
115 | 	} else {
116 | 		throw runtime_error( "lower_mult(0) is undefined!" );
117 | 	}
118 | }
119 | 
120 | 
121 | // test wether a number is a power of 2
122 | inline bool isPow2( size_t x ) {
123 | 	return ( x > 0 ) && ( ulb( x ) == 0 );
124 | }
125 | 
126 | 
127 | // the next higher power of two
128 | inline size_t ceilPow2( size_t n ) {
129 | 	// TODO fails if first bit and some other bit are set (overflow)
130 | 	if ( isPow2( n ) ) {
131 | 		return n;
132 | 	}
133 | 	size_t p = 1;
134 | 	while ( p < n ) {
135 | 		p <<= 1;
136 | 	}
137 | 	return p;
138 | }
139 | 
140 | 
141 | 
142 | // round to the smaller power of two
143 | // essentially: keep highest bit
144 | inline size_t floorPow2( size_t n ) {
145 | 	if ( isPow2( n ) ) {
146 | 		return n;
147 | 	}
148 | 	size_t p = 1;
149 | 	while ( p <= n ) {
150 | 		p <<= 1;
151 | 	}
152 | 	return p / 2;
153 | }
154 | 
155 | 
156 | // Check if a number is even.
157 | inline bool isEven( const size_t& x ) {
158 | 	return ( 1 & x ) == 0;
159 | }
160 | 
161 | 
162 | 
163 | #endif
164 | 


--------------------------------------------------------------------------------
/src/Mapping.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef MAPPING_HPP
  2 | #define MAPPING_HPP
  3 | 
  4 | #include "Tags.hpp"
  5 | #include "includes.hpp"
  6 | #include "Parser.hpp"
  7 | 
  8 | // Mapping[s][d] is the parameter dimension for state s and data dimension d
  9 | // TODO this must only be a many-to-one mapping; there should be one from data dimensions to parameter dimensions, one from parameter to hyperparameter, and also a method to create a compound mapping to update hyperparameters from data directly. States are just indices for a vector of parameter mappings. The mapping to hyperparameters makes it easy to have one prior per parameter or a joint prior such as a Dirichlet Process Prior.
 10 | 
 11 | // for Parser
 12 | template<>
 13 | MappingType convertType(
 14 |     const string& s ) {
 15 | 
 16 | 	if ( s == "combinations" || s == "C" ) {
 17 | 		return combinations;
 18 | 	} else {
 19 | 		throw runtime_error( "Unknown mapping type " + s + "!" );
 20 | 	}
 21 | }
 22 | 
 23 | 
 24 | 
 25 | // helper function to get the number of states, to be used in the initializer list of the constructor so the member can be const
 26 | size_t nrOfStates(
 27 |     size_t nrDataDim,
 28 |     size_t nrParam,
 29 |     MappingType mappingType ) {
 30 | 	size_t result;
 31 | 
 32 | 	switch ( mappingType ) {
 33 | 		case combinations:
 34 | 			result = pow( nrParam, nrDataDim );
 35 | 			break;
 36 | 
 37 | 		default:
 38 | 			throw runtime_error( "Mapping type not implemented!" );
 39 | 	}
 40 | 
 41 | 	if ( result <= 1 ) {
 42 | 		throw runtime_error( "Requested parameters would yield an HMM with less than 2 states!" );
 43 | 	}
 44 | 
 45 | 	return result;
 46 | }
 47 | 
 48 | // maps a state s to the indices in theta and tau_theta
 49 | 
 50 | 
 51 | 
 52 | 
 53 | class Mapping {
 54 | 		vector<vector<size_t>> mValue;
 55 | 
 56 | 		const size_t mNrDataDim;
 57 | 		const size_t mNrParams;
 58 | 		const size_t mNrStates;
 59 | 
 60 | 	public:
 61 | 
 62 | 		// delete copy constructor
 63 | 		Mapping( const Mapping& that ) = delete;
 64 | 
 65 | 		Mapping(
 66 | 		    const size_t nrdatadim,
 67 | 		    const size_t nrparams,	// number of parameters
 68 | 		    const MappingType mappingType )
 69 | 			:
 70 | 			mNrDataDim( nrdatadim ),
 71 | 			mNrParams( nrparams ),
 72 | 			mNrStates( nrOfStates( nrdatadim, nrparams, mappingType ) ) {
 73 | 
 74 | 
 75 | 			if ( mNrDataDim <= 0 ) {
 76 | 				throw runtime_error( "Number of data dimensions must be positive!" );
 77 | 			}
 78 | 
 79 | 			if ( mNrParams <= 0 ) {
 80 | 				throw runtime_error( "Number of parameters must be positive!" );
 81 | 			}
 82 | 
 83 | 			if ( mNrStates <= 0 ) {
 84 | 				throw runtime_error( "Number of states must be positive!" );
 85 | 			}
 86 | 
 87 | 
 88 | 			// setup the mapping using vectors of reference wrappers
 89 | 			switch ( mappingType ) {
 90 | 
 91 | 				case combinations:	// all combinations of shared parameters
 92 | 
 93 | 					// states are assigned by generating reversed nrParam-ary numbers of nrDataDim digits
 94 | 					for ( size_t x = 0; x < mNrStates; ++x ) {
 95 | 						size_t n = x;
 96 | 						vector<size_t> mapping;
 97 | 						mapping.reserve( nrdatadim );
 98 | 
 99 | 						for ( size_t d = 0; d < nrdatadim; ++d ) {
100 | 							auto residue = n % mNrParams;
101 | 							n /= mNrParams;
102 | 							mapping.push_back( residue );
103 | 						}
104 | 
105 | 						mValue.push_back( mapping );
106 | 					}
107 | 
108 | 					break;
109 | 
110 | 				case independent:
111 | 					throw runtime_error( "Mapping type \"independent\" not implemented yet!" );
112 | 					break;
113 | 
114 | 				default:
115 | 					throw runtime_error( "Unknown mapping type!" );
116 | 					break;
117 | 			}
118 | 		};
119 | 
120 | 
121 | 		const vector<size_t>& operator[](
122 | 		    size_t state ) const {
123 | 			return mValue[state];
124 | 		}
125 | 
126 | 		size_t nrStates() const {
127 | 			return mNrStates;
128 | 		}
129 | 
130 | 		size_t nrParams() const {
131 | 			return mNrParams;
132 | 		}
133 | 
134 | 		size_t nrDataDims() const {
135 | 			return mNrDataDim;
136 | 		}
137 | };
138 | 
139 | #endif
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/src/StateMarginalsIterator.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef STATEMARGINALITERATOR_HPP
  2 | #define STATEMARGINALITERATOR_HPP
  3 | 
  4 | #include "includes.hpp"
  5 | 
  6 | 
  7 | template<typename T>
  8 | class StateMarginalIterator {
  9 | 
 10 | 		// internal states of a finite state machine parsing the marginals
 11 | 		enum fsa_state {start, afterPos, afterState, afterCount};
 12 | 		fsa_state mInternal;
 13 | 
 14 | 		size_t mIndex;
 15 | 		size_t mState;
 16 | 		size_t mCount;
 17 | 		size_t mPos;
 18 | 		const T& mArray;
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 	public:
 25 | 
 26 | 		StateMarginalIterator( const T& array ):
 27 | 			mArray( array ),
 28 | 			mIndex( 0 ),
 29 | 			mState( 0 ),
 30 | 			mCount( 0 ),
 31 | 			mPos( 0 ),
 32 | 			mInternal( start ) {
 33 | // 			if ( array[0] != 0 ) {
 34 | // 				throw runtime_error( "Container holding compressed marginal counts must contain 0 at the first position!" );
 35 | // 			}
 36 | 			initForward();
 37 | 		}
 38 | 
 39 | 		void initForward() {
 40 | 			mInternal = start;
 41 | 		}
 42 | 
 43 | 
 44 | 		// TODO we do not check the sum of counts for each position. Technically speaking, it would be possible for the array to contain runs of zeros.
 45 | 		bool next() {
 46 | 			while ( mIndex + 1 < mArray.size() ) {
 47 | 				mIndex++;
 48 | 				marginal_t value = mArray[mIndex];
 49 | 
 50 | 				switch ( mInternal ) {
 51 | 
 52 | 					case start:
 53 | 
 54 | 						// first position, must be 0, TODO implementation of MarginalRecords does this slightly differently, with 0 added to the end
 55 | 						mIndex = 0;
 56 | 						value = mArray[0];
 57 | 						mPos = 0;
 58 | 						if ( value >= 0 ) {
 59 | 							mState = value;
 60 | 							mInternal = afterState;
 61 | 							break;
 62 | 						} else {
 63 | 							mState = 0;
 64 | 							mCount = -value;
 65 | 							mInternal = afterCount;
 66 | 							return true;
 67 | 						}
 68 | 						//throw runtime_error( "Malformed marginals, first entry must be zero!" );
 69 | 
 70 | 					case afterPos:
 71 | 
 72 | 						// counts for state 0
 73 | 						if ( value < 0 ) {
 74 | 							mCount = -value;
 75 | 							mInternal = afterCount;
 76 | 							return true;
 77 | 						}
 78 | 
 79 | 						// new state
 80 | 						if ( value > 0 ) {
 81 | 							mState = value;
 82 | 							mInternal = afterState;
 83 | 							break;
 84 | 						}
 85 | 
 86 | 						throw runtime_error( "Malformed marginals, expected count or state label!" );
 87 | 
 88 | 					case afterCount:
 89 | 
 90 | 						// another count, for next state
 91 | 						if ( value < 0 ) {
 92 | 							mState++;
 93 | 							mCount = -value;
 94 | 							return true;
 95 | 						}
 96 | 
 97 | 						// a new state
 98 | 						if ( value > 0 ) {
 99 | 							if ( value <= mState ) {
100 | 								throw runtime_error( "Malformed marginals, new state label must be larger than previous one!" );
101 | 							}
102 | 							mState = value;
103 | 							mInternal = afterState;
104 | 							break;
105 | 						}
106 | 
107 | 						// a new pos
108 | 						if ( value == 0 ) {
109 | 							mPos++;
110 | 							mState = 0;
111 | 							mInternal = afterPos;
112 | 							break;
113 | 						}
114 | 
115 | 					case afterState:
116 | 
117 | 						// a new state count
118 | 						if ( value < 0 ) {
119 | 							mCount = - value;
120 | 							mInternal = afterCount;
121 | 							return true;
122 | 						}
123 | 						throw runtime_error( "Malformed marginals, expected state count." );
124 | 				}
125 | 			}
126 | 
127 | 			// TODO change MarginalRecords
128 | // 			if ( mInternal != afterCount ) {
129 | // 				throw runtime_error( "Malformed marginals, last element must be a state count!" );
130 | // 			}
131 | 			return false;
132 | 		}
133 | 
134 | 
135 | 
136 | 
137 | 		size_t count() const {
138 | 			return mCount;
139 | 		}
140 | 
141 | 		size_t state() const {
142 | 			return mState;
143 | 		}
144 | 
145 | 		size_t pos() const {
146 | 			return mPos;
147 | 		}
148 | 
149 | 
150 | 		void print() const {
151 | 			cout << pos() << " \t" << state() << " \t" << count() << endl;
152 | 		}
153 | };
154 | 
155 | 
156 | 
157 | #endif
158 | 
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/bin/pyhammlet/io.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | """Basic I/O capabilities for HaMMLET, such as parsing compressed output etc."""
  5 | 
  6 | import numpy as np
  7 | import copy
  8 | from scipy import stats
  9 | import itertools
 10 | from bisect import bisect_right, bisect_left
 11 | from RLE import *
 12 | 
 13 | 
 14 | # TODO this reads marginals by iteration, not by class, which would be smaller	
 15 | def readMarginals(filename):
 16 | 	M = np.loadtxt(filename, dtype=int, ndmin=2)
 17 | 	segSizes = M[:,0]
 18 | 	counts = M[:,1:]
 19 | 	result = RunLengthArray(sizes=segSizes, array=counts)
 20 | 	return result
 21 | 
 22 | 
 23 | 
 24 | #TODO start and end
 25 | 
 26 | def readCompressedStateSequences(seqFileName):	
 27 | 	"""Reads compressed state sequence samples into a list of RLE'ed state sequences."""
 28 | 	result=[]
 29 | 	for line in file(seqFileName, "r"):
 30 | 		if line.startswith("#"):
 31 | 			continue
 32 | 		line = line.split()
 33 | 		a, b = zip(*[x.split(":") for x  in line])
 34 | 		assert len(a)==len(b)
 35 | 		result.append(RunLengthArray(sizes=np.array(a, dtype=int), array=np.array(b, dtype=int)))
 36 | 	return result
 37 | 	
 38 | 	#seqFile = file(seqFileName, "r")
 39 | 	#splitLines = [line.split() for line in filter(lambda x: not x.startswith("#"), seqFile.readlines())]
 40 | 	#nrIterations = len(splitLines)
 41 | 	#result = [[] for x in xrange(nrIterations)]
 42 | 	#for i in xrange(nrIterations):
 43 | 		#a, b = zip(*[x.split(":") for x  in splitLines[i]])
 44 | 		#assert len(a)==len(b)
 45 | 		#result[i] = RunLengthArray(sizes=np.array(a, dtype=int), array=np.array(b, dtype=int))
 46 | 	#return result
 47 | 	
 48 | 
 49 | 
 50 | def readBlockSizes(filename):
 51 | 	# read block sizes from file and create a compressed matrix in which each position corresponds to  (log of) the size of the block the value at this position is contained in. This yields a plot of the local compression.
 52 | 	
 53 | 		
 54 | 	# get end positions for each line
 55 | 	
 56 | 	# split a block of size N into two blocks [1, N-1], to mimick a block boundary while plotting
 57 | 	def processLine(line):
 58 | 		line = np.array(line.split(), dtype=int)
 59 | 		l = len(line)-1
 60 | 		newSize = 2*len(line)-line[line==1].sum()
 61 | 		r = newSize-1
 62 | 		line=np.resize(line, newSize)
 63 | 		while l >= 0:
 64 | 			if line[l] == 1:
 65 | 				line[r] = 1
 66 | 			else:
 67 | 				line[r] = line[l]-1
 68 | 				r -= 1
 69 | 				line[r] = 1
 70 | 			l -= 1
 71 | 			r -= 1
 72 | 		return line		
 73 | 	
 74 | 	lines = [processLine(line).cumsum() for line in file(filename).readlines()]
 75 | 	T = lines[0][-1]
 76 | 	for i in xrange(1, len(lines)):
 77 | 		assert lines[i][-1] == T, "Block structure in input line %d does not match the previous ones in total size!" % (i+2)
 78 | 	iterations = len(lines)
 79 | 	
 80 | 	# get all endpoints across all iterations
 81 | 	ends = set()
 82 | 	ends.update(*[set(L) for L in lines])
 83 | 	ends = np.array(sorted(ends), dtype=int)
 84 | 	
 85 | 	# change end points to match size indices for run-length encoding
 86 | 	# line[i] counts how many subblocks are covered at the end of the i-th block in line
 87 | 	for line in lines:
 88 | 		e = 0
 89 | 		for i in xrange(len(line)):
 90 | 			while line[i] > ends[e]:
 91 | 				e += 1
 92 | 			e += 1
 93 | 			line[i] = e
 94 | 	for i in xrange(len(lines)-1):
 95 | 		assert lines[i][-1] == lines[i+1][-1]
 96 | 		
 97 | 	data = np.zeros((len(ends), iterations), dtype=int)
 98 | 	# determine block labels
 99 | 	iteration=0
100 | 	
101 | 	# make ends contain the subblock sizes
102 | 	ends = subdiff(ends)
103 | 	
104 | 	for line in lines:
105 | 		t=0
106 | 		for i in xrange(len(line)):
107 | 			
108 | 			if i==0:
109 | 				start = 0
110 | 				end = line[0]
111 | 			else:
112 | 				start = line[i-1]
113 | 				end= line[i]
114 | 				
115 | 				
116 | 			label = sum(ends[start:end])
117 | 			data[start:end, iteration] = label
118 | 			t += label
119 | 			assert label>0
120 | 		assert t==T, "The sum of block sizes does not match the total data size!"
121 | 		iteration += 1
122 | 		
123 | 
124 | 	return RunLengthArray(sizes=np.array(ends), array=data)
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/src/StateSequence/Mixture.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef STATESEQUENCEMIXTURE_HPP
  2 | #define STATESEQUENCEMIXTURE_HPP
  3 | 
  4 | #include "../Statistics.hpp"
  5 | #include "../Tags.hpp"
  6 | #include "../SufficientStatistics.hpp"
  7 | #include "../Emissions.hpp"
  8 | #include "../Theta.hpp"
  9 | #include "../Transitions.hpp"
 10 | #include "../Initial.hpp"
 11 | #include "../KahanAggregator.hpp"
 12 | #include "../Statistics.hpp"
 13 | 
 14 | #include <vector>
 15 | using std::vector;
 16 | 
 17 | 
 18 | 
 19 | 
 20 | // sample the state sequence and record the necessary
 21 | template<> template <
 22 | typename StatsStructure,
 23 |          typename StatsType,
 24 |          typename BlocksType,
 25 |          typename ThetaType,
 26 |          typename TauThetaType,
 27 |          typename TransitionsType,
 28 |          typename TauAType,
 29 |          typename InitialType,
 30 |          typename TauPiType >
 31 | void StateSequence<Mixture>::sample(
 32 |     Emissions<Statistics<StatsStructure, StatsType>, Blocks<BlocksType>>& y,	// TODO this cannot be const due to the use of next(), work around that somehow
 33 |     const ThetaType& theta,
 34 |     TauThetaType& tau_theta,
 35 |     const TransitionsType& A,
 36 |     TauAType& tau_A,
 37 |     const InitialType& pi,
 38 |     TauPiType& tau_pi,
 39 |     const Mapping& mapping,
 40 |     Records& records,
 41 |     const bool doRecord,
 42 |     const bool useSelfTransitions	// NOTE this has no effect for mixtures
 43 | ) {
 44 | 	size_t nrStates = A.nrStates();
 45 | 	const size_t nrDim = y.nrDim();
 46 | 	const size_t nrParams = tau_theta.nrParams();
 47 | 	// TODO size checks go here
 48 | 
 49 | 
 50 | 	// NOTE initial state distribution and transitions have noeffect in mixture sampling
 51 | 
 52 | 	//TODO precompute carrier measure upon implementation of non-normal distributions
 53 | 
 54 | 	vector<real_t> logNormalizers;
 55 | 	logNormalizers.reserve( nrStates );
 56 | 
 57 | 
 58 | 
 59 | 	for ( auto s = 0; s < nrStates; ++s ) {
 60 | 		logNormalizers.push_back( theta.logNormalizer( s ) );
 61 | 	}
 62 | 
 63 | 	vector<real_t> weights( nrStates, 0 );
 64 | 
 65 | 
 66 | 
 67 | 	vector<KahanAggregator<SufficientStatistics<StatsType>>> stats;
 68 | 	stats.resize( nrParams );
 69 | 
 70 | 
 71 | 
 72 | 	// count transitions
 73 | 	SufficientStatistics<CategoricalVector> transitions( nrStates );
 74 | 
 75 | 	// count states
 76 | 	SufficientStatistics<Categorical> stateCounts( nrStates );
 77 | 
 78 | 	// forward variables
 79 | 	y.initForward();
 80 | 
 81 | 	// TODO initial
 82 | 	size_t prevState = 0;
 83 | // 	const auto stateProbs = pi.valueVector();
 84 | 
 85 | // 	auto logStateProbs = pi.valueVector();
 86 | // 	for ( size_t d = 0; d < nrStates; ++d ) {
 87 | // 		logStateProbs[d] = log( logStateProbs[d] );
 88 | // 	}
 89 | 
 90 | 	while ( y.next() ) {
 91 | 		real_t maxE = numeric_limits<real_t>::lowest();
 92 | 
 93 | 		const size_t N = y.blockSize();
 94 | 
 95 | 
 96 | 		// TODO assertions like in StateSequenceDirectGibbs
 97 | 		for ( auto s = 0; s < nrStates; ++s ) {
 98 | 			auto E = innerProduct( y, theta.value(), theta.mapping( s ) )  - N * logNormalizers[s]; // + N * logStateProbs[s];	// TODO carrier measure for the general EFD case TODO this is mixture sampling for burn-in, it does not take state probabilities into account
 99 | 			weights[s] = E ;
100 | 			maxE = max( E, maxE );
101 | 		}
102 | 
103 | 
104 | 		for ( auto s = 0; s < nrStates; ++s ) {
105 | 			
106 | // 			weights[s] = pow( (double)exp( weights[s] - maxE ), 1.0/(double)N);
107 | 			weights[s] = exp( weights[s] - maxE );
108 | 
109 | 		}
110 | 
111 | 		discrete_distribution<size_t> dist( weights.begin(), weights.end() );
112 | 		const size_t state = dist( mRNG );
113 | 
114 | 		stateCounts[state] += N;
115 | 		transitions[state][state] += N - 1;
116 | 		transitions[prevState][state] += 1;	// TODO Initial
117 | 
118 | 		for ( auto d = 0; d < nrDim; ++d ) {
119 | 			// TODO assert range
120 | 			stats[mapping[state][d]].add( y.suffStat( d ), N );
121 | 		}
122 | 
123 | 
124 | 		if ( doRecord ) {
125 | 			records.record( state, N );
126 | 		}
127 | 
128 | 		prevState = state;
129 | 	}
130 | 
131 | 	for ( auto p = 0; p < nrParams; ++p ) {
132 | 		const size_t N = stats[p].nrTerms();
133 | 		if ( N > 0 ) {
134 | 			tau_theta.addObservation( stats[p].sum(), N,  p );
135 | 		}
136 | 	}
137 | 
138 | 	tau_A.addObservation( transitions );
139 | 	tau_pi.addObservation( stateCounts );
140 | 
141 | 
142 | 
143 | 	// NOTE no state sequence is recorded
144 | }
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | #endif
152 | 
153 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | ###     This file is part of HaMMLET.
  2 | ### 
  3 | ###     HaMMLET is free software: you can redistribute it and/or modify
  4 | ###     it under the terms of the GNU General Public License as published by
  5 | ###     the Free Software Foundation, either version 3 of the License, or
  6 | ###     (at your option) any later version.
  7 | ### 
  8 | ###     HaMMLET is distributed in the hope that it will be useful,
  9 | ###     but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | ###     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | ###     GNU General Public License for more details.
 12 | ### 
 13 | ###     You should have received a copy of the GNU General Public License
 14 | ###     along with HaMMLET.  If not, see <http://www.gnu.org/licenses/>.
 15 | 
 16 | # $@ name of target
 17 | # $< first dependency
 18 | # $+ all dependencies
 19 | 
 20 | 
 21 | 
 22 | COMPILER=g++
 23 | CFLAGS=-Werror  --std=c++11   -fmax-errors=1 -Wreturn-type
 24 | SRC=./src
 25 | TLS=$(SRC)/tools
 26 | BIN=./bin
 27 | LIB=./lib
 28 | DOC=./doc
 29 | LOGO=./logo
 30 | 
 31 | TOOLS=mapLinesToGenome combineCounts avg maxSegmentation
 32 | TOOLSLIST=$(addprefix $(BIN)/, $(TOOLS))
 33 | 
 34 | all: CFLAGS +=  -O3
 35 | all: $(SRC)/hammlet-manpage.hpp tools hammlet
 36 | 	chmod ug+x $(BIN)/*
 37 | 
 38 | debug: CFLAGS += -g
 39 | debug:  $(SRC)/hammlet-manpage.hpp  tools hammlet
 40 | 	chmod ug+x $(BIN)/*
 41 | 
 42 | hammlet: $(SRC)/hammlet-manpage.hpp
 43 | 	$(COMPILER) $(CFLAGS) $(SRC)/main.cpp  -o $(BIN)/hammlet
 44 | 
 45 | 	
 46 | tools: $(TOOLSLIST)
 47 | 	
 48 | 
 49 | $(BIN)/%:  $(TLS)/%.cpp $(LIB)/gzstream/libgzstream.a
 50 | 	$(COMPILER) $(CFLAGS)  $< -I$(LIB)/gzstream -L$(LIB)/gzstream -lgzstream -lz -o $@ 
 51 | 
 52 | 	
 53 | # make gzip stream library	
 54 | $(LIB)/gzstream/libgzstream.a:
 55 | 	make -C $(LIB)/gzstream
 56 | 	
 57 | clean: 
 58 | 	rm -vf $(BIN)/hammlet
 59 | 	rm -vf $(TOOLSLIST)
 60 | 	rm -vf $(BIN)/pyhammlet/*.pyc
 61 | 	rm -vf $(LIB)/gzstream/libgzstream.a
 62 | 	rm -vf $(LIB)/gzstream/gzstream.o
 63 | 
 64 | 
 65 | 	
 66 | 	
 67 | # Create manuals in different formats from $(DOC)/manpage.md. It also produces a header file to hard-code the manpage into the executable for platform-independence. 
 68 | # This would typically not be called by the end-user, since it requires that the system has pandoc, xxd, pdflatex, awk, base64 and man installed. It is provided here for convenience of the developer.
 69 | man: manclean  $(SRC)/hammlet-manpage.hpp $(DOC)/hammlet.man $(DOC)/hammlet-manpage.txt $(DOC)/hammlet-manpage.html $(DOC)/hammlet-manpage-a4.pdf $(DOC)/hammlet-manpage-letter.pdf $(LOGO)/logo-round.base64
 70 | 
 71 | 	
 72 | manclean:
 73 | 	rm -f $(DOC)/hammlet-manpage-a4.pdf
 74 | 	rm -f $(DOC)/hammlet-manpage-letter.pdf
 75 | 	rm -f $(DOC)/hammlet-manpage.html
 76 | 	rm -f $(DOC)/hammlet-manpage.txt
 77 | 	rm -f $(DOC)/hammlet.man
 78 | 	rm -f $(SRC)/hammlet-manpage.hpp
 79 | 
 80 | $(LOGO)/logo-round.base64:
 81 | 	echo -n "data:image/png;base64," > $(LOGO)/logo-round.base64
 82 | 	base64 -w 0 $(LOGO)/logo-round.png >> $(LOGO)/logo-round.base64
 83 | 	
 84 | $(DOC)/hammlet-manpage-a4.pdf:
 85 | 	pandoc $(DOC)/hammlet-manpage.md -V lang=en -V papersize=a4 -H $(DOC)/man-preamble.tex -s -t latex  --variable classoption=landscape,twocolumn --variable geometry={margin=1in}  -o $(DOC)/hammlet-manpage-a4.pdf
 86 | 
 87 | $(DOC)/hammlet-manpage-letter.pdf:
 88 | 	pandoc $(DOC)/hammlet-manpage.md -V lang=en -V papersize=letter -H $(DOC)/man-preamble.tex -s -t latex  --variable classoption=landscape,twocolumn --variable geometry={margin=1in}  -o $(DOC)/hammlet-manpage-letter.pdf
 89 | 
 90 | $(DOC)/hammlet-manpage.html: $(LOGO)/logo-round.base64
 91 | 	pandoc $(DOC)/hammlet-manpage.md -V lang=en -H $(DOC)/pandoc.css  -s -t html | awk 'BEGIN{getline l < "logo/logo-round.base64"}/\"..\/logo\/logo-round.png\"/{gsub("../logo/logo-round.png",l)}1' > $(DOC)/hammlet-manpage.html
 92 | 	
 93 | $(DOC)/hammlet.man:
 94 | 	pandoc $(DOC)/hammlet-manpage.md -s -t man --variable adjusting=l > $(DOC)/hm
 95 | 	cat $(DOC)/hm $(LOGO)/logo-boxdrawing-centered.groff > $(DOC)/hammlet.man
 96 | 	rm $(DOC)/hm
 97 | 	
 98 | 
 99 | $(DOC)/hammlet-manpage.txt: $(DOC)/hammlet.man
100 | 	MANWIDTH=80 man  $(DOC)/hammlet.man > $(DOC)/hammlet-manpage.txt
101 | 	
102 | # Create .txt version of manpage, as well as a hexdump with C++ declarations, so we can #include the txt version of the manpage into the source at compile time:
103 | $(SRC)/hammlet-manpage.hpp:	$(DOC)/hammlet-manpage.txt
104 | 	xxd -i $(DOC)/hammlet-manpage.txt > $(SRC)/hammlet-manpage.hpp
105 | 	
106 | 
107 | 


--------------------------------------------------------------------------------
/lib/gzstream/gzstream.h:
--------------------------------------------------------------------------------
  1 | // ============================================================================
  2 | // gzstream, C++ iostream classes wrapping the zlib compression library.
  3 | // Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
  4 | //
  5 | // This library is free software; you can redistribute it and/or
  6 | // modify it under the terms of the GNU Lesser General Public
  7 | // License as published by the Free Software Foundation; either
  8 | // version 2.1 of the License, or (at your option) any later version.
  9 | //
 10 | // This library is distributed in the hope that it will be useful,
 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 | // Lesser General Public License for more details.
 14 | //
 15 | // You should have received a copy of the GNU Lesser General Public
 16 | // License along with this library; if not, write to the Free Software
 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 18 | // ============================================================================
 19 | //
 20 | // File          : gzstream.h
 21 | // Revision      : $Revision: 1.5 $
 22 | // Revision_date : $Date: 2002/04/26 23:30:15 $
 23 | // Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
 24 | // 
 25 | // Standard streambuf implementation following Nicolai Josuttis, "The 
 26 | // Standard C++ Library".
 27 | // ============================================================================
 28 | 
 29 | #ifndef GZSTREAM_H
 30 | #define GZSTREAM_H 1
 31 | 
 32 | // standard C++ with new header file names and std:: namespace
 33 | #include <iostream>
 34 | #include <fstream>
 35 | #include <zlib.h>
 36 | 
 37 | #ifdef GZSTREAM_NAMESPACE
 38 | namespace GZSTREAM_NAMESPACE {
 39 | #endif
 40 | 
 41 | // ----------------------------------------------------------------------------
 42 | // Internal classes to implement gzstream. See below for user classes.
 43 | // ----------------------------------------------------------------------------
 44 | 
 45 | class gzstreambuf : public std::streambuf {
 46 | private:
 47 |     static const int bufferSize = 47+256;    // size of data buff
 48 |     // totals 512 bytes under g++ for igzstream at the end.
 49 | 
 50 |     gzFile           file;               // file handle for compressed file
 51 |     char             buffer[bufferSize]; // data buffer
 52 |     char             opened;             // open/close state of stream
 53 |     int              mode;               // I/O mode
 54 | 
 55 |     int flush_buffer();
 56 | public:
 57 |     gzstreambuf() : opened(0) {
 58 |         setp( buffer, buffer + (bufferSize-1));
 59 |         setg( buffer + 4,     // beginning of putback area
 60 |               buffer + 4,     // read position
 61 |               buffer + 4);    // end position      
 62 |         // ASSERT: both input & output capabilities will not be used together
 63 |     }
 64 |     int is_open() { return opened; }
 65 |     gzstreambuf* open( const char* name, int open_mode);
 66 |     gzstreambuf* close();
 67 |     ~gzstreambuf() { close(); }
 68 |     
 69 |     virtual int     overflow( int c = EOF);
 70 |     virtual int     underflow();
 71 |     virtual int     sync();
 72 | };
 73 | 
 74 | class gzstreambase : virtual public std::ios {
 75 | protected:
 76 |     gzstreambuf buf;
 77 | public:
 78 |     gzstreambase() { init(&buf); }
 79 |     gzstreambase( const char* name, int open_mode);
 80 |     ~gzstreambase();
 81 |     void open( const char* name, int open_mode);
 82 |     void close();
 83 |     gzstreambuf* rdbuf() { return &buf; }
 84 | };
 85 | 
 86 | // ----------------------------------------------------------------------------
 87 | // User classes. Use igzstream and ogzstream analogously to ifstream and
 88 | // ofstream respectively. They read and write files based on the gz* 
 89 | // function interface of the zlib. Files are compatible with gzip compression.
 90 | // ----------------------------------------------------------------------------
 91 | 
 92 | class igzstream : public gzstreambase, public std::istream {
 93 | public:
 94 |     igzstream() : std::istream( &buf) {} 
 95 |     igzstream( const char* name, int open_mode = std::ios::in)
 96 |         : gzstreambase( name, open_mode), std::istream( &buf) {}  
 97 |     gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
 98 |     void open( const char* name, int open_mode = std::ios::in) {
 99 |         gzstreambase::open( name, open_mode);
100 |     }
101 | };
102 | 
103 | class ogzstream : public gzstreambase, public std::ostream {
104 | public:
105 |     ogzstream() : std::ostream( &buf) {}
106 |     ogzstream( const char* name, int mode = std::ios::out)
107 |         : gzstreambase( name, mode), std::ostream( &buf) {}  
108 |     gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
109 |     void open( const char* name, int open_mode = std::ios::out) {
110 |         gzstreambase::open( name, open_mode);
111 |     }
112 | };
113 | 
114 | #ifdef GZSTREAM_NAMESPACE
115 | } // namespace GZSTREAM_NAMESPACE
116 | #endif
117 | 
118 | #endif // GZSTREAM_H
119 | // ============================================================================
120 | // EOF //
121 | 
122 | 


--------------------------------------------------------------------------------
/src/utils.hpp:
--------------------------------------------------------------------------------
  1 | ////////// This file contains all those small utility functions we need. //////////
  2 | 
  3 | #ifndef UTILS_HPP
  4 | #define UTILS_HPP
  5 | 
  6 | #include "includes.hpp"
  7 | 
  8 | 
  9 | 
 10 | enum Direction {forward, backward, unset};
 11 | 
 12 | // use Kahan (1965) to compute a stable cumulative sum of partial array
 13 | //
 14 | template <typename T>
 15 | void KahanCumulativeSum(
 16 |     vector<T>& x,
 17 |     const size_t left = 0,
 18 |     size_t right = numeric_limits<size_t>::max(),
 19 |     const size_t stepSize = 1,	// NOTE the direction of steps is determined by whether end is larger or smaller than start
 20 |     bool reverse = false // whether to compute the sum from left to right rather than right to left. NOTE The affected indices are exactly the same in both cases!
 21 | ) {
 22 | 
 23 | 	if ( left >= x.size() ) {
 24 | 		throw runtime_error( "Start index for Kahan summation out of bounds!" );
 25 | 	}
 26 | 
 27 | 	if ( stepSize <= 0 ) {
 28 | 		throw runtime_error( "Increment for Kahan addition must be positive!" );
 29 | 	}
 30 | 	if ( left >= right ) {
 31 | 		throw runtime_error( "Start position in Kahan summation must be smaller than end position!" );
 32 | 	}
 33 | 	if ( right > x.size() ) {
 34 | 		right = x.size();
 35 | 	}
 36 | 
 37 | 	// compute first and last affected position
 38 | 
 39 | 	right--;	// end is exclusive
 40 | 	right -= left;	// temporary offset to get the rounding to multiples of step size
 41 | 	right = ( right / stepSize ) * stepSize;	// round down to step
 42 | 	right += left;	// shift back
 43 | 
 44 | 	if ( left < right ) {
 45 | 		T c( 0 );
 46 | 
 47 | 		if ( reverse ) {
 48 | 			// get the last affected position
 49 | 			T s = x[right];
 50 | // 			cout<<right<<" ";
 51 | 			for ( size_t i = right - stepSize;  i >= left; i -= stepSize ) {
 52 | // 				cout<<i<<" ";
 53 | 				T y = x[i] - c;
 54 | 				T temp = s + y;
 55 | 				c = ( temp - s ) - y;
 56 | 				s = temp;
 57 | 				x[i] = s; // TODO add error term here?
 58 | 
 59 | 				// avoid underflow of decrement
 60 | 				if ( i < stepSize ) {
 61 | 					break;
 62 | 				}
 63 | 			}
 64 | // 			cout<<endl<<endl;;
 65 | 		} else {
 66 | 			T s = x[left];
 67 | 			for ( size_t i = left + stepSize; i <= right; i += stepSize ) {
 68 | 				T y = x[i] - c;
 69 | 				T temp = s + y;
 70 | 				c = ( temp - s ) - y;
 71 | 				s = temp;
 72 | 				x[i] = s; // TODO add error term here?
 73 | 			}
 74 | 		}
 75 | 	}
 76 | }
 77 | 
 78 | 
 79 | 
 80 | 
 81 | // Return a reference to the lower median of a vector.
 82 | template<typename T>
 83 | T& lowerMedian( vector<T>& vec ) {
 84 | 	nth_element( vec.begin(), vec.begin() + ( vec.size() / 2 ), vec.end() );
 85 | 	return vec[vec.size() / 2];
 86 | }
 87 | 
 88 | 
 89 | // Delete a vector and release its memory.
 90 | template <typename T>
 91 | void deleteVector( vector<T>& vec ) {
 92 | 	vector<T>().swap( vec );
 93 | }
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | // concatenate elements of a vector and output to stream
100 | template <typename T>
101 | string concat(
102 |     const vector<T>& vec, 	// vector of elements
103 |     const string sep = "\t", 	// inner separator
104 |     const string finalSep = "",	// outer separator, e.g. "\n"
105 |     const size_t minSize = 0,	// minimum size, if vector is smaller, defaultValue will be appended
106 |     const T defaultValue = T()
107 | ) {
108 | 	stringstream ss;
109 | 	size_t size = max( vec.size(), minSize );
110 | 	for ( const auto & v : vec ) {
111 | 		ss << v;
112 | 		size--;
113 | 		if ( size > 0 ) {
114 | 			ss << sep;
115 | 		}
116 | 	}
117 | 	while ( size > 0 ) {
118 | 		ss << defaultValue << sep;
119 | 		size--;
120 | 	}
121 | 	ss << finalSep;
122 | 	return ss.str();
123 | }
124 | 
125 | 
126 | // check whether a file exists
127 | bool fileExists( const string& path ) {
128 | 	ifstream f( path.c_str() );
129 | 	bool status = f.good();
130 | 	f.close();
131 | 	return status;
132 | }
133 | 
134 | // count the number of lines in a file based on the occurrence of newline
135 | size_t nrLinesInFile( istream& infile ) {
136 | 	infile.unsetf( std::ios_base::skipws );
137 | 	size_t line_count = count( istream_iterator<char> ( infile ), istream_iterator<char>(), '\n' );
138 | 	infile.clear();
139 | 	infile.seekg( 0, ios::beg );
140 | 	infile.setf( std::ios_base::skipws );
141 | 	return line_count;
142 | }
143 | 
144 | 
145 | 
146 | 
147 | // append the values obtained from an input strea to a vector. If <ignoreInvalid> is true, this will throw an exception if some input cannot be converted to the proper type.
148 | template<class T>
149 | void istreamToVector(
150 |     istream& stream,
151 |     vector<T>& vec,
152 |     const bool ignoreInvalid = false ) {
153 | 
154 | 	T val;
155 | 	if ( ignoreInvalid ) {
156 | 		for ( ;; ) {
157 | 			stream >> val;
158 | 			if ( stream.eof() || stream.bad() ) {
159 | 				break;
160 | 			} else if ( stream.fail() ) {
161 | 				stream.clear(); // unset failbit
162 | 				stream.ignore( 1 ); // skip next char
163 | 			} else {
164 | 				vec.push_back( val );
165 | 			}
166 | 		}
167 | 	} else {
168 | 		while ( stream >> val ) {
169 | 			vec.push_back( val );
170 | 		}
171 | 		if ( !( stream.eof() || stream.bad() ) ) {
172 | 			throw runtime_error( "Invalid input encountered!" );
173 | 		}
174 | 	}
175 | }
176 | 
177 | 
178 | #endif
179 | 


--------------------------------------------------------------------------------
/lib/gzstream/gzstream.C:
--------------------------------------------------------------------------------
  1 | // ============================================================================
  2 | // gzstream, C++ iostream classes wrapping the zlib compression library.
  3 | // Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
  4 | //
  5 | // This library is free software; you can redistribute it and/or
  6 | // modify it under the terms of the GNU Lesser General Public
  7 | // License as published by the Free Software Foundation; either
  8 | // version 2.1 of the License, or (at your option) any later version.
  9 | //
 10 | // This library is distributed in the hope that it will be useful,
 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 | // Lesser General Public License for more details.
 14 | //
 15 | // You should have received a copy of the GNU Lesser General Public
 16 | // License along with this library; if not, write to the Free Software
 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 18 | // ============================================================================
 19 | //
 20 | // File          : gzstream.C
 21 | // Revision      : $Revision: 1.7 $
 22 | // Revision_date : $Date: 2003/01/08 14:41:27 $
 23 | // Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
 24 | // 
 25 | // Standard streambuf implementation following Nicolai Josuttis, "The 
 26 | // Standard C++ Library".
 27 | // ============================================================================
 28 | 
 29 | #include <gzstream.h>
 30 | #include <iostream>
 31 | #include <string.h>  // for memcpy
 32 | 
 33 | #ifdef GZSTREAM_NAMESPACE
 34 | namespace GZSTREAM_NAMESPACE {
 35 | #endif
 36 | 
 37 | // ----------------------------------------------------------------------------
 38 | // Internal classes to implement gzstream. See header file for user classes.
 39 | // ----------------------------------------------------------------------------
 40 | 
 41 | // --------------------------------------
 42 | // class gzstreambuf:
 43 | // --------------------------------------
 44 | 
 45 | gzstreambuf* gzstreambuf::open( const char* name, int open_mode) {
 46 |     if ( is_open())
 47 |         return (gzstreambuf*)0;
 48 |     mode = open_mode;
 49 |     // no append nor read/write mode
 50 |     if ((mode & std::ios::ate) || (mode & std::ios::app)
 51 |         || ((mode & std::ios::in) && (mode & std::ios::out)))
 52 |         return (gzstreambuf*)0;
 53 |     char  fmode[10];
 54 |     char* fmodeptr = fmode;
 55 |     if ( mode & std::ios::in)
 56 |         *fmodeptr++ = 'r';
 57 |     else if ( mode & std::ios::out)
 58 |         *fmodeptr++ = 'w';
 59 |     *fmodeptr++ = 'b';
 60 |     *fmodeptr = '\0';
 61 |     file = gzopen( name, fmode);
 62 |     if (file == 0)
 63 |         return (gzstreambuf*)0;
 64 |     opened = 1;
 65 |     return this;
 66 | }
 67 | 
 68 | gzstreambuf * gzstreambuf::close() {
 69 |     if ( is_open()) {
 70 |         sync();
 71 |         opened = 0;
 72 |         if ( gzclose( file) == Z_OK)
 73 |             return this;
 74 |     }
 75 |     return (gzstreambuf*)0;
 76 | }
 77 | 
 78 | int gzstreambuf::underflow() { // used for input buffer only
 79 |     if ( gptr() && ( gptr() < egptr()))
 80 |         return * reinterpret_cast<unsigned char *>( gptr());
 81 | 
 82 |     if ( ! (mode & std::ios::in) || ! opened)
 83 |         return EOF;
 84 |     // Josuttis' implementation of inbuf
 85 |     int n_putback = gptr() - eback();
 86 |     if ( n_putback > 4)
 87 |         n_putback = 4;
 88 |     memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback);
 89 | 
 90 |     int num = gzread( file, buffer+4, bufferSize-4);
 91 |     if (num <= 0) // ERROR or EOF
 92 |         return EOF;
 93 | 
 94 |     // reset buffer pointers
 95 |     setg( buffer + (4 - n_putback),   // beginning of putback area
 96 |           buffer + 4,                 // read position
 97 |           buffer + 4 + num);          // end of buffer
 98 | 
 99 |     // return next character
100 |     return * reinterpret_cast<unsigned char *>( gptr());    
101 | }
102 | 
103 | int gzstreambuf::flush_buffer() {
104 |     // Separate the writing of the buffer from overflow() and
105 |     // sync() operation.
106 |     int w = pptr() - pbase();
107 |     if ( gzwrite( file, pbase(), w) != w)
108 |         return EOF;
109 |     pbump( -w);
110 |     return w;
111 | }
112 | 
113 | int gzstreambuf::overflow( int c) { // used for output buffer only
114 |     if ( ! ( mode & std::ios::out) || ! opened)
115 |         return EOF;
116 |     if (c != EOF) {
117 |         *pptr() = c;
118 |         pbump(1);
119 |     }
120 |     if ( flush_buffer() == EOF)
121 |         return EOF;
122 |     return c;
123 | }
124 | 
125 | int gzstreambuf::sync() {
126 |     // Changed to use flush_buffer() instead of overflow( EOF)
127 |     // which caused improper behavior with std::endl and flush(),
128 |     // bug reported by Vincent Ricard.
129 |     if ( pptr() && pptr() > pbase()) {
130 |         if ( flush_buffer() == EOF)
131 |             return -1;
132 |     }
133 |     return 0;
134 | }
135 | 
136 | // --------------------------------------
137 | // class gzstreambase:
138 | // --------------------------------------
139 | 
140 | gzstreambase::gzstreambase( const char* name, int mode) {
141 |     init( &buf);
142 |     open( name, mode);
143 | }
144 | 
145 | gzstreambase::~gzstreambase() {
146 |     buf.close();
147 | }
148 | 
149 | void gzstreambase::open( const char* name, int open_mode) {
150 |     if ( ! buf.open( name, open_mode))
151 |         clear( rdstate() | std::ios::badbit);
152 | }
153 | 
154 | void gzstreambase::close() {
155 |     if ( buf.is_open())
156 |         if ( ! buf.close())
157 |             clear( rdstate() | std::ios::badbit);
158 | }
159 | 
160 | #ifdef GZSTREAM_NAMESPACE
161 | } // namespace GZSTREAM_NAMESPACE
162 | #endif
163 | 
164 | // ============================================================================
165 | // EOF //
166 | 


--------------------------------------------------------------------------------
/logo/logo-round.base64:
--------------------------------------------------------------------------------
1 | data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAfQAAAH0CAYAAADL1t+KAAAABmJLR0QA/wD/AP+gvaeTAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAB3RJTUUH4QgPFigWyP39KQAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAABGaSURBVHja7d1rqF1lfsfxX5ZHQlGGo0hURvHyxpAcRYp4qamKYKEqNmNlpoI5wxQcZXDQ0AidekOj00IcExkZdISWOQoyg3UyQS2MIKON9UIooscQ33hBB/UgehB9EYzYF/vRHmMu57Iv69n784ENBsXs/V9772+ek7WetWz1untCT6xIclKSE5Icl+TYJEcnOSrJkUnGkxye5LAky5McmuSQJMuMDqjEl0m+SPJ5kt1JPkvyaZLZJB8l+TDJB0neS/JukreTvJlkxui6b8wIuhLu05OclmQiycokp5RgAwyzZaUjY0n+YgHfe7NJXk+yK8l0kleSvCz0gt5vZyQ5N8nZSc5McrKRACzIeJKzymOuN5K8lOSFJM8l2WFUgt5NK5NclOTCJBdYeQP0zMnl8Q9zVvJ/SvJ0kqfKih5BX5Dzk1ya5OIkq4wDYGAr+bXlkSQ7kzyZ5PEkzxiPoO/PmiRXJLk8yfHGAdA6q8pjQ5J3kjyW5NEk240mWTbiZ7mfmOSqJFdaiQNUa2eSR5I8nOStUR1CM6Kv+7Ikv0/n8omNYg5Q/cp9Y/lO/335jhf0IbY8yfVJXkvyh/z/38kAMDzWlu/418p3/nJBHx4ryp/c3k+yxWocYGRW7VvKd//G0gJBr9QxSTYl+XOSm+NyM4BRNF4a8OfShGMEvR7fSfLzdM6A3BBn8gPQacFXZ8f/vLRC0FtsQzp7Bf9MyAHYT9h/VlqxQdDb5/vpnACxKX60DsDBjZdmvFYaIugDtiqdSxR+Gye7AbC4jvy2tKTqjtQc9JvLn6xcfgbAUq0tTblZ0PtnTTp34tno/QdAl20sjVkj6L11e5L/zrdvuQcA3XJWac3tgt59p6ZzZ51bvc8A6JNbS3tOFfTuuDrJ/yY5z3sLgD47rzToakFfmvuT/DquKQdgcMZKi+4X9IVbmeR/klzjfQRAS1xT2rRS0OfnsiTPJznHeweAljmnNKp1t2htW9CvT+e2d3Z7A6Ctxkurrhf0fduUzq3uAKAGW0q7BH2OqQzZJvkAjIQNpWEjH/TlSR5Pss57AoBKrSstWz6qQT8iyR+TXOK9AEDlLilNO2LUgr4iyX/FZjEADI/zSttWjErQVyR5IvZjB2D4nFUa1/eo9zvoRyTZluQMxxyAIXVGaV1ff/zez6AvT7LVyhyAEVmpb00fT5TrZ9D/M/7OHIDRcV5p31AFfSrOZgdg9FySPl2n3o+gb4rrzAEYXevShx3leh3062MHOADYkB7v/d7LoF8We7MDwFe2pId3aetV0Fcm+Y1jBwDf8Jv06H7qvQr6v8ctUAFgb+OlkVUE/f50bgAPAHzbOaWVrQ761UmucawA4ICuKc1sZdBPTfIrxwgA5uVXpZ2tC/p9ScYcHwCYl7HSzlYF/fbY1hUAFuq80tBWBH1NklsdEwBYlFtLSwce9LsdCwAYbEuXGvSb43aoALBUZ5WmDiToq5JsdAwAoCs2lrb2Peh3mT0AdNWi27rYoH8/yVpzB4CuWlsa27eg32bmANATi2rsYoK+IUv4GT8AcECrSmt7GvTvJLnJrAGgp24qze1Z0P85bosKAL02Xprbk6Afk+RGMwaAvrixtLfrQf+nuPkKAPTLWGlvV4O+IskNZgsAfXVDaXDXgv5Tq3MAGMgq/afdCvryJNeZKQAMxHWlxUsO+rVxZjsADMp4afGSg/5jswSAgTpoiw8W9MtiVzgAGLRVpcmLDvqPzBAAWuFHiw36iXFHNQBoi7WlzQsO+lVmBwCtctVign6luQFAq1y50KCviZPhAKBtVpVGzzvoV5gZALTSFQsJ+uXmBQCtdPl8g35+kuPNCwBa6fjS6oMG/VKzAoBWu3Q+Qb/YnACg1S4+WNBXxtntANB2q0qz9xv0i8wIAKpw0YGCfqH5AEAVvtHssb3+5QXms3TTU+sNoUITk5sNwecNn7eaXLC/FfoZ6dxEHQBov/HS7m8F/VyzAYCqnLuvoJ9tLgBQlbP3FfQzzQUAqnLm3kFfkeRkcwGAqpxcGv510E83EwCo0ulzg36aeQBAlU6bG/QJ8wCAKk3MDfpK8wCAKq2cG/RTzAMAqnTKV0FfETvEAUCtxpOsaJKcZBYAULWTmiQnmAMAVO2EJslx5gAAVTuuSXKsOQBA1Y5tkhxtDgBQtaObJEeZAwBU7agmyZHmAABVO7KJa9ABoHbjTZLDzQEAqnZ4k+QwcwCAqh3WJFluDgBQteVNkkPNAQCqdmiT5BBzAICqHdIkWWYOAFC1ZY0ZAED9BB0ABB0AEHQAoCvGjKD7JiY3GwL4nIMVOgAg6AAg6ACAoAMAgg4ACDoACDoAIOgAgKADAItnp7gemJ5abwgVsvMXvg983qzQAQBBBwAEHQAE3QgAQNABAEEHAAQdABB0ABB0AEDQAQBBBwAEHQAEHQAQdABA0AEAQQcAQQcABB0AEHQAQNABQNABAEEHAAQdABB0ABB0AEDQAQBBBwAEHQAEHQCoxJgRDI+Jyc2GAPg+sEIHAAQdABB0AEDQAUDQAQBBBwAEHQAQdABA0AFgGNgpbgRNT603hH2wsxa+D3zerNABAEEHAAQdAATdCABA0AEAQQcABB0AEHQAEHQAQNABAEEHAAQdAAQdABB0AEDQAQBBBwBBBwAEHQAQdABA0AFA0AEAQQcABB0AEHQAEHQAQNABAEEHAAQdAAQdAKjEmBHQLxOTmw0BwAodABB0ABB0AEDQAQBBBwAEHQAEHQAQdABA0AGA+bBTHK0zPbXeELCzIFihA4CgAwCCDgAIOgAg6AAg6ACAoAMAgg4ACDoACDoAIOgAgKADAIIOAAg6AAg6ACDoAICgAwCCDgCCDgAIOgAg6ACAoAOAoAMAgg4ACDoAIOgAIOgAgKADAIIOAOzfmBFAx8TkZkMArNABAEEHAAQdAAQdABB0AEDQAQBBBwAEHQAEHQBoBzvFwQJNT603hD6wcx9YoQOAoAMAgg4ACDoAIOgAIOgAgKADAIIOAAg6AAg6ACDoAICgAwCCDgAIOgAIOgAg6ACAoAMAgg4Agg4ACDoAIOgAgKADgKADAIIOAAg6ACDoACDoAICgAwCCDgDs15gRwMJMTG42BMAKHQAQdABA0AFA0AEAQQcABB0AEHQAEHQAQNABgN6yUxws0PTUekMArNABAEEHAAQdAAQdABB0AEDQAQBBBwBBBwAEHQAQdABA0AFA0AEAQQcABB0AEHQAEHQAQNABAEEHAAQdAAQdABB0AEDQAQBBBwBBBwAEHQAQdABA0AFA0AEAQQcABmnMCGBhJiY3GwJghQ4ACDoAIOgAIOgAgKADAIIOAAg6AAg6ACDoAEBv2SkOqNr01HpDACt0ABB0AEDQAQBBBwAEHQAEHQAQdABA0AEAQQcAQQcABB0AEHQAQNABQNABAEEHAAQdABB0ABB0AEDQAQBBBwAEHQAEHQAQdABA0AEAQQcAQQcABB0AEHQAQNABYNSMGUH3TUxuNgTweQMrdABA0AFA0AEAQQcABB0AEHQAEHQAQNABAEEHABbPTnE9MD213hDA5w2s0AEAQQcAQQcABB0AEHQAQNABQNABAEEHAAQdABB0ABB0AEDQAQBBBwAEHQAQdAAQdABA0AEAQQcABB0ABB0AEHQAQNABAEEHAEEHAAQdABB0AEDQAUDQAQBBBwB6ZcwIum9icrMhgM8bWKEDAIIOAIIOAAg6ACDoAICgA4CgAwCCDgAIOgCweHaK64HpqfWGAD5vHIAd/qzQAQBBBwBBBwAEHQDoZdC/NAYAqNqXTZIvzAEAqvZFk+RzcwCAqn3eJNltDgBQtd1Nks/MAQCq9lmT5FNzAICqfdokmTUHAKjabJPkI3MAgKp91CT50BwAoGofNkk+MAcAqNoHTZL3zAEAqvZek+RdcwCAqr3bJHnbHACgam83Sd40BwCo2ptNkpm4Fh0AajWbZOar+6G/bh4AUKXXk8790JNkl3kAQJV2zQ36tHkAQJWm5wb9FfMAgCq9MjfoL5sHAFTp5blBn0nyhpkAQFXeKA3/OuhJ8pK5AEBVvm733KC/YC4AUJUX9hX058wFAKry3L6CviN2jAOAWsyWdidJlq1ed8/cf/n7JGvNCABab2uS7+1rhZ4kT5sPAFThG83eO+hPmQ8AVOGpAwV9V5KdZgQArbYze92HpdnHf/SkOQFAq32r1fsK+uPmBACt9vh8gv5MknfMCgBa6Z3S6oMGPUkeMy8AaKV9Nnp/QX/UvACglR5dSNC3x9nuANA2O0uj5x30JHnE3ACgVfbb5gMF/WFzA4BWeXgxQX8rnX1iAYDB21ravOCgJ8l/mB8AtMIBm3ywoG+Lk+MAYNB2liYvOuhJ8mtzBICBOmiL5xP0+9O5iToA0H+zpcVLDvruJPeZJwAMxH2lxUsOepL8MskeMwWAvtpTGnxQ8w36TJIt5goAfbWlNLhrQU+SX1ilA0BfV+e/mO9/vJCgv59kk/kCQF9sKu3tetCT5N/ijHcA6LXZ0tx5W2jQP0lylzkDQE/dVZrbs6Anyd2xexwA9MrO0toFaRb5m91u3gDQE4tq7GKD/ru4ExsAdNvW0ti+BT1JbjJ3AOiqRbd1KUHfmeQWsweArrglSzhHrVnib35nkhcdAwBYkhdLUxet6cKT2OA4AMBgW9qNoG9PcodjAQCLckdp6cCDniS3JXnWMQGABXm2NHTJmi4+qevi5i0AMF97Sju7optBfzXJTxwfAJiXn5R2ti7oSfJgkgccIwA4oAdKM7um6cGTvDbJ844VAOzT86WVXdX06Mn+Y9xmFQD2Nlsa2XW9CvquJD903ADgG35YGllN0JNkW5IbHDsASEoTt/Xqf970+Mnfm0Xc0xUAhszdpYk90/ThRdyY5CHHEoAR9VBpYU81fXoxk0mecEwBGDFPlAb2XNPHF/X3sT0sAKPj2dK+vuhn0HcnWRu3WwVg+L1Ymrd7GIOeJB8nuSzJDscagCG1o7Tu437+ps0AXuhMkkus1AEY0pX5JaV1fdUM6AXPJPnb+Dt1AIbHs6VtM4P4zZsBvvCPk/xNnP0OQP2eKE37eFBPoBnwAHYnuTSuUwegXg+Vlu0e5JNoWjKMydhRDoD63J0+XWdeS9CTzi469n4HoBY3pA87wNUY9KSzz+3fxa1XAWiv2dKqe9v0pJoWDmpbknPSuQE8ALTJ86VR29r2xJqWDmxXkr9K8oD3DgAt8UBp0642Prmm5cO7NsmPk+zxPgJgQPaUFl3b5ifZVDDIB5P8ZWxCA0D/PVsa9GDbn2hTyUBfTXJ+kju8twDokztKe16t4ck2lQ33tiR/HfvAA9A7L5bW3FbTk24qHPT2JGcnucV7DoAuu6U0ZnttT7ypeOh3JlmdZKv3HwBLtLU05c5aX0BT+QHYmeR7SX5Q/hkAFtqRH5SWVN2RZkgOyO/Kn6xujF3mADi42dKM1aUh1WuG7ADdneSEJP8a164D8G17SiNOyJDdFKwZwoP1SZJ/SXJ8OVjCDsCe0oTjSyM+GbYX2AzxwXs/nR+nfDedkxz8KB5g9MyWBny3NOH9YX2hzQgczJl0LkM4Jp1b3Tl5DmD47Szf+ceUBswM+wtuRujg7k7nVner07ntncvdAIbP1vIdv7p85+8elRc+NqIHfFt5nJjkqiRXJlnlcwBQ7Wr8kSQPJ3lrVIewbPW6e7wVOtYkuSLJ5emcNAFAe72T5LEkj6bCXd2s0Htre3nckM5m/JcmudjKHaBVK/Enkzye5BnjEPT5eKY8bkyyMslFSS5MckGSceMB6IvZJH9K8nSSp5LsMhJBX4pd5fHL8uszkpybzub9ZyY52YgAuuKNJC8leSHJc0l2GImg99KO8ri3/HpFktOTnJZkoqzoT7GSBzjgyvv1sliaTvJKkpczApeWCXq7zST5Y3nMtSLJSelsL3hckmOTHJ3kqCRHluAfnuSwJMuTHJrkkCTLjBSoxJdJvkjyeTqXh32W5NMS7I+SfJjkgyTvJXk3ydtJ3hTu3vg/8VYUpRn2lqIAAAAASUVORK5CYII=


--------------------------------------------------------------------------------
/doc/pandoc.css:
--------------------------------------------------------------------------------
  1 | <style>
  2 | 
  3 | html {
  4 |   font-size: 100%;
  5 |   overflow-y: scroll;
  6 |   -webkit-text-size-adjust: 100%;
  7 |   -ms-text-size-adjust: 100%;
  8 | }
  9 | 
 10 | body {
 11 |   color: #444;
 12 |   font-family:  Palatino, 'Palatino Linotype', Georgia, 'Noto Serif', Times, 'Times New Roman', serif;
 13 |   font-size: 12pt;
 14 |   line-height: 1.7;
 15 |   padding: 1em;
 16 |   margin: auto;
 17 |   max-width: 50em;
 18 |   background: #fefefe;
 19 | }
 20 | 
 21 | a {
 22 |   color: #0645ad;
 23 |   text-decoration: none;
 24 | }
 25 | 
 26 | a:visited {
 27 |   color: #0b0080;
 28 | }
 29 | 
 30 | a:hover {
 31 |   color: #06e;
 32 | }
 33 | 
 34 | a:active {
 35 |   color: #faa700;
 36 | }
 37 | 
 38 | a:focus {
 39 |   outline: thin dotted;
 40 | }
 41 | 
 42 | *::-moz-selection {
 43 |   background: rgba(255, 255, 0, 0.3);
 44 |   color: #000;
 45 | }
 46 | 
 47 | *::selection {
 48 |   background: rgba(255, 255, 0, 0.3);
 49 |   color: #000;
 50 | }
 51 | 
 52 | a::-moz-selection {
 53 |   background: rgba(255, 255, 0, 0.3);
 54 |   color: #0645ad;
 55 | }
 56 | 
 57 | a::selection {
 58 |   background: rgba(255, 255, 0, 0.3);
 59 |   color: #0645ad;
 60 | }
 61 | 
 62 | p {
 63 |  lang: en;
 64 |   margin: 1em 0;
 65 |   -webkit-hyphens: auto;
 66 |   -moz-hyphens: auto;
 67 |    -ms-hyphens: auto;
 68 |     /*-o-hyphens: auto;*/
 69 |   hyphens: auto
 70 | }
 71 | 
 72 | img {
 73 |   max-width: 100%;
 74 | }
 75 | 
 76 | h1, h2, h3, h4, h5, h6 {
 77 |   color: #3A6072AA;
 78 |   line-height: 125%;
 79 |   margin-top: 2em;
 80 |   font-weight: normal;
 81 | }
 82 | 
 83 | h4, h5, h6 {
 84 |   font-weight: bold;
 85 | }
 86 | 
 87 | h1 {
 88 |   font-size: 2.5em;
 89 | }
 90 | 
 91 | h2 {
 92 |   font-size: 1.8em;
 93 | }
 94 | 
 95 | h3 {
 96 |   font-size: 1.5em;
 97 | }
 98 | 
 99 | h4 {
100 |   font-size: 1.2em;
101 | }
102 | 
103 | h5 {
104 |   font-size: 1em;
105 | }
106 | 
107 | h6 {
108 |   font-size: 0.9em;
109 | }
110 | 
111 | blockquote {
112 |   color: #666666;
113 |   margin: 0;
114 |   padding-left: 3em;
115 |   border-left: 0.5em #EEE solid;
116 | }
117 | 
118 | hr {
119 |   display: block;
120 |   height: 2px;
121 |   border: 0;
122 |   border-top: 1px solid #aaa;
123 |   border-bottom: 1px solid #eee;
124 |   margin: 1em 0;
125 |   padding: 0;
126 | }
127 | 
128 | pre, code, kbd, samp {
129 |   color: #000;
130 |   font-family: monospace, monospace;
131 |   _font-family: 'courier new', monospace;
132 |   font-size: 0.98em;
133 | }
134 | 
135 | pre {
136 |   white-space: pre;
137 |   white-space: pre-wrap;
138 |   word-wrap: break-word;
139 | }
140 | 
141 | b, strong {
142 |   font-weight: bold;
143 | }
144 | 
145 | dfn {
146 |   font-style: italic;
147 | }
148 | 
149 | ins {
150 |   background: #ff9;
151 |   color: #000;
152 |   text-decoration: none;
153 | }
154 | 
155 | mark {
156 |   background: #ff0;
157 |   color: #000;
158 |   font-style: italic;
159 |   font-weight: bold;
160 | }
161 | 
162 | sub, sup {
163 |   font-size: 75%;
164 |   line-height: 0;
165 |   position: relative;
166 |   vertical-align: baseline;
167 | }
168 | 
169 | sup {
170 |   top: -0.5em;
171 | }
172 | 
173 | sub {
174 |   bottom: -0.25em;
175 | }
176 | 
177 | ul, ol {
178 |   margin: 1em 0;
179 |   padding: 0 0 0 2em;
180 | }
181 | 
182 | li p:last-child {
183 |   margin-bottom: 0;
184 | }
185 | 
186 | ul ul, ol ol {
187 |   margin: .3em 0;
188 | }
189 | 
190 | dl {
191 |   margin-bottom: 1em;
192 | }
193 | 
194 | dt {
195 |   font-weight: bold;
196 |   margin-bottom: .8em;
197 | }
198 | 
199 | dd {
200 |   margin: 0 0 .8em 2em;
201 | }
202 | 
203 | dd:last-child {
204 |   margin-bottom: 0;
205 | }
206 | 
207 | img {
208 |   border: 0;
209 |   -ms-interpolation-mode: bicubic;
210 |   vertical-align: middle;
211 | }
212 | 
213 | figure {
214 |   display: block;
215 |   text-align: center;
216 |   margin: 1em 0;
217 | }
218 | 
219 | figure img {
220 |   border: none;
221 |   margin: 0 auto;
222 | }
223 | 
224 | figcaption {
225 |   font-size: 0.8em;
226 |   font-style: italic;
227 |   margin: 0 0 .8em;
228 | }
229 | 
230 | table {
231 |   margin-bottom: 2em;
232 |   border-bottom: 1px solid #ddd;
233 |   border-right: 1px solid #ddd;
234 |   border-spacing: 0;
235 |   border-collapse: collapse;
236 | }
237 | 
238 | table th {
239 |   padding: .2em 1em;
240 |   background-color: #eee;
241 |   border-top: 1px solid #ddd;
242 |   border-left: 1px solid #ddd;
243 | }
244 | 
245 | table td {
246 |   padding: .2em 1em;
247 |   border-top: 1px solid #ddd;
248 |   border-left: 1px solid #ddd;
249 |   vertical-align: top;
250 | }
251 | 
252 | .author {
253 |   font-size: 1.2em;
254 |   text-align: center;
255 | }
256 | 
257 | @media only screen and (min-width: 480px) {
258 |   body {
259 |     font-size: 14px;
260 |   }
261 | }
262 | @media only screen and (min-width: 768px) {
263 |   body {
264 |     font-size: 16px;
265 |   }
266 | }
267 | @media print {
268 |   * {
269 |     background: transparent !important;
270 |     color: black !important;
271 |     filter: none !important;
272 |     -ms-filter: none !important;
273 |   }
274 | 
275 |   body {
276 |     font-size: 12pt;
277 |     max-width: 100%;
278 |   }
279 | 
280 |   a, a:visited {
281 |     text-decoration: underline;
282 |   }
283 | 
284 |   hr {
285 |     height: 1px;
286 |     border: 0;
287 |     border-bottom: 1px solid black;
288 |   }
289 | 
290 |   a[href]:after {
291 |     content: " (" attr(href) ")";
292 |   }
293 | 
294 |   abbr[title]:after {
295 |     content: " (" attr(title) ")";
296 |   }
297 | 
298 |   .ir a:after, a[href^="javascript:"]:after, a[href^="#"]:after {
299 |     content: "";
300 |   }
301 | 
302 |   pre, blockquote {
303 |     border: 1px solid #999;
304 |     padding-right: 1em;
305 |     page-break-inside: avoid;
306 |   }
307 | 
308 |   tr, img {
309 |     page-break-inside: avoid;
310 |   }
311 | 
312 |   img {
313 |     max-width: 100% !important;
314 |   }
315 | 
316 |   @page :left {
317 |     margin: 15mm 20mm 15mm 10mm;
318 | }
319 | 
320 |   @page :right {
321 |     margin: 15mm 10mm 15mm 20mm;
322 | }
323 | 
324 |   p, h2, h3 {
325 |     orphans: 3;
326 |     widows: 3;
327 |   }
328 | 
329 |   h2, h3 {
330 |     page-break-after: avoid;
331 |   }
332 | }
333 | 
334 | </style>


--------------------------------------------------------------------------------
/src/Distribution.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef DISTRIBUTION_HPP
  2 | #define DISTRIBUTION_HPP
  3 | 
  4 | #include "includes.hpp"
  5 | #include "Tags.hpp"
  6 | 
  7 | 
  8 | #include <random>
  9 | using std::default_random_engine;
 10 | using std::mt19937;
 11 | using std::normal_distribution;
 12 | using std::gamma_distribution;
 13 | using std::discrete_distribution;
 14 | 
 15 | typedef mt19937 rng_t;
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | template <typename DistType>
 22 | class Distribution {
 23 | 
 24 | 		rng_t& mRNG;
 25 | 
 26 | 	public:
 27 | 
 28 | 		Distribution( rng_t& RNG ) : mRNG( RNG ) {};
 29 | 
 30 | 
 31 | 		// sample new observation
 32 | 		template< typename ParamType>
 33 | 		Observation<DistType> sample(
 34 | 		    const Observation< ParamType >& param ) {
 35 | 			Observation<DistType> result;
 36 | 			resample( result, param );
 37 | 			return result;
 38 | 		}
 39 | 
 40 | 
 41 | 		// replace existing observation with a new sample
 42 | 		template < typename ParamType>
 43 | 		void resample(
 44 | 		    Observation<DistType>& obs,
 45 | 		    const Observation<ParamType>& param );
 46 | 
 47 | };
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | ////////////////////////////////////////////////// TEMPLATE SPECIALIZATIONS //////////////////////////////////////////////////
 54 | #include "SufficientStatistics.hpp"	// required implementation
 55 | #include "Observation.hpp"	// required implementation
 56 | 
 57 | 
 58 | 
 59 | //////////////////// NORMAL ////////////////////
 60 | 
 61 | template<>	template<>
 62 | void Distribution<Normal>::resample(
 63 |     Observation<Normal>& obs,
 64 |     const Observation<NormalParam>& param ) {
 65 | 
 66 | 	normal_distribution<real_t> dist( param.mean(),  param.stdev() );
 67 | 	obs.setValue( dist( mRNG ) );
 68 | 	dist.reset();
 69 | }
 70 | 
 71 | 
 72 | 
 73 | //////////////////// NORMAL INVERSE GAMMA ////////////////////
 74 | 
 75 | 
 76 | template<> template<>
 77 | void Distribution<NormalInverseGamma>::resample(
 78 |     Observation<NormalInverseGamma>& obs,
 79 |     const Observation<NormalInverseGammaParam>& param ) {
 80 | 	gamma_distribution<real_t> gamma( param.alpha(), 1.0 / param.beta() );
 81 | 	real_t var = 1.0 / gamma( mRNG );
 82 | 	gamma.reset();
 83 | 	normal_distribution<real_t> normal( param.mu0(),  sqrt( var / param.nu() ) );	
 84 | 	real_t mean = normal( mRNG );
 85 | 	normal.reset();
 86 | 	obs.setValue( mean, var );
 87 | }
 88 | 
 89 | 
 90 | 
 91 | //////////////////// BETA ////////////////////
 92 | 
 93 | // Sample Beta distribution using two independent Gamma RV.
 94 | template<>	template<>
 95 | void Distribution<Beta>::resample(
 96 |     Observation<Beta>& obs,
 97 |     const Observation<BetaParam>& param ) {
 98 | 
 99 | 	gamma_distribution<real_t> distA( param.alpha(),  1 );
100 | 	gamma_distribution<real_t> distB( param.beta(),  1 );
101 | 	real_t a = distA( mRNG );
102 | 	distA.reset();
103 | 	real_t b = distB( mRNG );
104 | 	distB.reset();
105 | 	obs.setValue( a / ( a + b ) );
106 | 
107 | }
108 | 
109 | 
110 | 
111 | 
112 | //////////////////// DIRICHLET ////////////////////
113 | 
114 | 
115 | // Sample Dirichlet distribution from normalized vector of Gamma RVs
116 | void dirichlet_sample(
117 |     vector<real_t>& probs,
118 |     const vector<real_t>& alphas,
119 |     rng_t& RNG ) {
120 | 	if ( probs.size() != alphas.size() ) {
121 | 		throw runtime_error( "Number of parameters must match the domain size of the Dirichlet RV!" );
122 | 	}
123 | 	size_t d = 0;
124 | 	real_t rand = 0;
125 | 	real_t sum = 0;
126 | 	for ( const auto & alpha : alphas ) {
127 | 		gamma_distribution<real_t> dist( alpha, 1.0 );
128 | 		rand = dist( RNG );
129 | 		dist.reset();
130 | 		probs[d] = rand;
131 | 		sum += rand;
132 | 		d++;
133 | 	}
134 | 
135 | 
136 | 	for ( auto & p : probs ) {
137 | 		p /= sum;
138 | 	}
139 | }
140 | 
141 | 
142 | template<> template<>
143 | void Distribution<Dirichlet>::resample(
144 |     Observation<Dirichlet>& obs,
145 |     const Observation<DirichletParam>& param ) {
146 | 
147 | 
148 | 	if ( obs.domainSize() != param.domainSize() ) {
149 | 		throw runtime_error( "Domain sizes of Dirichlet random variable (" + to_string( obs.domainSize() ) + ") and the parameters requested for sampling (" + to_string( param.domainSize() ) + ") do not match!" );
150 | 	}
151 | 
152 | 	dirichlet_sample( obs.probs(), param.alphas(), mRNG );
153 | 	// Dirichlet can be sampled using gamma(alpha_i, 1) distributions, with subsequent normalization
154 | 
155 | }
156 | 
157 | 
158 | 
159 | //////////////////// DIRICHLET VECTOR ////////////////////
160 | 
161 | template<> template<>
162 | void Distribution<DirichletVector>::resample(
163 |     Observation<DirichletVector>& obs,
164 |     const Observation<DirichletParamVector>& param ) {
165 | 
166 | 	// NOTE this class is declared friend of Observation<Dirichlet> to set obs.mValues directly
167 | 
168 | 	if ( obs.nrDim() != param.nrDim() ) {
169 | 		throw runtime_error( "Dimensions of Dirichlet random variable (" + to_string( obs.nrDim() ) + ") and the parameters requested for sampling (" + to_string( param.nrDim() ) + ") do not match!" );
170 | 	}
171 | 
172 | 	for ( size_t d = 0; d < obs.nrDim(); ++d ) {
173 | 		if ( obs[d].domainSize() != param[d].domainSize() ) {
174 | 			throw runtime_error( "Domain sizes of Dirichlet random variable (" + to_string( obs[d].domainSize() ) + ") and the parameters requested for sampling (" + to_string( param[d].domainSize() ) + ") do not match!" );
175 | 		}
176 | 		dirichlet_sample( obs[d].probs(), param[d].alphas(), mRNG );
177 | 	}
178 | }
179 | 
180 | 
181 | 
182 | // TODO this interface differs from the others for a reason. Make this more elegant and consistent somehow.
183 | 
184 | template <>
185 | class Distribution<Categorical> {
186 | 
187 | 		rng_t& mRNG;
188 | 
189 | 	public:
190 | 
191 | 		Distribution( rng_t& RNG ) : mRNG( RNG ) {};
192 | 
193 | 
194 | 		// sample new observation
195 | 		int sample(
196 | 		    const vector<real_t>& probs,
197 | 		    const size_t begin,
198 | 		    const size_t end ) {
199 | 			discrete_distribution<int> dist( probs.begin() + begin, probs.begin() + end );
200 | 			int result = dist( mRNG );
201 | 			dist.reset();
202 | 			return result;
203 | 		}
204 | 
205 | };
206 | 
207 | 
208 | 
209 | #endif
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 


--------------------------------------------------------------------------------
/lib/gzstream/index.html:
--------------------------------------------------------------------------------
  1 | <html> <head>  
  2 | <title>Gzstream Library Home Page</title>
  3 | </head>  
  4 | <body BGCOLOR="FAF8E8" TEXT="#000000">
  5 | 
  6 | <h1>Gzstream Library Home Page</h1>
  7 | 
  8 | <hr>
  9 |     <TABLE><TR><TD ALIGN=LEFT VALIGN=TOP>
 10 |         <img border=0 src="logo.gif" align=center>
 11 |     </TD><TD ALIGN=LEFT VALIGN=TOP NOWRAP>
 12 | 	<ul>
 13 | 	<li><a href="#intro"> Introduction</a>
 14 |         <li><a href="#sys">   Supported Systems</a>
 15 | 	<li><a href="#inst">  Installation</a>
 16 | 	<li><a href="#doc">   Documentation</a>
 17 | 	<li><a href="#miss">  What's Missing</a>
 18 | 	<li><a href="#src">   Download</a>
 19 | 	<li><a href="#links"> Links</a><P>
 20 | 	</ul>
 21 |     </TD></TR></TABLE>
 22 | 
 23 | 
 24 | <hr><!-------------------------------------------------------------------->
 25 | <a name="intro"><h2> Introduction </h2></a>
 26 | 
 27 | <i>Gzstream</i> is a small C++ library, basically just a wrapper,
 28 | that provides the functionality of the 
 29 | <a href="http://www.gzip.org/zlib/">zlib C-library</a> in a C++ iostream.
 30 | It is freely available under the <a href="COPYING.LIB">LGPL license</a>.<P>
 31 | 
 32 | Gzstream has been written by 
 33 | <a href="http://www.cs.unc.edu/~debug/">Deepak Bandyopadhyay</a> and
 34 | <a href="http://www.cs.unc.edu/~kettner/">Lutz Kettner</a> at 
 35 | the <a href="http://www.cs.unc.edu/Research/compgeom/">Computational 
 36 | Geometry Group at UNC Chapel Hill</a>.<P>
 37 | 
 38 | 
 39 | <hr><!-------------------------------------------------------------------->
 40 | <a name="sys"><h2> Supported Systems </h2></a>
 41 | 
 42 | Gzstream requires a standard compliant C++ compiler (we use the new
 43 | header file conventions and the new iostream in the std:: name space)
 44 | and, of course, zlib. We used zlib 1.1.3 so far, but see the <a
 45 | href="http://www.gzip.org/zlib/">zlib home page</a> for why you should
 46 | upgrade to zlib 1.1.4. So, in theory, the provided sources could run
 47 | on many platforms. However, we used only the following few
 48 | platforms.<P>
 49 | <P>
 50 | 
 51 | <ul>
 52 |   <li> PC Linux, RedHat 6.1, g++ version 2.95.2
 53 |   <li> PC Linux, Debian, g++ version 2.95.2 and 3.1
 54 |   <li> SGI Irix 6.5, MIPSpro CC version 7.30
 55 | </ul><P>
 56 | 
 57 | 
 58 | <hr><!-------------------------------------------------------------------->
 59 | <a name="inst"><h2> Installation </h2></a>
 60 | 
 61 | Either compile <tt>gzstream.C</tt> by hand, place it in some library,
 62 | and move <tt>gzstream.h</tt> into the include search path of your
 63 | compiler. Or use the provided <tt>Makefile</tt>, adapt its 
 64 | variables, and follow the remarks in the <tt>Makefile</tt>. Two 
 65 | test programs are provided, <tt>test_gzip.C</tt> and <tt>test_gunzip.C</tt>.
 66 | The <tt>Makefile</tt> contains a rule that performs a small test
 67 | with these programs.<P>
 68 | 
 69 | 
 70 | <hr><!-------------------------------------------------------------------->
 71 | <a name="doc"><h2> Documentation </h2></a>
 72 | 
 73 | The library provides two classes, <tt>igzstream</tt> and <tt>ogzstream</tt>,
 74 | that can be used analogously to <tt>ifstream</tt> and <tt>ofstream</tt>
 75 | respectively.<P>
 76 | 
 77 | The classes are by default in the global name space. This can 
 78 | be changed by setting the macro <tt>GZSTREAM_NAMESPACE</tt> to
 79 | the desired name space, e.g., by setting the option 
 80 | </tt>-DGZSTREAM_NAMESPACE=gz</tt> in the <tt>Makefile</tt>. 
 81 | However, this needs to be consistent for both, the library compilation
 82 | and the application that uses the library.<P>
 83 | 
 84 | 
 85 | <hr><!-------------------------------------------------------------------->
 86 | <a name="miss"><h2> What's Missing </h2></a>
 87 | 
 88 | <ul>
 89 |   <li> Seek. The zlib library provides the necessary functionality,
 90 |        but we have not realized that in the wrapper (yet? ;-).
 91 |   <li> Both streams are based on the same streambuffer. So, they 
 92 |        cannot be used to derive an iogzstream class that would allow
 93 |        simultaneous reading and writing to the same file.
 94 | </ul><P>
 95 | 
 96 | 
 97 | <hr><!-------------------------------------------------------------------->
 98 | <a name="src"><h2> Download and Release Notes</h2></a>
 99 | 
100 | <ul>
101 |   <li> Gzstream library 1.5 (08 Apr 2003):
102 |          <a href="gzstream.tgz">gzstream.tgz</a><br>
103 |        Fixed bug that did not set the state correctly on failure to open or 
104 |        close a file. <br>
105 |        Fixed bug in the indexing of the write buffer that
106 |        caused the write buffer to shrink continously and finally caused 
107 |        wrong results when writing compressed files (only observed on some
108 |        platforms). <P>
109 |   <li> Gzstream library 1.4 (27 Apr 2002):<br>
110 |        Fixed a bug that stopped stream output after calling <tt>flush()</tt>
111 |        or using <tt>std::endl</tt>.<P>
112 |   <li> Gzstream library 1.3 (06 Nov 2001):<br>
113 |        Fixed unsigned char -- signed char bug. Increased buffer size
114 |        for better performance.<P>
115 |   <li> Gzstream library 1.2 (04 Oct 2001):<br>
116 |        Initial release as gzstream, renamed from zipstream.<P>
117 |   <li> Zipstream library 1.1 (09 Sep 2001):<br>
118 |        Initial release.
119 | </ul>
120 | 
121 | <hr><!-------------------------------------------------------------------->
122 | <a name="links"><h2> Acknowledgements </h2></a>
123 | 
124 | Credits for finding bugs and improving this software go to:
125 | Vincent Ricard, Peter Milley, Peter J. Torelli, and Ares Lagae.
126 | <P>
127 | 
128 | <hr><!-------------------------------------------------------------------->
129 | <a name="links"><h2> Links </h2></a>
130 | 
131 | <ul>
132 |     <li><a href="http://www.gzip.org/zlib/">zlib C-library</a>
133 |     <li><a href="http://www.cs.unc.edu/~debug/">Deepak Bandyopadhyay</a>
134 |     <li><a href="http://www.cs.unc.edu/~kettner/">Lutz Kettner</a>
135 |     <li><a href="http://www.cs.unc.edu/Research/compgeom/">
136 |             The Computational Geometry Group at UNC Chapel Hill</a>
137 | </ul>
138 | 
139 | <hr><!-------------------------------------------------------------------->
140 | <address> 
141 |     The Computational Geometry Group at UNC Chapel Hill, Jan. 08, 2003. 
142 | </address>
143 | </body>  </html>
144 | <!-------------------------------------------------------------------->
145 | <!EOF>
146 | 


--------------------------------------------------------------------------------
/src/tools/combineCounts.cpp:
--------------------------------------------------------------------------------
  1 | // Combine counts from different files, by adding, subtracting.
  2 | 
  3 | #include "MappedValues.hpp"
  4 | 
  5 | #include <iostream>
  6 | using std::cout;
  7 | using std::endl;
  8 | using std::flush;
  9 | 
 10 | #include <algorithm>
 11 | using std::sort;
 12 | 
 13 | #include "../Parser.hpp"
 14 | 
 15 | #include <string>
 16 | using std::string;
 17 | 
 18 | #include <fstream>
 19 | using std::ifstream;
 20 | using std::ofstream;
 21 | using std::getline;
 22 | 
 23 | #include <sstream>
 24 | using std::stringstream;
 25 | 
 26 | #include <unordered_map>
 27 | using std::unordered_map;
 28 | 
 29 | #include "gzstream.h"
 30 | 
 31 | int main( int argc, const char* argv[] ) {
 32 | 
 33 | 	Parser args( argc, argv );
 34 | 
 35 | 	// OPTIONS
 36 | 	// file containing
 37 | 	args.registerFlags( {"-i", "-input-prefices"}, "" );	// prefix list for files that should be counted positive
 38 | 	args.registerFlags( {"-p", "-pos-suffix"}, "-pos.csv.gz" );
 39 | 	args.registerFlags( {"-c", "-count-suffix"}, "-count.csv.gz" );
 40 | 	args.registerFlags( {"-s", "-size-suffix"}, "-size.csv" );
 41 | 
 42 | 	args.registerFlags( {"-n", "-normalization-prefix"}, "mappability" );
 43 | 	
 44 | 	args.registerFlags( {"-o", "-out-prefix"} );
 45 | 
 46 | 
 47 | 	args.registerFlags( {"-h", "--help", "-help"}, "" );
 48 | 	args.parseArgs();
 49 | 
 50 | 	const string posSuffix = args.parse<string>( "-pos-suffix", 0 );
 51 | 	const string countSuffix = args.parse<string>( "-count-suffix", 0 );
 52 | 	const string sizeSuffix = args.parse<string>( "-size-suffix", 0 );
 53 | 	const string outPrefix = args.parse<string>( "-out-prefix" );
 54 | 	vector<string> prefices = args.parseVector<string>( "-input-prefices" );
 55 | 
 56 | 	if ( args.isSet( "-h" ) ) {
 57 | 		cout << "Takes lists of file prefices (-i) and adds their counts (use + and - before lists of files), and adds them. Shared suffices for files can be set using -p, -c, and -s, for position, count and size. The output prefix is set using -o. If -n is provided, its argument is used as a prefix for normalization, i.e. counts are multiplied if a position exists (e.g. for mappability correction)." << endl;
 58 | 		return 0;
 59 | 	}
 60 | 
 61 | 
 62 | 	unordered_map<string, vector<MappedValueEntry<long int>> > refseqToData;
 63 | 	vector<string> observedRefSeqs;	// refseqs should be stored in the order observed in files, not by string comparison in an ordered map. Also, unordered_map is faster.
 64 | 	vector<MappedValueEntry<long int>> currentData;
 65 | 
 66 | 
 67 | 	// TODO implement with GenomeGetter
 68 | 	
 69 | 	long int sign = 1;
 70 | 	if ( prefices[0] != "+" && prefices[0] != "-" ) {
 71 | 		throw runtime_error( "First token of -i must be + or -!" );
 72 | 	}
 73 | 	for ( const auto & prefix : prefices )	{
 74 | 		if ( prefix == "+" ) {
 75 | 			sign = 1;
 76 | 			continue;
 77 | 		}
 78 | 		if ( prefix == "-" ) {
 79 | 			sign = -1;
 80 | 			continue;
 81 | 		}
 82 | 
 83 | 		if ( sign > 0 ) {
 84 | 			cout << "Adding";
 85 | 		} else {
 86 | 			cout << "Subtracting";
 87 | 		}
 88 | 		cout  << " counts for " << prefix << "*" << endl << flush;
 89 | 
 90 | 		const string sizefilename = prefix + sizeSuffix;
 91 | 		const string posfilename = prefix + posSuffix;
 92 | 		const string countfilename = prefix + countSuffix;
 93 | 
 94 | 		ifstream sizeFile( sizefilename );
 95 | 		igzstream posFile( posfilename.c_str() );
 96 | 		igzstream countFile( countfilename.c_str() );
 97 | 
 98 | 
 99 | 
100 | 		// string to hold line read from file
101 | 		string line;
102 | 
103 | 		// variable to stream unised parts of string to
104 | 		string dump;
105 | 
106 | 		// variable to hold current refseq
107 | 		string refseq;
108 | 
109 | 
110 | 		// variable for number of entries in a refseq
111 | 		size_t nrEntries;
112 | 
113 | 		// variable to hold current genome position
114 | 		size_t pos;
115 | 
116 | 		// variable to hold current count
117 | 		long int count;
118 | 
119 | 		if ( !sizeFile.good() ) {
120 | 			throw runtime_error( "Cannot open " + sizefilename + "!" );
121 | 		}
122 | 		while ( getline( sizeFile, line ) ) {
123 | 
124 | 			// string stream for line
125 | 			stringstream ss( line );
126 | 
127 | 			// parse line in sizeFile
128 | 			ss >> refseq >> nrEntries >> dump;
129 | 
130 | // 			cout << "\t" << refseq << endl;
131 | 
132 | 			// check whether refseq has been observed before
133 | 			if ( refseqToData.find( refseq ) == refseqToData.end() ) {
134 | 
135 | 				// insert new refseq into records
136 | 				observedRefSeqs.push_back( refseq );
137 | 
138 | 				// add empty vector to new refseq
139 | 				refseqToData.insert( {refseq, {}} );
140 | 			}
141 | 
142 | 
143 | 			// swap out data for refseq to currentData to be processed
144 | 			refseqToData[refseq].swap( currentData );
145 | 
146 | 			currentData.reserve( currentData.size() + nrEntries );
147 | 
148 | 			for ( size_t i = 0; i < nrEntries; ++i ) {
149 | 
150 | 				// read position
151 | 				getline( posFile, line );
152 | 				pos = atoi( line.c_str() );
153 | 
154 | 				// read count
155 | 				// TODO negative numbers
156 | 				getline( countFile, line );
157 | 				count = sign * atoi( line.c_str() );
158 | 
159 | 				currentData.push_back( MappedValueEntry<long int>( pos, count ) );
160 | 			}
161 | 
162 | 			// compress currentData
163 | 			sortAddAndCompress( currentData );
164 | 
165 | 			// put the current data into the refseq that ends here, currentData is now an empty vector
166 | 			refseqToData[refseq].swap( currentData );
167 | 
168 | 		}
169 | 		sizeFile.close();
170 | 		posFile.close();
171 | 		countFile.close();
172 | 
173 | 	}
174 | 
175 | 
176 | 	// normalize counts if necessary
177 | 	if (args.isSet("-n")){
178 | 		throw runtime_error("-n not implemented yet!");
179 | 	}
180 | 
181 | 
182 | 	// output
183 | 	cout << "Writing output to " << outPrefix << "*" << endl << flush;
184 | 	ofstream sizeFile( outPrefix + sizeSuffix );
185 | 	ogzstream posFile( (outPrefix + posSuffix).c_str() );
186 | 	ogzstream countFile( (outPrefix + countSuffix).c_str() );
187 | 	size_t totalSize = 0;
188 | 	size_t size = 0;
189 | 	for ( const string & refseq : observedRefSeqs ) {
190 | 		size = refseqToData[refseq].size();
191 | 		totalSize += size;
192 | 		sizeFile << refseq << "\t" << size << "\t" << totalSize << endl;
193 | 		for ( const auto & entry : refseqToData[refseq] ) {
194 | 			posFile << entry.pos << endl;
195 | 			countFile << entry.entry << endl;
196 | 		}
197 | 	}
198 | 	sizeFile.close();
199 | 	posFile.close();
200 | 	countFile.close();
201 | }
202 | 
203 | 
204 | 
205 | 
206 | 


--------------------------------------------------------------------------------
/bin/pyhammlet/RLE.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | import numpy as np
  5 | import copy
  6 | from scipy import stats
  7 | from bisect import bisect_right, bisect_left
  8 | from itertools import groupby
  9 | 
 10 | def find_gt(a, x):
 11 | 	'Find leftmost index if item greater than x'
 12 | 	i = bisect_right(a, x)
 13 | 	if i != len(a):
 14 | 		return i
 15 | 	raise ValueError
 16 | 
 17 | 
 18 | def find_ge(a, x):
 19 |     'Find leftmost index of item greater than or equal to x'
 20 |     i = bisect_left(a, x)
 21 |     if i != len(a):
 22 |         return i
 23 |     raise ValueError
 24 | 
 25 | 
 26 | # "subtractive difference" as inverse of "cumulative sum"
 27 | def subdiff(ends):
 28 | 	result = copy.deepcopy(ends)
 29 | 	result[1:] -= ends[:-1]
 30 | 	return result
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | # A NumPy array, with a run-length encoding of its first axis.
 39 | class RunLengthArray:
 40 | 
 41 | 	# self.ends
 42 | 	# self.array
 43 | 	
 44 | 	def __init__(self, *args, **kwargs):
 45 | 		if kwargs.has_key("array"):
 46 | 			self.array = np.array(kwargs["array"])
 47 | 			if kwargs.has_key("sizes"):
 48 | 				self.ends = np.array(kwargs["sizes"], dtype=int)
 49 | 			else:
 50 | 				self.ends = np.ones(self.array.shape[0], dtype=int)
 51 | 			self.ends = self.ends.cumsum();	# store the first position after the current block, for log-query
 52 | 			self.size = self.ends[-1]
 53 | 			
 54 | 			assert len(self.ends) == len(self.array)
 55 | 
 56 | 			assert len(self.ends) == self.array.shape[0], "Length of runlength array (%d) does not match the size of the data (%s)!" % (len(self.ends),  str(self.array.shape))
 57 | 			
 58 | 		else:
 59 | 			self.size=0
 60 | 			self.array=None
 61 | 			self.ends=None
 62 | 		
 63 | 		
 64 | 	def loadtxt(self, filename, dtype=int):
 65 | 		array = np.loadtxt(filename, dtype=dtype)	
 66 | 		self.__init__( sizes=array[:,0], array=array[:,1:])
 67 | 
 68 | 
 69 | 	# TODO does this work for multivariate data?
 70 | 	def compress(self):
 71 | 		#"""Compress the data as much as possible."""
 72 | 		
 73 | 		size = self.ends[-1]
 74 | 		rle = np.array([(x[0],  sum(1 for _ in x[1])) for x in groupby(self.array)])
 75 | 		self.ends = self.ends[rle[:,-1].cumsum()-1]
 76 | 		self.array=rle[:,0]
 77 | 		del rle
 78 | 		assert self.ends[-1]==size
 79 | 		print self.array
 80 | 		
 81 | 		#size = self.ends[-1]
 82 | 		#L = 0	# index for insertion
 83 | 		#for R in xrange(1,len(self.ends)):
 84 | 			#if not np.array_equal(self.array[L], self.array[R]):
 85 | 				#L += 1
 86 | 				#if L != R:
 87 | 					#self.array[L] = self.array[R]
 88 | 			#self.ends[L] = self.ends[R]
 89 | 		#L+=1
 90 | 		#self.ends = self.ends[0:L]
 91 | 		#self.array = self.array[0:L]
 92 | 		#assert len(self.ends)==len(self.array)
 93 | 		#assert self.ends[-1]==size
 94 | 
 95 | 	# decompressed shape
 96 | 	def shape(self):
 97 | 		result = list(self.array.shape)
 98 | 		result[0] = self.ends[-1]
 99 | 		return tuple(result)
100 | 		
101 | 		
102 | 	def __str__(self):
103 | 		result = "%s %s\n" % (self.ends[0].__str__(), self.array[0].__str__())
104 | 		for b in xrange(1, len(self.ends)):
105 | 			result += "%s %s\n" % ((self.ends[b]-self.ends[b-1]).__str__(), self.array[b].__str__())
106 | 		return result
107 | 
108 | 		
109 | 	# Returns an RLE array of indices of maxima
110 | 	def mode(self):
111 | 		res = stats.mode(self.array, axis=1)[0].astype(self.array.dtype).reshape(self.array.shape[0])
112 | 		return RunLengthArray(sizes=subdiff(self.ends), array=res)
113 | 
114 | 	# Returns an RLE array of argmax
115 | 	def argmax(self):
116 | 		res = list(np.argmax(self.array, axis=1).astype(self.array.dtype))#.reshape(self.array.shape[0])
117 | 		return RunLengthArray(sizes=subdiff(self.ends), array=res)
118 | 
119 | 
120 | 	def __getitem__(self, key):
121 | 		return self.array[find_gt(self.ends, key)]
122 | 		
123 | 	
124 | 	# Returns a RunLengthArray object corresponding to the array in the slice  [start:end] (in uncompressed coordinates)
125 | 	def __getslice__(self, start = None, end = None, step = None):
126 | 		if step is not None and step != 1:
127 | 			assert False, "RunLengthArray does not support steps in slices!"
128 | 		if end is None:
129 | 			end = start+self.size
130 | 		assert start >=0
131 | 		assert end <= self.size, "End index %d > size of compressed array %d!" % (end, self.size)
132 | 		
133 | 		L = find_gt(self.ends, start)	# get left index
134 | 		R = find_ge(self.ends, end)	# get right index
135 | 		R+=1
136 | 		segmentSizes = copy.deepcopy(self.ends[L:R])
137 | 		# transform right boundaries to block sizes
138 | 		segmentSizes[-1]=end	# set right boundary directly
139 | 		i = len(segmentSizes)-1
140 | 		while i>0:
141 | 			segmentSizes[i] -= segmentSizes[i-1]
142 | 			i-=1
143 | 		segmentSizes[0] -= start	# adjust first blocksize
144 | 		
145 | 		assert segmentSizes.sum()==end-start, "Sum of segment sizes (%d) does not match requested range (%d)!" %(segmentSizes.sum(), end-start)
146 | 		return RunLengthArray(sizes=segmentSizes, array=self.array[L:R])
147 | 		
148 | 	
149 | 	def getSegment(self, index):
150 | 		assert index < len(self.array)
151 | 		if index==0:
152 | 			return (self.ends[index], self.array[index])
153 | 		else:
154 | 			return (self.ends[index]-self.ends[index-1], self.array[index])
155 | 		
156 | 		
157 | 	# Returns an uncompressed array
158 | 	def decompress(self, start=None, end=None):
159 | 		if start is None and end is None:
160 | 			return np.repeat(self.array, subdiff(self.ends), axis=0)
161 | 		else:
162 | 			# TODO more efficiently without creating new object
163 | 			return self[start:end].decompress()
164 | 	
165 | 	def values(self):
166 | 		return self.array
167 | 	
168 | 	def blocksizes(self):
169 | 		return subdiff(self.ends)
170 | 	
171 | 	def nrSegments(self):
172 | 		return len(self.ends)
173 | 	
174 | 	def __len__(self):
175 | 		return self.ends[-1]
176 | 	
177 | 	
178 | 	
179 | 	
180 | def shatter(A, B):
181 | 	"""Takes two RLE and joins their break points so that they share the same block structure in-place."""
182 | 	ends = set()
183 | 	assert A.size == B.size, "Cannot shatter RLE of different sizes: %d, %d." % (A.size, B.size)
184 | 	ends.update(A.ends)
185 | 	ends.update(B.ends)
186 | 	ends = np.array(sorted(ends), dtype=int)
187 | 	L = len(ends)
188 | 	valA = [None for l in xrange(L) ]
189 | 	valB = [None for l in xrange(L) ]
190 | 	a=0
191 | 	b=0
192 | 	for i in xrange(L):
193 | 		if A.ends[a]<ends[i]:
194 | 			a+=1
195 | 		valA[i] = A.array[a]
196 | 		if B.ends[b]<ends[i]:
197 | 			b+=1
198 | 		valB[i] = B.array[b]
199 | 	A.ends = ends
200 | 	B.ends = ends
201 | 	A.array = np.array(valA)
202 | 	B.array = np.array(valB)
203 | 	
204 | 
205 | 
206 | 


--------------------------------------------------------------------------------
/src/StateSequence/ForwardBackward.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef STATESEQUENCEFORWARDBACKWARD_HPP
  2 | #define STATESEQUENCEFORWARDBACKWARD_HPP
  3 | 
  4 | 
  5 | 
  6 | template<> template <
  7 | typename StatsStructure,
  8 |          typename StatsType,
  9 |          typename BlocksType,
 10 |          typename ThetaType,
 11 |          typename TauThetaType,
 12 |          typename TransitionsType,
 13 |          typename TauAType,
 14 |          typename InitialType,
 15 |          typename TauPiType >
 16 | void StateSequence<ForwardBackward>::sample(
 17 |     Emissions<Statistics<StatsStructure, StatsType>, Blocks<BlocksType>>& y,	// TODO this cannot be const due to the use of next(), work around that somehow
 18 |     const ThetaType& theta,
 19 |     TauThetaType& tau_theta,
 20 |     const TransitionsType& A,
 21 |     TauAType& tau_A,
 22 |     const InitialType& pi,
 23 |     TauPiType& tau_pi,
 24 |     const Mapping& mapping,
 25 |     Records& records,
 26 |     const bool doRecord,
 27 |     const bool useSelfTransitions	// NOTE this has no effect for mixtures
 28 | ) {
 29 | 	const size_t nrParams = tau_theta.nrParams();
 30 | 	const size_t nrDim = y.nrDim();
 31 | 	const size_t nrStates = A.nrStates();
 32 | 	mTrellis.clear();
 33 | 	mTrellis.setNrStates( nrStates );
 34 | 
 35 | 
 36 | 
 37 | 	//TODO precompute carrier measure upon implementation of non-normal distributions
 38 | 
 39 | 	// precompute log of self-transitions etc.
 40 | 	vector<real_t> logA;
 41 | 	logA.reserve( nrStates );
 42 | 	vector<real_t> logNormalizers;
 43 | 	logNormalizers.reserve( nrStates );
 44 | 
 45 | 
 46 | 
 47 | 	for ( auto s = 0; s < nrStates; ++s ) {
 48 | 		if ( useSelfTransitions ) {
 49 | 			logA.push_back( log( A( s, s ) ) );
 50 | 		}
 51 | 		logNormalizers.push_back( theta.logNormalizer( s ) );
 52 | 	}
 53 | 
 54 | 
 55 | 	// set the first element of the trellis to the initial state distribution
 56 | 	// NOTE t-th index of the state sequence is t+1 in the trellis
 57 | 	mTrellis.push_back( pi.valueVector() );
 58 | 
 59 | 	size_t t = 0;	// TODO rename to tt?
 60 | 
 61 | 
 62 | 	real_t prevN = 1;
 63 | 
 64 | 	// FORWARD FILTERING
 65 | 	y.initForward();
 66 | 	vector<real_t> forward( nrStates, 0 );
 67 | 	while ( y.next() ) {
 68 | 		++t;
 69 | 		real_t maxE = numeric_limits<real_t>::lowest();
 70 | 
 71 | 
 72 | 		real_t N = y.blockSize();	// typecasting to avoid integer division TODO maxBlockSize should be restricted by range of real_t (data_t)
 73 | 
 74 | 		for ( auto s = 0; s < nrStates; ++s ) {
 75 | 			auto E = innerProduct( y, theta.value(), theta.mapping( s ) )  - N * logNormalizers[s];	// TODO carrier measure for the general EFD case
 76 | 			if ( useSelfTransitions ) {
 77 | 				E += ( N - 1 ) * logA[s];	// include self-transitions
 78 | 			}
 79 | 			forward[s] = E;
 80 | 			maxE = max( E, maxE );
 81 | 		}
 82 | 		for ( auto s = 0; s < nrStates; ++s ) {
 83 | 			forward[s]  = exp( forward[ s ] - maxE );
 84 | 		}
 85 | 
 86 | 
 87 | 		// calculate transition term and include in forward variables
 88 | 		real_t forwardSum = 0;
 89 | 
 90 | 		for ( auto j = 0; j < nrStates; ++j ) {
 91 | 			real_t transitionTerm = 0;
 92 | 
 93 | 			for ( auto i = 0; i < nrStates; ++i ) {
 94 | 				transitionTerm += mTrellis( t - 1, i ) * A( i, j );
 95 | 			}
 96 | 
 97 | 			forward[ j ] *= transitionTerm;
 98 | 			forwardSum += forward[j];
 99 | 		}
100 | 
101 | 		//normalize forward variables
102 | 		if ( forwardSum != 0 ) {
103 | 			for ( auto j = 0; j < nrStates; ++j ) {
104 | 				forward[ j ] /= forwardSum;
105 | 
106 | 			}
107 | 		} else {
108 | 			cout << "[WARNING] Uniform sampling of forward variables!" << endl;
109 | 			for ( auto j = 0; j < nrStates; ++j ) {
110 | 				forward[ j ] = 1.0 / ( ( real_t )nrStates );
111 | 			}
112 | 		}
113 | 
114 | 
115 | 		if ( useSelfTransitions ) {	// we are done calculating the next forward variables. In the backward step, we have to scale them by their block size, which we can do now already.
116 | 			for ( auto s = 0; s < nrStates; ++s ) {
117 | 				mTrellis.back( s ) *= exp( ( prevN - 1 ) * logA[s] );
118 | 			}
119 | 		}
120 | 
121 | 		mTrellis.push_back( forward );
122 | 
123 | 		prevN = N;
124 | 
125 | 	}
126 | 
127 | 	size_t T = mTrellis.size() - 1;		// -1 because pi is in the trellis, but not part of the state sequence
128 | 
129 | 
130 | 	// BACKWARD SAMPLING
131 | 	// each forward variable is multiplied by the probability to transition into the sampled state
132 | 
133 | 	mStates.resize( T );
134 | 
135 | 	auto j =  mTrellis.sample( mTrellis.size() - 1 ) ;	// the sampled state
136 | 	mStates[T - 1] = j;
137 | 
138 | 	real_t N = 0;
139 | 
140 | 	for ( auto tt = T - 1; tt > 0; --tt ) {	// index in the trellis
141 | 
142 | 		t = tt - 1;
143 | 
144 | 		// update forward variable based on sampled state
145 | 		for ( auto i = 0; i < nrStates; ++i ) {
146 | 			mTrellis( tt, i ) = mTrellis( tt, i ) * A( i, j ) ;
147 | 			if ( mTrellis( tt, i ) < 0 ) {
148 | 				throw runtime_error( "Negative backward variable!" );
149 | 			}
150 | 		}
151 | 
152 | 		// sample
153 | 		j = mTrellis.sample( tt );
154 | 
155 | 		// set sampled state or initial value accordingly
156 | 
157 | 		mStates[t] = j;
158 | 
159 | 		// NOTE the value for the initial state distribution is NOT sampled by FB, but within pi itself, depending on the type of distribution.
160 | 
161 | 
162 | 	}
163 | 
164 | 	mTrellis.clear();
165 | 
166 | 
167 | 	// POSTERIOR RECORDING
168 | 	// TODO is there any way to do the second iteration without recomputing stats?
169 | 
170 | 	vector<KahanAggregator<SufficientStatistics<StatsType>>> stats;
171 | 	stats.resize( nrParams );
172 | 	SufficientStatistics<Categorical> stateCounts( nrStates );
173 | 
174 | 	// count transitions
175 | 	SufficientStatistics<CategoricalVector> transitions( nrStates );
176 | 
177 | 	size_t prevState = 0;
178 | 	t = 0;
179 | 	marginal_t state;
180 | 	y.initForward();
181 | 	
182 | 	while ( y.next() ) {
183 | 		N = y.blockSize();
184 | 		state = mStates[t];
185 | 		transitions[state][state] += N - 1;
186 | 		transitions[prevState][state] += 1;	// TODO Initial
187 | 		stateCounts[state] += N;
188 | 
189 | 		for ( auto d = 0; d < nrDim; ++d ) {
190 | 			// TODO assert range
191 | 			stats[mapping[state][d]].add( y.suffStat( d ), N );
192 | 		}
193 | 
194 | 		if ( doRecord ) {
195 | 			records.record( mStates[t], N );
196 | 		}
197 | 		
198 | 		prevState = state;
199 | 		t++;
200 | 	}
201 | 	
202 | 
203 | 	for ( auto p = 0; p < nrParams; ++p ) {
204 | 		const size_t N = stats[p].nrTerms();
205 | 		if ( N > 0 ) {
206 | 			tau_theta.addObservation( stats[p].sum(), stats[p].nrTerms(),  p );
207 | 		}
208 | 	}
209 | 
210 | 	tau_A.addObservation( transitions );
211 | 	tau_pi.addObservation( stateCounts );
212 | 	// TODO initial
213 | }
214 | 
215 | 
216 | 
217 | 
218 | #endif
219 | 


--------------------------------------------------------------------------------
/src/Conjugate.hpp:
--------------------------------------------------------------------------------
  1 | // Class file for conjugate parameter pairs (prior and posterior)
  2 | 
  3 | 
  4 | #ifndef CONJUGATE_HPP
  5 | #define CONJUGATE_HPP
  6 | 
  7 | #include "includes.hpp"
  8 | #include "Tags.hpp"
  9 | #include "Observation.hpp"
 10 | #include "SufficientStatistics.hpp"
 11 | 
 12 | 
 13 | 
 14 | 
 15 | // parameters of ParamType will be updated using sufficient statistics of ObsType
 16 | template <typename ParamType>
 17 | class Conjugate {
 18 | 
 19 | 		// parameters
 20 | 		Observation<ParamType> mPrior;
 21 | 		Observation<ParamType> mPosterior;
 22 | 
 23 | 	public:
 24 | 
 25 | 		////////// constructors //////////
 26 | 
 27 | 
 28 | 		// A conjugate is initialized using the same parameters as an observation of the same type
 29 | 		template<typename ... Types>
 30 | 		Conjugate( Types ... args ) : mPrior( args ... ), mPosterior( args ... ) {};
 31 | 
 32 | 
 33 | 		Conjugate( Observation<ParamType> prior ): mPrior( prior ), mPosterior( prior ) {}
 34 | 
 35 | 
 36 | 
 37 | 		const Observation<ParamType>& prior() const {
 38 | 			return mPrior;
 39 | 		}
 40 | 
 41 | 		Observation<ParamType>& prior() {
 42 | 			return mPrior;
 43 | 		}
 44 | 
 45 | 		const Observation<ParamType>& posterior() const {
 46 | 			return mPosterior;
 47 | 		}
 48 | 
 49 | 		Observation<ParamType>& posterior() {
 50 | 			return mPosterior;
 51 | 		}
 52 | 
 53 | 
 54 | 		size_t domainSize() const {
 55 | 			return mPrior.domainSize();
 56 | 		}
 57 | 
 58 | 
 59 | 		size_t nrDim() const {
 60 | 			return mPrior.nrDim();
 61 | 		}
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 		template<typename ObsType>
 67 | 		inline void addObservation(
 68 | 		    const SufficientStatistics<ObsType>& obs );
 69 | 
 70 | 		template<typename ObsType>
 71 | 		inline void addObservation(
 72 | 		    const SufficientStatistics<ObsType>& obs,
 73 | 		    const size_t N	);
 74 | 
 75 | 
 76 | 		template<typename ObsType>
 77 | 		inline void addObservation(
 78 | 		    const SufficientStatistics<ObsType>& suffstat,
 79 | 		    const size_t N,
 80 | 		    const vector<int>& mapping );
 81 | 
 82 | 
 83 | 		template<typename ObsType>
 84 | 		inline void addObservation(
 85 | 		    const Observation<ObsType>& obs );
 86 | 
 87 | 
 88 | 		void reset() {
 89 | 			mPosterior = mPrior;
 90 | 		}
 91 | 
 92 | 		string str( string sep = " " ) const {	// TODO sep
 93 | 			return " " + mPosterior.str() + " (prior: " + mPrior.str() + ")";
 94 | 		}
 95 | 
 96 | 		friend ostream& operator<<(
 97 | 		    ostream& output,
 98 | 		    const Conjugate<ParamType>& D )  {
 99 | 			output << D.str();
100 | 			return output;
101 | 		};
102 | 
103 | };
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | ////////////////////////////////////////////////// TEMPLATE SPECIALIZATIONS //////////////////////////////////////////////////
112 | #include "SufficientStatistics.hpp"	// required implementation
113 | #include"Observation.hpp"	// required implementation
114 | 
115 | 
116 | 
117 | ////////// Normal-Inverse Gamma parameters //////////
118 | 
119 | 
120 | template<> template<>
121 | void Conjugate<NormalInverseGammaParam>::addObservation(
122 |     const SufficientStatistics< Normal>& obs,
123 |     const size_t counts ) {
124 | 
125 | 	const real_t sum = obs.sum();
126 | 	const real_t sumSq = obs.sumSq();
127 | 
128 | 
129 | 	if ( counts == 0 ) {
130 | 		if ( sumSq > 0 ) {
131 | 			throw runtime_error( "Sufficient statistics contain values, but no observation count!" );
132 | 		}
133 | 		cout << "[WARNING] No observation count for sufficient statistics, there might be an index error!" << endl << flush;
134 | 		// TODO this shouldn't occur anyway...
135 | 		return;
136 | 	}
137 | 	if ( sumSq < 0 ) {
138 | 		throw runtime_error( "Sum of squares is negative (" + to_string( sumSq ) + ") for " + to_string( counts ) + " observations!" );
139 | 	}
140 | 
141 | 	const double N = ( double )counts;
142 | 	const real_t xbar = sum / N;	// sample mean
143 | 
144 | 
145 | 	const real_t alpha = mPosterior.alpha( );
146 | 	const real_t beta = mPosterior.beta( );
147 | 	const real_t mu0 = mPosterior.mu0( );
148 | 	const real_t nu = mPosterior.nu( );
149 | 
150 | 
151 | 
152 | 	// NOTE sometimes (sum*sum)/N > sumSq, i.e. sample variance is negative. This is akin to the numerically unstable way to calculate the sample variance naively.
153 | 	real_t ssN = ( sum * sum ) / N;
154 | 	if ( ssN > sumSq ) {
155 | 		ssN = sumSq;
156 | 		// NOTE this is mostly some tiny value below zero
157 | // 		cerr << "[WARNING] Encountered numerical instability in sample variance!" << endl << flush;
158 | 	}
159 | 
160 | 	// TODO better numerics, especially for beta?
161 | 	mPosterior.setValue(
162 | 	    alpha + N / 2.0,	// alpha	NOTE using 2 instead of 2.0 previously caused the strange "label-switching" bug in maxBlockLen=1
163 | 	    // NOTE This involves a term which makes the naive calculation of sample variance numerically unstable (sumSq-ssN); we subtract ssN last in hopes that the term added to sumSq is large enough to alleviate catastrophic cancellation.
164 | 	    beta + ( ( sumSq   + ( N * nu / ( N + nu ) ) * ( ( xbar - mu0 ) * ( xbar - mu0 ) ) )  -  ssN ) / 2.0, 	// beta
165 | 	    ( nu * mu0 + sum ) / ( nu + N ), 	// mu0
166 | 	    nu + N	// nu	// TODO size_t?
167 | 	);
168 | }
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | ////////// Dirichlet parameters //////////
176 | 
177 | template<> template<>
178 | void Conjugate<DirichletParamVector>::addObservation(
179 |     const SufficientStatistics<CategoricalVector>& countMatrix ) {
180 | 
181 | 	if ( countMatrix.nrDim() != mPosterior.nrDim() ) {
182 | 		throw runtime_error( "Dimensions of count matrix (" + to_string( countMatrix.nrDim() ) + ") and posterior observations (" + to_string( mPosterior.nrDim() ) + ") do not match!" );
183 | 	}
184 | 
185 | 	for ( size_t d = 0; d < countMatrix.nrDim() ; ++d ) {
186 | 		for ( size_t c = 0; c < countMatrix[d].domainSize(); ++c ) {
187 | 			mPosterior[d][c] += countMatrix[d][c];
188 | 		}
189 | 	}
190 | }
191 | 
192 | 
193 | 
194 | template<> template<>
195 | void Conjugate<DirichletParam>::addObservation(
196 |     const SufficientStatistics<Categorical>& obs ) {
197 | 
198 | 	if ( obs.domainSize() != mPosterior.domainSize() ) {
199 | 		throw runtime_error( "Domain size of observations (" + to_string( obs.domainSize() ) + ") does not match that of posterior (" + to_string( mPosterior.domainSize() ) + ")!" );
200 | 	}
201 | 
202 | 	for ( size_t i = 0; i < mPosterior.domainSize(); ++i ) {
203 | 		mPosterior.mAlphas[i] += obs[i];
204 | 	}
205 | }
206 | 
207 | 
208 | 
209 | template<> template<>
210 | void Conjugate<BetaParam>::addObservation(
211 |     const SufficientStatistics<Geometric>& obs,
212 |     const size_t N
213 | ) {
214 | 	mPosterior.setValue( mPosterior.alpha() + N, mPosterior.beta() + obs.sum() );
215 | }
216 | 
217 | 
218 | 
219 | #endif
220 | 
221 | 


--------------------------------------------------------------------------------
/src/Theta.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef THETA_HPP
  2 | #define THETA_HPP
  3 | 
  4 | #include "includes.hpp"
  5 | #include "Tags.hpp"
  6 | #include "StateSequence.hpp"
  7 | #include "ThetaHyperParam.hpp"
  8 | #include "Mapping.hpp"
  9 | #include "EFD.hpp"
 10 | #include "SufficientStatistics.hpp"
 11 | #include "KahanAggregator.hpp"
 12 | 
 13 | template <typename ParamType>
 14 | class Theta {
 15 | 
 16 | 		const size_t mNrDataDim;
 17 | 		vector<Observation<ParamType>> mParams;
 18 | 		Mapping mMapping;
 19 | 		Distribution<ParamType> mDist;
 20 | 
 21 | 		// NOTE It would be possible to have different parameters depend on the same hyperparameters.
 22 | 
 23 | 	public:
 24 | 
 25 | 		// delete copy constructor
 26 | 		Theta( const Theta& that ) = delete;
 27 | 
 28 | 
 29 | 		template< typename HyperParamType>
 30 | 		Theta(
 31 | 		    ThetaHyperParam<HyperParamType>& tau_theta,	// initializers for parameters
 32 | 		    const size_t nrdatadim,
 33 | 		    const MappingType mappingType,
 34 | 		    rng_t& RNG
 35 | 		);
 36 | 
 37 | 
 38 | 		template< typename HyperParamType>
 39 | 		Theta(
 40 | 		    ThetaHyperParam<HyperParamType>& tau_theta,	// initializers for parameters
 41 | 		    const size_t nrdatadim,
 42 | 		    const Mapping mapping,
 43 | 		    rng_t& RNG
 44 | 		);
 45 | 
 46 | 
 47 | 		// TODO currently we compute log-normalizers for each parameter multiple times in the case of shared parameters. This could be memoized.
 48 | 		real_t logNormalizer( size_t state ) const;
 49 | 
 50 | 
 51 | 		const vector<Observation<ParamType>>& value() const;
 52 | 
 53 | 
 54 | 		size_t nrDataDim() const;
 55 | 
 56 | 
 57 | 		size_t nrStates() const;
 58 | 
 59 | 
 60 | 		size_t nrParams() const;
 61 | 
 62 | 
 63 | 		const vector<size_t>& mapping(
 64 | 		    size_t state ) const ;
 65 | 
 66 | 
 67 | 		template<typename ThetaParamType>
 68 | 		void sample(
 69 | 		    ThetaHyperParam<ThetaParamType>& tau_theta ) ;
 70 | 
 71 | 
 72 | 		template<typename ThetaParamType>
 73 | 		void sample(
 74 | 		    ThetaParamType& tau_theta );
 75 | 
 76 | 
 77 | 		// returns a distribution-specific value for threshold computation, such as the minimum variance for Gaussian emissions
 78 | 		real_t thresholdValue() const;
 79 | 
 80 | 
 81 | 		string str(
 82 | 		    const string& sep = "\t",
 83 | 		    const string& finalSep = "" ) const ;
 84 | 
 85 | 
 86 | 		// TODO move outside?
 87 | 		template <typename U>
 88 | 		friend ostream& operator<<(
 89 | 		    ostream& output,
 90 | 		    const Theta<U>& D );
 91 | 
 92 | };
 93 | 
 94 | 
 95 | 
 96 | template <typename ParamType>
 97 | ostream& operator<<(
 98 |     ostream& output,
 99 |     const Theta<ParamType>& D )  {
100 | 	output << D.str( );
101 | 	return output;
102 | };
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | template <typename ParamType>
114 | template< typename HyperParamType>
115 | Theta<ParamType>::Theta(
116 |     ThetaHyperParam<HyperParamType>& tau_theta,	// initializers for parameters
117 |     const size_t nrdatadim,
118 |     const MappingType mappingType,
119 |     rng_t& RNG
120 | ) :
121 | 	mNrDataDim( nrdatadim ),
122 | 	mParams( tau_theta.nrParams() ),
123 | 	mMapping( nrdatadim, tau_theta.nrParams(), mappingType ) ,
124 | 	mDist( RNG ) {
125 | 
126 | 	// initialize by sampling from the prior
127 | 	sample( tau_theta );
128 | }
129 | 
130 | 
131 | 
132 | 
133 | template <typename ParamType>
134 | template< typename HyperParamType>
135 | Theta<ParamType>::Theta(
136 |     ThetaHyperParam<HyperParamType>& tau_theta,	// initializers for parameters
137 |     const size_t nrdatadim,
138 |     const Mapping mapping,
139 |     rng_t& RNG
140 | ) :
141 | 	mNrDataDim( nrdatadim ),
142 | 	mParams( tau_theta.nrParams() ),
143 | 	mMapping( mapping ) ,
144 | 	mDist( RNG ) {
145 | 
146 | 	// initialize by sampling from the prior
147 | 	sample( tau_theta );
148 | 
149 | }
150 | 
151 | template <typename ParamType>
152 | // TODO currently we compute log-normalizers for each parameter multiple times in the case of shared parameters. This could be memoized.
153 | // TODO adapt for multivariate
154 | real_t Theta<ParamType>::logNormalizer(
155 |     size_t state
156 | ) const {
157 | 	real_t result = 0;
158 | 
159 | 	for ( const auto & m : mMapping[state] ) {
160 | 		result += ::logNormalizer( mParams[m] );
161 | 	}
162 | 
163 | 	return result;
164 | };
165 | 
166 | ////////// accessors //////////
167 | //NOTE round parentheses access the data through the mapping, square brackets access the parameters directly
168 | 
169 | template <typename ParamType>
170 | // TODO more elegant, e.g. implicit conversion?
171 | const vector<Observation<ParamType>>& Theta<ParamType>::value() const {
172 | 	return mParams;
173 | }
174 | 
175 | 
176 | template <typename ParamType>
177 | size_t Theta<ParamType>::nrDataDim() const {
178 | 	return mNrDataDim;
179 | }
180 | 
181 | template <typename ParamType>
182 | size_t Theta<ParamType>::nrStates() const {
183 | 	return mMapping.nrStates();
184 | }
185 | 
186 | 
187 | template <typename ParamType>
188 | size_t Theta<ParamType>::nrParams() const {
189 | 	return mMapping.nrParams();
190 | }
191 | 
192 | 
193 | template <typename ParamType>
194 | const vector<size_t>& Theta<ParamType>::mapping(
195 |     size_t state ) const {
196 | 	return mMapping[state];
197 | }
198 | 
199 | 
200 | template <typename ParamType>
201 | // sample each parameter from its posterior and reset the posterior afterwards
202 | template<typename ThetaParamType>
203 | void Theta<ParamType>::sample(
204 |     ThetaHyperParam<ThetaParamType>& tau_theta ) {
205 | 
206 | 	for ( auto d = 0; d < mParams.size(); ++d ) {
207 | 		mDist.resample( mParams[d], tau_theta.posterior( d ) );
208 | 	}
209 | 	tau_theta.reset();
210 | 	
211 | }
212 | 
213 | 
214 | template <typename ParamType>
215 | string Theta<ParamType>::str(
216 |     const string& sep,
217 |     const string& finalSep ) const {
218 | 	return concat( mParams, sep, finalSep );
219 | }
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | template <>
227 | real_t Theta<NormalParam>::thresholdValue() const {
228 | 
229 | 	// min
230 | 	real_t result = inf;
231 | 	for ( const auto & param : mParams ) {
232 | 		result = min( result, param.var() );
233 | 	}
234 | 	return result;
235 | 
236 | // TODO add option for averaging version for data with bad compression characteristics
237 | // 	avg
238 | // 	real_t result = 0;
239 | // 	for ( const auto & param : mParams ) {
240 | // 		result += param.var();
241 | // 	}
242 | // 	return result / mParams.size();
243 | 
244 | }
245 | 
246 | 
247 | template <>
248 | real_t Theta<Beta>::thresholdValue() const {
249 | 	real_t result = inf;
250 | 
251 | 	for ( const auto & param : mParams ) {
252 | 		real_t p = param.value();
253 | 		result = min( result, ( 1 - p ) / ( p * p ) );
254 | 	}
255 | 
256 | 	return result;
257 | }
258 | 
259 | 
260 | #endif
261 | 
262 | 
263 | 
264 | 


--------------------------------------------------------------------------------
/src/wavelet.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef WAVELET_HPP
  2 | #define WAVELET_HPP
  3 | 
  4 | #include "includes.hpp"
  5 | #include "uintmath.hpp"
  6 | 
  7 | 
  8 | 
  9 | // TODO for implementation of chunks etc.: be carefull, we don't maintain scale coefficients!
 10 | // In-place Haar transform, for arbitrary data sizes. Data is sorted by (position, dimension), and dimensions are processed in pseudo-parallel fashion. TODO acually make this parallel?
 11 | // Data is treated like a greedy concatenation of vectors of sizes that are powers of two. In place where there would be the scale coefficient, the result is infinity.
 12 | void HaarDetailCoeffs(
 13 |     vector<real_t>& y,
 14 |     size_t dim = 1 ) {
 15 | 
 16 | 
 17 | 	const size_t Tdim = y.size();	// T*dim
 18 | 	if ( !divides( Tdim, dim ) ) {
 19 | 		throw runtime_error( "Cannot compute Haar detail coefficients, array size is not a multiple of the number of dimensions!" );
 20 | 	}
 21 | 	const size_t T = Tdim / dim;	// number of input positions
 22 | 
 23 | 	real_t yL;
 24 | 	real_t yR;
 25 | 	size_t L;
 26 | 	size_t R;
 27 | 	size_t Ldim;
 28 | 	size_t Rdim;
 29 | 	const size_t Nup = ceilPow2( T );
 30 | 	for ( size_t N = 2; N <= Nup; N *= 2 ) {	// N is size of non-zero support interval
 31 | 		const real_t s = 1 / sqrt( N );
 32 | 		L = 0;
 33 | 		R = N / 2;
 34 | 		while ( L < T ) {
 35 | 			if ( R < T ) {
 36 | 				Ldim = L * dim;
 37 | 				Rdim = R * dim;
 38 | 				for ( size_t d = 0; d < dim; ++d ) {
 39 | 					yL = y[Ldim];
 40 | 					yR = y[Rdim];
 41 | 					y[Ldim] = yL + yR;
 42 | 					y[Rdim] = s * ( yL - yR );
 43 | 					Ldim ++;
 44 | 					Rdim++;
 45 | 				}
 46 | 			} else {
 47 | 				Ldim = L * dim;
 48 | 				for ( size_t d = 0; d < dim; ++d ) {
 49 | 					y[Ldim] = inf;
 50 | 					Ldim++;
 51 | 				}
 52 | 			}
 53 | 			L += N;
 54 | 			R += N;
 55 | 		}
 56 | 	}
 57 | 
 58 | 	// set scale coefficient at first position force breakpoint at first element
 59 | 	for ( size_t d = 0; d < dim; ++d ) {
 60 | 		y[d] = inf;
 61 | 	}
 62 | }
 63 | 
 64 | 
 65 | 
 66 | 
 67 | // Takes a maxlet transform, and computes the breakpoint weights, i.e. for each position t it computes the maximum absolute coefficient of all wavelets which have a discontinuity at t. Complexity is in-place in linear time.
 68 | void HaarBreakpointWeights(
 69 |     vector< real_t >& weights	// absolute Haar wavelet coefficients
 70 | ) {
 71 | 	const size_t size = weights.size();
 72 | 	if ( size <= 0 ) {
 73 | 		throw runtime_error( "Cannot compute Haar breakpoint weights, vector is empty!" );
 74 | 	}
 75 | 	size_t index;
 76 | 	size_t L;
 77 | 	size_t R;
 78 | 	for ( size_t interval = ceilPow2( size ) / 2; interval >= 1; interval = interval / 2 ) {	// size of the positive (or negative) support interval (half the support)
 79 | 		const size_t shift = 2 * interval;	// how far to advance to the next index
 80 | 		for ( size_t index = interval; index < size; index += shift ) {
 81 | 			L = index - interval;
 82 | 			R = index + interval;
 83 | 			if ( R < size ) {
 84 | 				weights[R] = max( weights[R], weights[index] );
 85 | 			} else {
 86 | 				// NOTE just a precaution we expect this to be the case in the input
 87 | 				weights[L] = inf;
 88 | 				weights[index] = inf;
 89 | 			}
 90 | 			weights[L] = max( weights[L], weights[index] );
 91 | 		}
 92 | 	}
 93 | }
 94 | 
 95 | 
 96 | // Computes the maxlet transform (absolute Haar wavelet transform  for each dimension, then maximum of corresponding values across dimensions) from streaming input (dimensions first, then position), using only space T for coefficients and nrDim*T for statistics, plus nrDim*log2(T) for a stack. Output: coeffs.size()=T, suffstats.size() = nrDim*T
 97 | template< typename T>
 98 | void MaxletTransform(
 99 |     istream& input,
100 |     vector<real_t>& coeffs,
101 |     vector< SufficientStatistics<T> >& suffstats,
102 |     const size_t nrDim = 1,
103 |     const size_t reserveT = 0	// an estimate of the number of data points to avoid reallocation
104 | ) {
105 | 
106 | 	if ( nrDim <= 0 ) {
107 | 		throw runtime_error( "Number of dimensions must be positive!" );
108 | 	}
109 | 
110 | 
111 | 	if ( coeffs.size() > 0 ) {
112 | 		throw runtime_error( "Coefficient array must be empty!" );
113 | 	}
114 | 
115 | 	if ( suffstats.size() > 0 ) {
116 | 		throw runtime_error( "Statistics array must be empty!" );
117 | 	}
118 | 
119 | 	if ( input ) {
120 | 
121 | 
122 | 		coeffs.reserve( ( reserveT + nrDim ) / nrDim + nrDim );
123 | 		suffstats.reserve( reserveT + nrDim );
124 | 
125 | // 	stack<real_t, vector<real_t> > S;	// stack never gets larger than nrDim*log2(T), so we don't expect a lot of reallocation, and save a lot of push and pop operations due to random access
126 | 		vector<real_t> S;
127 | 		size_t i = 0;
128 | 		real_t v = 0;
129 | 		size_t dim = 0;
130 | 
131 | 		while ( input >> v ) {
132 | 			S.push_back( v );
133 | 			suffstats.push_back( SufficientStatistics<T>( v ) );
134 | 			dim++;	// set dimension of next value
135 | 			if ( dim == nrDim ) {	// filled all dimensions at index i
136 | 				dim = 0;	// next value will be first dimension again
137 | 
138 | 
139 | 				coeffs.push_back( inf );
140 | 
141 | 
142 | 				size_t j = i;	// points to node indices on an upward-left path (i.e. DFS post-order)
143 | 				size_t m = 1;	// mask to determine whether j is an index of a left child
144 | 				real_t normalizer = sqrt2half;
145 | 
146 | 				while ( ( j & m ) > 0 ) {	// while j is on a left-upward path (DFS post-order)
147 | 
148 | 					real_t maxCoeff = 0;	// the maximum detail coefficient across dimensions at j; NOTE we cannot take the maximum with coeffs because it contains infinity
149 | 
150 | 					size_t L = S.size() - 2 * nrDim;	// index of left element in stack, get incremented to iterate over dimensions
151 | 					size_t R = L + nrDim;	// likewise, index of right element in stack
152 | 
153 | 
154 | 					// compute maximum of detail coefficients across dimensions
155 | 					for ( size_t d = 0; d < nrDim; ++d ) {
156 | 						maxCoeff = max( maxCoeff, normalizer * abs( S[L] - S[R] ) );
157 | 						S[L] += S[R];	// add right values to left values, so only the right values need to be popped
158 | 						L++;		// go to next dimension
159 | 						R++;
160 | 					}
161 | 					coeffs[j] = maxCoeff;
162 | 
163 | 
164 | 					// pop the right values
165 | 					for ( size_t d = 0; d < nrDim; ++d ) {
166 | 						S.pop_back();
167 | 					}
168 | 
169 | 
170 | 					j = j - m;	// move to left parent (if current position is not a right child, the loop will exit)
171 | 					m *= 2;	// move bit-mask to the left, i.e. check if i is still on a left-up path)
172 | 					normalizer *= sqrt2half;	// moving up one level changes normalization factor
173 | 				}
174 | 				i++;
175 | 			}
176 | 		}
177 | 
178 | 
179 | 		if ( dim != 0 ) {
180 | 			throw runtime_error( "Input stream did not contain enough values to fill all dimensions at last position!" );
181 | 		}
182 | 
183 | 		coeffs[0] = inf;
184 | 
185 | 	} else {
186 | 		throw runtime_error( "Cannot read input file or stream!" );
187 | 	}
188 | }
189 | 
190 | 
191 | 
192 | #endif
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/src/Records.hpp:
--------------------------------------------------------------------------------
  1 | // Records data to file for each iteration.
  2 | 
  3 | #ifndef RECORDS_HPP
  4 | #define RECORDS_HPP
  5 | 
  6 | #include "StateMarginals.hpp"
  7 | #include "Theta.hpp"
  8 | #include "StateMarginalsIterator.hpp"
  9 | 
 10 | 
 11 | #include <string>
 12 | using std::string;
 13 | 
 14 | #include <fstream>
 15 | using std::ofstream;
 16 | 
 17 | 
 18 | 
 19 | 
 20 | class Records {
 21 | 
 22 | 		size_t mNrObservedPos;
 23 | 		size_t mNrBlocks;
 24 | 		size_t mNrSegments;
 25 | 		size_t mSegmentState;	// the state being aggregated
 26 | 		size_t mSegmentSize;	// the current aggregated segment size
 27 | 		const size_t mSize;
 28 | 		string mPrefix;
 29 | 		string mSuffix;
 30 | 
 31 | 
 32 | 
 33 | 		StateMarginals mMarginals;
 34 | 
 35 | 		bool mRecordMarginals;
 36 | 		bool mRecordBlocks;
 37 | 		bool mRecordCompression;
 38 | 		bool mRecordSequences;
 39 | 		bool mRecordTheta;
 40 | 		bool mRecordSegments;
 41 | 
 42 | 		// TODO allow individual names for each file?
 43 | 
 44 | 		ofstream mMarginalsFile;
 45 | 		ofstream mSequenceFile;
 46 | 		ofstream mBlocksFile;
 47 | 		ofstream mThetaFile;
 48 | 		ofstream mCompressionsFile;
 49 | 		ofstream mSegmentFile;
 50 | 
 51 | 		// helper method to avoid copying of code
 52 | 		void setRecordX(
 53 | 		    ofstream& mFile, 	// reference to member ofstream
 54 | 		    const string& type, 	// type of record, also used as suffix
 55 | 		    bool& mFlag,  	// reference to bool member (whether to record or not)
 56 | 		    const bool flag,	// whether to record or not
 57 | 		    const bool overwrite	// whether to allow overwriting existing files
 58 | 		) {
 59 | 			mFlag = flag;
 60 | 			if ( mFlag && !mFile.is_open() ) {
 61 | 				string filename = mPrefix +  type + mSuffix;
 62 | 				if ( fileExists( filename ) && !overwrite ) {
 63 | 					throw runtime_error( "File " + filename + " already exists! Use -w to allow overwrite!" );
 64 | 				}
 65 | 				mFile.open( ( filename ).c_str() );
 66 | 				if ( !mFile.is_open() ) {
 67 | 					throw runtime_error( "Cannot write to file " + filename + "!" );
 68 | 				}
 69 | 			}
 70 | 		}
 71 | 
 72 | 	public:
 73 | 		// delete copy constructor
 74 | 		Records( const Records& that ) = delete;
 75 | 
 76 | 
 77 | 		Records(
 78 | 		    size_t T,
 79 | 		    string prefix,
 80 | 		    string suffix,
 81 | 		    const size_t nrStates
 82 | 		) :
 83 | 			mNrBlocks( 0 ),
 84 | 			mSize( T ),
 85 | 			mNrObservedPos( 0 ),
 86 | 			mPrefix( prefix ),
 87 | 			mSuffix( suffix ),
 88 | 			mMarginals( T ),
 89 | 			mRecordMarginals( true ),
 90 | 			mRecordSegments( false ),
 91 | 			mRecordBlocks( false ),
 92 | 			mRecordCompression( false ),
 93 | 			mRecordSequences( false ),
 94 | 			mRecordTheta( false ),
 95 | 			mSegmentSize( 0 ),
 96 | 			mSegmentState( 0 ),
 97 | 			mNrSegments( 0 ) {}
 98 | 
 99 | 		~Records() {
100 | 			close();
101 | 		}
102 | 
103 | 		void close() {
104 | 			if ( mRecordMarginals ) {
105 | 				mMarginals.save( mMarginalsFile );
106 | 				mMarginalsFile.close();
107 | 			}
108 | 			if ( mRecordSequences ) {
109 | 				mSequenceFile.close();
110 | 			}
111 | 			if ( mRecordBlocks ) {
112 | 				mBlocksFile.close();
113 | 			}
114 | 			if ( mRecordTheta ) {
115 | 				mThetaFile.close();
116 | 			}
117 | 			if ( mRecordSegments ) {
118 | 				mSegmentFile.close();
119 | 			}
120 | 		}
121 | 
122 | 		void setRecordMarginals( bool b, bool overwrite = false ) {
123 | 			setRecordX( mMarginalsFile, "marginals", mRecordMarginals, b, overwrite );
124 | 		}
125 | 
126 | 		void setRecordBlocks( bool b, bool overwrite = false ) {
127 | 			setRecordX( mBlocksFile, "blocks", mRecordBlocks, b, overwrite );
128 | 		}
129 | 
130 | 		void setRecordCompression( bool b, bool overwrite = false ) {
131 | 			setRecordX( mCompressionsFile, "compression", mRecordCompression, b, overwrite );
132 | 		}
133 | 
134 | 		void setRecordStateSequence( bool b, bool overwrite = false ) {
135 | 			setRecordX( mSequenceFile, "sequences", mRecordSequences, b, overwrite );
136 | 		}
137 | 
138 | 		void setRecordTheta( bool b, bool overwrite = false ) {
139 | 			setRecordX( mThetaFile, "parameters", mRecordTheta, b, overwrite );	// TODO rename?
140 | 		}
141 | 
142 | 		void setRecordSegments( bool b, bool overwrite = false ) {
143 | 			setRecordX( mSegmentFile, "segments", mRecordSegments, b, overwrite );	// TODO rename?
144 | 		}
145 | 
146 | 		template<typename ThetaType>
147 | 		void record(
148 | 		    const Theta<ThetaType>& theta ) {
149 | 			// write parameters
150 | 			if ( mRecordTheta ) {
151 | 				mThetaFile << theta << endl;
152 | 			}
153 | 		}
154 | 
155 | 		void record(
156 | 		    const size_t state,
157 | 		    const size_t N ) {
158 | 
159 | 			string blockPreChar = "\t";
160 | 			string segmentPreChar = "";
161 | 			if (mNrSegments>0){
162 | 				segmentPreChar = "\t";
163 | 			}
164 | 			string postChar = "";
165 | 
166 | 			// is this the first block?
167 | 			if ( mNrBlocks == 0 ) {
168 | 				mSegmentState = state;
169 | 				mSegmentSize = N;
170 | 				blockPreChar="";
171 | 			} else {	// this is not the first block
172 | 				// record the previous segment if we encountered a new state
173 | 				if ( state != mSegmentState ) {
174 | 					if ( mRecordMarginals ) {
175 | 						mMarginals.addRecord( mSegmentState, mSegmentSize );
176 | 					}
177 | 					if ( mRecordSequences ) {
178 | 						mSequenceFile << segmentPreChar << mSegmentSize << ":" << mSegmentState << postChar;
179 | 					}
180 | 					mSegmentState = state;
181 | 					mSegmentSize = N;
182 | 					mNrSegments++;
183 | 					segmentPreChar = "\t";
184 | 				} else {
185 | 					// do nothing except extend segment
186 | 					mSegmentSize += N;
187 | 				}
188 | 			}
189 | 
190 | 
191 | 			mNrBlocks++;
192 | 
193 | 			mNrObservedPos += N;
194 | 			
195 | 
196 | 			// end of line? In that case we need to record everything that's left
197 | 			if ( mNrObservedPos >= mSize ) {
198 | 				if ( mNrObservedPos > mSize ) {
199 | 					throw runtime_error( "Cannot record block, exceeding data size!" );
200 | 				}
201 | 
202 | 				postChar = "\n";
203 | 
204 | 				if ( mRecordCompression ) {
205 | 					mCompressionsFile << ( ( double ) mSize ) / ( ( double ) mNrBlocks ) << endl;
206 | 				}
207 | 
208 | 				if ( mRecordSegments ) {
209 | 					mSegmentFile << mMarginals.nrSegments() << "\t" << mMarginals.internalSize() << endl;
210 | 				}
211 | 
212 | 
213 | 
214 | 				// record the last block
215 | 				if ( mRecordMarginals ) {
216 | 					mMarginals.addRecord( mSegmentState, mSegmentSize );
217 | 				}
218 | 				if ( mRecordSequences ) {
219 | 					mSequenceFile << segmentPreChar << mSegmentSize << ":" << mSegmentState << postChar;
220 | 				}
221 | 
222 | 
223 | 				mNrObservedPos = 0;
224 | 				mNrBlocks = 0;
225 | 				mSegmentSize = 0;
226 | 				mNrSegments = 0;
227 | 			}
228 | 
229 | 
230 | 			if ( mRecordBlocks ) {
231 | 				mBlocksFile << blockPreChar << N << postChar;
232 | 			}
233 | 
234 | 
235 | 		}
236 | 
237 | 
238 | 
239 | 		vector<size_t> maxMarginSegmentation() {
240 | 			return mMarginals.maxMarginSegmentation();
241 | 		}
242 | 
243 | };
244 | 
245 | #endif
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/src/tools/mapLinesToGenome.cpp:
--------------------------------------------------------------------------------
  1 | #include "GenomeGetter.hpp"
  2 | 
  3 | #include "../Parser.hpp"
  4 | 
  5 | #include <limits>
  6 | using std::numeric_limits;
  7 | 
  8 | #include <iostream>
  9 | using std::cin;
 10 | using std::cout;
 11 | using std::endl;
 12 | using std::istream;
 13 | using std::ostream;
 14 | 
 15 | #include <fstream>
 16 | using std::ifstream;
 17 | using std::ofstream;
 18 | 
 19 | #include <string>
 20 | using std::stoi;
 21 | using std::getline;
 22 | 
 23 | #include <stdexcept>
 24 | using std::runtime_error;
 25 | 
 26 | #include "gzstream.h"
 27 | 
 28 | 
 29 | int main( int argc, const char* argv[] ) {
 30 | 
 31 | 	Parser args( argc, argv );
 32 | 
 33 | 	// OPTIONS
 34 | 	args.registerFlags( {"-g", "-genome-prefix"}, "" );
 35 | 	args.registerFlags( {"-c", "-coordinates"}, "" );
 36 | 	args.registerFlags( {"-w", "-window-size"}, "1" );
 37 | 	args.registerFlags( {"-r", "-range"} );	// print start and end column for each refseq, instead of individual lines for each position
 38 | 	args.registerFlags( {"-i", "-infile"}, "" );	// read from cin, unless -i is defined
 39 | 	args.registerFlags( {"-o", "-outfile"}, "" );	// write to cout, unless -o is defined
 40 | 	args.registerFlags( {"-b", "-blocks"}, "" );	// TODO implement: there is no size column, treat each position as block size 1
 41 | 	args.registerFlags( {"-h", "--help", "-help"}, "" );
 42 | 	args.parseArgs();
 43 | 
 44 | 	if ( args.isSet( "-h" ) ) {
 45 | 		cout << "Prepend genomic coordinates to lines, separated by tabs. If -c/-coordinates is set, coordinates are refseq:start:inclusiveend, otherwise they are separated by tabs. The input file name is set using -i, otherwise lines are read from STDIN. The PREFIX for the genomes size and position files is set using -g/-genome-prefix, and genomic coordinates are read from PREFIX-size.csv and PREFIX-pos.csv. The window size -w/-window size specifies the number of genome coordinates corresponding to one line in the data, for example if mapping counts have been averaged over adjacent, non-overlapping windows. The last data line may map to less genome positions than the window size, and no warning is issued. If -b/-blocks is set, the first entry in each input line (up to the first tab) is considered the segment size for a run-length encoding, and thus specifies that the lines should be repeated this many times; in that case, the window size is multiplied by that number. If -r/-range is specified, only the first and last genome position (in columns) is printed for each input segment per refseq, instead of mapping to all genome positions. If an INT is provided as argument to -r, this specifies the maximum distance between adjacent positions within a range, otherwise a new range is started. If -o/-outfile is specified, output is written to that path, otherwise it is written to STDOUT." << endl;
 46 | 		return 0;
 47 | 	}
 48 | 
 49 | 
 50 | 	string prefix = args.parse<string>( "-genome-prefix" );
 51 | 
 52 | 	// specify input stream (cin or -i)
 53 | 	const bool readFromFile = args.isSet( "-i" );
 54 | 	ifstream realInFile;
 55 | 	if ( readFromFile ) {
 56 | 		realInFile.open( args.parse<string>( "-i" ), std::ios::in );
 57 | 	}
 58 | 	istream& inFile = ( readFromFile ? realInFile : cin );
 59 | 
 60 | 	// specify output stream (cout or -o)
 61 | 	const bool writeToFile = args.isSet( "-o" );
 62 | 	ofstream realOutFile;
 63 | 	if ( writeToFile ) {
 64 | 		realOutFile.open( args.parse<string>( "-o" ), std::ios::out );
 65 | 	}
 66 | 	ostream& outFile = ( writeToFile ? realOutFile : cout );
 67 | 
 68 | 
 69 | 	const bool isRLE = args.isSet( "-b" );
 70 | 	const size_t windowSize = args.parse<size_t>( "-window-size" );
 71 | 	const bool outputRanges = args.isSet( "-range" );
 72 | 
 73 | 	GenomeGetter gg( prefix );
 74 | 
 75 | 	string sep1 = "\t";
 76 | 	string sep2 = "\t";
 77 | 	if ( args.isSet( "-coordinates" ) ) {
 78 | 		sep1 = ":";
 79 | 		sep2 = "-";
 80 | 	}
 81 | 
 82 | 
 83 | 	size_t start = 0;	// start position of a segment if isRLE, data position otherwise
 84 | 	size_t end = 0;
 85 | 
 86 | 	string refseq;
 87 | 	string line;
 88 | 	size_t segmentSize = 1;
 89 | 	const size_t windowsize = args.parse<size_t>( "-w" );
 90 | 	size_t maxMergeDist = numeric_limits<size_t>::max();	// maximum distance of entries to still be considered for merging into range
 91 | 	if ( outputRanges ) {
 92 | 		if ( args.nrTokens( "-range" ) > 0 ) {
 93 | 			maxMergeDist = args.parse<size_t>( "-range", 0 );
 94 | 		}
 95 | 	}
 96 | 
 97 | 	// the number of lines in the pos file corresponding to a line in the data file. This is the window size times the segment size
 98 | 	size_t nrGenomeLines = 0;
 99 | 	while ( getline( inFile, line ) ) {	// TODO check status etc.
100 | 
101 | 		// if the first input column represents the segment size, update it
102 | 		if ( isRLE ) {
103 | 			// get the size of the segment
104 | 			segmentSize = stoi( line.substr( 0, line.find_first_of( "\t" ) ) ) ;	// get segment size from string up to the first tab
105 | 			line = line.substr(line.find_first_of( "\t" )+1 ) ;
106 | 			if ( segmentSize == 0 ) {
107 | 				throw runtime_error( "Segment size must be positive!" );
108 | 			}
109 | 		}
110 | 
111 | 		// specify the number of genome lines (positions) that this data line represents
112 | 		nrGenomeLines = windowsize * segmentSize;
113 | 
114 | 		if ( outputRanges ) {
115 | 
116 | 			// get values for new segment
117 | 			if ( gg.next() ) {
118 | 				if ( gg.refseqChanged() ) {
119 | 					refseq = gg.refseq();
120 | 				}
121 | 				start = gg.pos();
122 | 				end = start;
123 | 
124 | 			} else {
125 | 				throw runtime_error( "Genome ended before all data was processed!" );
126 | 			}
127 | 			nrGenomeLines--;
128 | 
129 | 			// read as many lines as the run-length information specifies
130 | 			while ( nrGenomeLines > 0 ) {
131 | 				if ( gg.next() ) {
132 | 
133 | 					// if the current line starts a new refseq
134 | 					if ( gg.refseqChanged() || gg.pos() - end > maxMergeDist ) {	// TODO what if this happens in first line
135 | 
136 | 						// print previous refseq, start and end
137 | 						cout << refseq << sep1 << start << sep2 << end << "\t" << line << endl;
138 | 
139 | 						refseq = gg.refseq();
140 | 						start = gg.pos();
141 | 					}
142 | 					end = gg.pos();
143 | 				} else {
144 | 					break;	// silently ignore window size going past the end of the data
145 | 				}
146 | 				nrGenomeLines--;
147 | 			}
148 | 
149 | 			cout  << refseq << sep1 << start << sep2 << end << "\t" << line << endl;
150 | 
151 | 		} else { // output each line
152 | 			while ( nrGenomeLines > 0 ) {
153 | 				if ( gg.next() ) {
154 | 					cout << gg.refseq() << sep1 << gg.pos() << "\t" << line << endl;
155 | 				} else {
156 | 					break;
157 | 				}
158 | 				nrGenomeLines--;
159 | 			}
160 | 		}
161 | 		// we only allow for the last window to be incomplete, since the genome size might not be a multiple of the window size, but otherwise we enforce correct size
162 | 		if ( nrGenomeLines >= windowsize ) {
163 | 			throw runtime_error( "Data too long for genome!" );
164 | 		}
165 | 	}
166 | 
167 | 	// see if there are unprocessed parts of the genome
168 | 	if ( gg.next() ) {
169 | 		throw runtime_error( "Data ended before genome!" );
170 | 	}
171 | 
172 | 	return 0;
173 | }
174 | 
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/src/Statistics/IntegralArray.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef INTEGRALARRAY_HPP
  2 | #define INTEGRALARRAY_HPP
  3 | 
  4 | #include "../Statistics.hpp"
  5 | #include "../Blocks/BreakpointArray.hpp"
  6 | #include "../includes.hpp"
  7 | #include "../SufficientStatistics.hpp"
  8 | #include "../Tags.hpp"
  9 | #include "../uintmath.hpp"
 10 | #include "../KahanAggregator.hpp"
 11 | 
 12 | 
 13 | #include <algorithm>
 14 | using std::rotate;
 15 | 
 16 | #include <deque>
 17 | using std::deque;
 18 | 
 19 | #include "../utils.hpp"
 20 | 
 21 | #include "../MultiVector.hpp"
 22 | 
 23 | typedef uint16_t PointerType;
 24 | const size_t CELLSIZE = 65535;	// for numeric reasons, the cumulative sum array is divided into cells of this size, and the reverse cumulative sum is calculated within that cell
 25 | 
 26 | 
 27 | template<typename SuffStatType>
 28 | class Statistics<IntegralArray, SuffStatType > {
 29 | 
 30 | 		// number of input data points that the breakpoints are derived from, mNrDim* mSize is the size of the stats arrays
 31 | 		const size_t mSize;
 32 | 
 33 | 		// number of input dimensions (for stats arrays) TODO current implementation only works for 1D
 34 | 		const size_t mNrDim;
 35 | 
 36 | 		// mStats[i] represents the sufficient statistics of [i,..,i+mPointers[i]-1] (inclusive)
 37 | 		MultiVector<SufficientStatistics<SuffStatType>> mStats;
 38 | 
 39 | 
 40 | 		// current state during iteration
 41 | 		vector<SufficientStatistics<SuffStatType>> mCurrentSuffStat;
 42 | 
 43 | 
 44 | 		// to compute the current sufficient statistics, we aggregate the positive and negative parts (as positive value) in a Kahan aggregator for numerical stability
 45 | 		vector<KahanAggregator<SufficientStatistics<SuffStatType>>> mCurrentStats;
 46 | 
 47 | 
 48 | 		// Given a start and end position as well as two Kahan aggregators, add the positive and negative statistics for the segment.
 49 | 		void addBlockStats(
 50 | 		    size_t start,
 51 | 		    size_t end,	// TODO check bounds?
 52 | 		    size_t d,
 53 | 		    KahanAggregator<SufficientStatistics<SuffStatType>>& stats );
 54 | 
 55 | 
 56 | 	public:
 57 | 
 58 | 		Statistics( const Statistics& that ) = delete;
 59 | 
 60 | 		Statistics(
 61 | 		    vector<SufficientStatistics< SuffStatType>>& stats,
 62 | 		    const size_t nrDim	);
 63 | 
 64 | 		template<typename T>
 65 | 		void setStats(
 66 | 		    const Blocks<T>& blocks );
 67 | 
 68 | 		const SufficientStatistics<SuffStatType>& suffStat(
 69 | 		    size_t dim ) const;
 70 | 
 71 | 		size_t nrDim() const;
 72 | 
 73 | 		size_t size() const;
 74 | 
 75 | };
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | // Given a start and end position as well as two Kahan aggregators, add the positive and negative statistics for the segment.
103 | template<typename SuffStatType>
104 | void Statistics<IntegralArray, SuffStatType >::addBlockStats(
105 |     size_t start,
106 |     size_t end,	// TODO check bounds?
107 |     size_t d,
108 |     KahanAggregator<SufficientStatistics<SuffStatType>>& stats
109 | ) {
110 | 	// we use the Kahan aggregator to record the number of positions included in the sum, which we need to set manually
111 | 	size_t N = stats.nrTerms() + end - start;
112 | 
113 | 
114 | 	stats.add( mStats( start , d ) );
115 | 	for ( start = higher_mult( start, CELLSIZE ); start < end; start += CELLSIZE ) {
116 | 		stats.add( mStats( start, d ) );
117 | 	}
118 | 	if ( end % CELLSIZE != 0 ) {
119 | 		stats.subtract( mStats( end, d ) );
120 | 	}
121 | 
122 | 	// manually set number of terms
123 | 	stats.setNrTerms( N );
124 | }
125 | 
126 | 
127 | 
128 | // delete copy constructor
129 | // template<typename SuffStatType>
130 | // Statistics<IntegralArray, SuffStatType >::Statistics( const Statistics& that ) = delete;
131 | 
132 | 
133 | // NOTE this constructor swaps its input vectors, i.e. they are empty outsize of this class
134 | // TODO Multivector?
135 | template<typename SuffStatType>
136 | Statistics<IntegralArray, SuffStatType >::Statistics(
137 |     vector<SufficientStatistics< SuffStatType>>& stats,
138 |     const size_t nrDim
139 | ) :
140 | 	mSize( stats.size() / nrDim ),
141 | 	mNrDim( nrDim ),
142 | 	mCurrentSuffStat( nrDim, 0 ),
143 | 	mStats( nrDim ) ,
144 | 	mCurrentStats( nrDim, KahanAggregator<SufficientStatistics<SuffStatType>>( ) ) {
145 | 	// TODO make parameter?
146 | 
147 | 
148 | 	
149 | 	// check that weights contain data
150 | 	if ( mSize <= 0 ) {
151 | 		throw runtime_error( "Input vector for breakpoint weights is empty!" );
152 | 	}
153 | 
154 | 	//check that stats contain data
155 | 	if ( stats.size() <= 0 ) {
156 | 		throw runtime_error( "Input vector for breakpoint weights is empty!" );
157 | 	}
158 | 
159 | 	if ( !divides( stats.size(), mSize ) ) {
160 | 		throw runtime_error( "Error constructing breakpoint array: number of sufficient statistics must be an integer multiple of the number of weights!" );
161 | 	}
162 | 
163 | 
164 | 	// check that size of stats array is integer mutliple of size of weights array
165 | 	if ( !( divides( stats.size(), mSize ) && stats.size() >= mSize ) ) {
166 | 		throw runtime_error( "Cannot infer data dimension, size of statistics vector (" + to_string( stats.size() ) + ") must be multiple of size of weight vector(" + to_string( mSize ) + ")!" );
167 | 	}
168 | 
169 | 	// move sufficient statistics to right and compute cumulative sums; Having the first entry be zero means we don't have to check for t=0 start positions and also don't worry about underflow of t
170 | 
171 | 	// TODO might cause reallocation
172 | 	SufficientStatistics<SuffStatType> zero( 0 );
173 | 	stats.reserve( stats.size() + mNrDim );
174 | 	for ( size_t d = 0; d < mNrDim; ++d ) {
175 | 		stats.push_back( zero );
176 | 	}
177 | 
178 | 
179 | 	// compute the partial cumulative sums in subarrays, for all dimensions
180 | 
181 | 	const size_t skip = mNrDim * CELLSIZE;
182 | 	for ( size_t start = 0; start < stats.size(); start += skip ) {
183 | 		for ( size_t d = 0; d < mNrDim; ++d ) {
184 | 			KahanCumulativeSum( stats, start + d, start + skip + d, mNrDim, true );
185 | 		}
186 | 	}
187 | 
188 | 
189 | 	mStats.swap( stats );
190 | 
191 | };
192 | 
193 | 
194 | 
195 | 
196 | template<typename SuffStatType>
197 | template<typename T>
198 | void Statistics<IntegralArray, SuffStatType >::setStats(
199 |     const Blocks<T>& blocks ) {
200 | 
201 | 	/*
202 | 	 Get the next block under the current threshold.
203 | 	NOTE: for a cumulative sum array A shifted one position to the right, with A[0]=0, the sum of [start, end) = [start, end-1] is A[end]-A[start]
204 | 	*/
205 | 
206 | 	for ( size_t dim = 0; dim < mNrDim; ++dim ) {
207 | 		mCurrentStats[dim].reset();
208 | 		addBlockStats( blocks.start(), blocks.end(), dim, mCurrentStats[dim] );
209 | 		mCurrentSuffStat[dim] = mCurrentStats[dim].sum();
210 | 	}
211 | 
212 | }
213 | 
214 | 
215 | 
216 | 
217 | template<typename SuffStatType>
218 | const SufficientStatistics<SuffStatType>& Statistics<IntegralArray, SuffStatType >::suffStat( size_t dim ) const {
219 | 	return mCurrentSuffStat[dim];
220 | 
221 | }
222 | 
223 | 
224 | template<typename SuffStatType>
225 | size_t Statistics<IntegralArray, SuffStatType >::nrDim() const {
226 | 	return mNrDim;
227 | }
228 | 
229 | 
230 | template<typename SuffStatType>
231 | size_t Statistics<IntegralArray, SuffStatType >::size() const {
232 | 	return mSize;
233 | }
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 
241 | #endif
242 | 
243 | 
244 | 


--------------------------------------------------------------------------------
/src/Blocks/BreakpointArray.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef BREAKPOINTARRAY_HPP
  2 | #define BREAKPOINTARRAY_HPP
  3 | 
  4 | #include "../Blocks.hpp"
  5 | 
  6 | #include "../includes.hpp"
  7 | #include "../Tags.hpp"
  8 | #include "../Theta.hpp"
  9 | #include "../uintmath.hpp"
 10 | #include "../utils.hpp"
 11 | 
 12 | #include <algorithm>
 13 | using std::rotate;
 14 | 
 15 | #include <deque>
 16 | using std::deque;
 17 | 
 18 | 
 19 | 
 20 | typedef uint16_t PointerType;
 21 | 
 22 | // generates a block structure for any go
 23 | template<>
 24 | class Blocks<BreakpointArray> {
 25 | 
 26 | 		// number of input positions
 27 | 		const size_t mSize;
 28 | 
 29 | 		// the maximum size that an iterator can jump forward (pruning limit)
 30 | 		Direction mDirection;
 31 | 
 32 | 		// mWeights[i] represents the weight of breakpoint [i-1,i]. This also means that mWeights[0] is essentially ignored.
 33 | 		vector<real_t> mWeights;
 34 | 
 35 | 		// mPointers[i] means that for all j in [i+1, i+mPointers[i]-1] (inclusive), mWeights[j] < mWeights[i]
 36 | 		vector<PointerType> mPointers;
 37 | 
 38 | 		real_t mThreshold;
 39 | 		size_t mBlockCounter;
 40 | 
 41 | 		// the boundaries of the current block
 42 | 		size_t mBlockStart;
 43 | 		size_t mBlockEnd;
 44 | 		size_t mBlockSize;
 45 | 
 46 | 
 47 | 		// this creates a pointer target to <right> based on the top of the stacks
 48 | 		inline void reduceStacks(
 49 | 		    deque<size_t>& indexStack,
 50 | 		    const size_t& right );
 51 | 
 52 | 	public:
 53 | 
 54 | 		// delete copy constructor
 55 | // 		Blocks( const Blocks& that ) = delete;
 56 | 
 57 | 
 58 | 		// NOTE this constructor swaps its input vectors, i.e. they are empty outsize of this class
 59 | 		Blocks(
 60 | 		    vector<real_t>& weights );
 61 | 
 62 | 
 63 | 		void createBlocks( real_t threshold );
 64 | 
 65 | 		template<typename ParamType>
 66 | 		void createBlocks( const Theta<ParamType>& param );
 67 | 
 68 | 		void initForward();
 69 | 
 70 | 
 71 | 
 72 | 		// get the end of a block starting at <start> for a given threshold
 73 | 		// return false if the block end is the last possible value
 74 | 		inline bool next();
 75 | 
 76 | 
 77 | 
 78 | 		// average weight of breakpoints, can be used to derive a block structure for automatic priors for instance
 79 | 		real_t avgWeight() const;
 80 | 
 81 | 
 82 | 		size_t start() const;
 83 | 
 84 | 		size_t end() const;
 85 | 
 86 | 		size_t pos() const;
 87 | 
 88 | 		// Return the size of the current block.
 89 | 		size_t blockSize() const;
 90 | 
 91 | 		// Return the total size, i.e. the sum of all block sizes.
 92 | 		size_t size() const;
 93 | 
 94 | 		size_t nrBlocks() const;
 95 | 
 96 | 
 97 | 		void printBlock() const;
 98 | 
 99 | 
100 | };
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | // this creates a pointer target to <right> based on the top of the stacks
114 | 
115 | inline void Blocks<BreakpointArray>::reduceStacks(
116 |     deque<size_t>& indexStack,
117 |     const size_t& right ) {
118 | 
119 | 	// set pointer for stretch
120 | 	size_t left = indexStack.back();
121 | 	mPointers[left] =  right - left;
122 | 	indexStack.pop_back();
123 | }
124 | 
125 | 
126 | 
127 | 
128 | // NOTE this constructor swaps its input vectors, i.e. they are empty outsize of this class
129 | 
130 | Blocks<BreakpointArray>::Blocks(
131 |     vector<real_t>& weights
132 | ) :
133 | 	mSize( weights.size() ),
134 | 	mDirection( unset ),
135 | 	mBlockCounter( 0 ) {
136 | 	// TODO make parameter?
137 | 
138 | 	mWeights.swap( weights );
139 | 
140 | 
141 | 
142 | 	// check that weights contain data
143 | 	if ( mSize <= 0 ) {
144 | 		throw runtime_error( "Input vector for breakpoint weights is empty!" );
145 | 	}
146 | 
147 | 	// calculate pointers
148 | 	// NOTE stacks are implemented without container adapters, since we require random access to indexStack[0]
149 | 	// the maximum value the pointers can take
150 | 	const PointerType maxJumpSize = min( mSize, ( size_t )numeric_limits<PointerType>::max() );
151 | 
152 | 	// initialize all pointers to their maximum allowed value
153 | 	mPointers.assign( mSize, maxJumpSize );
154 | 	deque<size_t> indexStack;
155 | 	indexStack.push_back( 0 );
156 | 	size_t left = 0;
157 | 	for ( size_t right = 1; right < mSize; ++right ) {
158 | 
159 | 		// check if the furthest element in the deque  has reached its maximum jump size, and set its pointer if necessary
160 | 		if ( !indexStack.empty() ) {
161 | 			size_t furthestIndex = indexStack.front();
162 | 			if ( right - furthestIndex == maxJumpSize ) {
163 | 				mPointers[furthestIndex] = maxJumpSize;
164 | 				indexStack.pop_front();
165 | 			}
166 | 		}
167 | 
168 | 		while ( !indexStack.empty() ) {
169 | 			left = indexStack.back();
170 | 
171 | 			if ( mWeights[left] <= mWeights[right] ) {
172 | 				reduceStacks( indexStack, right );
173 | 			} else {
174 | 				break;	// weights only get larger further down the stack
175 | 			}
176 | 		}
177 | 		indexStack.push_back( right );
178 | 	}
179 | 	// elements still on the stack all point past the end
180 | 	while ( indexStack.size() > 0 ) {
181 | 		reduceStacks( indexStack, mSize );
182 | 	}
183 | 
184 | };
185 | 
186 | 
187 | 
188 | 
189 | void Blocks<BreakpointArray>::createBlocks( real_t threshold ) {
190 | 	mThreshold = threshold;
191 | }
192 | 
193 | 
194 | // TODO find more elegant and flexible way to compute thresholds
195 | template<>
196 | void Blocks<BreakpointArray>::createBlocks(
197 |     const  Theta<NormalParam>& param ) {
198 | 	createBlocks( sqrt( 2 * log( ( real_t )mSize ) *param.thresholdValue() ) );
199 | }
200 | 
201 | 
202 | 
203 | void Blocks<BreakpointArray>::initForward() {
204 | 	mDirection = forward;
205 | 	mBlockStart = 0;
206 | 	mBlockEnd = 0;
207 | 	mBlockSize = 0;
208 | 	mBlockCounter = 0;
209 | }
210 | 
211 | 
212 | 
213 | // get the end of a block starting at <start> for a given threshold
214 | // return false if the block end is the last possible value
215 | 
216 | inline bool Blocks<BreakpointArray>::next() {
217 | 	if ( mBlockEnd >= mSize ) {
218 | 		mDirection = unset;
219 | 		return false;
220 | 	} else {
221 | 		mBlockCounter++;
222 | 		mBlockStart = mBlockEnd;
223 | 		mBlockEnd = mBlockStart + 1;
224 | 		while ( mBlockEnd < mSize ) {
225 | 			// TODO how to handle overflow. Maximum block size?
226 | 			if ( mWeights[mBlockEnd] < mThreshold ) {
227 | 				mBlockEnd += mPointers[mBlockEnd];	// NOTE this involves typecasting by necessity
228 | 			} else {
229 | 				break;
230 | 			}
231 | 		}
232 | 		mBlockSize = mBlockEnd - mBlockStart;
233 | 		return true;
234 | 	}
235 | }
236 | 
237 | 
238 | 
239 | // average weight of breakpoints, can be used to derive a block structure for automatic priors for instance
240 | 
241 | real_t Blocks<BreakpointArray>::avgWeight() const {
242 | 	real_t sum = 0;
243 | 	for ( const auto & w : mWeights ) {
244 | 		if ( isfinite( w ) ) {
245 | 			sum += w;
246 | 		}
247 | 	}
248 | 	if ( isfinite( mWeights[0] ) ) {
249 | 		sum -= mWeights[0];	// the first element isn't really a true weight, as there is always a breakpoint before the first element
250 | 	}
251 | 	return sum / ( ( double )mWeights.size() - 1 );
252 | }
253 | 
254 | 
255 | 
256 | size_t Blocks<BreakpointArray>::start() const {
257 | 	return mBlockStart;
258 | }
259 | 
260 | 
261 | 
262 | size_t Blocks<BreakpointArray>::end() const {
263 | 	return mBlockEnd;
264 | }
265 | 
266 | 
267 | 
268 | size_t Blocks<BreakpointArray>::pos() const {
269 | 	if ( mBlockCounter > 0 ) {
270 | 		return mBlockCounter - 1;
271 | 	} else {
272 | 		throw runtime_error( "No blocks created yet, position is undefined!" );
273 | 	}
274 | }
275 | 
276 | 
277 | 
278 | // Return the size of the current block.
279 | 
280 | size_t Blocks<BreakpointArray>::blockSize() const {
281 | 	return mBlockSize ;
282 | }
283 | 
284 | 
285 | 
286 | // Return the total size, i.e. the sum of all block sizes.
287 | 
288 | size_t Blocks<BreakpointArray>::size() const {
289 | 	return mSize;
290 | }
291 | 
292 | 
293 | 
294 | // Return the number of blocks induced by the current threshold. This can only be called once the iteration is complete.
295 | 
296 | size_t Blocks<BreakpointArray>::nrBlocks() const {
297 | 	if ( mDirection != unset ) {
298 | 		throw runtime_error( "Cannot determine size of block structure before all blocks have been seen!" );
299 | 	}
300 | 	return mBlockCounter;
301 | }
302 | 
303 | 
304 | 
305 | void Blocks<BreakpointArray>::printBlock() const {
306 | 	cout << "[" << mBlockStart << ":" << mBlockEnd << ") " << mBlockSize << " ";
307 | }
308 | 
309 | #endif
310 | 


--------------------------------------------------------------------------------
/bin/pyhammlet/plotting.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | """Generic plotting functions used by HaMMLET and associated scripts and programs. """
  5 | 
  6 | 
  7 | 
  8 | from __future__ import print_function, division
  9 | import matplotlib
 10 | matplotlib.use('Agg')
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | from matplotlib.ticker import MaxNLocator 
 14 | import os
 15 | from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, inset_axes
 16 | from matplotlib.colors import ListedColormap, BoundaryNorm, LogNorm, Colormap
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | # like plt.imshow, but scaled down to a defined number of pixels 
 26 | def scaledImshow(matrix, cmap, norm,  maxNrPixels = 100000000, intType="uint16", *args, **kwargs):
 27 | 	ax = plt.gca()
 28 | 	height, width = matrix.shape
 29 | 	nrPixels = height*width
 30 | 	if nrPixels > maxNrPixels:
 31 | 		try:
 32 | 			from scipy.ndimage import zoom
 33 | 		except:
 34 | 			print("[ERROR] You must have SciPy installed to plot huge data!")
 35 | 		scalingFactor = np.sqrt(maxNrPixels/nrPixels)		
 36 | 		mat = zoom(matrix, scalingFactor, mode="nearest")
 37 | 		assert mat.ndim==2
 38 | 		ax.imshow(mat, cmap, norm,  *args, **kwargs)
 39 | 	else:
 40 | 		assert matrix.ndim==2
 41 | 		ax.imshow(matrix, cmap, norm, *args, **kwargs)
 42 | 	return ax
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | def lettercase(s):
 53 | 	return s[0].upper()+s[1:]
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | ### MATRIX PLOTTING, e.g. for marginals ###
 60 | 
 61 | def sortMatrix(
 62 | 	matrix,
 63 | 	axis=0,	# 1 for rows, 0 for columns
 64 | 	reverse=False):
 65 | 	return np.apply_along_axis(lambda x, r: np.array(sorted(x, reverse=r)), axis, matrix, not reverse)
 66 | 		
 67 | 	
 68 | def sortByFrequency(
 69 | 	a,
 70 | 	reverse=False):
 71 | 	binCounts = np.bincount(a)
 72 | 	idx=np.argsort(binCounts)
 73 | 	if reverse:
 74 | 		return np.repeat(np.array(range(max(a)+1))[idx], binCounts[idx])[::-1]
 75 | 	else:
 76 | 		return np.repeat(np.array(range(max(a)+1))[idx], binCounts[idx])
 77 | 
 78 | 
 79 | def sortMatrixByFrequency(
 80 | 	m,
 81 | 	axis = 0,
 82 | 	reverse=False):
 83 | 	"""Sorts matrix axes by the number of occurrences in each array, e.g. when sorting columns, each column will have its rarest element first and its most common entry last."""
 84 | 	return np.apply_along_axis(sortByFrequency, axis, m, not reverse)
 85 | 
 86 | 
 87 | 
 88 | 
 89 | def plotMatrix(
 90 | 	m,
 91 | 	xlabel="Position along chromosome",
 92 | 	ylabel="Marginal counts",
 93 | 	xstretch=1,
 94 | 	xmin=0,
 95 | 	normalize=False,
 96 | 	*args,
 97 | 	**kwargs):
 98 | 	
 99 | 	ax = plt.gca()
100 | 	ax.set_xlabel(xlabel)
101 | 	ax.set_ylabel(ylabel)
102 | 	ymax, xmax = m.shape
103 | 	if normalize:
104 | 		ymax=1
105 | 	ext = [xmin, xmin+xmax*xstretch, 0, ymax]	# TODO include xstretch in xmin?
106 | 	scaledImshow(m, extent = ext, aspect="auto", origin="lower", interpolation="none",  *args, **kwargs)
107 | 	return ax
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | def matrixQuantilePlot(
116 | 	data, 
117 | 	quantiles = range(5, 100, 5),
118 | 	xlabel="Iteration", 
119 | 	ylabel="F-measure (quantiles)", 
120 | 	cmap="Blues",
121 | 	mincolor = 0.1,
122 | 	maxcolor=0.9,
123 | 	ylim = None,
124 | 	insetXlim=None, 
125 | 	insetYlim=None, 
126 | 	insetXticks=None, 
127 | 	insetYticks=None, 
128 | 	insetWidth="40%", 
129 | 	insetHeight="40%", 
130 | 	insetLoc=4):
131 | 	
132 | 	from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes, inset_axes
133 | 	inset = insetXlim != None
134 | 	percentiles = np.percentile(data, quantiles, axis=0)
135 | 	iterations = data.shape[1]
136 | 	
137 | 	
138 | 	ax = plt.gca()
139 | 	if inset:
140 | 		axins = inset_axes(ax, width=insetWidth, height=insetHeight, loc=4)
141 | 	
142 | 	colormap = plt.cm.get_cmap(cmap)
143 | 	
144 | 	for i in xrange(len(quantiles)):
145 | 		q = quantiles[i]
146 | 		if q == 50:
147 | 			ax.plot(percentiles[i], color="black",zorder=len(quantiles), linewidth=2)
148 | 			if inset:
149 | 				axins.plot(percentiles[i], color="black",zorder=len(quantiles), linewidth=2)
150 | 		if q > 50:
151 | 			break
152 | 		
153 | 
154 | 		color = colormap(((q/100)/(maxcolor-mincolor)+mincolor))
155 | 		#getColor(value, cmap, minColor, maxColor)
156 | 		
157 | 		ax.fill_between(range(iterations), percentiles[i], percentiles[-i-1], color=color, linewidth=1, zorder=i)
158 | 		ax.plot(percentiles[i], color="black", linewidth=0.3, zorder=len(quantiles)+1)
159 | 		ax.plot(percentiles[-i-1], color="black", linewidth=0.3, zorder=len(quantiles)+1)
160 | 		
161 | 		if inset:
162 | 			axins.fill_between(range(iterations), percentiles[i], percentiles[-i-1], color=color, linewidth=1, zorder=i)
163 | 			axins.plot(percentiles[i], color="black", linewidth=0.3, zorder=len(quantiles)+1)
164 | 			axins.plot(percentiles[-i-1], color="black", linewidth=0.3, zorder=len(quantiles)+1)
165 | 		
166 | 	
167 | 	ymin=min([min(p) for p in percentiles])
168 | 	ymax = max([max(p) for p in percentiles])
169 | 	if ylim != None:
170 | 		ymin = min(ymin, ylim[0])
171 | 		ymax = min(ymax, ylim[1])
172 | 	
173 | 	margin = (ymax-ymin)*0.05
174 | 	ax.set_ylim([ymin-margin,ymax+margin])
175 | 	ax.set_xlim([-iterations/20, iterations+iterations/20 ])
176 | 	ax.set_xlabel(xlabel)
177 | 	ax.set_ylabel(ylabel)
178 | 	
179 | 	if inset:
180 | 		axins.set_xlim(insetXlim)
181 | 		axins.set_ylim(insetYlim)
182 | 		axins.xaxis.tick_top()
183 | 		axins.set_xticks(insetXticks)
184 | 		axins.set_yticks(insetYticks)
185 | 	#plt.sca(ax) 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | # given data and an array of states of the same size, create a scatter plot for the start:end slice, colored according to the states
195 | def plotData(
196 | 	data, 
197 | 	states=None,
198 | 	start=0, 
199 | 	end=None, 
200 | 	marker=".",
201 | 	linewidth=0,
202 | 	alpha=0.8,
203 | 	ylabel=None,
204 | 	cmap=None,
205 | 	norm=None,
206 | 	*args,
207 | 	**kwargs): 
208 | 	
209 | 	ax = plt.gca()
210 | 	
211 | 	if states is not None:
212 | 		assert cmap is not None, "Require colormap to plot data if a state sequence is provided!"
213 | 		assert norm is not None, "Require color normalization to plot data if a state sequence is provided!"
214 | 			
215 | 	#TODO chunksize
216 | 	#TODO multivariate
217 | 	if end is None:
218 | 		end = start+len(data)
219 | 	if states is None:
220 | 		states="k"
221 | 	else:
222 | 		states = states[start:end]
223 | 
224 | 	ax.scatter(range(start, end), data[start:end], c=states,  marker=marker, linewidth=linewidth, alpha=alpha, cmap=cmap, norm=norm, *args)
225 | 	ax.set_ylabel(ylabel)
226 | 	ax.set_xlim([start, end])
227 | 	return ax
228 | 
229 | 
230 | 
231 | #even blocks are white, odd are dark
232 | def plotBlockSizes(
233 | 	compressedBlocks, 
234 | 	start=0, 
235 | 	end=None,  
236 | 	chunkSize=1, 
237 | 	*args,
238 | 	**kwargs):
239 | 	ax = plt.gca()
240 | 	M=compressedBlocks[start:end].decompress().T
241 | 	# TODO more customizable way to plot boundaries (or not!)
242 | 	M[M==1] = 0
243 | 	plotMatrix(M,  xmin=start, xstretch=chunkSize,  *args, **kwargs)	# cmap="Blues", Spectral"
244 | 	return ax
245 | 	
246 | 
247 | # takes compressed marginal counts, extracts the [start:end] slice, expands the counts to a full matrix of iterations and plots it
248 | def plotMarginals(
249 | 	compressedMarginalCounts, 
250 | 	cmap,
251 | 	norm,
252 | 	start=0, 
253 | 	end=None, 
254 | 	chunkSize=1, 
255 | 	*args,
256 | 	**kwargs):
257 | 	ax = plt.gca()
258 | 	counts = compressedMarginalCounts[start:end].decompress()
259 | 	
260 | 	# expand the counts to full array
261 | 	nrSegments, nrStates = counts.shape
262 | 	iterations = counts[0,:].sum()
263 | 	marginals = np.zeros((nrSegments, iterations), dtype=int)
264 | 	for i in xrange(nrSegments):
265 | 		marginals[i,:] = np.repeat(np.array(range(nrStates)), counts[i,:])
266 | 	plotMatrix(marginals.T,  xstretch=chunkSize, xmin=start, cmap=cmap, norm=norm, *args, **kwargs)
267 | 	del marginals
268 | 	del counts
269 | 	return ax
270 | 
271 | 
272 | 
273 | def plotSequences(
274 | 	sequences, # a list of RLE vectors
275 | 	cmap,
276 | 	norm,
277 | 	start=0, 
278 | 	end=None, 
279 | 	*args,
280 | 	**kwargs
281 | 	):
282 | 	
283 | 	ax = plt.gca()
284 | 	if end is None:
285 | 		end = sequences[0].size
286 | 	matrix = np.zeros([len(sequences), end-start], dtype=int)
287 | 	for i in xrange(len(sequences)):
288 | 		matrix[i] = sequences[i][start:end].decompress()
289 | 	#TODO vmin, vmax
290 | 	plotMatrix(matrix,  ylabel="Iteration", xmin=start, normalize=False, cmap=cmap, norm=norm, *args, **kwargs)
291 | 	return ax
292 | 	
293 | 	


--------------------------------------------------------------------------------
/src/SufficientStatistics.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SUFFICIENTSTATISTICS_HPP
  2 | #define SUFFICIENTSTATISTICS_HPP
  3 | 
  4 | #include "Tags.hpp"
  5 | #include "includes.hpp"
  6 | #include "Observation.hpp"
  7 | 
  8 | //NOTE empty constructors should create objects that are meaningful for not having any observatgions, e.g. all 0 for <Normal>
  9 | 
 10 | 
 11 | template <typename DistType>
 12 | class SufficientStatistics;
 13 | 
 14 | 
 15 | template<typename Type>
 16 | inline SufficientStatistics<Type> operator+(
 17 |     SufficientStatistics<Type> lhs,
 18 |     const SufficientStatistics<Type>&  rhs ) {
 19 | 	if ( lhs.nrDim() != rhs.nrDim() ) {
 20 | 		throw runtime_error( "Cannot add sufficient statistics for categorical of different dimensions!" );
 21 | 	}
 22 | 
 23 | 	lhs += rhs;
 24 | 	return lhs;
 25 | }
 26 | 
 27 | 
 28 | template<typename Type>
 29 | inline SufficientStatistics<Type> operator-(
 30 |     SufficientStatistics<Type> lhs,
 31 |     const SufficientStatistics<Type>&  rhs ) {
 32 | 	if ( lhs.nrDim() != rhs.nrDim() ) {
 33 | 		throw runtime_error( "Cannot add sufficient statistics for categorical of different dimensions!" );
 34 | 	}
 35 | 
 36 | 	lhs -= rhs;
 37 | 	return lhs;
 38 | }
 39 | 
 40 | 
 41 | 
 42 | // TODO define common methods outside?
 43 | 
 44 | 
 45 | 
 46 | //////////////////// UNIVARIATE NORMAL ////////////////////
 47 | 
 48 | template <>
 49 | class SufficientStatistics<Normal> {
 50 | 		real_t mSum;
 51 | 		real_t mSumSq;
 52 | 
 53 | 
 54 | 
 55 | 	public:
 56 | 
 57 | 		SufficientStatistics() {
 58 | 			clear();
 59 | 		};
 60 | 
 61 | 		SufficientStatistics(
 62 | 		    real_t singleValue ) {
 63 | 			mSum = singleValue;
 64 | 			mSumSq = singleValue * singleValue;
 65 | 		}
 66 | 
 67 | 		SufficientStatistics(
 68 | 		    vector<real_t> vec,
 69 | 		    size_t begin = 0,
 70 | 		    size_t end = 0 ) {
 71 | 			if ( end == 0 || end > vec.size() ) {
 72 | 				end = vec.size();
 73 | 			}
 74 | 			clear();
 75 | 			for ( auto i = begin; i < end; ++i ) {
 76 | 				addObs( vec[i] );
 77 | 			}
 78 | 		}
 79 | 
 80 | 		SufficientStatistics(
 81 | 		    const real_t sum,
 82 | 		    const real_t sumSq
 83 | 		) {
 84 | 			mSum = sum;
 85 | 			mSumSq = sumSq;
 86 | 		}
 87 | 
 88 | 		inline void addObs( const real_t x ) {
 89 | 			mSum += x;
 90 | 			mSumSq += x * x;
 91 | 		}
 92 | 
 93 | 
 94 | 		size_t nrDim() const {
 95 | 			return 1;
 96 | 		}
 97 | 
 98 | 		real_t sum() const {
 99 | 			return mSum;
100 | 		}
101 | 
102 | 		real_t sumSq() const {
103 | 			return mSumSq;
104 | 		}
105 | 
106 | 
107 | 		SufficientStatistics<Normal> operator-() const {
108 | 			SufficientStatistics<Normal> result( -mSum, -mSumSq );
109 | 			return result;
110 | 
111 | 		}
112 | 
113 | 		SufficientStatistics<Normal>&  operator+=(
114 | 		    const SufficientStatistics<Normal>&  rhs )  {
115 | 			mSum += rhs.sum();
116 | 			mSumSq += rhs.sumSq();
117 | 			return *this;
118 | 		}
119 | 
120 | 		SufficientStatistics<Normal>&  operator-=(
121 | 		    const SufficientStatistics<Normal>&  rhs )  {
122 | 			mSum -= rhs.sum();
123 | 			mSumSq -= rhs.sumSq();
124 | 			return *this;
125 | 		}
126 | 
127 | 
128 | 		void clear() {
129 | 			mSum = 0;
130 | 			mSumSq = 0;
131 | 		}
132 | 
133 | 
134 | // TODO move outside?
135 | 		friend ostream& operator<<(
136 | 		    ostream& output,
137 | 		    const SufficientStatistics<Normal>& D )  {
138 | 			output << D.mSum << "\t" << D.mSumSq;
139 | 			return output;
140 | 		};
141 | 
142 | };
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | //////////////////// Categorical ////////////////////
153 | 
154 | 
155 | template <>
156 | class SufficientStatistics<Categorical> {
157 | 		vector<size_t> mCounts;
158 | 
159 | 	public:
160 | 
161 | 		SufficientStatistics( size_t domainsize ) {
162 | 			mCounts.assign( domainsize, 0 );
163 | 		}
164 | 
165 | 		SufficientStatistics( vector<size_t>& counts ) {
166 | 			mCounts = counts;
167 | 		}
168 | 
169 | 		SufficientStatistics<Categorical>&  operator+=(
170 | 		    const SufficientStatistics<Categorical>&  rhs )  {
171 | 			if ( domainSize() != rhs.domainSize() ) {
172 | 				throw runtime_error( "Cannot add sufficient statistics for categorical of different domain sizes!" );
173 | 			}
174 | 
175 | 			for ( size_t i = 0; i < mCounts.size(); ++i ) {
176 | 				mCounts[i] += rhs[i];
177 | 			}
178 | 
179 | 			return *this;
180 | 		}
181 | 
182 | 		SufficientStatistics<Categorical>&  operator-=(
183 | 		    const SufficientStatistics<Categorical>&  rhs )  {
184 | 			if ( domainSize() != rhs.domainSize() ) {
185 | 				throw runtime_error( "Cannot add sufficient statistics for categorical of different domain sizes!" );
186 | 			}
187 | 
188 | 			for ( size_t i = 0; i < mCounts.size(); ++i ) {
189 | 				mCounts[i] -= rhs[i];
190 | 			}
191 | 
192 | 			return *this;
193 | 		}
194 | 
195 | 		size_t nrDim() const {
196 | 			return 1;	// NOTE This is correct, these are the sufficient statistics for a 1-D categorical variable.
197 | 		}
198 | 
199 | 
200 | 		// how many different values can the underlying categorical variable take?
201 | 		size_t domainSize() const {
202 | 			return mCounts.size();
203 | 		}
204 | 
205 | 		const size_t& operator[]( size_t i ) const {
206 | 			return mCounts[i];
207 | 		}
208 | 
209 | 		size_t& operator[]( size_t i ) {
210 | 			return mCounts[i];
211 | 		}
212 | 
213 | 		void clear() {
214 | 			mCounts.assign( mCounts.size(), 0 );
215 | 		}
216 | 
217 | 		string str() const {
218 | 			return concat( mCounts );
219 | 		}
220 | };
221 | 
222 | 
223 | 
224 | //////////////////// Categorical Vector ////////////////////
225 | // NOTE if nrDim() == domainSize(), then this is a symmetric matrix that can be used for counting transitions
226 | 
227 | 
228 | template <>
229 | class SufficientStatistics<CategoricalVector> {
230 | 		vector < SufficientStatistics<Categorical>> mCounts;
231 | 
232 | 	public:
233 | 
234 | 		SufficientStatistics(
235 | 		    size_t nrdim,
236 | 		    size_t domainsize = 0 ) {
237 | 			if ( domainsize == 0 ) {
238 | 				domainsize = nrdim;
239 | 			}
240 | 
241 | 			for ( size_t d = 0; d < nrdim; ++d ) {
242 | 				mCounts.push_back( SufficientStatistics< Categorical >( domainsize ) );
243 | 			}
244 | 		}
245 | 
246 | 		size_t nrDim() const {
247 | 			return mCounts.size();
248 | 		}
249 | 
250 | 
251 | 
252 | 		const SufficientStatistics<Categorical>& operator[]( int i ) const {
253 | 			return mCounts[i];
254 | 		}
255 | 
256 | 
257 | 		SufficientStatistics<Categorical>& operator[]( int i )  {
258 | 			return mCounts[i];
259 | 		}
260 | 
261 | 
262 | 		SufficientStatistics<CategoricalVector>&  operator+=(
263 | 		    const SufficientStatistics<CategoricalVector>&  rhs )  {
264 | 
265 | 			for ( size_t i = 0; i < nrDim(); ++i ) {
266 | 				mCounts[i] += rhs[i];
267 | 			}
268 | 
269 | 			return *this;
270 | 		}
271 | 
272 | 		SufficientStatistics<CategoricalVector>&  operator-=(
273 | 		    const SufficientStatistics<CategoricalVector>&  rhs )  {
274 | 
275 | 			for ( size_t i = 0; i < nrDim(); ++i ) {
276 | 				mCounts[i] -= rhs[i];
277 | 			}
278 | 
279 | 			return *this;
280 | 		}
281 | 
282 | 
283 | 
284 | 
285 | 		string str() const {
286 | 			string result = "";
287 | 			for ( auto & c : mCounts ) {
288 | 				result += c.str() + "\n";
289 | 			}
290 | 			return result;
291 | 		}
292 | 
293 | 
294 | 		void clear() {
295 | 			for ( auto & c : mCounts ) {
296 | 				c.clear();
297 | 			}
298 | 		}
299 | 
300 | };
301 | 
302 | 
303 | 
304 | 
305 | 
306 | //////////////////// Geometric ////////////////////
307 | 
308 | 
309 | template <>
310 | class SufficientStatistics<Geometric> {
311 | 		size_t mSum;
312 | 
313 | 	public:
314 | 
315 | 		SufficientStatistics() {
316 | 			mSum = 0;
317 | 		}
318 | 
319 | 		SufficientStatistics( real_t& sum ) {	// TODO parsing real_t for count data is not the best option
320 | 			mSum = sum;
321 | 		}
322 | 
323 | 		SufficientStatistics( size_t& sum ) {
324 | 			mSum = sum;
325 | 		}
326 | 
327 | 
328 | 		SufficientStatistics(
329 | 		    vector<real_t> vec,
330 | 		    size_t begin = 0,
331 | 		    size_t end = 0 ) {
332 | 			if ( end == 0 || end > vec.size() ) {
333 | 				end = vec.size();
334 | 			}
335 | 			clear();
336 | 			for ( auto i = begin; i < end; ++i ) {
337 | 				mSum += vec[i] ;
338 | 			}
339 | 		}
340 | 
341 | 
342 | 		SufficientStatistics<Geometric>&  operator+=(
343 | 		    const SufficientStatistics<Geometric>&  rhs )  {
344 | 			if ( domainSize() != rhs.domainSize() ) {
345 | 				throw runtime_error( "Cannot add sufficient statistics for categorical of different domain sizes!" );
346 | 			}
347 | 
348 | 			mSum += rhs.sum();
349 | 
350 | 			return *this;
351 | 		}
352 | 
353 | 
354 | 		SufficientStatistics<Geometric>&  operator-=(
355 | 		    const SufficientStatistics<Geometric>&  rhs )  {
356 | 			if ( domainSize() != rhs.domainSize() ) {
357 | 				throw runtime_error( "Cannot add sufficient statistics for categorical of different domain sizes!" );
358 | 			}
359 | 
360 | 			mSum -= rhs.sum();
361 | 
362 | 			return *this;
363 | 		}
364 | 
365 | 
366 | 		size_t nrDim() const {
367 | 			return 1;	// NOTE This is correct, these are the sufficient statistics for a 1-D categorical variable.
368 | 		}
369 | 
370 | 
371 | 		// how many different values can the underlying categorical variable take?
372 | 		size_t domainSize() const {
373 | 			return 1;
374 | 		}
375 | 
376 | 		size_t sum() const {
377 | 			return mSum;
378 | 		}
379 | 
380 | 		void clear() {
381 | 			mSum = 0;
382 | 		}
383 | 
384 | 		string str() const {
385 | 			return to_string( mSum );
386 | 		}
387 | 
388 | };
389 | 
390 | #endif
391 | 
392 | 
393 | 
394 | 
395 | 
396 | 
397 | 
398 | 
399 | 
400 | 
401 | 
402 | 
403 | 
404 | 


--------------------------------------------------------------------------------