├── images ├── baf.gof.png ├── baf.post.png ├── baf.real.png ├── cna.gof.png ├── cna.post.png ├── cna.real.png ├── snv.gof.png ├── filterHD-1.png ├── prefilter-1.png └── screenshots │ ├── cloneHD-mean.png │ ├── cloneHD-avail.png │ ├── cloneHD-posterior.png │ ├── cloneHD-stdout-1.png │ ├── cloneHD-stdout-2.png │ ├── cloneHD-subclone.png │ ├── cloneHD-summary.png │ ├── filterHD-stdout.png │ ├── filterHD-posterior-1.png │ ├── filterHD-posterior-2.png │ └── filterHD-posterior-3.png ├── .gitignore ├── src ├── log-space.h ├── common-functions.h ├── Makefile.farm.debug ├── Makefile.debug ├── Makefile.farm ├── Makefile ├── jump-diffusion.h ├── minimization.h ├── log-space.cpp ├── cloneHD-functions.h ├── cloneHD-inference.h ├── emission.h ├── clone-predict.cpp ├── common-functions.cpp ├── clone-bulk.cpp ├── clone-llh.cpp ├── clone.h ├── pre-filter.cpp ├── clone-prior.cpp ├── minimization.cpp ├── jump-diffusion.cpp ├── cloneHD.cpp └── filterHD.cpp ├── TODO.md ├── docs ├── README-pre-filter.md ├── README-filterHD.md └── README-cloneHD.md ├── run-example.sh ├── changelog.md └── README.md /images/baf.gof.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/baf.gof.png -------------------------------------------------------------------------------- /images/baf.post.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/baf.post.png -------------------------------------------------------------------------------- /images/baf.real.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/baf.real.png -------------------------------------------------------------------------------- /images/cna.gof.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/cna.gof.png -------------------------------------------------------------------------------- /images/cna.post.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/cna.post.png -------------------------------------------------------------------------------- /images/cna.real.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/cna.real.png -------------------------------------------------------------------------------- /images/snv.gof.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/snv.gof.png -------------------------------------------------------------------------------- /images/filterHD-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/filterHD-1.png -------------------------------------------------------------------------------- /images/prefilter-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/prefilter-1.png -------------------------------------------------------------------------------- /images/screenshots/cloneHD-mean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-mean.png -------------------------------------------------------------------------------- /images/screenshots/cloneHD-avail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-avail.png -------------------------------------------------------------------------------- /images/screenshots/cloneHD-posterior.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-posterior.png -------------------------------------------------------------------------------- /images/screenshots/cloneHD-stdout-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-stdout-1.png -------------------------------------------------------------------------------- /images/screenshots/cloneHD-stdout-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-stdout-2.png -------------------------------------------------------------------------------- /images/screenshots/cloneHD-subclone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-subclone.png -------------------------------------------------------------------------------- /images/screenshots/cloneHD-summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-summary.png -------------------------------------------------------------------------------- /images/screenshots/filterHD-stdout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/filterHD-stdout.png -------------------------------------------------------------------------------- /images/screenshots/filterHD-posterior-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/filterHD-posterior-1.png -------------------------------------------------------------------------------- /images/screenshots/filterHD-posterior-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/filterHD-posterior-2.png -------------------------------------------------------------------------------- /images/screenshots/filterHD-posterior-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/filterHD-posterior-3.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | run-bulk* 2 | run-test* 3 | *.[oa] 4 | build/* 5 | build-*tar.gz 6 | *~ 7 | release/ 8 | release* 9 | *.bk 10 | bk/ 11 | recompile* 12 | .DS_Store 13 | test/ 14 | -------------------------------------------------------------------------------- /src/log-space.h: -------------------------------------------------------------------------------- 1 | //log_space.h 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | // GSL headers... 17 | #include "gsl/gsl_vector.h" 18 | #include "gsl/gsl_matrix.h" 19 | #include "gsl/gsl_blas.h" 20 | 21 | double log_add(double one, double two); 22 | double log_sub(double one, double two); 23 | void log_vector_add(gsl_vector * one, gsl_vector * two); 24 | void log_matrix_add(gsl_matrix * one, gsl_matrix * two); 25 | void log_vector_invert(gsl_vector * vec); 26 | double log_vector_norm(const gsl_vector * x); 27 | double log_matrix_norm(const gsl_matrix * M); 28 | void log_vector_normalize(gsl_vector * x); 29 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # To do list for filterHD/cloneHD 2 | 3 | # Bugs/issues to be fixed 4 | 5 | ## filterHD 6 | 7 | ## cloneHD 8 | 9 | * print full if `--*jump` is (also) given, only events if `--*jumps` 10 | * check memory leaks 11 | 12 | # Features to be added in future releases 13 | 14 | ## filterHD 15 | 16 | * filter loci incompatible with emission model via 2-state HMM 17 | * do Baum-Welch 18 | * `--filter-shortSeg [int]` via posterior jump-prob (not via pmean) 19 | * Use diffusion constant that scales with size in mode 3/4. 20 | * Implement jump subtraction 21 | 22 | ## cloneHD 23 | 24 | * bias field as distribution 25 | * re-think update_snp_fixed (hashing) 26 | * different nLevels per chr via max-tcn (for memory efficiency only) 27 | 28 | ## both 29 | 30 | * string chromosome ids 31 | * re-factor emission class 32 | -------------------------------------------------------------------------------- /src/common-functions.h: -------------------------------------------------------------------------------- 1 | //common-functions.h 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | // GSL headers... 18 | #include "gsl/gsl_vector.h" 19 | #include "gsl/gsl_matrix.h" 20 | #include "gsl/gsl_randist.h" 21 | #include "gsl/gsl_blas.h" 22 | 23 | //own headers 24 | class Emission; 25 | 26 | using namespace std; 27 | 28 | void get_dims( const char * data_fn, int& nTimes, vector& chrs, vector& nSites, int keep); 29 | void get_data( const char * data_fn, Emission * myEmit); 30 | void get_bias( const char * bias_fn, Emission * myEmit); 31 | double get_mean( gsl_vector * dist, double xmin, double xmax); 32 | double get_var( gsl_vector * dist, double xmin, double xmax, double mean); 33 | -------------------------------------------------------------------------------- /src/Makefile.farm.debug: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | GSL = /usr/lib/libgsl.so /usr/lib/libgslcblas.so 3 | CC_FLAGS = -Wall -ggdb -static-libgcc -static-libstdc++ 4 | LD_FLAGS = ${GSL} 5 | prefobj = emission.o common-functions.o 6 | filterHDobj = emission.o common-functions.o jump-diffusion.o minimization.o 7 | cloneobj = clone.o clone-bulk.o clone-prior.o clone-llh.o clone-predict.o clone-update.o clone-fwd-bwd.o 8 | cloneHDobj = $(cloneobj) emission.o common-functions.o minimization.o log-space.o cloneHD-functions.o cloneHD-inference.o 9 | .PHONY: clean all 10 | all: preFilter filterHD cloneHD 11 | #rm -f ./*.o 12 | preFilter: $(prefobj) pre-filter.o 13 | $(CC) $(CC_FLAGS) $^ -o ../build/pre-filter $(LD_FLAGS) 14 | filterHD: $(filterHDobj) filterHD.o 15 | $(CC) $(CC_FLAGS) $^ -o ../build/filterHD $(LD_FLAGS) 16 | cloneHD: $(cloneHDobj) cloneHD.o 17 | $(CC) $(CC_FLAGS) $^ -o ../build/cloneHD $(LD_FLAGS) 18 | %.o: %.cpp 19 | $(CC) $(CC_FLAGS) -c $< -o $@ 20 | $(cloneobj): clone.h $(cloneobj:.o=.cpp) 21 | $(filterHDobj) $(cloneHDobj) $(prefobj): $(.TARGET:.o=.h) 22 | clean: 23 | rm -f ./*.o 24 | -------------------------------------------------------------------------------- /src/Makefile.debug: -------------------------------------------------------------------------------- 1 | CC = g++-mp-4.7 2 | GSL = /opt/local/lib/libgsl.a /opt/local/lib/libgslcblas.a 3 | CC_FLAGS = -Wall -ggdb -static-libgcc -static-libstdc++ 4 | LD_FLAGS = ${GSL} 5 | prefobj = emission.o common-functions.o 6 | filterHDobj = emission.o common-functions.o jump-diffusion.o minimization.o 7 | cloneobj = clone.o clone-bulk.o clone-prior.o clone-llh.o clone-predict.o clone-update.o clone-fwd-bwd.o 8 | cloneHDobj = $(cloneobj) emission.o common-functions.o minimization.o log-space.o cloneHD-functions.o cloneHD-inference.o 9 | .PHONY: clean all 10 | all: preFilter filterHD cloneHD 11 | #rm -f ./*.o 12 | preFilter: $(prefobj) pre-filter.o 13 | $(CC) $(CC_FLAGS) $^ -o ../build/pre-filter $(LD_FLAGS) 14 | filterHD: $(filterHDobj) filterHD.o 15 | $(CC) $(CC_FLAGS) $^ -o ../build/filterHD $(LD_FLAGS) 16 | cloneHD: $(cloneHDobj) cloneHD.o 17 | $(CC) $(CC_FLAGS) $^ -o ../build/cloneHD $(LD_FLAGS) 18 | %.o: %.cpp 19 | $(CC) $(CC_FLAGS) -c $< -o $@ 20 | $(cloneobj): clone.h# $(cloneobj:.o=.cpp) 21 | $(filterHDobj) $(cloneHDobj) $(prefobj): $(.TARGET:.o=.h) 22 | clean: 23 | rm -f ./*.o 24 | -------------------------------------------------------------------------------- /src/Makefile.farm: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | GSL = /usr/lib/libgsl.so /usr/lib/libgslcblas.so 3 | CC_FLAGS = -Wall -O3 -static-libgcc -static-libstdc++ -fopenmp -DHAVE_INLINE 4 | LD_FLAGS = ${GSL} -fopenmp 5 | prefobj = emission.o common-functions.o 6 | filterHDobj = emission.o common-functions.o jump-diffusion.o minimization.o 7 | cloneobj = clone.o clone-bulk.o clone-prior.o clone-llh.o clone-predict.o clone-update.o clone-fwd-bwd.o 8 | cloneHDobj = $(cloneobj) emission.o common-functions.o minimization.o log-space.o cloneHD-functions.o cloneHD-inference.o 9 | .PHONY: clean all 10 | all: preFilter filterHD cloneHD 11 | rm -f ./*.o 12 | preFilter: $(prefobj) pre-filter.o 13 | $(CC) $(CC_FLAGS) $^ -o ../build/pre-filter $(LD_FLAGS) 14 | filterHD: $(filterHDobj) filterHD.o 15 | $(CC) $(CC_FLAGS) $^ -o ../build/filterHD $(LD_FLAGS) 16 | cloneHD: $(cloneHDobj) cloneHD.o 17 | $(CC) $(CC_FLAGS) $^ -o ../build/cloneHD $(LD_FLAGS) 18 | %.o: %.cpp 19 | $(CC) $(CC_FLAGS) -c $< -o $@ 20 | $(cloneobj): clone.h $(cloneobj:.o=.cpp) 21 | $(filterHDobj) $(cloneHDobj) $(prefobj): $(.TARGET:.o=.h) 22 | clean: 23 | rm -f ./*.o 24 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | CC = g++-mp-4.7 2 | GSL = /opt/local/lib/libgsl.a /opt/local/lib/libgslcblas.a 3 | CC_FLAGS = -Wall -O3 -static-libgcc -static-libstdc++ -fopenmp -DHAVE_INLINE 4 | LD_FLAGS = ${GSL} -fopenmp 5 | prefobj = emission.o common-functions.o 6 | filterHDobj = emission.o common-functions.o jump-diffusion.o minimization.o 7 | cloneobj = clone.o clone-bulk.o clone-prior.o clone-llh.o clone-predict.o clone-update.o clone-fwd-bwd.o 8 | cloneHDobj = $(cloneobj) emission.o common-functions.o minimization.o log-space.o cloneHD-functions.o cloneHD-inference.o 9 | .PHONY: clean all 10 | all: preFilter filterHD cloneHD 11 | rm -f ./*.o 12 | preFilter: $(prefobj) pre-filter.o 13 | $(CC) $(CC_FLAGS) $^ -o ../build/pre-filter $(LD_FLAGS) 14 | filterHD: $(filterHDobj) filterHD.o 15 | $(CC) $(CC_FLAGS) $^ -o ../build/filterHD $(LD_FLAGS) 16 | cloneHD: $(cloneHDobj) cloneHD.o 17 | $(CC) $(CC_FLAGS) $^ -o ../build/cloneHD $(LD_FLAGS) 18 | %.o: %.cpp 19 | $(CC) $(CC_FLAGS) -c $< -o $@ 20 | $(cloneobj): clone.h# $(cloneobj:.o=.cpp) 21 | $(filterHDobj) $(cloneHDobj) $(prefobj): $(.TARGET:.o=.h) 22 | clean: 23 | rm -f ./*.o 24 | -------------------------------------------------------------------------------- /src/jump-diffusion.h: -------------------------------------------------------------------------------- 1 | //jump-diffusion.h 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #ifdef _OPENMP 14 | #include 15 | #endif 16 | 17 | // GSL headers... 18 | #include "gsl/gsl_vector.h" 19 | #include "gsl/gsl_matrix.h" 20 | #include "gsl/gsl_randist.h" 21 | #include "gsl/gsl_blas.h" 22 | #include "gsl/gsl_multimin.h" 23 | 24 | using namespace std; 25 | 26 | class JumpDiffusion{ 27 | public: 28 | JumpDiffusion(Emission * emit, int time); 29 | ~JumpDiffusion(); 30 | Emission * myEmit; 31 | int nSamples; 32 | int time; 33 | int mode; 34 | double sigma, jump, rnd_emit; 35 | int Fwd_done, Bwd_done, wTotal, save_alpha; 36 | int gridSize; 37 | int * nSites; 38 | unsigned int ** dist; 39 | unsigned int ** loci; 40 | unsigned int ** mask; 41 | double ** pstay; 42 | double ** pjump; 43 | double ** pnojump; 44 | double ** bias; 45 | void get_EmitProb(int read, int depth, double * xgrid, gsl_vector * eprob);// emission probability 46 | gsl_vector * proposal; 47 | gsl_matrix ** alpha; 48 | gsl_matrix ** gamma; 49 | gsl_matrix ** total; 50 | void set_pstay(); 51 | int pstay_set; 52 | double do_Fwd(int sample); 53 | void do_Bwd(int sample); 54 | // 55 | gsl_matrix ** DiffProp; 56 | int set_DiffProp(gsl_matrix * propagator, double variance); 57 | void get_DiffProp(); 58 | int DiffProp_set; 59 | void reset_DiffProp(); 60 | vector is_identity; 61 | map position; 62 | // 63 | //void set_DiffProp_Log(gsl_matrix * propagator, double variance); 64 | int predict(gsl_vector * prior, gsl_vector * post, gsl_matrix*& DiffProp, gsl_matrix**& DP_pt, int sampe, int site); 65 | double total_llh; 66 | double get_total_llh(); 67 | void get_posterior(int sample); 68 | int adapt_range(); 69 | }; 70 | 71 | -------------------------------------------------------------------------------- /src/minimization.h: -------------------------------------------------------------------------------- 1 | //minimization.h 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | // GSL headers... 18 | #include "gsl/gsl_vector.h" 19 | #include "gsl/gsl_matrix.h" 20 | #include "gsl/gsl_randist.h" 21 | #include "gsl/gsl_blas.h" 22 | #include "gsl/gsl_multimin.h" 23 | #include "gsl/gsl_linalg.h" 24 | 25 | using namespace std; 26 | 27 | 28 | // The numerical minimization routine to get the clone frequencies... 29 | double find_local_optimum( int nSimplex, 30 | gsl_vector**& simplex, 31 | gsl_vector * lower, 32 | gsl_vector * other, 33 | gsl_vector * range, 34 | void * params, 35 | double (*obj_fn)( const gsl_vector * x, void * p), 36 | double prec, 37 | int& steps, 38 | int verbose 39 | ); 40 | 41 | double find_optimum_wrestarts(int nSimplex, 42 | gsl_vector**& simplex, 43 | gsl_vector * lower, 44 | gsl_vector * other, 45 | gsl_vector * range, 46 | void * params, 47 | double (*obj_fn)( const gsl_vector * x, void * p), 48 | double prec, 49 | int restarts, 50 | int& steps, 51 | int verbose 52 | ); 53 | 54 | void spherical_random_step_uniform( double ri, double& rf, double lower, 55 | const gsl_vector * anglei, gsl_vector*& anglef, 56 | double eps); 57 | void simplex_random_step_uniform(const gsl_vector*simplexi, gsl_vector*& simplexf, 58 | double lower, double eps); 59 | 60 | void arg_map( int nSimplex, gsl_vector**& simplex, gsl_vector * lower, const gsl_vector * other, const gsl_vector * range, gsl_vector ** x); 61 | int arg_unmap( const gsl_vector * x, int nSimplex, gsl_vector**& simplex, gsl_vector * lower, gsl_vector * other, const gsl_vector * range); 62 | 63 | double spherical_to_simplex( double radial, const gsl_vector * angle, gsl_vector *& simplex, int getLJD); 64 | 65 | void simplex_to_spherical( const gsl_vector * simplex, double& radial, gsl_vector*& angle); 66 | 67 | double logify( double x, double R); 68 | double delogify(double y, double R); 69 | 70 | /* 71 | double simulated_annealing( 72 | gsl_matrix * freqs, // set of points in or on simplex 73 | gsl_vector * other, // other arguments 74 | gsl_vector * range,// range of other arguments 75 | void * params, 76 | double (*obj_fn)( const gsl_vector * x, void * p), 77 | int& steps 78 | ); 79 | int accepted(double dE, double T); 80 | */ 81 | -------------------------------------------------------------------------------- /src/log-space.cpp: -------------------------------------------------------------------------------- 1 | //log_space.cpp 2 | 3 | #define PI 3.1415926 4 | 5 | //own headers... 6 | #include "log-space.h" 7 | 8 | // adding two vectors in log space... 9 | void log_vector_add(gsl_vector * one, gsl_vector * two){ 10 | for (int i=0; i<(int) one->size; i++){ 11 | gsl_vector_set( one, i, log_add( one->data[i], two->data[i])); 12 | } 13 | } 14 | 15 | // adding two matrices in log space... 16 | void log_matrix_add(gsl_matrix * one, gsl_matrix * two){ 17 | for (int i=0; i<(int) one->size1; i++){ 18 | gsl_vector_view r1=gsl_matrix_row(one,i); 19 | gsl_vector_view r2=gsl_matrix_row(two,i); 20 | log_vector_add(&r1.vector,&r2.vector); 21 | } 22 | } 23 | 24 | // adding two scalars in log space... 25 | double log_add(double one, double two){ 26 | if (one > two){ 27 | if (one-two > 10.0){ 28 | return( one + exp(two - one)); 29 | } 30 | else{ 31 | return( one + log(1.0 + exp(two - one))); 32 | } 33 | } 34 | else if (one < two){ 35 | if (two-one > 10.0){ 36 | return( two + exp(one - two)); 37 | } 38 | else{ 39 | return( two + log(1.0 + exp(one - two))); 40 | } 41 | } 42 | else{ 43 | return(one + log(2.0)); 44 | } 45 | } 46 | 47 | double log_sub(double one, double two){ 48 | if (two > one){ 49 | printf("ERROR in log_sub(%e,%e)\n",one,two); 50 | } 51 | if (one-two > 10.0){ 52 | return( one - exp(two - one)); 53 | } 54 | else{ 55 | return( one + log(1.0 - exp(two - one))); 56 | } 57 | } 58 | 59 | 60 | void log_vector_invert(gsl_vector * vec){ 61 | double val; 62 | for (int i=0; i<(int) vec->size; i++){ 63 | val = vec->data[i]; 64 | vec->data[i] = log(1.0 - exp(val)); 65 | } 66 | } 67 | 68 | 69 | // 1-norm of a vector in log-space 70 | double log_vector_norm(const gsl_vector * x){ 71 | double norm = 0; 72 | double max = gsl_vector_max(x); 73 | double crit = max - 10.0; 74 | for (int i=0; i<(int) x->size; i++){ 75 | if ( x->data[i] > crit) 76 | norm += exp(x->data[i] - max); 77 | } 78 | if (norm>1.1){ 79 | norm = log(norm) + max; 80 | } 81 | else{//Taylor-expansion of log(1+x) = x - 0.5*x*x 82 | norm = norm - 1.0; 83 | norm = norm*(1.0 - 0.5*norm) + max; 84 | } 85 | return(norm); 86 | } 87 | 88 | void log_vector_normalize( gsl_vector * x){ 89 | double norm = log_vector_norm(x); 90 | gsl_vector_add_constant(x,-norm); 91 | } 92 | 93 | // 1-norm of a vector in log-space 94 | double log_matrix_norm(const gsl_matrix * M){ 95 | double norm=0; 96 | double max = gsl_matrix_max(M); 97 | for (int i=0; i<(int) M->size1; i++){ 98 | for (int j=0; j<(int) M->size2; j++){ 99 | norm += exp(gsl_matrix_get(M,i,j) - max); 100 | } 101 | } 102 | norm = log(norm) + max; 103 | return(norm); 104 | } 105 | -------------------------------------------------------------------------------- /docs/README-pre-filter.md: -------------------------------------------------------------------------------- 1 | # pre-filter command line arguments 2 | 3 | The program `pre-filter` can be used to remove loci based on the observed read depth. It includes two heuristic filtering methods: loci are removed based on (i) their local variability and (ii) their being an outlier (see below). 4 | 5 | ![pref](/images/prefilter-1.png "Pre-filtering of read depth via matched normal.") 6 | 7 | The effect of `pre-filter` on read depth data: (A) Centromeric regions of real chromosomes often show huge large scale variability in their read depth. But there are also many small regions with very low read depth throughout. (B) After pre-filtering, the problematic regions are masked out. (C) Removing the same regions in the tumor data improves quality visibly while retaining biologically relevant features. 8 | 9 | 10 | ## Typical usage options 11 | 12 | * `--data [file]` Input data to be pre-filtered. 13 | 14 | The file format is the same as for cloneHD's `--cna` option. Only the first sample will be used for pre-filtering. 15 | 16 | * `--pre [string:"./out"]` Set prefix for all output files. 17 | 18 | The pre-filtered loci and data are print to a file named `pre.pref.txt`. 19 | 20 | * `--print-tracks [0/1:0]` Print the window average and window variability. 21 | 22 | The windowed tracks are used for pre-filtering. They are printed for all loci to a file named `pre.track.txt`. Use this to inspect and tune the pre-filter thresholds. 23 | 24 | * `--pick-from [file]` Pre-filter data in this file by picking loci present in `match-to`. 25 | 26 | Only loci are selected which fall into a bin also present in `match-to`. Bins in `match-to` are assumed to be of constant width with the given coordinate being the right bin end inclusive, e.g. 27 | 28 | 1000 = 1-1000 29 | 2000 = 1001-2000 30 | 4000 = 3001-4000 31 | etc. 32 | 33 | * `--match-to [file]` Use this file as reference to pick loci in `pick-from`. 34 | 35 | Loci in this file are assumed to be equidistant (e.g. per 1 kb, not all bins need be present). The bin width is decided automatically by majority. 36 | 37 | ## Parameter options 38 | 39 | * `--window-size [int:100]` Set the window scale for smoothing (centered, +-size). 40 | 41 | * `--remove-outlier [double:3.0]` Set the outlier threshold. 42 | 43 | All loci are removed, where the observed read depth is further than this value away from the local window-average (in units of sqrt(window-average), assuming Poisson distributed read depths). If set to `0.0`, filter is not applied. 44 | 45 | * `--remove-variable [double:2.0]` Set the variability threshold. 46 | 47 | All loci are removed, where the local window-variability exceeds this multiple of the global variability. Global (local) variability is defined as median (mean) of the absolute distance of observed read depths to the global median read depth. If set to `0.0`, filter is not applied. 48 | -------------------------------------------------------------------------------- /src/cloneHD-functions.h: -------------------------------------------------------------------------------- 1 | //cloneHD-functions.h 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | // GSL headers... 18 | #include "gsl/gsl_vector.h" 19 | #include "gsl/gsl_matrix.h" 20 | #include "gsl/gsl_randist.h" 21 | #include "gsl/gsl_blas.h" 22 | 23 | //own headers 24 | class Clone; 25 | class Emission; 26 | 27 | using namespace std; 28 | 29 | 30 | struct cmdl_opts{ 31 | const char * cna_fn; 32 | const char * baf_fn; 33 | const char * snv_fn; 34 | const char * pre; 35 | const char * bias_fn; 36 | const char * mntcn_fn; 37 | const char * maxtcn_fn; 38 | const char * avcn_fn; 39 | const char * chr_fn; 40 | const char * bulk_fn; 41 | const char * clones_fn; 42 | const char * purity_fn; 43 | const char * cna_jumps_fn; 44 | const char * baf_jumps_fn; 45 | const char * snv_jumps_fn; 46 | // 47 | int force, trials, restarts, nmax, seed, maxtcn, print_all, learn_priors; 48 | int mass_gauging; 49 | double cna_jump, baf_jump, snv_jump; 50 | double cna_shape, baf_shape, snv_shape; 51 | double cna_rnd, baf_rnd, snv_rnd; 52 | double cna_pen_zero, cna_pen_norm, cna_pen_diff; 53 | double baf_pen_comp; 54 | double snv_pen_high, snv_pen_mult; 55 | double snv_fpr, snv_fpf; 56 | double bulk_fix, bulk_sigma, bulk_rnd; 57 | double min_occ,min_jump; 58 | int bulk_mean, bulk_prior, bulk_updates; 59 | int cnaGrid, bafGrid, snvGrid, bulkGrid; 60 | }; 61 | 62 | 63 | void get_opts( int argc, const char ** argv, cmdl_opts& opts); 64 | void read_opts( const char * opts_fn, cmdl_opts& opts); 65 | void default_opts(cmdl_opts& opts); 66 | void test_opts(cmdl_opts& opts); 67 | void print_usage(); 68 | 69 | void print_all_results( Clone * myClone, cmdl_opts& opts); 70 | void print_posterior_header( FILE * fp, Clone * myClone, Emission * myEmit, cmdl_opts& opts); 71 | void print_posterior( FILE * fp, Clone * myClone, Emission * myEmit, int s, cmdl_opts& opts); 72 | void print_perclone_header( FILE * fp, Clone * myClone, Emission * myEmit, cmdl_opts& opts); 73 | void print_perclone_posterior( FILE ** fp, Clone * myClone, Emission * myEmit, int s, cmdl_opts& opts); 74 | void print_mean_tcn( FILE * mntcn_fp, Clone * myClone, Emission * cnaEmit, int s, cmdl_opts& opts); 75 | void print_avail_cn( FILE * avcn_fp, Clone * myClone, Emission * cnaEmit, int s, cmdl_opts& opts); 76 | void print_gof( Clone * myClone, Emission * myEmit, cmdl_opts& opts); 77 | 78 | void get_cna_data(Emission * cnaEmit, cmdl_opts& opts, int& nTimes); 79 | void get_baf_data(Emission * bafEmit, cmdl_opts& opts, int& nTimes, int& nT); 80 | void get_snv_data(Emission * snvEmit, cmdl_opts& opts, int& nTimes, int& nT); 81 | void get_snv_bulk_prior( Clone * myClone, cmdl_opts& opts); 82 | void get_track(const char * fn, gsl_matrix **& dist, double **& mn, double **& var, Emission * myEmit); 83 | void match_jumps(const char * jumps_fn, Emission * myEmit); 84 | void get_maxtcn_input(const char * maxtcn_fn, int maxtcn_gw, Clone * myClone); 85 | void get_mean_tcn( const char * mtcn_fn, Clone * myClone, Emission * myEmit); 86 | void get_avail_cn( const char * avcn_fn, Clone * myClone, Emission * myEmit); 87 | void get_purity( const char * purity_fn, gsl_vector *& purity); 88 | void get_fixed_clones( gsl_matrix *& clones, gsl_vector *& mass, const char * clones_fn, int nTimes); 89 | void get_jump_probability( Clone * myClone, cmdl_opts& opts); 90 | void get_bias_field( Clone * myClone, cmdl_opts& opts); 91 | void print_llh_for_set(gsl_matrix * clones, gsl_vector * mass, Clone * myClone, cmdl_opts& opts); 92 | -------------------------------------------------------------------------------- /src/cloneHD-inference.h: -------------------------------------------------------------------------------- 1 | //cloneHD-inference.h 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | // GSL headers... 18 | #include "gsl/gsl_vector.h" 19 | #include "gsl/gsl_matrix.h" 20 | #include "gsl/gsl_randist.h" 21 | #include "gsl/gsl_blas.h" 22 | 23 | class Clone; 24 | class Emission; 25 | 26 | int infer_clones( gsl_matrix * Clones, gsl_vector * Mass, Clone * myClone, cmdl_opts& opts); 27 | 28 | double get_clones( gsl_matrix *& clones, 29 | gsl_matrix *& Clones, 30 | gsl_vector *& mass, 31 | gsl_vector *& Mass, 32 | gsl_matrix *& priors, 33 | Clone * myClone, 34 | cmdl_opts& opts, 35 | double& cl, 36 | double& bl, 37 | double& sl); 38 | 39 | double get_clones_cna( gsl_matrix *& clones, 40 | gsl_matrix *& Clones, 41 | gsl_vector *& mass, 42 | gsl_vector *& Mass, 43 | Clone * myClone, 44 | cmdl_opts& opts, 45 | double& cl, 46 | double& bl, 47 | double& sl); 48 | 49 | double get_clones_baf( gsl_matrix *& clones, 50 | gsl_matrix *& Clones, 51 | Clone * myClone, 52 | cmdl_opts& opts 53 | ); 54 | 55 | double get_clones_snv_ncorr( gsl_matrix *& clones, 56 | gsl_matrix *& Clones, 57 | gsl_matrix *& priors, 58 | Clone * myClone, 59 | cmdl_opts& opts 60 | ); 61 | 62 | double get_clones_snv_wcorr( gsl_matrix *& clones, 63 | gsl_matrix *& Clones, 64 | Clone * myClone, 65 | cmdl_opts& opts 66 | ); 67 | 68 | 69 | double cna_only_mass_noclones( gsl_vector *& mass, Clone * myClone, int restarts, int& steps); 70 | //double cna_only_clones_mass( gsl_matrix*& clones, gsl_vector*& mass, Clone * myClone, int& steps); 71 | double cna_clones_fixed_mass( gsl_matrix*& clones, Clone * myClone, int restarts, 72 | int& steps, double& cl, double& bl, double& sl); 73 | double cna_mass_fixed_clones(gsl_vector*& mass, Clone * myClone, int restarts, 74 | int& steps, double& cl, double& bl, double& sl); 75 | 76 | double cna_clones_mass( gsl_matrix*& clones, gsl_vector*& mass, Clone * myClone, int restarts, 77 | int& steps, double& cl, double& bl, double& sl); 78 | 79 | 80 | void get_candidate_masses( gsl_matrix * clones, 81 | gsl_vector * mass, 82 | Clone * myClone, 83 | gsl_matrix*& candidate_masses, 84 | gsl_vector*& levels, 85 | double min_occ); 86 | 87 | double cna_llh_all_fixed(Clone * myClone); 88 | 89 | double baf_clones( gsl_matrix*& clones, Clone* myClone, int restarts, int& steps); 90 | 91 | double snv_clones_fixed_priors( gsl_matrix*& clones, Clone * myClone, int restarts, int& steps); 92 | void snv_iterative_bulk_update( double& llh, gsl_matrix*& clones, Clone * myClone, int iter); 93 | double snv_priors_fixed_clones( gsl_matrix*& priors, Clone * myClone, int restarts, int& steps); 94 | double snv_clones_priors( gsl_matrix*& clones, gsl_matrix*& priors, Clone * myClone, int restarts, int& steps); 95 | void snv_bulk_update(Clone * myClone); 96 | 97 | void set_random_start_freq(gsl_vector *& freq, double lower); 98 | void report_results( double cl, double bl, double sl, int steps, gsl_vector * mass, gsl_matrix * freq); 99 | 100 | double Q( const gsl_vector * x, void * p); 101 | struct Q_par{ 102 | Clone * myClone; 103 | int nSimplex; 104 | vector simplexD; 105 | int clones_fixed; 106 | int mass_fixed; 107 | int prior_fixed; 108 | int cna,baf,snv; 109 | }; 110 | -------------------------------------------------------------------------------- /run-example.sh: -------------------------------------------------------------------------------- 1 | # RUN filterHD & cloneHD FOR A SIMULATED EXAMPLE DATA SET 2 | 3 | # fix the number of threads 4 | export OMP_NUM_THREADS=4; 5 | 6 | part=$1 7 | 8 | # input data 9 | data="./test/data/" 10 | results="./test/results/" 11 | filterHD="./build/filterHD" 12 | cloneHD="./build/cloneHD" 13 | 14 | normalCNA="${data}/normal.cna.txt" 15 | tumorCNA="${data}/tumor.cna.txt" 16 | tumorBAF="${data}/tumor.baf.txt" 17 | tumorSNV="${data}/tumor.snv.txt" 18 | bias="${results}/normal.cna.posterior-1.txt" 19 | tumorCNAjumps="${results}/tumor.cna.bias.jumps.txt" 20 | tumorBAFjumps="${results}/tumor.baf.jumps.txt" 21 | 22 | ### filterHD ### 23 | if [ -z $part ] || [ $part -eq 1 ] 24 | then 25 | echo "*** filterHD ***" 26 | echo 27 | 28 | #emission modes: 29 | # 1: Binomial 30 | # 2: Beta-Binomial 31 | # 3: Poisson 32 | # 4: Negative Binomial 33 | 34 | # The normal read depth is analysed to estimate the technical read depth modulation. This will be later used to account 35 | # for the bias field in cloneHD. In principal, jumps are not expected (so could set --jump 0). The simulations do not have 36 | # random emissions. 37 | cmd="$filterHD --data $normalCNA --mode 3 --pre ${results}/normal.cna" 38 | echo $cmd 39 | $cmd 40 | echo 41 | 42 | # The tumor read depth is first analysed without bias to get a benchmark for the LLH value. The result will not be used later. 43 | # In the tumor data, we do expect jumps, but we actually would like to learn the jumps only accounting for the bias field (below). 44 | cmd="$filterHD --data $tumorCNA --mode 3 --pre ${results}/tumor.cna" 45 | echo $cmd 46 | $cmd 47 | echo 48 | 49 | # The tumor read depth is now analysed with the bias field from the matched normal. The diffusion constant is set to zero. 50 | # If left free, it should converge to a very small value. The jump rate could be slightly higher. The LLH should be higher than 51 | # for the run above indicating the presence of the bias field. Now we are interested in the jumps. 52 | cmd="$filterHD --data $tumorCNA --mode 3 --pre ${results}/tumor.cna.bias --bias $bias --sigma 0 --jumps 1" 53 | echo $cmd 54 | $cmd 55 | echo 56 | 57 | # The tumor BAF data is analysed, mainly to get the emission parameters (shape, rnd) and jumps. In principle, there could be jumps 58 | # visible in the BAF data, but not in the read depth (copy number neutral LOH within chromosomes). Diffusion should be switched off. 59 | cmd="$filterHD --data $tumorBAF --mode 1 --pre ${results}/tumor.baf --sigma 0 --jumps 1 --reflect 1 --dist 1" 60 | echo $cmd 61 | $cmd 62 | echo 63 | fi 64 | 65 | if [ -z $part ] || [ $part -eq 2 ] 66 | then 67 | ### cloneHD ### 68 | echo "*** cloneHD ***" 69 | echo "True mass and cell fractions:" `cat test/data/clones.txt` 70 | echo 71 | # The CNA and BAF data is analysed for subclonality. 72 | # Try varying the --min-jump, --force and --max-tcn values and try --mass-gauging 0. 73 | # Try adding the SNV data to the mix. 74 | cmd="$cloneHD --cna $tumorCNA --baf $tumorBAF --pre ${results}/tumor --bias $bias --seed 123 --trials 2\ 75 | --nmax 3 --force --max-tcn 4 --cna-jumps $tumorCNAjumps --baf-jumps $tumorBAFjumps --min-jump 0.01 --restarts 10 --mass-gauging 1" 76 | echo $cmd 77 | $cmd 78 | echo 79 | cat ${results}/tumor.summary.txt 80 | echo 81 | 82 | # Using the information from above, the SNV data is analysed. Try what happens removing the --avail-cn and --mean-tcn options. 83 | cmd="$cloneHD --snv $tumorSNV --pre ${results}/tumorSNV --seed 123 --trials 2\ 84 | --nmax 3 --force --max-tcn 4 --restarts 10 --mean-tcn ${results}/tumor.mean-tcn.txt --avail-cn ${results}/tumor.avail-cn.txt" 85 | echo $cmd 86 | $cmd 87 | echo 88 | cat ${results}/tumorSNV.summary.txt 89 | fi -------------------------------------------------------------------------------- /src/emission.h: -------------------------------------------------------------------------------- 1 | //emission.h 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | //#include 14 | #include 15 | #include 16 | #include 17 | 18 | // GSL headers... 19 | #include "gsl/gsl_vector.h" 20 | #include "gsl/gsl_matrix.h" 21 | #include "gsl/gsl_randist.h" 22 | #include "gsl/gsl_blas.h" 23 | #include "gsl/gsl_sf_gamma.h" 24 | #include "gsl/gsl_sf_psi.h" 25 | #include "gsl/gsl_statistics_double.h" 26 | #include "gsl/gsl_sort.h" 27 | #include "gsl/gsl_cdf.h" 28 | 29 | 30 | 31 | using namespace std; 32 | 33 | class Emission{ 34 | public: 35 | Emission(); 36 | void set(int ntimes, vector& chrs, vector& nsites, int grid); 37 | ~Emission(); 38 | void clear(); 39 | void delete_old_Emit(); 40 | int is_set; 41 | void set_dist(); 42 | int dist_set; 43 | int connect; 44 | double median_dist; 45 | map dist_count; 46 | map frequent_dist; 47 | int get_log, get_der, get_mv; 48 | unsigned int nmax, Nmax; 49 | //unordered_map< unsigned int, unordered_map< unsigned int, gsl_vector*> > EmitProb; 50 | //unordered_map< unsigned int, unordered_map< unsigned int, gsl_vector*> > EmitLog; 51 | map< unsigned int, map< unsigned int, gsl_vector*> > EmitProb; 52 | map< unsigned int, map< unsigned int, gsl_vector*> > EmitLog; 53 | double shape, log_shape, rnd_emit; 54 | double minRate, maxRate; 55 | // 56 | int mode, reflect, log_space; 57 | void set_EmitProb(int time); 58 | void binomial(int N, int n); 59 | void beta_binomial(int N, int n); 60 | void poisson(int N, int n); 61 | void negative_binomial(int N, int n); 62 | double get_single_EmitLog(double x, unsigned int n, unsigned int N); 63 | void get_eprob_wBias( gsl_vector * eprob, gsl_vector * emit, double b, unsigned int n, unsigned int N, int get_log); 64 | int EmitProb_set; 65 | // 66 | int nTimes, nSamples; 67 | int gridSize; 68 | double dx,xmin,xmax; 69 | double dy,ymin,ymax; 70 | double * xgrid; 71 | double * ygrid; 72 | unsigned int *** reads; 73 | unsigned int *** depths; 74 | unsigned int ** loci; 75 | unsigned int ** mask; 76 | unsigned int ** dist; 77 | unsigned int *** nObs; 78 | void get_nObs(); 79 | int * nSites; 80 | int * chr; 81 | std::set chrs; 82 | int * idx_of; 83 | int maxchr; 84 | double ** bias; 85 | double ** log_bias; 86 | void allocate_bias(); 87 | void allocate_mean_tcn(); 88 | void allocate_av_cn(int maxcn); 89 | int total_loci, total_events; 90 | unsigned int total_dist; 91 | void set_grid(); 92 | void init_range(int time); 93 | int range_set; 94 | void reset_mask(); 95 | double get_pval(int time, int sample, int site, double mean); 96 | void coarse_grain_jumps( int sample, double plow, int range); 97 | double ** pjump; 98 | void set_pjump(double jump); 99 | double *** mean_tcn;//mean total copy number 100 | double **** av_cn;//copy number availability 101 | void init_events(); 102 | int * nEvents; 103 | unsigned int ** Event_of_idx;// map from idx to cnv-event 104 | void map_idx_to_Event(Emission * Emit, int sample); 105 | void map_jumps(Emission * Emit); 106 | void add_break_points_via_jumps(Emission * Emit, double pmin); 107 | void get_events_via_jumps(); 108 | unsigned int ** idx_of_event;// map from event to idx 109 | unsigned int ** event_of_idx;// map from idx to event 110 | int idx_to_Event_mapped; 111 | int coarse_grained; 112 | }; 113 | 114 | 115 | bool value_comparer(std::map::value_type &i1, std::map::value_type &i2); 116 | -------------------------------------------------------------------------------- /docs/README-filterHD.md: -------------------------------------------------------------------------------- 1 | # filterHD command line arguments 2 | 3 | ## Typical usage options 4 | 5 | * `--data [file]` Input data. 6 | 7 | The file format is the same as in cloneHD for `--cna`, `--baf` or 8 | `--snv` (see [here](./README-cloneHD.md)). Multiple samples are processed independently, one by one. 9 | 10 | * `--mode [1/2/3/4]` Emission modes. 11 | 12 | 1. Binomial (for SNV data and BAF data (use with `--reflect 1`)) 13 | 2. Beta-Binomial (over-dispersed Binomial) 14 | 3: Poisson (for read depth data) 15 | 4: Negative-Binomial (over-dispersed Poisson) 16 | 17 | In modes 3/4, the range of the hidden emission rate is learned 18 | automatically. For modes 1/2, it is always in [0,1]. Reflective 19 | boundary conditions are used. 20 | 21 | * `--pre [string:"./out"]` Prefix for all output files. 22 | 23 | * `--dist [0/1:0]` Whether to print also the posterior distribution. 24 | 25 | The posterior mean, std-dev and jump probability are always printed to files 26 | `pre.posterior-[int].txt`, one for each sample in the input. With 1, the 27 | whole posterior distribution is also printed, so files can be big. 28 | 29 | * `--jumps [0/1:0]` Whether to print posterior jump probability. 30 | 31 | The posterior jump probability is compounded over all samples. It 32 | can be used with `--min-jump [double]` below, to consolidate jumps. 33 | 34 | * `--reflect [0/1:0]` If 1, binomial observations `n in N` and 35 | `(N-n) in N` are assumed to be identical. Use this option for BAF data. 36 | 37 | ## Parameter options 38 | 39 | The continuous state space HMM underlying filterHD is determined by the following global 40 | parameters. They can all be fixed, otherwise they are learned from the data. 41 | 42 | * `--jump [double]` Fix the jump probability per length unit (bp). 43 | * `--sigma [double]` Fix the diffusion constant. 44 | * `--shape [double]` Fix the shape parameter for modes 2/4. If >1000, use modes 1/3. 45 | * `--rnd [double]` Fix the rate of random emissions. 46 | 47 | For all of the above parameters, initial values for the numerical 48 | optimization can be given. This might be useful if you suspect several 49 | local optima and want to start in the neighbourhood of a particular one. 50 | 51 | * `--jumpi [double]` 52 | * `--sigmai [double]` 53 | * `--shapei [double]` 54 | * `--rndi [double]` 55 | 56 | ## Further advanced options 57 | 58 | * `--min-jump [double:0.0]` Consolidate jumps down to `--min-jump`. 59 | 60 | The posterior jump probability track will be consolidated by merging neighboring jump events into 61 | unique jumps, down to the minimum value given here. Can only be used together with 62 | `--jumps 1`. 63 | 64 | * `--filter-pVal [0/1:0]` Use p-Value filter. 65 | 66 | Filter sites where the p-Value of the 67 | observation is below `10/nSites`, where `nSites` is the total number 68 | of sites in a sample. 69 | 70 | * `--filter-shortSeg [int:0]` Use short-segment filter. 71 | 72 | Filter sites within short segments between jumps. All filtered data will be in the file ending `pre.filtered.txt`, which will be in the same format as the input file. 73 | 74 | * `--grid [int:100]` Set the grid size. 75 | 76 | The grid size for the internal representation of continuous distributions. For large ranges in 77 | mode 3/4, it can make sense to increase this resolution. 78 | 79 | # filterHD output 80 | 81 | filterHD generates a few output files automatically. Here, we provide annotated screenshots for them for the simulated example data set. 82 | 83 | ## STDOUT 84 | 85 | ![stdout](/images/screenshots/filterHD-stdout.png "filterHD stdout") 86 | 87 | ## Output file 88 | 89 | ![posterior1](/images/screenshots/filterHD-posterior-1.png "filterHD posterior") 90 | 91 | The posterior mean value of the hidden emission rate and jump probabilities 92 | 93 | ![posterior2](/images/screenshots/filterHD-posterior-2.png "filterHD posterior") 94 | 95 | The same as above, but here a bias (normal) was used, so the rate is scaled accordingly. Note: in filterHD, the bias field is not scaled to have mean 1! 96 | 97 | 98 | ![posterior3](/images/screenshots/filterHD-posterior-3.png "filterHD posterior") 99 | 100 | The same as above, but here the whole posterior distribution was requested with `--dist 1` -------------------------------------------------------------------------------- /src/clone-predict.cpp: -------------------------------------------------------------------------------- 1 | //clone-predict.cpp 2 | 3 | //own headers... 4 | #include "emission.h" 5 | #include "log-space.h" 6 | #include "clone.h" 7 | 8 | using namespace std; 9 | 10 | 11 | void Clone::set_TransMat_cna(){ 12 | if (TransMat_cna==NULL){ 13 | TransMat_cna = new gsl_matrix * [cnaEmit->nSamples]; 14 | for (int s=0; snSamples;s++) TransMat_cna[s] = NULL; 15 | } 16 | for (int s=0; snSamples;s++){ 17 | if (TransMat_cna[s]!=NULL) gsl_matrix_free(TransMat_cna[s]); 18 | TransMat_cna[s] = gsl_matrix_alloc(nLevels,nLevels); 19 | set_TransMat_cna( TransMat_cna[s], cnaEmit->chr[s]); 20 | } 21 | } 22 | 23 | // only one clone can change its state, 24 | // or clones in the same state can change in parallel 25 | void Clone::set_TransMat_cna( gsl_matrix * Trans, int chr){ 26 | double norm,p; 27 | gsl_vector_view row; 28 | int jumps,cni,cnf; 29 | for (int i=0; i maxtcn_per_clone[chr][k]){ 35 | jumps = 2; 36 | break; 37 | }*/ 38 | if( copynumber[i][k] != copynumber[j][k]){ 39 | if ( jumps==0 ){ 40 | cni = copynumber[i][k]; 41 | cnf = copynumber[j][k]; 42 | jumps++; 43 | } 44 | //else if (cni != copynumber[i][k] || cnf != copynumber[j][k]){ 45 | else if (cni - cnf != copynumber[i][k] - copynumber[j][k]){ 46 | jumps++; 47 | } 48 | //if (copynumber[j][k] == 0) p*= 0.01; 49 | } 50 | } 51 | if (jumps <= 1){ 52 | gsl_matrix_set( Trans, i, j, p); 53 | } 54 | else{ 55 | gsl_matrix_set( Trans, i, j, 0.0); 56 | } 57 | } 58 | row = gsl_matrix_row(Trans,i); 59 | norm = gsl_blas_dasum(&row.vector); 60 | if (norm <= 0) abort(); 61 | gsl_vector_scale(&row.vector,1.0/norm); 62 | } 63 | } 64 | 65 | 66 | void Clone::set_TransMat_snv(){ 67 | if (TransMat_snv==NULL){ 68 | TransMat_snv = new gsl_matrix * [snvEmit->nSamples]; 69 | for (int s=0; snSamples;s++) TransMat_snv[s] = NULL; 70 | } 71 | for (int s=0; snSamples;s++){ 72 | if (TransMat_snv[s]!=NULL) gsl_matrix_free(TransMat_snv[s]); 73 | TransMat_snv[s] = gsl_matrix_alloc(nLevels,nLevels); 74 | set_TransMat_snv( TransMat_snv[s], snvEmit->chr[s]); 75 | } 76 | } 77 | 78 | 79 | // only one clone can change its state, 80 | void Clone::set_TransMat_snv(gsl_matrix * Trans, int chr){ 81 | double norm; 82 | gsl_vector_view row; 83 | int jumps; 84 | for (int i=0; i maxtcn_per_clone[chr][k]){ 89 | jumps = 2; 90 | break; 91 | } 92 | if( copynumber[i][k] != copynumber[j][k] 93 | && copynumber[i][k] <= maxtcn_per_clone[chr][k]){ 94 | jumps++; 95 | } 96 | } 97 | if (jumps <= 1){ 98 | gsl_matrix_set(Trans,i, j, 1.0); 99 | } 100 | else{ 101 | gsl_matrix_set(Trans,i, j, 0.0); 102 | } 103 | } 104 | row = gsl_matrix_row(Trans,i); 105 | norm = gsl_blas_dasum(&row.vector); 106 | if (norm <= 0){ 107 | cout<<"ERROR\n"; 108 | abort(); 109 | } 110 | gsl_vector_scale(&row.vector,1.0/norm); 111 | } 112 | } 113 | 114 | 115 | // predict step with transition matrix... 116 | void Clone::predict( gsl_vector * prior, gsl_vector * post, Emission * myEmit, double pj, gsl_matrix * T){ 117 | if (pj == 0.0){ 118 | gsl_vector_memcpy( prior, post);//no jump possible 119 | } 120 | else{ 121 | if (myEmit->log_space) for (int l=0;ldata[l] = exp(post->data[l]); 122 | gsl_vector_memcpy( prior, post); 123 | gsl_blas_dgemv( CblasTrans, pj, T, post, 1.0-pj, prior); 124 | if (myEmit->log_space){ 125 | for (int l=0;ldata[l] = prior->data[l]>0.0 ? log(prior->data[l]) : logzero; 127 | } 128 | } 129 | } 130 | } 131 | 132 | 133 | 134 | // predict step with convex mixing... 135 | void Clone::predict( gsl_vector * prior, gsl_vector * post, Emission * myEmit, double pj, gsl_vector * flat){ 136 | if (pj==0.0){ 137 | gsl_vector_memcpy(prior,post); 138 | } 139 | else{ 140 | gsl_vector_memcpy( prior, flat); 141 | if(pj<1.0){//convex combination 142 | if(myEmit->log_space){ 143 | gsl_vector_add_constant(prior,log(pj)); 144 | gsl_vector_add_constant(post,log(1.0-pj)); 145 | log_vector_add(prior,post); 146 | } 147 | else{ 148 | gsl_vector_scale(prior,pj); 149 | gsl_vector_scale(post,1.0-pj); 150 | gsl_vector_add(prior,post); 151 | } 152 | } 153 | } 154 | } 155 | 156 | void Clone::apply_maxtcn_mask(gsl_vector * prior, int chr, int log_space){ 157 | for (int l=0;l maxtcn_per_clone[chr][j] ){ 160 | prior->data[l] = 0.0; 161 | break; 162 | } 163 | } 164 | } 165 | double norm = gsl_blas_dasum(prior); 166 | if (norm <= 0.0) abort(); 167 | gsl_vector_scale(prior,1.0/norm); 168 | if(log_space){ 169 | for (int l=0;ldata[l] = prior->data[l]>0.0 ? log(prior->data[l]) : logzero; 171 | } 172 | } 173 | } 174 | 175 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | # changelog for cloneHD/filterHD 2 | 3 | ## v1.17.9 / to come 4 | 5 | * bug fix for sparse data with singletons in a chr (bug-001) 6 | 7 | ## v1.17.8 / 29.05.2014 8 | 9 | * added checks whether files are open for writing 10 | * changed to new defaults: `--(cna/baf/snv)-rnd [double:1.0e-6]` (nan) 11 | * allowed `--cna-jump -1` and `--baf-jump -1` (no jumps) 12 | * `--cna-jumps [baf-jumps-file]` and vice versa enabled (useful for exome data) 13 | * jumps read and integrated with new function match_jumps() (not get_track()). 14 | * fixed bug when chromosomes have no non-zero observations. 15 | 16 | ## v1.17.7 / 25.04.2014 17 | 18 | * fixed range error in `pre-filter` in pick-from/match-to mode. 19 | 20 | ## v1.17.6 / 24.04.2014 21 | 22 | * fixed nan bug in GOF, when N==0 (missing data). 23 | * fixed bugs in `pre-filter`, when `--window-size` is greater than length 24 | * fixed bug in `pre-filter` in pick-from-match-to mode 25 | 26 | ## v1.17.5 / 22.04.2014 27 | 28 | * fixed memory alloc bug in pre-filter 29 | * abandon ftp site for releases, used only for backup and beta 30 | 31 | ## v1.17.4 / 10.04.2014 32 | 33 | * fixed fatal bug in snv-mode with correlations 34 | 35 | ## v1.17.3 / 04.04.2014 36 | 37 | * new program `pre-filter` 38 | * `--snv-pen` to `--snv-pen-high` and `--snv-pen-mult` 39 | * `--baf-pen` to `--baf-pen-compl` 40 | * `--cna-pen` to `--cna-pen-zero`, `--cna-pen-diff` and `cna-pen-norm` 41 | * split README 42 | * fixed bug in SNV transition matrix in combination with `--max-tcn [file]` 43 | 44 | ## v1.17.2 / 27.03.2014 45 | 46 | * new output: posterior per subclone, goodness of fit (GOF) per 47 | segment 48 | * changed file name `*clonal.txt` -> `*summary.txt` 49 | * filterHD STDOUT includes now GOF per sample 50 | * cloneHD `*summary.txt` includes now GOF per sample 51 | * changed `_` to `-` in all file names 52 | * fixed bug: BAF now symmetrized only in per-subclone-posterior 53 | * new CNA prior to penalize homozygous deletions `--cna-pen [double:0.9]` 54 | 55 | ## v1.17.1 / 01.03.2014 56 | 57 | * BAF posterior symmetrized for output 58 | * CNA transition matrix penalizes clones with zero copies of a segment 59 | * fixed bug in SNV prior computation 60 | * added pre-processor directives for conditional openMP compilation 61 | 62 | ## v1.17.0 / 25.02.2014 major release 63 | 64 | ### changed the way SNV priors are computed: 65 | 66 | * if CNA given: SNV prior informed by CNA posterior 67 | * if CNA+BAF given, SNV prior informed by BAF+CNA posterior 68 | * if SNV only and `--max-tcn` not given, assumes all chr to be 69 | all-normal, mean total c.n. to be normal; SNV prior parameters can 70 | be learned with `--learn-priors 1`. 71 | * if SNV only and `--max-tcn [int/file]` is given, this data is used 72 | to fix the total c.n. per chr and subclone; mean total c.n. is 73 | calculated on the fly; SNV prior parameters can be learned with 74 | `--learn-priors 1`. 75 | * if SNV only and `--max-tcn [int/file]` and `--avail-cn [file]` are 76 | given, SNV prior is calculated according to c.n. availability. 77 | 78 | ### more changes 79 | 80 | * changed option `--copynumber [file]` to `--mean-tcn [file]` 81 | * new option `--avail-cn [file]` 82 | * changed option `--maxcn [int:4]` to `--max-tcn [file/int]` 83 | * changed option `--snv-err [double]` to `--snv-fpfreq [double]` 84 | * changed option `--snv-fpr [double]` to `--snv-fprate [double]` 85 | * output file `*used-tcn.txt` to `*used_mean_tcn.txt` 86 | * output file `*copynumber.txt` to `*mean_tcn.txt` 87 | * new output file `*available_cn.txt` 88 | * changed `sample` to `chr` in cloneHD output files 89 | * slimmed down output of `--print-options`. 90 | * split clone.cpp into components clone-*.cpp 91 | * split off cloneHD-inference.cpp 92 | * new Makefile 93 | 94 | ## v1.16.7 / 19.02.2014 95 | 96 | * fixed bug in SNV w/ corr mode when --bulk-fix is used 97 | * introduced different grid sizes for CNA, BAF and SNV 98 | * fixed bug in Clone::get_interpolation(), at the boundaries 99 | * fixed bug in Clone::trapezoidal() (affected --bulk-prior vs --bulk-mean consistency) 100 | 101 | ## v1.16.6 / 12.02.2014 102 | 103 | * fixed major bug for SNV false positive emission rate and prior 104 | * introduced new functions: Clone::update_snv_site_ncorr/fixed/nfixed() 105 | * fixed bug in SNV prior from CNA/BAF posterior computation (BAF normalization) 106 | * false positive SNV prior now includes P(c=all-zero) 107 | * fixed bug in used cn output 108 | * all-zero "observations" in SNV input (w/o corr) are ignored (and not printed!) 109 | * fixed bug in filterHD: all-zero observations are always retained. 110 | 111 | ## v1.16.5 / 07.04.2014 112 | 113 | * fixed major bug when CNA, BAF and SNV data used with males (X,Y with only one copy) 114 | * fixed bug in Clone::snv_prior_from_cna_baf_post() 115 | * fixed bug in posterior output for BAF and SNV 116 | * introduced prior masking for all update functions 117 | * introduced `--maxcn_mask [file]` option to limit total c.n. per chromosome 118 | * static linking of both libgcc and libstdc++ for increased portability 119 | 120 | ## v1.16.4 / 30.01.2014 121 | 122 | * filterHD: if `--reflect 1`, use only posterior in [0,0.5] for mean/std-dev 123 | * fixed bug with `--bulk-fix 0.0` 124 | 125 | ## v1.16.3 / 13.01.2014 126 | 127 | * introduced the option `--mass-gauging [0/1:1]` to switch off the mass gauging for cna data. 128 | 129 | ## v1.16.2 / 12.01.2104 130 | 131 | * snp -> snv and cnv -> cna in all code 132 | * introduced `--chr [file]`, candidate masses are computed via majority normal copy number 133 | 134 | ## v1.16.1 / 10.01.2014 135 | 136 | * cnv to cna for all command line options 137 | * cnv to cna in all output file names and content 138 | * filterHD stdout modified 139 | 140 | ## v1.16 / 03.01.2014 141 | 142 | * first stable release of cloneHD 143 | -------------------------------------------------------------------------------- /src/common-functions.cpp: -------------------------------------------------------------------------------- 1 | //common-functions.cpp 2 | 3 | #include "common-functions.h" 4 | #include "emission.h" 5 | 6 | #define PI 3.1415926 7 | #define LOG2 0.693147 8 | 9 | using namespace std; 10 | 11 | 12 | //get general dimensions of a data set for cloneHD... 13 | void get_dims( const char * data_fn, 14 | int& nTimes, 15 | vector& chrs, 16 | vector& nSites, 17 | int keep 18 | ){ 19 | ifstream data_ifs; 20 | string line; 21 | stringstream line_ss; 22 | data_ifs.open( data_fn, ios::in); 23 | if (data_ifs.fail()){ 24 | printf("ERROR: file %s cannot be opened.\n", data_fn); 25 | exit(1); 26 | } 27 | nSites.clear(); 28 | chrs.clear(); 29 | int ct=0,l,r,d; 30 | int chr=0,old=-1,nT=0; 31 | while( data_ifs.good()){ 32 | line.clear(); 33 | getline( data_ifs, line); 34 | if (line.empty()) break; 35 | if (line[0] == '#') continue; 36 | line_ss.clear(); 37 | line_ss.str(line); 38 | //check first entry for nTimes 39 | if (old == -1 && ct == 0){ 40 | line_ss >> chr >> l; 41 | while(line_ss >> r >> d){ 42 | nT++; 43 | } 44 | line_ss.clear(); 45 | line_ss.str(line); 46 | } 47 | line_ss >> chr >> l; 48 | if (chr != old ){//new chromosome encounter 49 | if (ct>0){ 50 | nSites.push_back(ct); 51 | chrs.push_back(old); 52 | } 53 | ct=0; 54 | } 55 | old=chr; 56 | r = 0; 57 | for( int t=0;t> r >> d; 59 | if (r>0) break; 60 | } 61 | if (keep || r>0) ct++; 62 | } 63 | if (ct>0){ 64 | nSites.push_back(ct); 65 | chrs.push_back(old); 66 | } 67 | nTimes = nT; 68 | data_ifs.close(); 69 | } 70 | 71 | 72 | // read in data: expects columns to be "chr location (depth reads)^x" 73 | void get_data( const char * data_fn, Emission * myEmit){ 74 | ifstream data_ifs; 75 | string line; 76 | stringstream line_ss; 77 | data_ifs.open( data_fn, ios::in); 78 | if (data_ifs.fail()){ 79 | printf("ERROR: file %s cannot be opened.\n", data_fn); 80 | exit(1); 81 | } 82 | int ct=0,l; 83 | int chr=0,old=-1, sample=0; 84 | int d,r, keep=0, wait=0; 85 | //now collect all data... 86 | while( data_ifs.good()){ 87 | line.clear(); 88 | getline( data_ifs, line); 89 | if (line.empty()) break; 90 | if (line[0] == '#') continue; 91 | line_ss.clear(); 92 | line_ss.str(line); 93 | line_ss >> chr >> l;//chromosome and locus 94 | if (chr != old){ 95 | if (myEmit->chrs.count(chr) == 0){ 96 | printf("WARNING: chr %2i in file %s will be ignored.\n", chr, data_fn); 97 | wait = 1; 98 | } 99 | else{ 100 | sample = myEmit->idx_of[chr]; 101 | ct = 0; 102 | wait = 0; 103 | } 104 | old = chr; 105 | } 106 | if (wait) continue; 107 | if (ct >= myEmit->nSites[sample]) continue; 108 | keep = 0; 109 | for (int t=0; tnTimes; t++){ 110 | myEmit->loci[sample][ct] = l;//set locus 111 | line_ss >> r >> d;//get read and depth 112 | if (d == 0 && r > 0){ 113 | printf("ERROR: depth = 0 in chr %i locus %i\n", chr, l); 114 | cout<reads[t][ myEmit->idx_of[chr] ][ct] = r; 119 | myEmit->depths[t][ myEmit->idx_of[chr] ][ct] = d; 120 | if (r>0) keep=1; 121 | } 122 | if (keep || myEmit->connect) ct++; 123 | } 124 | data_ifs.close(); 125 | // set the distances between loci 126 | for (int t=0; tnTimes; t++){ 127 | myEmit->set_dist(); 128 | } 129 | } 130 | 131 | 132 | void get_bias(const char * bias_fn, Emission * myEmit){ 133 | ifstream ifs; 134 | string line; 135 | stringstream line_ss; 136 | ifs.open( bias_fn, ios::in); 137 | if (ifs.fail()){ 138 | printf("ERROR: file %s cannot be opened.\n", bias_fn); 139 | exit(1); 140 | } 141 | int chr = 0, old_chr = -1, idx=0, blocus=0, next_locus=0,l1=0; 142 | double b=0,b1=0,nu=0; 143 | while( ifs.good() ){ 144 | line.clear(); 145 | getline( ifs, line); 146 | if (line.empty()) break; 147 | if (line[0] == '#') continue; 148 | line_ss.clear(); 149 | line_ss.str(line); 150 | line_ss >> chr >> blocus; 151 | if (chr != old_chr && (chr > (int) myEmit->maxchr || myEmit->idx_of[chr] < 0)){ 152 | printf("ERROR 1 in get_bias()\n"); 153 | cout<=0){ 158 | while ( idx < myEmit->nSites[myEmit->idx_of[old_chr]] ){ 159 | myEmit->bias[myEmit->idx_of[old_chr]][idx] = b; 160 | idx++; 161 | } 162 | } 163 | idx=0; 164 | next_locus = (int) myEmit->loci[ myEmit->idx_of[chr] ][idx]; 165 | old_chr = chr; 166 | } 167 | if (idx >= myEmit->nSites[ myEmit->idx_of[chr] ]) continue; 168 | line_ss >> b; 169 | if (idx==0){ 170 | while ( next_locus < blocus ){//left overhang 171 | myEmit->bias[myEmit->idx_of[chr]][idx] = b; 172 | idx++; 173 | if (idx < myEmit->nSites[myEmit->idx_of[chr]]){ 174 | next_locus = (int) myEmit->loci[ myEmit->idx_of[chr] ][idx]; 175 | } 176 | if (idx >= myEmit->nSites[myEmit->idx_of[chr]]) break; 177 | } 178 | } 179 | if ( blocus <= next_locus ){ 180 | b1 = b; 181 | l1 = blocus; 182 | } 183 | if ( blocus < next_locus ){ 184 | continue; 185 | } 186 | else if (blocus==next_locus){ 187 | myEmit->bias[myEmit->idx_of[chr]][idx] = b; 188 | idx++; 189 | if (idx < myEmit->nSites[myEmit->idx_of[chr]]){ 190 | next_locus = (int) myEmit->loci[ myEmit->idx_of[chr] ][idx]; 191 | } 192 | } 193 | else if (blocus > next_locus){ 194 | while ( next_locus <= blocus ){ 195 | nu = double(next_locus-l1)/double(blocus-l1); 196 | myEmit->bias[myEmit->idx_of[chr]][idx] = b1*(1.0-nu) + b*nu; 197 | idx++; 198 | if (idx < myEmit->nSites[myEmit->idx_of[chr]]){ 199 | next_locus = (int) myEmit->loci[ myEmit->idx_of[chr] ][idx]; 200 | } 201 | if (idx >= myEmit->nSites[myEmit->idx_of[chr]]) break; 202 | } 203 | } 204 | } 205 | ifs.close(); 206 | } 207 | 208 | 209 | 210 | //***Mean and Variance function*** 211 | double get_mean(gsl_vector * dist, double xmin, double xmax){ 212 | double mean=0.0,P1,P2; 213 | int n = (int) dist->size; 214 | double dx = (xmax - xmin) / double(n-1); 215 | for (int i=0; i < n-1; i++){ 216 | P1 = gsl_vector_get(dist,i); 217 | P2 = gsl_vector_get(dist,i+1); 218 | mean += 3.0*(P1+P2)*(xmin+double(i)*dx) + (P1+2.0*P2)*dx; 219 | } 220 | mean = mean * dx / 6.0; 221 | return(mean); 222 | } 223 | 224 | double get_var(gsl_vector * dist, double xmin, double xmax, double mean){ 225 | double var=0.0, P1, P2,dev; 226 | int n = (int) dist->size; 227 | double dx = (xmax - xmin) / double(n-1); 228 | for (int i=0; i