├── images
    ├── baf.gof.png
    ├── baf.post.png
    ├── baf.real.png
    ├── cna.gof.png
    ├── cna.post.png
    ├── cna.real.png
    ├── snv.gof.png
    ├── filterHD-1.png
    ├── prefilter-1.png
    └── screenshots
    │   ├── cloneHD-mean.png
    │   ├── cloneHD-avail.png
    │   ├── cloneHD-posterior.png
    │   ├── cloneHD-stdout-1.png
    │   ├── cloneHD-stdout-2.png
    │   ├── cloneHD-subclone.png
    │   ├── cloneHD-summary.png
    │   ├── filterHD-stdout.png
    │   ├── filterHD-posterior-1.png
    │   ├── filterHD-posterior-2.png
    │   └── filterHD-posterior-3.png
├── .gitignore
├── src
    ├── log-space.h
    ├── common-functions.h
    ├── Makefile.farm.debug
    ├── Makefile.debug
    ├── Makefile.farm
    ├── Makefile
    ├── jump-diffusion.h
    ├── minimization.h
    ├── log-space.cpp
    ├── cloneHD-functions.h
    ├── cloneHD-inference.h
    ├── emission.h
    ├── clone-predict.cpp
    ├── common-functions.cpp
    ├── clone-bulk.cpp
    ├── clone-llh.cpp
    ├── clone.h
    ├── pre-filter.cpp
    ├── clone-prior.cpp
    ├── minimization.cpp
    ├── jump-diffusion.cpp
    ├── cloneHD.cpp
    └── filterHD.cpp
├── TODO.md
├── docs
    ├── README-pre-filter.md
    ├── README-filterHD.md
    └── README-cloneHD.md
├── run-example.sh
├── changelog.md
└── README.md


/images/baf.gof.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/baf.gof.png


--------------------------------------------------------------------------------
/images/baf.post.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/baf.post.png


--------------------------------------------------------------------------------
/images/baf.real.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/baf.real.png


--------------------------------------------------------------------------------
/images/cna.gof.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/cna.gof.png


--------------------------------------------------------------------------------
/images/cna.post.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/cna.post.png


--------------------------------------------------------------------------------
/images/cna.real.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/cna.real.png


--------------------------------------------------------------------------------
/images/snv.gof.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/snv.gof.png


--------------------------------------------------------------------------------
/images/filterHD-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/filterHD-1.png


--------------------------------------------------------------------------------
/images/prefilter-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/prefilter-1.png


--------------------------------------------------------------------------------
/images/screenshots/cloneHD-mean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-mean.png


--------------------------------------------------------------------------------
/images/screenshots/cloneHD-avail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-avail.png


--------------------------------------------------------------------------------
/images/screenshots/cloneHD-posterior.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-posterior.png


--------------------------------------------------------------------------------
/images/screenshots/cloneHD-stdout-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-stdout-1.png


--------------------------------------------------------------------------------
/images/screenshots/cloneHD-stdout-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-stdout-2.png


--------------------------------------------------------------------------------
/images/screenshots/cloneHD-subclone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-subclone.png


--------------------------------------------------------------------------------
/images/screenshots/cloneHD-summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/cloneHD-summary.png


--------------------------------------------------------------------------------
/images/screenshots/filterHD-stdout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/filterHD-stdout.png


--------------------------------------------------------------------------------
/images/screenshots/filterHD-posterior-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/filterHD-posterior-1.png


--------------------------------------------------------------------------------
/images/screenshots/filterHD-posterior-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/filterHD-posterior-2.png


--------------------------------------------------------------------------------
/images/screenshots/filterHD-posterior-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrej-fischer/cloneHD/HEAD/images/screenshots/filterHD-posterior-3.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | run-bulk*
 2 | run-test*
 3 | *.[oa]
 4 | build/*
 5 | build-*tar.gz
 6 | *~
 7 | release/
 8 | release*
 9 | *.bk
10 | bk/
11 | recompile*
12 | .DS_Store
13 | test/
14 | 


--------------------------------------------------------------------------------
/src/log-space.h:
--------------------------------------------------------------------------------
 1 | //log_space.h
 2 | 
 3 | #include <stdio.h>
 4 | #include <iostream>
 5 | #include <fstream>
 6 | #include <sstream>
 7 | #include <time.h>
 8 | #include <math.h>
 9 | #include <ctype.h> 
10 | #include <string>
11 | #include <map>
12 | #include <vector>
13 | #include <algorithm>
14 | 
15 | 
16 | // GSL headers...
17 | #include "gsl/gsl_vector.h"
18 | #include "gsl/gsl_matrix.h"
19 | #include "gsl/gsl_blas.h"
20 | 
21 | double log_add(double one, double two);
22 | double log_sub(double one, double two);
23 | void log_vector_add(gsl_vector * one, gsl_vector * two);
24 | void log_matrix_add(gsl_matrix * one, gsl_matrix * two);
25 | void log_vector_invert(gsl_vector * vec);
26 | double log_vector_norm(const gsl_vector * x);
27 | double log_matrix_norm(const gsl_matrix * M);
28 | void log_vector_normalize(gsl_vector * x);
29 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # To do list for filterHD/cloneHD
 2 | 
 3 | # Bugs/issues to be fixed
 4 | 
 5 | ## filterHD
 6 | 
 7 | ## cloneHD
 8 | 
 9 | *  print full if `--*jump` is (also) given, only events if `--*jumps`
10 | *  check memory leaks
11 | 
12 | # Features to be added in future releases
13 | 
14 | ## filterHD
15 | 
16 | *  filter loci incompatible with emission model via 2-state HMM
17 | *  do Baum-Welch
18 | *  `--filter-shortSeg [int]` via posterior jump-prob (not via pmean)
19 | *  Use diffusion constant that scales with size in mode 3/4.
20 | *  Implement jump subtraction
21 | 
22 | ## cloneHD
23 | 
24 | *  bias field as distribution
25 | *  re-think update_snp_fixed (hashing)
26 | *  different nLevels per chr via max-tcn (for memory efficiency only)
27 | 
28 | ## both
29 | 
30 | *  string chromosome ids
31 | *  re-factor emission class
32 | 


--------------------------------------------------------------------------------
/src/common-functions.h:
--------------------------------------------------------------------------------
 1 | //common-functions.h
 2 | 
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <iostream>
 6 | #include <fstream>
 7 | #include <sstream>
 8 | #include <time.h>
 9 | #include <math.h>
10 | #include <ctype.h> 
11 | #include <string>
12 | #include <map>
13 | #include <vector>
14 | #include <list>
15 | 
16 | 
17 | // GSL headers...
18 | #include "gsl/gsl_vector.h"
19 | #include "gsl/gsl_matrix.h"
20 | #include "gsl/gsl_randist.h"
21 | #include "gsl/gsl_blas.h"
22 | 
23 | //own headers
24 | class Emission;
25 | 
26 | using namespace std;
27 | 
28 | void get_dims( const char * data_fn, int& nTimes, vector<int>& chrs, vector<int>& nSites, int keep);
29 | void get_data( const char * data_fn, Emission * myEmit);
30 | void get_bias( const char * bias_fn, Emission * myEmit);
31 | double get_mean( gsl_vector * dist, double xmin, double xmax);
32 | double get_var(  gsl_vector * dist, double xmin, double xmax, double mean);
33 | 


--------------------------------------------------------------------------------
/src/Makefile.farm.debug:
--------------------------------------------------------------------------------
 1 | CC		= g++
 2 | GSL		= /usr/lib/libgsl.so /usr/lib/libgslcblas.so
 3 | CC_FLAGS	= -Wall -ggdb -static-libgcc -static-libstdc++
 4 | LD_FLAGS	= ${GSL}
 5 | prefobj		= emission.o common-functions.o
 6 | filterHDobj	= emission.o common-functions.o jump-diffusion.o minimization.o
 7 | cloneobj	= clone.o clone-bulk.o clone-prior.o clone-llh.o clone-predict.o clone-update.o clone-fwd-bwd.o
 8 | cloneHDobj	= $(cloneobj) emission.o common-functions.o minimization.o log-space.o cloneHD-functions.o cloneHD-inference.o
 9 | .PHONY: clean all
10 | all: preFilter filterHD cloneHD
11 | 	#rm -f ./*.o
12 | preFilter: $(prefobj) pre-filter.o
13 | 	$(CC) $(CC_FLAGS) $^ -o ../build/pre-filter $(LD_FLAGS)
14 | filterHD: $(filterHDobj) filterHD.o
15 | 	$(CC) $(CC_FLAGS) $^ -o ../build/filterHD $(LD_FLAGS)
16 | cloneHD: $(cloneHDobj) cloneHD.o
17 | 	$(CC) $(CC_FLAGS) $^ -o ../build/cloneHD $(LD_FLAGS)	
18 | %.o: %.cpp
19 | 	$(CC) $(CC_FLAGS) -c $< -o $@
20 | $(cloneobj): clone.h $(cloneobj:.o=.cpp)
21 | $(filterHDobj) $(cloneHDobj) $(prefobj): $(.TARGET:.o=.h)
22 | clean:
23 | 	rm -f ./*.o
24 | 


--------------------------------------------------------------------------------
/src/Makefile.debug:
--------------------------------------------------------------------------------
 1 | CC		= g++-mp-4.7
 2 | GSL		= /opt/local/lib/libgsl.a /opt/local/lib/libgslcblas.a
 3 | CC_FLAGS	= -Wall -ggdb -static-libgcc -static-libstdc++
 4 | LD_FLAGS	= ${GSL}
 5 | prefobj		= emission.o common-functions.o
 6 | filterHDobj	= emission.o common-functions.o jump-diffusion.o minimization.o
 7 | cloneobj	= clone.o clone-bulk.o clone-prior.o clone-llh.o clone-predict.o clone-update.o clone-fwd-bwd.o
 8 | cloneHDobj	= $(cloneobj) emission.o common-functions.o minimization.o log-space.o cloneHD-functions.o cloneHD-inference.o
 9 | .PHONY: clean all
10 | all: preFilter filterHD cloneHD
11 | 	#rm -f ./*.o
12 | preFilter: $(prefobj) pre-filter.o
13 | 	$(CC) $(CC_FLAGS) $^ -o ../build/pre-filter $(LD_FLAGS)
14 | filterHD: $(filterHDobj) filterHD.o
15 | 	$(CC) $(CC_FLAGS) $^ -o ../build/filterHD $(LD_FLAGS)
16 | cloneHD: $(cloneHDobj) cloneHD.o
17 | 	$(CC) $(CC_FLAGS) $^ -o ../build/cloneHD $(LD_FLAGS)	
18 | %.o: %.cpp
19 | 	$(CC) $(CC_FLAGS) -c $< -o $@
20 | $(cloneobj): clone.h# $(cloneobj:.o=.cpp)
21 | $(filterHDobj) $(cloneHDobj) $(prefobj): $(.TARGET:.o=.h)
22 | clean:
23 | 	rm -f ./*.o
24 | 


--------------------------------------------------------------------------------
/src/Makefile.farm:
--------------------------------------------------------------------------------
 1 | CC		= g++
 2 | GSL		= /usr/lib/libgsl.so /usr/lib/libgslcblas.so
 3 | CC_FLAGS	= -Wall -O3 -static-libgcc -static-libstdc++ -fopenmp -DHAVE_INLINE
 4 | LD_FLAGS	= ${GSL} -fopenmp
 5 | prefobj		= emission.o common-functions.o
 6 | filterHDobj	= emission.o common-functions.o jump-diffusion.o minimization.o
 7 | cloneobj	= clone.o clone-bulk.o clone-prior.o clone-llh.o clone-predict.o clone-update.o clone-fwd-bwd.o
 8 | cloneHDobj	= $(cloneobj) emission.o common-functions.o minimization.o log-space.o cloneHD-functions.o cloneHD-inference.o
 9 | .PHONY: clean all
10 | all: preFilter filterHD cloneHD
11 | 	rm -f ./*.o
12 | preFilter: $(prefobj) pre-filter.o
13 | 	$(CC) $(CC_FLAGS) $^ -o ../build/pre-filter $(LD_FLAGS)
14 | filterHD: $(filterHDobj) filterHD.o
15 | 	$(CC) $(CC_FLAGS) $^ -o ../build/filterHD $(LD_FLAGS)
16 | cloneHD: $(cloneHDobj) cloneHD.o
17 | 	$(CC) $(CC_FLAGS) $^ -o ../build/cloneHD $(LD_FLAGS)	
18 | %.o: %.cpp
19 | 	$(CC) $(CC_FLAGS) -c $< -o $@
20 | $(cloneobj): clone.h $(cloneobj:.o=.cpp)
21 | $(filterHDobj) $(cloneHDobj) $(prefobj): $(.TARGET:.o=.h)
22 | clean:
23 | 	rm -f ./*.o
24 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | CC		= g++-mp-4.7
 2 | GSL		= /opt/local/lib/libgsl.a /opt/local/lib/libgslcblas.a
 3 | CC_FLAGS	= -Wall -O3  -static-libgcc -static-libstdc++ -fopenmp -DHAVE_INLINE
 4 | LD_FLAGS	= ${GSL} -fopenmp
 5 | prefobj		= emission.o common-functions.o
 6 | filterHDobj	= emission.o common-functions.o jump-diffusion.o minimization.o
 7 | cloneobj	= clone.o clone-bulk.o clone-prior.o clone-llh.o clone-predict.o clone-update.o clone-fwd-bwd.o
 8 | cloneHDobj	= $(cloneobj) emission.o common-functions.o minimization.o log-space.o cloneHD-functions.o cloneHD-inference.o
 9 | .PHONY: clean all
10 | all: preFilter filterHD cloneHD
11 | 	rm -f ./*.o
12 | preFilter: $(prefobj) pre-filter.o
13 | 	$(CC) $(CC_FLAGS) $^ -o ../build/pre-filter $(LD_FLAGS)
14 | filterHD: $(filterHDobj) filterHD.o
15 | 	$(CC) $(CC_FLAGS) $^ -o ../build/filterHD $(LD_FLAGS)
16 | cloneHD: $(cloneHDobj) cloneHD.o
17 | 	$(CC) $(CC_FLAGS) $^ -o ../build/cloneHD $(LD_FLAGS)	
18 | %.o: %.cpp
19 | 	$(CC) $(CC_FLAGS) -c $< -o $@
20 | $(cloneobj): clone.h# $(cloneobj:.o=.cpp)
21 | $(filterHDobj) $(cloneHDobj) $(prefobj): $(.TARGET:.o=.h)
22 | clean:
23 | 	rm -f ./*.o
24 | 


--------------------------------------------------------------------------------
/src/jump-diffusion.h:
--------------------------------------------------------------------------------
 1 | //jump-diffusion.h
 2 | 
 3 | #include <stdio.h>
 4 | #include <iostream>
 5 | #include <fstream>
 6 | #include <sstream>
 7 | #include <time.h>
 8 | #include <math.h>
 9 | #include <ctype.h> 
10 | #include <string>
11 | #include <map>
12 | #include <vector>
13 | #ifdef _OPENMP
14 | #include <omp.h>
15 | #endif
16 | 
17 | // GSL headers...
18 | #include "gsl/gsl_vector.h"
19 | #include "gsl/gsl_matrix.h"
20 | #include "gsl/gsl_randist.h"
21 | #include "gsl/gsl_blas.h"
22 | #include "gsl/gsl_multimin.h"
23 | 
24 | using namespace std;
25 | 
26 | class JumpDiffusion{
27 | public:
28 |   JumpDiffusion(Emission * emit, int time);
29 |   ~JumpDiffusion();
30 |   Emission * myEmit; 
31 |   int nSamples;
32 |   int time;
33 |   int mode;
34 |   double sigma, jump, rnd_emit;
35 |   int Fwd_done, Bwd_done, wTotal, save_alpha;
36 |   int gridSize;
37 |   int * nSites;
38 |   unsigned int ** dist;
39 |   unsigned int ** loci;
40 |   unsigned int ** mask;  
41 |   double ** pstay;
42 |   double ** pjump;
43 |   double ** pnojump;
44 |   double ** bias;
45 |   void get_EmitProb(int read, int depth, double * xgrid, gsl_vector * eprob);// emission probability
46 |   gsl_vector * proposal;
47 |   gsl_matrix ** alpha;
48 |   gsl_matrix ** gamma;
49 |   gsl_matrix ** total;
50 |   void set_pstay();
51 |   int pstay_set;
52 |   double do_Fwd(int sample);
53 |   void do_Bwd(int sample);
54 |   //
55 |   gsl_matrix ** DiffProp;
56 |   int set_DiffProp(gsl_matrix * propagator, double variance);
57 |   void get_DiffProp();
58 |   int DiffProp_set;
59 |   void reset_DiffProp();
60 |   vector<int> is_identity;
61 |   map<unsigned int,int> position;
62 |   //
63 |   //void set_DiffProp_Log(gsl_matrix * propagator, double variance);
64 |   int predict(gsl_vector * prior, gsl_vector * post, gsl_matrix*& DiffProp, gsl_matrix**& DP_pt, int sampe, int site);
65 |   double total_llh;
66 |   double get_total_llh();
67 |   void get_posterior(int sample);
68 |   int adapt_range();
69 | };
70 | 
71 | 


--------------------------------------------------------------------------------
/src/minimization.h:
--------------------------------------------------------------------------------
 1 | //minimization.h
 2 | 
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <iostream>
 6 | #include <fstream>
 7 | #include <sstream>
 8 | #include <time.h>
 9 | #include <math.h>
10 | #include <ctype.h> 
11 | #include <string>
12 | #include <map>
13 | #include <vector>
14 | #include <list>
15 | 
16 | 
17 | // GSL headers...
18 | #include "gsl/gsl_vector.h"
19 | #include "gsl/gsl_matrix.h"
20 | #include "gsl/gsl_randist.h"
21 | #include "gsl/gsl_blas.h"
22 | #include "gsl/gsl_multimin.h"
23 | #include "gsl/gsl_linalg.h"
24 | 
25 | using namespace std;
26 | 
27 | 
28 | // The  numerical minimization routine to get the clone frequencies...
29 | double find_local_optimum( int nSimplex,
30 | 			   gsl_vector**& simplex, 
31 | 			   gsl_vector * lower,
32 | 			   gsl_vector * other,
33 | 			   gsl_vector * range,
34 | 			   void * params,
35 | 			   double (*obj_fn)( const gsl_vector * x, void * p),
36 | 			   double prec,
37 | 			   int& steps,
38 | 			   int verbose
39 | 			   );
40 | 
41 | double find_optimum_wrestarts(int nSimplex,
42 | 			      gsl_vector**& simplex,
43 | 			      gsl_vector * lower,
44 | 			      gsl_vector * other,
45 | 			      gsl_vector * range,
46 | 			      void * params,
47 | 			      double (*obj_fn)( const gsl_vector * x, void * p),
48 | 			      double prec,
49 | 			      int restarts,
50 | 			      int& steps,
51 | 			      int verbose
52 | 			      );
53 | 
54 | void spherical_random_step_uniform( double ri, double& rf, double lower,
55 | 				    const gsl_vector * anglei, gsl_vector*& anglef,
56 | 				    double eps);
57 | void simplex_random_step_uniform(const gsl_vector*simplexi, gsl_vector*& simplexf,
58 | 				 double lower, double eps);
59 | 
60 | void arg_map( int nSimplex, gsl_vector**& simplex, gsl_vector * lower, const gsl_vector * other, const gsl_vector * range, gsl_vector ** x);
61 | int arg_unmap( const gsl_vector * x, int nSimplex, gsl_vector**& simplex, gsl_vector * lower, gsl_vector * other, const gsl_vector * range);
62 | 
63 | double spherical_to_simplex( double radial, const gsl_vector * angle, gsl_vector *& simplex, int getLJD);
64 | 
65 | void simplex_to_spherical( const gsl_vector * simplex, double& radial, gsl_vector*& angle);
66 | 
67 | double logify( double x,  double R);
68 | double delogify(double y, double R);
69 | 
70 | /*
71 | double simulated_annealing(
72 | 			   gsl_matrix * freqs, // set of points in or on simplex
73 | 			   gsl_vector * other, // other arguments
74 | 			   gsl_vector * range,// range of other arguments
75 | 			   void * params,
76 | 			   double (*obj_fn)( const gsl_vector * x, void * p),
77 | 			   int& steps
78 | 			   );
79 | int accepted(double dE, double T);
80 | */
81 | 


--------------------------------------------------------------------------------
/src/log-space.cpp:
--------------------------------------------------------------------------------
  1 | //log_space.cpp
  2 | 
  3 | #define PI 3.1415926
  4 | 
  5 | //own headers...
  6 | #include "log-space.h"
  7 | 
  8 | // adding two vectors in log space...
  9 | void log_vector_add(gsl_vector * one, gsl_vector * two){
 10 |   for (int i=0; i<(int) one->size; i++){
 11 |     gsl_vector_set( one, i, log_add( one->data[i], two->data[i]));
 12 |   }
 13 | }
 14 | 
 15 | // adding two matrices in log space...
 16 | void log_matrix_add(gsl_matrix * one, gsl_matrix * two){
 17 |   for (int i=0; i<(int) one->size1; i++){
 18 |     gsl_vector_view r1=gsl_matrix_row(one,i);
 19 |     gsl_vector_view r2=gsl_matrix_row(two,i);
 20 |     log_vector_add(&r1.vector,&r2.vector);
 21 |   }
 22 | }
 23 | 
 24 | // adding two scalars in log space...
 25 | double log_add(double one, double two){
 26 |   if (one > two){
 27 |     if (one-two > 10.0){
 28 |       return( one + exp(two - one));
 29 |     }
 30 |     else{
 31 |       return( one + log(1.0 + exp(two - one)));
 32 |     }
 33 |   }
 34 |   else if (one < two){
 35 |     if (two-one > 10.0){
 36 |       return( two + exp(one - two)); 
 37 |     }
 38 |     else{
 39 |       return( two + log(1.0 + exp(one - two))); 
 40 |     }
 41 |   }
 42 |   else{
 43 |     return(one + log(2.0));
 44 |   }
 45 | }
 46 | 
 47 | double log_sub(double one, double two){
 48 |   if (two > one){
 49 |     printf("ERROR in log_sub(%e,%e)\n",one,two);
 50 |   }
 51 |   if (one-two > 10.0){
 52 |     return( one - exp(two - one));
 53 |   }
 54 |   else{
 55 |     return( one + log(1.0 - exp(two - one)));
 56 |   }
 57 | }
 58 | 
 59 | 
 60 | void log_vector_invert(gsl_vector * vec){
 61 |   double val;
 62 |   for (int i=0; i<(int) vec->size; i++){
 63 |     val = vec->data[i];
 64 |     vec->data[i] = log(1.0 - exp(val));
 65 |   }
 66 | }
 67 | 
 68 | 
 69 | // 1-norm of a vector in log-space
 70 | double log_vector_norm(const gsl_vector * x){
 71 |   double norm = 0;
 72 |   double max  = gsl_vector_max(x);
 73 |   double crit = max - 10.0;
 74 |   for (int i=0; i<(int) x->size; i++){
 75 |     if ( x->data[i] >  crit) 
 76 |       norm += exp(x->data[i] - max);
 77 |   }
 78 |   if (norm>1.1){
 79 |     norm = log(norm) + max;
 80 |   }
 81 |   else{//Taylor-expansion of log(1+x) = x - 0.5*x*x
 82 |     norm = norm - 1.0;
 83 |     norm = norm*(1.0 - 0.5*norm) + max;
 84 |   }
 85 |   return(norm);
 86 | }
 87 | 
 88 | void log_vector_normalize( gsl_vector * x){
 89 |   double norm = log_vector_norm(x);
 90 |   gsl_vector_add_constant(x,-norm);
 91 | }
 92 | 
 93 | // 1-norm of a vector in log-space
 94 | double log_matrix_norm(const gsl_matrix * M){
 95 |   double norm=0;
 96 |   double max = gsl_matrix_max(M);
 97 |   for (int i=0; i<(int) M->size1; i++){
 98 |     for (int j=0; j<(int) M->size2; j++){
 99 |       norm += exp(gsl_matrix_get(M,i,j) - max);
100 |     }
101 |   }
102 |   norm = log(norm) + max;
103 |   return(norm);
104 | }
105 | 


--------------------------------------------------------------------------------
/docs/README-pre-filter.md:
--------------------------------------------------------------------------------
 1 | # pre-filter command line arguments
 2 | 
 3 | The program `pre-filter` can be used to remove loci based on the observed read depth. It includes two heuristic filtering methods: loci are removed based on (i) their local variability and (ii) their being an outlier (see below). 
 4 | 
 5 | ![pref](/images/prefilter-1.png "Pre-filtering of read depth via matched normal.")
 6 | 
 7 | The effect of `pre-filter` on read depth data: (A) Centromeric regions of real chromosomes often show huge large scale variability in their read depth. But there are also many small regions with very low read depth throughout. (B) After pre-filtering, the problematic regions are masked out. (C) Removing the same regions in the tumor data improves quality visibly while retaining biologically relevant features.
 8 | 
 9 | 
10 | ## Typical usage options
11 | 
12 | *  `--data [file]`  Input data to be pre-filtered. 
13 |     
14 |     The file format is the same as for cloneHD's `--cna` option. Only the first sample will be used for pre-filtering.
15 | 
16 | *  `--pre [string:"./out"]`  Set prefix for all output files. 
17 | 
18 |     The pre-filtered loci and data are print to a file named `pre.pref.txt`.
19 | 
20 | *  `--print-tracks [0/1:0]`  Print the window average and window variability. 
21 |      
22 |      The windowed tracks are used for pre-filtering. They are printed for all loci to a file named `pre.track.txt`. Use this to inspect and tune the pre-filter thresholds.
23 | 
24 | *  `--pick-from [file]`  Pre-filter data in this file by picking loci present in `match-to`. 
25 | 
26 |      Only loci are selected which fall into a bin also present in `match-to`. Bins in `match-to` are assumed to be of constant width with the given coordinate being the right bin end inclusive, e.g.
27 | 
28 |         1000  =  1-1000
29 |         2000  =  1001-2000
30 |         4000  =  3001-4000
31 |         etc.
32 | 
33 | *  `--match-to [file]`  Use this file as reference to pick loci in `pick-from`. 
34 | 
35 |      Loci in this file are assumed to be equidistant (e.g. per 1 kb, not all bins need be present). The bin width is decided automatically by majority.
36 | 
37 | ## Parameter options
38 | 
39 | *  `--window-size [int:100]` Set the window scale for smoothing (centered, +-size).
40 | 
41 | *  `--remove-outlier [double:3.0]`  Set the outlier threshold.
42 | 
43 |      All loci are removed, where the observed read depth is further than this value away from the local window-average (in units of sqrt(window-average), assuming Poisson distributed read depths). If set to `0.0`, filter is not applied. 
44 | 
45 | *  `--remove-variable [double:2.0]`  Set the variability threshold.
46 | 
47 |      All loci are removed, where the local window-variability exceeds this multiple of the global variability. Global (local) variability is defined as median (mean) of the absolute distance of observed read depths to the global median read depth. If set to `0.0`, filter is not applied. 
48 | 


--------------------------------------------------------------------------------
/src/cloneHD-functions.h:
--------------------------------------------------------------------------------
 1 | //cloneHD-functions.h
 2 | 
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <iostream>
 6 | #include <fstream>
 7 | #include <sstream>
 8 | #include <time.h>
 9 | #include <math.h>
10 | #include <ctype.h> 
11 | #include <string>
12 | #include <map>
13 | #include <vector>
14 | #include <list>
15 | 
16 | 
17 | // GSL headers...
18 | #include "gsl/gsl_vector.h"
19 | #include "gsl/gsl_matrix.h"
20 | #include "gsl/gsl_randist.h"
21 | #include "gsl/gsl_blas.h"
22 | 
23 | //own headers
24 | class Clone;
25 | class Emission;
26 | 
27 | using namespace std;
28 | 
29 | 
30 | struct cmdl_opts{
31 |   const char * cna_fn;
32 |   const char * baf_fn;
33 |   const char * snv_fn;
34 |   const char * pre;
35 |   const char * bias_fn;
36 |   const char * mntcn_fn;
37 |   const char * maxtcn_fn;
38 |   const char * avcn_fn; 
39 |   const char * chr_fn;
40 |   const char * bulk_fn;
41 |   const char * clones_fn;
42 |   const char * purity_fn;
43 |   const char * cna_jumps_fn;
44 |   const char * baf_jumps_fn;
45 |   const char * snv_jumps_fn;
46 |   //
47 |   int force, trials, restarts, nmax, seed, maxtcn, print_all, learn_priors;
48 |   int mass_gauging;
49 |   double cna_jump, baf_jump, snv_jump;
50 |   double cna_shape, baf_shape, snv_shape;
51 |   double cna_rnd, baf_rnd, snv_rnd;
52 |   double cna_pen_zero, cna_pen_norm, cna_pen_diff;
53 |   double baf_pen_comp;
54 |   double snv_pen_high, snv_pen_mult;
55 |   double snv_fpr, snv_fpf;
56 |   double bulk_fix, bulk_sigma, bulk_rnd;
57 |   double min_occ,min_jump;
58 |   int bulk_mean, bulk_prior, bulk_updates;
59 |   int cnaGrid, bafGrid, snvGrid, bulkGrid;
60 | };
61 | 
62 | 
63 | void get_opts( int argc, const char ** argv, cmdl_opts& opts);
64 | void read_opts( const char * opts_fn, cmdl_opts& opts);
65 | void default_opts(cmdl_opts& opts);
66 | void test_opts(cmdl_opts& opts);
67 | void print_usage();
68 | 
69 | void print_all_results( Clone * myClone, cmdl_opts& opts);
70 | void print_posterior_header( FILE * fp,  Clone * myClone, Emission * myEmit, cmdl_opts& opts);
71 | void print_posterior( FILE * fp,  Clone * myClone, Emission * myEmit,  int s, cmdl_opts& opts);
72 | void print_perclone_header( FILE * fp,  Clone * myClone, Emission * myEmit, cmdl_opts& opts);
73 | void print_perclone_posterior( FILE ** fp,  Clone * myClone, Emission * myEmit,  int s, cmdl_opts& opts);
74 | void print_mean_tcn( FILE * mntcn_fp, Clone * myClone, Emission * cnaEmit, int s, cmdl_opts& opts);
75 | void print_avail_cn( FILE * avcn_fp,  Clone * myClone, Emission * cnaEmit, int s, cmdl_opts& opts);
76 | void print_gof( Clone * myClone, Emission * myEmit, cmdl_opts& opts);
77 | 
78 | void get_cna_data(Emission * cnaEmit, cmdl_opts& opts, int& nTimes);
79 | void get_baf_data(Emission * bafEmit, cmdl_opts& opts, int& nTimes, int& nT);
80 | void get_snv_data(Emission * snvEmit, cmdl_opts& opts, int& nTimes, int& nT);
81 | void get_snv_bulk_prior( Clone * myClone, cmdl_opts& opts);
82 | void get_track(const char * fn, gsl_matrix **& dist, double **& mn, double **& var, Emission * myEmit);
83 | void match_jumps(const char * jumps_fn, Emission * myEmit);
84 | void get_maxtcn_input(const char * maxtcn_fn, int maxtcn_gw, Clone * myClone);
85 | void get_mean_tcn( const char * mtcn_fn, Clone * myClone, Emission * myEmit);
86 | void get_avail_cn( const char * avcn_fn, Clone * myClone, Emission * myEmit);
87 | void get_purity( const char * purity_fn, gsl_vector *& purity);
88 | void get_fixed_clones( gsl_matrix *& clones, gsl_vector *& mass, const char * clones_fn, int nTimes);
89 | void get_jump_probability(  Clone * myClone, cmdl_opts& opts);
90 | void get_bias_field( Clone * myClone, cmdl_opts& opts);
91 | void print_llh_for_set(gsl_matrix * clones, gsl_vector * mass, Clone * myClone, cmdl_opts& opts);
92 | 


--------------------------------------------------------------------------------
/src/cloneHD-inference.h:
--------------------------------------------------------------------------------
  1 | //cloneHD-inference.h
  2 | 
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <iostream>
  6 | #include <fstream>
  7 | #include <sstream>
  8 | #include <time.h>
  9 | #include <math.h>
 10 | #include <ctype.h> 
 11 | #include <string>
 12 | #include <map>
 13 | #include <vector>
 14 | #include <list>
 15 | 
 16 | 
 17 | // GSL headers...
 18 | #include "gsl/gsl_vector.h"
 19 | #include "gsl/gsl_matrix.h"
 20 | #include "gsl/gsl_randist.h"
 21 | #include "gsl/gsl_blas.h"
 22 | 
 23 | class Clone;
 24 | class Emission;
 25 | 
 26 | int infer_clones( gsl_matrix * Clones, gsl_vector * Mass, Clone * myClone, cmdl_opts& opts);
 27 | 
 28 | double get_clones( gsl_matrix *& clones, 
 29 | 		   gsl_matrix *& Clones, 
 30 | 		   gsl_vector *& mass, 
 31 | 		   gsl_vector *& Mass, 
 32 | 		   gsl_matrix *& priors, 
 33 | 		   Clone * myClone,
 34 | 		   cmdl_opts& opts,
 35 | 		   double& cl,
 36 | 		   double& bl,
 37 | 		   double& sl);
 38 | 
 39 | double get_clones_cna( gsl_matrix *& clones, 
 40 | 		       gsl_matrix *& Clones, 
 41 | 		       gsl_vector *& mass, 
 42 | 		       gsl_vector *& Mass, 
 43 | 		       Clone * myClone,
 44 | 		       cmdl_opts& opts,
 45 | 		       double& cl,
 46 | 		       double& bl,
 47 | 		       double& sl);
 48 | 
 49 | double get_clones_baf( gsl_matrix *& clones, 
 50 | 		       gsl_matrix *& Clones, 
 51 | 		       Clone * myClone,
 52 | 		       cmdl_opts& opts
 53 | 		       );
 54 | 
 55 | double get_clones_snv_ncorr( gsl_matrix *& clones, 
 56 | 			     gsl_matrix *& Clones, 
 57 | 			     gsl_matrix *& priors, 
 58 | 			     Clone * myClone,
 59 | 			     cmdl_opts& opts
 60 | 			     );
 61 | 
 62 | double get_clones_snv_wcorr( gsl_matrix *& clones, 
 63 | 			     gsl_matrix *& Clones, 
 64 | 			     Clone * myClone,
 65 | 			     cmdl_opts& opts
 66 | 			     );
 67 | 
 68 | 
 69 | double cna_only_mass_noclones( gsl_vector *& mass, Clone * myClone, int restarts, int& steps);
 70 | //double cna_only_clones_mass( gsl_matrix*& clones, gsl_vector*& mass, Clone * myClone, int& steps);
 71 | double cna_clones_fixed_mass( gsl_matrix*& clones, Clone * myClone, int restarts, 
 72 | 			      int& steps, double& cl, double& bl, double& sl);
 73 | double cna_mass_fixed_clones(gsl_vector*& mass, Clone * myClone, int restarts, 
 74 | 			     int& steps, double& cl, double& bl, double& sl);
 75 | 
 76 | double cna_clones_mass( gsl_matrix*& clones, gsl_vector*& mass, Clone * myClone, int restarts, 
 77 | 			int& steps, double& cl, double& bl, double& sl);
 78 | 
 79 | 
 80 | void get_candidate_masses( gsl_matrix * clones, 
 81 | 			   gsl_vector * mass, 
 82 | 			   Clone * myClone, 
 83 | 			   gsl_matrix*& candidate_masses,
 84 | 			   gsl_vector*& levels,
 85 | 			   double min_occ);
 86 | 
 87 | double cna_llh_all_fixed(Clone * myClone);
 88 | 
 89 | double baf_clones( gsl_matrix*& clones, Clone* myClone, int restarts, int& steps);
 90 | 
 91 | double snv_clones_fixed_priors( gsl_matrix*& clones, Clone * myClone, int restarts, int& steps);
 92 | void snv_iterative_bulk_update( double& llh, gsl_matrix*& clones, Clone * myClone, int iter);
 93 | double snv_priors_fixed_clones( gsl_matrix*& priors, Clone * myClone, int restarts, int& steps);
 94 | double snv_clones_priors( gsl_matrix*& clones, gsl_matrix*& priors, Clone * myClone, int restarts, int& steps);
 95 | void snv_bulk_update(Clone * myClone);
 96 | 
 97 | void set_random_start_freq(gsl_vector *& freq, double lower);
 98 | void report_results( double cl, double bl, double sl, int steps, gsl_vector * mass, gsl_matrix * freq);
 99 | 
100 | double Q( const gsl_vector * x, void * p);
101 | struct Q_par{
102 |   Clone * myClone;
103 |   int nSimplex;
104 |   vector<int> simplexD;
105 |   int clones_fixed;
106 |   int mass_fixed;
107 |   int prior_fixed;
108 |   int cna,baf,snv;
109 | };
110 | 


--------------------------------------------------------------------------------
/run-example.sh:
--------------------------------------------------------------------------------
 1 | # RUN filterHD & cloneHD FOR A SIMULATED EXAMPLE DATA SET
 2 | 
 3 | # fix the number of threads
 4 | export OMP_NUM_THREADS=4;
 5 | 
 6 | part=$1
 7 | 
 8 | # input data
 9 | data="./test/data/"
10 | results="./test/results/"
11 | filterHD="./build/filterHD"
12 | cloneHD="./build/cloneHD"
13 | 
14 | normalCNA="${data}/normal.cna.txt"
15 | tumorCNA="${data}/tumor.cna.txt"
16 | tumorBAF="${data}/tumor.baf.txt"
17 | tumorSNV="${data}/tumor.snv.txt"
18 | bias="${results}/normal.cna.posterior-1.txt"
19 | tumorCNAjumps="${results}/tumor.cna.bias.jumps.txt"
20 | tumorBAFjumps="${results}/tumor.baf.jumps.txt"
21 | 
22 | ### filterHD ###
23 | if [ -z $part ] || [ $part -eq 1 ]
24 | then
25 |     echo "*** filterHD ***"
26 |     echo    
27 | 
28 | #emission modes: 
29 | # 1: Binomial
30 | # 2: Beta-Binomial
31 | # 3: Poisson
32 | # 4: Negative Binomial
33 |  
34 | # The normal read depth is analysed to estimate the technical read depth modulation. This will be later used to account
35 | # for the bias field in cloneHD. In principal, jumps are not expected (so could set --jump 0). The simulations do not have 
36 | # random emissions. 
37 |     cmd="$filterHD --data $normalCNA --mode 3 --pre ${results}/normal.cna"
38 |     echo $cmd
39 |     $cmd
40 |     echo
41 | 
42 | # The tumor read depth is first analysed without bias to get a benchmark for the LLH value. The result will not be used later. 
43 | # In the tumor data, we do expect jumps, but we actually would like to learn the jumps only accounting for the bias field (below).
44 |     cmd="$filterHD --data $tumorCNA --mode 3 --pre ${results}/tumor.cna"
45 |     echo $cmd
46 |     $cmd
47 |     echo
48 | 
49 | # The tumor read depth is now analysed with the bias field from the matched normal. The diffusion constant is set to zero. 
50 | # If left free, it should converge to a very small value. The jump rate could be slightly higher. The LLH should be higher than
51 | # for the run above indicating the presence of the bias field. Now we are interested in the jumps.
52 |     cmd="$filterHD --data $tumorCNA --mode 3 --pre ${results}/tumor.cna.bias --bias $bias --sigma 0 --jumps 1"
53 |     echo $cmd
54 |     $cmd
55 |     echo
56 | 
57 | # The tumor BAF data is analysed, mainly to get the emission parameters (shape, rnd) and jumps. In principle, there could be jumps
58 | # visible in the BAF data, but not in the read depth (copy number neutral LOH within chromosomes). Diffusion should be switched off.
59 |     cmd="$filterHD --data $tumorBAF --mode 1 --pre ${results}/tumor.baf --sigma 0 --jumps 1 --reflect 1 --dist 1"
60 |     echo $cmd
61 |     $cmd
62 |     echo
63 | fi
64 | 
65 | if [ -z $part ] || [ $part -eq 2 ] 
66 | then
67 | ### cloneHD ###
68 |     echo "*** cloneHD ***"
69 |     echo "True mass and cell fractions:" `cat test/data/clones.txt` 
70 |     echo
71 |     # The CNA and BAF data is analysed for subclonality.
72 |     # Try varying the --min-jump, --force and --max-tcn values and try --mass-gauging 0. 
73 |     # Try adding the SNV data to the mix.
74 |     cmd="$cloneHD --cna $tumorCNA --baf $tumorBAF --pre ${results}/tumor --bias $bias --seed 123 --trials 2\
75 |  --nmax 3 --force --max-tcn 4 --cna-jumps $tumorCNAjumps --baf-jumps $tumorBAFjumps --min-jump 0.01 --restarts 10 --mass-gauging 1" 
76 |     echo $cmd
77 |     $cmd
78 |     echo
79 |     cat ${results}/tumor.summary.txt
80 |     echo
81 | 
82 |     # Using the information from above, the SNV data is analysed. Try what happens removing the --avail-cn and --mean-tcn options.
83 |     cmd="$cloneHD --snv $tumorSNV --pre ${results}/tumorSNV --seed 123 --trials 2\
84 |  --nmax 3 --force --max-tcn 4 --restarts 10 --mean-tcn ${results}/tumor.mean-tcn.txt --avail-cn ${results}/tumor.avail-cn.txt"    
85 |     echo $cmd
86 |     $cmd
87 |     echo
88 |     cat ${results}/tumorSNV.summary.txt
89 | fi


--------------------------------------------------------------------------------
/src/emission.h:
--------------------------------------------------------------------------------
  1 | //emission.h
  2 | 
  3 | #include <stdio.h>
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <sstream>
  7 | #include <time.h>
  8 | #include <math.h>
  9 | #include <ctype.h> 
 10 | #include <string>
 11 | #include <map>
 12 | #include <set>
 13 | //#include <unordered_map>
 14 | #include <vector>
 15 | #include <list>
 16 | #include <algorithm>
 17 | 
 18 | // GSL headers...
 19 | #include "gsl/gsl_vector.h"
 20 | #include "gsl/gsl_matrix.h"
 21 | #include "gsl/gsl_randist.h"
 22 | #include "gsl/gsl_blas.h"
 23 | #include "gsl/gsl_sf_gamma.h"
 24 | #include "gsl/gsl_sf_psi.h"
 25 | #include "gsl/gsl_statistics_double.h"
 26 | #include "gsl/gsl_sort.h"
 27 | #include "gsl/gsl_cdf.h"
 28 | 
 29 | 
 30 | 
 31 | using namespace std;
 32 | 
 33 | class Emission{
 34 | public:
 35 |   Emission();
 36 |   void set(int ntimes, vector<int>& chrs, vector<int>& nsites, int grid);
 37 |   ~Emission();
 38 |   void clear();
 39 |   void delete_old_Emit();
 40 |   int is_set;
 41 |   void set_dist();
 42 |   int dist_set;
 43 |   int connect;
 44 |   double median_dist;
 45 |   map<unsigned int, int> dist_count;
 46 |   map<unsigned int, int> frequent_dist;
 47 |   int get_log, get_der, get_mv;
 48 |   unsigned int nmax, Nmax;
 49 |   //unordered_map< unsigned int, unordered_map< unsigned int, gsl_vector*> > EmitProb;
 50 |   //unordered_map< unsigned int, unordered_map< unsigned int, gsl_vector*> > EmitLog;
 51 |   map< unsigned int, map< unsigned int, gsl_vector*> > EmitProb;
 52 |   map< unsigned int, map< unsigned int, gsl_vector*> > EmitLog;
 53 |   double shape, log_shape, rnd_emit;
 54 |   double minRate, maxRate;
 55 |   //
 56 |   int mode, reflect, log_space;
 57 |   void set_EmitProb(int time);
 58 |   void binomial(int N, int n);
 59 |   void beta_binomial(int N, int n);
 60 |   void poisson(int N, int n);
 61 |   void negative_binomial(int N, int n);
 62 |   double get_single_EmitLog(double x, unsigned int n, unsigned int N);
 63 |   void get_eprob_wBias( gsl_vector * eprob, gsl_vector * emit, double b, unsigned int n, unsigned int N, int get_log);
 64 |   int EmitProb_set;
 65 |   //
 66 |   int nTimes, nSamples;
 67 |   int gridSize;
 68 |   double dx,xmin,xmax;
 69 |   double dy,ymin,ymax;
 70 |   double * xgrid;
 71 |   double * ygrid;
 72 |   unsigned int *** reads;
 73 |   unsigned int *** depths;
 74 |   unsigned int ** loci;
 75 |   unsigned int ** mask;
 76 |   unsigned int ** dist;
 77 |   unsigned int *** nObs;
 78 |   void get_nObs();
 79 |   int * nSites;
 80 |   int * chr;
 81 |   std::set<int> chrs;
 82 |   int * idx_of;
 83 |   int maxchr;
 84 |   double ** bias;
 85 |   double ** log_bias;
 86 |   void allocate_bias();
 87 |   void allocate_mean_tcn();
 88 |   void allocate_av_cn(int maxcn);
 89 |   int total_loci, total_events;
 90 |   unsigned int total_dist;
 91 |   void set_grid();
 92 |   void init_range(int time);
 93 |   int range_set;
 94 |   void reset_mask();
 95 |   double get_pval(int time, int sample, int site, double mean);
 96 |   void coarse_grain_jumps( int sample, double plow, int range);
 97 |   double ** pjump;
 98 |   void set_pjump(double jump);
 99 |   double *** mean_tcn;//mean total copy number
100 |   double **** av_cn;//copy number availability
101 |   void init_events();
102 |   int * nEvents;
103 |   unsigned int ** Event_of_idx;// map from idx to cnv-event 
104 |   void map_idx_to_Event(Emission * Emit, int sample);
105 |   void map_jumps(Emission * Emit);
106 |   void add_break_points_via_jumps(Emission * Emit, double pmin);
107 |   void get_events_via_jumps();
108 |   unsigned int ** idx_of_event;// map from event to idx
109 |   unsigned int ** event_of_idx;// map from idx to event
110 |   int idx_to_Event_mapped;
111 |   int coarse_grained;
112 | };
113 | 
114 | 
115 | bool value_comparer(std::map<int,double>::value_type &i1, std::map<int,double>::value_type &i2);
116 | 


--------------------------------------------------------------------------------
/docs/README-filterHD.md:
--------------------------------------------------------------------------------
  1 | # filterHD command line arguments
  2 | 
  3 | ## Typical usage options
  4 | 
  5 | *    `--data [file]`  Input data. 
  6 | 
  7 |      The file format is the same as in cloneHD for `--cna`, `--baf` or
  8 |      `--snv` (see  [here](./README-cloneHD.md)). Multiple samples are processed independently, one by one.
  9 | 
 10 | *    `--mode [1/2/3/4]`  Emission modes.
 11 | 
 12 |         1. Binomial (for SNV data and BAF data (use with `--reflect 1`))
 13 |         2. Beta-Binomial (over-dispersed Binomial)
 14 |         3: Poisson (for read depth data) 
 15 |         4: Negative-Binomial (over-dispersed Poisson)
 16 | 
 17 |     In modes 3/4, the range of the hidden emission rate is learned
 18 |     automatically. For modes 1/2, it is always in [0,1]. Reflective
 19 |     boundary conditions are used.
 20 | 
 21 | *    `--pre [string:"./out"]`  Prefix for all output files.
 22 | 
 23 | *    `--dist [0/1:0]`  Whether to print also the  posterior distribution. 
 24 |      
 25 |      The posterior mean, std-dev and jump probability are always printed  to files
 26 |      `pre.posterior-[int].txt`, one for each sample in the input. With 1, the
 27 |      whole posterior distribution is also printed, so files can be big. 
 28 | 
 29 | *    `--jumps [0/1:0]`  Whether to print posterior jump probability. 
 30 | 
 31 |      The posterior jump probability is compounded over all samples. It
 32 |      can be used with `--min-jump [double]` below, to consolidate jumps.
 33 | 
 34 | *    `--reflect [0/1:0]`  If 1, binomial observations `n in N` and
 35 |      `(N-n) in N` are assumed to be identical. Use this option for BAF data.
 36 | 
 37 | ## Parameter options
 38 | 
 39 | The continuous state space HMM underlying filterHD is determined by the following global
 40 | parameters. They can all be fixed, otherwise they are learned from the data.
 41 | 
 42 | *    `--jump [double]`   Fix the jump probability per length unit (bp).
 43 | *    `--sigma [double]`  Fix the diffusion constant. 
 44 | *    `--shape [double]`  Fix the shape parameter for modes 2/4. If >1000, use modes 1/3.
 45 | *    `--rnd [double]`    Fix the rate of random emissions.
 46 | 
 47 | For all of the above parameters, initial values for the numerical
 48 | optimization can be given. This might be useful if you suspect several
 49 | local optima and want to start in the neighbourhood of a particular one.
 50 | 
 51 | *    `--jumpi [double]`
 52 | *    `--sigmai [double]`
 53 | *    `--shapei [double]`
 54 | *    `--rndi [double]`
 55 | 
 56 | ## Further advanced options
 57 | 
 58 | *    `--min-jump [double:0.0]`  Consolidate jumps down to `--min-jump`.
 59 | 
 60 |      The posterior jump probability track will be consolidated by merging neighboring jump events into
 61 |      unique jumps, down to the minimum value given here. Can only be used together with
 62 |      `--jumps 1`. 
 63 | 
 64 | *    `--filter-pVal [0/1:0]`  Use p-Value filter.
 65 | 
 66 |      Filter sites where the p-Value of the
 67 |      observation is below `10/nSites`, where `nSites` is the total number
 68 |      of sites in a sample.
 69 | 
 70 | *    `--filter-shortSeg [int:0]` Use short-segment filter.
 71 | 
 72 |      Filter sites within short segments between jumps. All filtered data will be in the file ending `pre.filtered.txt`, which will be in the same format as the input file.
 73 | 
 74 | *    `--grid [int:100]`  Set the grid size.
 75 | 
 76 |      The grid size for the internal representation of continuous distributions. For large ranges in
 77 |      mode 3/4, it can make sense to increase this resolution.
 78 | 
 79 | # filterHD output  
 80 | 
 81 | filterHD generates a few output files automatically. Here, we provide annotated screenshots for them for the simulated example data set.
 82 | 
 83 | ## STDOUT
 84 | 
 85 | ![stdout](/images/screenshots/filterHD-stdout.png "filterHD stdout")
 86 | 
 87 | ## Output file
 88 | 
 89 | ![posterior1](/images/screenshots/filterHD-posterior-1.png "filterHD posterior")
 90 | 
 91 | The posterior mean value of the hidden emission rate and jump probabilities
 92 | 
 93 | ![posterior2](/images/screenshots/filterHD-posterior-2.png "filterHD posterior")
 94 | 
 95 | The same as above, but here a bias (normal) was used, so the rate is scaled accordingly. Note: in filterHD, the bias field is not scaled to have mean 1!
 96 | 
 97 | 
 98 | ![posterior3](/images/screenshots/filterHD-posterior-3.png "filterHD posterior")
 99 | 
100 | The same as above, but here the whole posterior distribution was requested with `--dist 1`


--------------------------------------------------------------------------------
/src/clone-predict.cpp:
--------------------------------------------------------------------------------
  1 | //clone-predict.cpp
  2 | 
  3 | //own headers...
  4 | #include "emission.h"
  5 | #include "log-space.h"
  6 | #include "clone.h"
  7 | 
  8 | using namespace std;
  9 | 
 10 | 
 11 | void Clone::set_TransMat_cna(){
 12 |   if (TransMat_cna==NULL){
 13 |     TransMat_cna = new gsl_matrix * [cnaEmit->nSamples];
 14 |     for (int s=0; s<cnaEmit->nSamples;s++) TransMat_cna[s] = NULL;
 15 |   }
 16 |   for (int s=0; s<cnaEmit->nSamples;s++){
 17 |     if (TransMat_cna[s]!=NULL) gsl_matrix_free(TransMat_cna[s]);
 18 |     TransMat_cna[s] = gsl_matrix_alloc(nLevels,nLevels);
 19 |     set_TransMat_cna( TransMat_cna[s], cnaEmit->chr[s]);
 20 |   }
 21 | }
 22 | 
 23 | // only one clone can change its state,
 24 | // or clones in the same state can change in parallel
 25 | void Clone::set_TransMat_cna( gsl_matrix * Trans, int chr){
 26 |   double norm,p;
 27 |   gsl_vector_view row;
 28 |   int jumps,cni,cnf;
 29 |   for (int i=0; i<nLevels; i++){
 30 |     for (int j=0; j<nLevels; j++){
 31 |       jumps=0;
 32 |       p=1.0;
 33 |       for(int k=0; k < nClones; k++){
 34 | 	/*if (copynumber[j][k] > maxtcn_per_clone[chr][k]){
 35 | 	  jumps = 2;
 36 | 	  break;
 37 | 	  }*/
 38 | 	if( copynumber[i][k] != copynumber[j][k]){
 39 | 	  if ( jumps==0 ){
 40 | 	    cni = copynumber[i][k];
 41 | 	    cnf = copynumber[j][k];
 42 | 	    jumps++;
 43 | 	  }
 44 | 	  //else if (cni != copynumber[i][k] || cnf != copynumber[j][k]){
 45 | 	  else if (cni - cnf != copynumber[i][k] - copynumber[j][k]){
 46 | 	    jumps++;
 47 | 	  }
 48 | 	  //if (copynumber[j][k] == 0) p*= 0.01;
 49 | 	}
 50 |       }
 51 |       if (jumps <= 1){
 52 | 	gsl_matrix_set( Trans, i, j, p);
 53 |       }
 54 |       else{
 55 | 	gsl_matrix_set( Trans, i, j, 0.0);
 56 |       }
 57 |     }
 58 |     row  = gsl_matrix_row(Trans,i);
 59 |     norm = gsl_blas_dasum(&row.vector);
 60 |     if (norm <= 0) abort();
 61 |     gsl_vector_scale(&row.vector,1.0/norm);
 62 |   }
 63 | }
 64 | 
 65 | 
 66 | void Clone::set_TransMat_snv(){
 67 |   if (TransMat_snv==NULL){
 68 |     TransMat_snv = new gsl_matrix * [snvEmit->nSamples];
 69 |     for (int s=0; s<snvEmit->nSamples;s++) TransMat_snv[s] = NULL;
 70 |   }
 71 |   for (int s=0; s<snvEmit->nSamples;s++){
 72 |     if (TransMat_snv[s]!=NULL) gsl_matrix_free(TransMat_snv[s]);
 73 |     TransMat_snv[s] = gsl_matrix_alloc(nLevels,nLevels);
 74 |     set_TransMat_snv( TransMat_snv[s], snvEmit->chr[s]);
 75 |   }
 76 | }
 77 | 
 78 | 
 79 | // only one clone can change its state,
 80 | void Clone::set_TransMat_snv(gsl_matrix * Trans, int chr){
 81 |   double norm;
 82 |   gsl_vector_view row;
 83 |   int jumps;
 84 |   for (int i=0; i<nLevels; i++){
 85 |     for (int j=0; j<nLevels; j++){
 86 |       jumps=0;
 87 |       for(int k=0; k < nClones; k++){
 88 | 	if (copynumber[j][k] > maxtcn_per_clone[chr][k]){
 89 | 	  jumps = 2;
 90 | 	  break;
 91 | 	}
 92 | 	if( copynumber[i][k]    != copynumber[j][k] 
 93 | 	    && copynumber[i][k] <= maxtcn_per_clone[chr][k]){
 94 | 	  jumps++;
 95 | 	}
 96 |       }
 97 |       if (jumps <= 1){
 98 | 	gsl_matrix_set(Trans,i, j, 1.0);
 99 |       }
100 |       else{
101 | 	gsl_matrix_set(Trans,i, j, 0.0);
102 |       }
103 |     }
104 |     row  = gsl_matrix_row(Trans,i);
105 |     norm = gsl_blas_dasum(&row.vector);
106 |     if (norm <= 0){
107 |       cout<<"ERROR\n";
108 |       abort();
109 |     }
110 |     gsl_vector_scale(&row.vector,1.0/norm);
111 |   }
112 | }
113 | 
114 | 
115 | // predict step with transition matrix...
116 | void Clone::predict( gsl_vector * prior, gsl_vector * post, Emission * myEmit, double pj, gsl_matrix * T){
117 |   if (pj == 0.0){
118 |     gsl_vector_memcpy( prior, post);//no jump possible
119 |   }
120 |   else{
121 |     if (myEmit->log_space) for (int l=0;l<nLevels;l++) post->data[l] = exp(post->data[l]);
122 |     gsl_vector_memcpy( prior, post);
123 |     gsl_blas_dgemv( CblasTrans, pj, T, post, 1.0-pj, prior);
124 |     if (myEmit->log_space){
125 |       for (int l=0;l<nLevels;l++){
126 | 	prior->data[l] = prior->data[l]>0.0 ? log(prior->data[l]) : logzero;
127 |       }
128 |     }
129 |   }
130 | }
131 | 
132 | 
133 | 
134 | // predict step with convex mixing...
135 | void Clone::predict( gsl_vector * prior, gsl_vector * post, Emission * myEmit, double pj, gsl_vector * flat){
136 |   if (pj==0.0){
137 |     gsl_vector_memcpy(prior,post);
138 |   }
139 |   else{
140 |     gsl_vector_memcpy( prior, flat);
141 |     if(pj<1.0){//convex combination
142 |       if(myEmit->log_space){
143 | 	gsl_vector_add_constant(prior,log(pj));
144 | 	gsl_vector_add_constant(post,log(1.0-pj));
145 | 	log_vector_add(prior,post);
146 |       }
147 |       else{
148 | 	gsl_vector_scale(prior,pj);
149 | 	gsl_vector_scale(post,1.0-pj);
150 | 	gsl_vector_add(prior,post);
151 |       }
152 |     }
153 |   }
154 | }
155 | 
156 | void Clone::apply_maxtcn_mask(gsl_vector * prior, int chr, int log_space){
157 |   for (int l=0;l<nLevels;l++){
158 |     for (int j=0; j<nClones; j++){
159 |       if ( copynumber[l][j] > maxtcn_per_clone[chr][j] ){
160 | 	prior->data[l] = 0.0;
161 | 	break;
162 |       }
163 |     }
164 |   }
165 |   double norm = gsl_blas_dasum(prior);
166 |   if (norm <= 0.0) abort();
167 |   gsl_vector_scale(prior,1.0/norm);
168 |   if(log_space){
169 |     for (int l=0;l<nLevels;l++){
170 |       prior->data[l] = prior->data[l]>0.0 ? log(prior->data[l]) : logzero;
171 |     }
172 |   }
173 | }
174 | 
175 | 


--------------------------------------------------------------------------------
/changelog.md:
--------------------------------------------------------------------------------
  1 | # changelog for cloneHD/filterHD
  2 | 
  3 | ## v1.17.9 / to come
  4 | 
  5 | * bug fix for sparse data with singletons in a chr (bug-001)
  6 | 	
  7 | ## v1.17.8 / 29.05.2014
  8 | 
  9 | *  added checks whether files are open for writing
 10 | *  changed to new defaults: `--(cna/baf/snv)-rnd [double:1.0e-6]` (nan)
 11 | *  allowed `--cna-jump -1` and `--baf-jump -1` (no jumps)
 12 | *  `--cna-jumps [baf-jumps-file]` and vice versa enabled (useful for exome data)
 13 | *  jumps read and integrated with new function match_jumps() (not get_track()).
 14 | * fixed bug when chromosomes have no non-zero observations.
 15 | 
 16 | ## v1.17.7 / 25.04.2014
 17 | 
 18 | *  fixed range error in `pre-filter` in pick-from/match-to mode.
 19 | 
 20 | ## v1.17.6 / 24.04.2014
 21 | 
 22 | *  fixed nan bug in GOF, when N==0 (missing data).
 23 | *  fixed bugs in `pre-filter`, when `--window-size` is greater than length
 24 | *  fixed bug in `pre-filter` in pick-from-match-to mode
 25 | 
 26 | ## v1.17.5 / 22.04.2014
 27 | 
 28 | *  fixed memory alloc bug in pre-filter
 29 | *  abandon ftp site for releases, used only for backup and beta
 30 | 
 31 | ## v1.17.4 / 10.04.2014
 32 | 
 33 | *  fixed fatal bug in snv-mode with correlations
 34 | 
 35 | ## v1.17.3 / 04.04.2014
 36 | 
 37 | *  new program `pre-filter`
 38 | *  `--snv-pen` to `--snv-pen-high` and `--snv-pen-mult`
 39 | *  `--baf-pen` to `--baf-pen-compl`
 40 | *  `--cna-pen` to `--cna-pen-zero`, `--cna-pen-diff` and `cna-pen-norm`
 41 | *  split README
 42 | *  fixed bug in SNV transition matrix in combination with `--max-tcn [file]`
 43 | 
 44 | ## v1.17.2 / 27.03.2014
 45 | 
 46 | *  new output: posterior per subclone, goodness of fit (GOF) per
 47 |    segment
 48 | *  changed file name `*clonal.txt` -> `*summary.txt`
 49 | *  filterHD STDOUT includes now GOF per sample
 50 | *  cloneHD `*summary.txt` includes now GOF per sample
 51 | *  changed `_` to `-` in all file names
 52 | *  fixed bug: BAF now symmetrized only in per-subclone-posterior
 53 | *  new CNA prior to penalize homozygous deletions `--cna-pen [double:0.9]`
 54 | 
 55 | ## v1.17.1 / 01.03.2014
 56 | 
 57 | *  BAF posterior symmetrized for output
 58 | *  CNA transition matrix penalizes clones with zero copies of a segment
 59 | *  fixed bug in SNV prior computation
 60 | *  added pre-processor directives for conditional openMP compilation
 61 | 
 62 | ## v1.17.0 / 25.02.2014 major release
 63 | 
 64 | ### changed the way SNV priors are computed:
 65 | 
 66 | *  if CNA given: SNV prior informed by CNA posterior
 67 | *  if CNA+BAF given, SNV prior informed by BAF+CNA posterior
 68 | *  if SNV only and `--max-tcn` not given, assumes all chr to be
 69 |    all-normal, mean total c.n. to be normal; SNV prior parameters can
 70 |    be learned with `--learn-priors 1`.
 71 | *  if SNV only and `--max-tcn [int/file]` is given, this data is used
 72 |    to fix the total c.n. per chr and subclone; mean total c.n. is
 73 |    calculated on the fly; SNV prior parameters can be learned with
 74 |    `--learn-priors 1`.
 75 | *  if SNV only and `--max-tcn [int/file]` and `--avail-cn [file]` are
 76 |    given, SNV prior is calculated according to c.n. availability.
 77 | 
 78 | ### more changes
 79 | 
 80 | *  changed option `--copynumber [file]` to  `--mean-tcn [file]`
 81 | *  new option  `--avail-cn [file]`
 82 | *  changed option `--maxcn [int:4]` to `--max-tcn [file/int]`
 83 | *  changed option `--snv-err [double]` to `--snv-fpfreq [double]`
 84 | *  changed option `--snv-fpr [double]` to `--snv-fprate [double]`
 85 | *  output file `*used-tcn.txt` to `*used_mean_tcn.txt`
 86 | *  output file `*copynumber.txt` to `*mean_tcn.txt`
 87 | *  new output file `*available_cn.txt`
 88 | *  changed `sample` to `chr` in cloneHD output files
 89 | *  slimmed down output of `--print-options`.
 90 | *  split clone.cpp into components clone-*.cpp
 91 | *  split off cloneHD-inference.cpp
 92 | *  new Makefile
 93 | 
 94 | ## v1.16.7 / 19.02.2014
 95 | 
 96 | *  fixed bug in SNV w/ corr mode when --bulk-fix is used
 97 | *  introduced different grid sizes for CNA, BAF and SNV
 98 | *  fixed bug in Clone::get_interpolation(), at the boundaries
 99 | *  fixed bug in Clone::trapezoidal() (affected --bulk-prior vs --bulk-mean consistency)
100 | 
101 | ## v1.16.6 / 12.02.2014
102 | 
103 | *  fixed major bug for SNV false positive emission rate and prior
104 | *  introduced new functions:  Clone::update_snv_site_ncorr/fixed/nfixed()
105 | *  fixed bug in SNV prior from CNA/BAF posterior computation (BAF normalization)
106 | *  false positive SNV prior now includes P(c=all-zero)
107 | *  fixed bug in used cn output
108 | *  all-zero "observations" in SNV input (w/o corr) are ignored (and not printed!)
109 | *  fixed bug in filterHD: all-zero observations are always retained.
110 | 
111 | ## v1.16.5 / 07.04.2014
112 | 
113 | *  fixed major bug when CNA, BAF and SNV data used with males (X,Y with only one copy)
114 | *  fixed bug in Clone::snv_prior_from_cna_baf_post()
115 | *  fixed bug in posterior output for BAF and SNV
116 | *  introduced prior masking for all update functions
117 | *  introduced `--maxcn_mask [file]` option to limit total c.n. per chromosome
118 | *  static linking of both libgcc and libstdc++ for increased portability
119 | 
120 | ## v1.16.4 / 30.01.2014
121 | 
122 | *  filterHD: if `--reflect 1`, use only posterior in [0,0.5] for mean/std-dev
123 | *  fixed bug with `--bulk-fix 0.0`
124 | 
125 | ## v1.16.3 / 13.01.2014
126 | 
127 | *  introduced the option `--mass-gauging [0/1:1]` to switch off the mass gauging for cna data.
128 | 
129 | ## v1.16.2 / 12.01.2104
130 | 
131 | *  snp -> snv and cnv -> cna in all code
132 | *  introduced `--chr [file]`, candidate masses are computed via majority normal copy number
133 | 
134 | ## v1.16.1 / 10.01.2014
135 | 
136 | *  cnv to cna  for all command line options
137 | *  cnv to cna in all output file names and content
138 | *  filterHD stdout modified
139 | 
140 | ## v1.16 / 03.01.2014
141 | 
142 | *  first stable release of cloneHD
143 | 


--------------------------------------------------------------------------------
/src/common-functions.cpp:
--------------------------------------------------------------------------------
  1 | //common-functions.cpp
  2 | 
  3 | #include "common-functions.h"
  4 | #include "emission.h"
  5 | 
  6 | #define PI 3.1415926
  7 | #define LOG2 0.693147
  8 | 
  9 | using namespace std;
 10 | 
 11 | 
 12 | //get general dimensions of a data set for cloneHD...
 13 | void get_dims( const char * data_fn, 
 14 | 	       int& nTimes,
 15 | 	       vector<int>& chrs,
 16 | 	       vector<int>& nSites,
 17 | 	       int keep
 18 | 	       ){
 19 |   ifstream data_ifs;
 20 |   string line;
 21 |   stringstream line_ss;
 22 |   data_ifs.open( data_fn, ios::in);
 23 |   if (data_ifs.fail()){
 24 |     printf("ERROR: file %s cannot be opened.\n", data_fn);
 25 |     exit(1);
 26 |   }
 27 |   nSites.clear();
 28 |   chrs.clear();
 29 |   int ct=0,l,r,d;
 30 |   int chr=0,old=-1,nT=0;
 31 |   while( data_ifs.good()){
 32 |     line.clear();
 33 |     getline( data_ifs, line);
 34 |     if (line.empty()) break;
 35 |     if (line[0] == '#') continue;
 36 |     line_ss.clear();
 37 |     line_ss.str(line);
 38 |     //check first entry for nTimes
 39 |     if (old == -1 && ct == 0){
 40 |       line_ss >> chr >> l; 
 41 |       while(line_ss >> r >> d){
 42 | 	nT++;
 43 |       }
 44 |       line_ss.clear();
 45 |       line_ss.str(line);      
 46 |     }
 47 |     line_ss >> chr >> l;
 48 |     if (chr != old ){//new chromosome encounter     
 49 |       if (ct>0){
 50 | 	nSites.push_back(ct);
 51 | 	chrs.push_back(old);
 52 |       }
 53 |       ct=0;
 54 |     }
 55 |     old=chr;
 56 |     r = 0;
 57 |     for( int t=0;t<nT; t++){
 58 |       line_ss >> r >> d;
 59 |       if (r>0) break;
 60 |     }
 61 |     if (keep || r>0) ct++;
 62 |   }
 63 |   if (ct>0){
 64 |     nSites.push_back(ct);
 65 |     chrs.push_back(old);
 66 |   }
 67 |   nTimes = nT;
 68 |   data_ifs.close();
 69 | }
 70 | 
 71 | 
 72 | // read in data: expects columns to be "chr location (depth reads)^x"
 73 | void get_data( const char * data_fn, Emission * myEmit){
 74 |   ifstream data_ifs;
 75 |   string line;
 76 |   stringstream line_ss;
 77 |   data_ifs.open( data_fn, ios::in);
 78 |   if (data_ifs.fail()){
 79 |     printf("ERROR: file %s cannot be opened.\n", data_fn);
 80 |     exit(1);
 81 |   }
 82 |   int ct=0,l;
 83 |   int chr=0,old=-1, sample=0;
 84 |   int d,r, keep=0, wait=0;
 85 |   //now collect all data...
 86 |   while( data_ifs.good()){
 87 |     line.clear();
 88 |     getline( data_ifs, line);
 89 |     if (line.empty()) break;
 90 |     if (line[0] == '#') continue;
 91 |     line_ss.clear();
 92 |     line_ss.str(line);
 93 |     line_ss >> chr >> l;//chromosome and locus
 94 |     if (chr != old){
 95 |       if (myEmit->chrs.count(chr) == 0){
 96 | 	printf("WARNING: chr %2i in file %s will be ignored.\n", chr, data_fn);
 97 | 	wait = 1;
 98 |       }
 99 |       else{
100 | 	sample = myEmit->idx_of[chr];
101 | 	ct  = 0;
102 | 	wait = 0;
103 |       }
104 |       old = chr;
105 |     }
106 |     if (wait) continue;
107 |     if (ct >= myEmit->nSites[sample]) continue;
108 |     keep = 0;
109 |     for (int t=0; t<myEmit->nTimes; t++){
110 |       myEmit->loci[sample][ct] = l;//set locus
111 |       line_ss >> r >> d;//get read and depth
112 |       if (d == 0 && r > 0){
113 | 	printf("ERROR: depth = 0 in chr %i locus %i\n", chr, l);
114 | 	cout<<line<<endl;
115 | 	exit(1);
116 |       }
117 |       //set read and depth
118 |       myEmit->reads[t][  myEmit->idx_of[chr] ][ct] = r;
119 |       myEmit->depths[t][ myEmit->idx_of[chr] ][ct] = d;
120 |       if (r>0) keep=1;
121 |     }
122 |     if (keep || myEmit->connect) ct++;
123 |   }  
124 |   data_ifs.close();
125 |   // set the distances between loci
126 |   for (int t=0; t<myEmit->nTimes; t++){
127 |     myEmit->set_dist();
128 |   }
129 | }
130 | 
131 | 
132 | void get_bias(const char * bias_fn, Emission * myEmit){
133 |   ifstream ifs;
134 |   string line;
135 |   stringstream line_ss;
136 |   ifs.open( bias_fn, ios::in);
137 |   if (ifs.fail()){
138 |     printf("ERROR: file %s cannot be opened.\n", bias_fn);
139 |     exit(1);
140 |   }
141 |   int  chr = 0, old_chr = -1, idx=0, blocus=0, next_locus=0,l1=0;
142 |   double b=0,b1=0,nu=0;
143 |   while( ifs.good() ){
144 |     line.clear();
145 |     getline( ifs, line);
146 |     if (line.empty()) break;
147 |     if (line[0] == '#') continue;
148 |     line_ss.clear();
149 |     line_ss.str(line);
150 |     line_ss >> chr >> blocus; 
151 |     if (chr != old_chr && (chr > (int) myEmit->maxchr || myEmit->idx_of[chr] < 0)){
152 |       printf("ERROR 1 in get_bias()\n");
153 |       cout<<line<<endl;
154 |       exit(1);
155 |     }
156 |     if (chr != old_chr){ 
157 |       if (old_chr >=0){
158 | 	while ( idx < myEmit->nSites[myEmit->idx_of[old_chr]] ){
159 | 	  myEmit->bias[myEmit->idx_of[old_chr]][idx] = b;
160 | 	  idx++;
161 | 	}
162 |       }
163 |       idx=0;
164 |       next_locus = (int) myEmit->loci[ myEmit->idx_of[chr] ][idx];
165 |       old_chr    = chr;
166 |     }
167 |     if (idx >= myEmit->nSites[ myEmit->idx_of[chr] ]) continue;    
168 |     line_ss >> b;
169 |     if (idx==0){
170 |       while ( next_locus < blocus ){//left overhang
171 | 	myEmit->bias[myEmit->idx_of[chr]][idx] = b;
172 | 	idx++;
173 | 	if (idx < myEmit->nSites[myEmit->idx_of[chr]]){
174 | 	  next_locus = (int) myEmit->loci[ myEmit->idx_of[chr] ][idx];
175 | 	}
176 | 	if (idx >= myEmit->nSites[myEmit->idx_of[chr]]) break;    
177 |       }
178 |     }
179 |     if ( blocus <= next_locus ){
180 |       b1 = b;
181 |       l1 = blocus;
182 |     }
183 |     if ( blocus < next_locus ){
184 |       continue;
185 |     }
186 |     else if (blocus==next_locus){
187 |       myEmit->bias[myEmit->idx_of[chr]][idx] = b;
188 |       idx++;
189 |       if (idx < myEmit->nSites[myEmit->idx_of[chr]]){
190 | 	next_locus = (int) myEmit->loci[ myEmit->idx_of[chr] ][idx];
191 |       }
192 |     }
193 |     else if (blocus > next_locus){
194 |       while ( next_locus <= blocus ){
195 | 	nu = double(next_locus-l1)/double(blocus-l1);
196 | 	myEmit->bias[myEmit->idx_of[chr]][idx] = b1*(1.0-nu) + b*nu;
197 | 	idx++;
198 | 	if (idx < myEmit->nSites[myEmit->idx_of[chr]]){
199 | 	  next_locus = (int) myEmit->loci[ myEmit->idx_of[chr] ][idx];
200 | 	}
201 | 	if (idx >= myEmit->nSites[myEmit->idx_of[chr]]) break;    
202 |       } 
203 |     }
204 |   }
205 |   ifs.close();
206 | }
207 | 
208 | 
209 | 
210 | //***Mean and Variance function***
211 | double get_mean(gsl_vector * dist, double xmin, double xmax){
212 |   double mean=0.0,P1,P2;
213 |   int n = (int) dist->size;
214 |   double dx = (xmax - xmin) / double(n-1);
215 |   for (int i=0; i < n-1; i++){
216 |     P1 = gsl_vector_get(dist,i);
217 |     P2 = gsl_vector_get(dist,i+1);
218 |     mean += 3.0*(P1+P2)*(xmin+double(i)*dx) + (P1+2.0*P2)*dx;
219 |   }
220 |   mean = mean * dx / 6.0;
221 |   return(mean);
222 | }
223 | 
224 | double get_var(gsl_vector * dist, double xmin, double xmax, double mean){
225 |   double var=0.0, P1, P2,dev;
226 |   int n = (int) dist->size;
227 |   double dx = (xmax - xmin) / double(n-1);
228 |   for (int i=0; i<n-1; i++){
229 |     P1 = gsl_vector_get(dist,i);
230 |     P2 = gsl_vector_get(dist,i+1);
231 |     dev = xmin + double(i)*dx - mean;
232 |     var += (P1+3.0*P2)*dx*dx + 4.0*(P1+2.0*P2)*dev*dx + 6.0*(P1+P2)*pow(dev,2);
233 |   }
234 |   var = var*dx/12.0;
235 |   return(var);
236 | }
237 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # How to get cloneHD and filterHD?
  2 | 
  3 | The current stable release, as well as pre-compiled executable binaries 
  4 | for Mac OS X and GNU Linux (64bit), can be found [here](https://github.com/andrej-fischer/cloneHD/releases). The cloneHD software is undergoing rapid development. Watch/Star this repo to receive updates.
  5 | 
  6 | # Run a test with simulated data
  7 | 
  8 | After downloading cloneHD from the release site, you can test both filterHD and cloneHD by running
  9 | 
 10 | `$ sh run-example.sh`
 11 | 
 12 | where you can see a typical workflow of analysing read depth and BAF
 13 | data with a matched normal. All command line arguments are explained below.
 14 | 
 15 | # Compilation  
 16 | 
 17 | For Mac OS X and GNU Linux (64bit), pre-compiled binaries are available [here](https://github.com/andrej-fischer/cloneHD/releases). To compile cloneHD yourself, you need the GNU scientific library ([GSL](http://www.gnu.org/software/gsl/)) v1.15 or later. Change the paths in the Makefile to point to your local GSL installation (if non-standard). Then type 
 18 | 
 19 | `$ make`
 20 | 
 21 | in the `src` directory. The executables will be in `build`. For debugging with gdb, use `make -f Makefile.debug`.
 22 | 
 23 | # Report bugs
 24 | 
 25 | To report bugs, use the [issue](https://github.com/andrej-fischer/cloneHD/issues) interface of github.
 26 | 
 27 | # Full documentation
 28 | 
 29 | The full documentation can be found in the `/docs/` subfolder. Click below.
 30 | 
 31 | *  [pre-filter](/docs/README-pre-filter.md)
 32 | *  [filterHD](/docs/README-filterHD.md)
 33 | *  [cloneHD](/docs/README-cloneHD.md)
 34 | 
 35 | # What are cloneHD and filterHD for?
 36 | 
 37 | cloneHD is a software for reconstructing the subclonal structure of a
 38 | population from short-read sequencing data. Read depth
 39 | data, B-allele count data and somatic nucleotide variant (SNV) data can be
 40 | used for the inference. cloneHD can estimate the number of subclonal
 41 | populations, their fractions in the sample, their individual total copy number profiles, 
 42 | their B-allele status and all the SNV genotypes with high resolution.
 43 | 
 44 | filterHD is a general purpose probabilistic filtering algorithm for one-dimensional
 45 | discrete data, similar in spirit to a Kalman filter. It is a continuous state
 46 | space Hidden Markov model with Poisson or Binomial emissions and a
 47 | jump-diffusion propagator. It can be used for scale-free smoothing, 
 48 | fuzzy data segmentation and data filtering. 
 49 | 
 50 | ![cna gof](/images/cna.gof.png "CNA goodness of fit")
 51 | ![baf gof](/images/baf.gof.png "BAF goodness of fit")
 52 | ![cna post](/images/cna.post.png "CNA posterior")
 53 | ![cna real](/images/cna.real.png "CNA real profile")
 54 | ![baf post](/images/baf.post.png "BAF posterior")
 55 | ![baf real](/images/baf.real.png "BAF real profile")
 56 | ![snv gof](/images/snv.gof.png "SNV goodness of fit")
 57 | 
 58 | Visualization of the cloneHD output for the simulated data set. From
 59 | top to bottom: 
 60 | (i) The bias corrected read depth data and the cloneHD
 61 | prediction (red).
 62 | (ii) The BAF (B-allele frequency), reflected at 0.5 and the cloneHD prediction (red).
 63 | (iii) The total copy number posterior.
 64 | (iv) The real total copy number profile.
 65 | (v) The minor copy number posterior.
 66 | (vi) The real minor copy number profile.
 67 | (vii) The observed SNV frequencies, corrected for local ploidy, and per genotype (SNVs are assigned ramdomly according to the cloneHD SNV posterior).
 68 | (All plots are created with Wolfram [Mathematica](http://www.wolfram.com/mathematica/).)
 69 | 
 70 | # Tips and tricks
 71 | 
 72 | * The read depth input files for filterHD and cloneHD can be generated from a bam-file with samtools. Given a bed file of non-overlapping 1kb windows, e.g.
 73 | 
 74 |         human-genome.1kb-grid.bed :
 75 | 
 76 |         1	9000	10000
 77 |         1	10000	11000
 78 |         ...
 79 | 
 80 |     `$ samtools bedcov human-genome.1kb-grid.bed sample.bam > read-depth.sample.txt`
 81 | 
 82 |         read-depth.sample.txt :
 83 |  
 84 |         1	9000	10000	12009
 85 |         1	10000	11000	213557
 86 |         ...
 87 | 
 88 |     `$ awk '{print $1,$3,int(0.5+$4/1000.0),1}'  read-depth.sample.txt > read-depth.sample.cloneHD.txt`
 89 | 
 90 |         read-depth.sample.cloneHD.txt :
 91 | 
 92 |         1 10000 12 1
 93 |         1 11000 214 1
 94 |         ...
 95 | 
 96 |   For 10 kb windows, you would use `$ awk '{print $1,$3,int(0.5+$4/10000.0),10}` etc. For windows smaller than 1kb, the observations might not be approximately independent.
 97 | 
 98 | *  All input files are assumed to be sorted by chromosome and genomic coordinate. With Unix, this can be achieved with `sort -k1n,1 -k2n,2 file.txt > sorted-file.txt`.
 99 | 
100 | *  Pre-filtering of data is very important. If filterHD predicts
101 |    many more jumps than you would expect, it might be necessary to
102 |    pre-filter the data, removing variable regions, outliers or very short 
103 |    segments (use programs `pre-filter` and `filterHD`).
104 | 
105 | *  Make sure that the bias field for the tumor CNA data is
106 |    meaningful. If a matched normal sample was sequenced with the same
107 |    pipeline, its read depth profile, as predicted by filterHD, can be used as a
108 |    bias field for the tumor CNA data. Follow the logic of the example data
109 |    given here.
110 | 
111 | *  If the matched-normal sample was sequenced at lower coverage than the tumor sample, 
112 |    it might be necessary to run filterHD with a higher-than-optimal diffusion constant 
113 |    (set with `--sigma [double]`) to obtain a more faithful bias field. Otherwise, the 
114 |    filterHD solution is too stiff and you loose bias detail.
115 | 
116 | *  filterHD can sometimes run into local optima. In this case, it might be useful to
117 |    set initial values for the parameters via `--jumpi [double]` etc.
118 | 
119 | *  By default, cloneHD runs with mass-gauging enabled. This seems wasteful,
120 |    but is actually quite useful because you can see some alternative explanations
121 |    during the course of the analysis.
122 | 
123 | *  Don't put too much weight on the BIC criterion. It was calibrated
124 |    using simulated data. For real data, it should be supplemented with
125 |    common sense and biological knowledge. Use `--force [int]` to use a
126 |    fixed number of subclones and `--max-tcn [int]` to set the maximum possible total
127 |    copy number.
128 | 
129 | *  If high copy numbers are expected only in a few chromosomes, you can increase performance
130 |    by using the `--max-tcn [file]` option to specify per-chromosome upper limits.
131 | 
132 | *  For exome sequencing data, the read depth bias can be enormous. The filterHD estimate of the bias field might not be very useful, especially in segmenting the tumor data.
133 |    Use rather, if available, the jumps seen in the BAF data for both CNA and BAF data
134 |    (give the BAF jumps file to both `--cna-jumps` and `--baf-jumps`).
135 | 
136 | #How to cite
137 | 
138 | The cloneHD and filterHD software is free under the GNU General Public License v3.
139 | If you use this software in your work, please cite the accompanying publication:
140 | 
141 | Andrej Fischer, Ignacio Vazquez-Garcia, Christopher J.R. Illingworth and Ville Mustonen. High-definition reconstruction of subclonal composition in cancer. Cell Reports (2014), http://dx.doi.org/10.1016/j.celrep.2014.04.055
142 | 


--------------------------------------------------------------------------------
/src/clone-bulk.cpp:
--------------------------------------------------------------------------------
  1 | //clone-bulk.cpp
  2 | 
  3 | //own headers...
  4 | #include "emission.h"
  5 | #include "log-space.h"
  6 | #include "common-functions.h"
  7 | #include "clone.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | 
 12 | 
 13 | void Clone::allocate_bulk_mean(){//mean only...
 14 |   if (snvEmit->is_set==0) abort();
 15 |   //allocate prior...
 16 |   bulk_prior_mean = new double * [snvEmit->nSamples];
 17 |   for (int s=0; s<snvEmit->nSamples; s++){
 18 |     bulk_prior_mean[s] = new double [ snvEmit->nSites[s] ];
 19 |   }
 20 |   //allocate posterior...
 21 |   bulk_post_mean  = new double ** [nTimes];
 22 |   for (int t=0; t<nTimes; t++){
 23 |     bulk_post_mean[t] = new double * [snvEmit->nSamples];   
 24 |     for (int s=0; s<snvEmit->nSamples; s++){
 25 |       bulk_post_mean[t][s] = new double [snvEmit->nSites[s]];
 26 |     }
 27 |   }
 28 |   //set pointers to prior...
 29 |   bulk_mean = new double ** [nTimes];
 30 |   for (int t=0; t<nTimes; t++){
 31 |     bulk_mean[t] = new double * [snvEmit->nSamples];   
 32 |     for (int s=0; s<snvEmit->nSamples; s++){
 33 |       bulk_mean[t][s] = bulk_prior_mean[s];
 34 |     }
 35 |   }
 36 | }
 37 | 
 38 | void Clone::allocate_bulk_dist(){//distribution and mean...
 39 |   if (snvEmit->is_set==0) abort();
 40 |   //allocate prior...
 41 |   bulk_prior      = new gsl_matrix * [snvEmit->nSamples];
 42 |   bulk_prior_mean = new double * [snvEmit->nSamples];
 43 |   for (int s=0; s<snvEmit->nSamples; s++){
 44 |     bulk_prior[s] = gsl_matrix_calloc( snvEmit->nSites[s], bulkGrid + 1);
 45 |     bulk_prior_mean[s] = new double [ snvEmit->nSites[s] ];
 46 |   }
 47 |   //allocate posterior...
 48 |   bulk_post       = new gsl_matrix ** [nTimes];
 49 |   bulk_post_mean  = new double ** [nTimes];
 50 |   for (int t=0; t<nTimes; t++){
 51 |     bulk_post[t] = new gsl_matrix * [snvEmit->nSamples];   
 52 |     bulk_post_mean[t] = new double * [snvEmit->nSamples];   
 53 |     for (int s=0; s<snvEmit->nSamples; s++){
 54 |       bulk_post[t][s] = gsl_matrix_calloc( snvEmit->nSites[s], bulkGrid + 1);
 55 |       bulk_post_mean[t][s] = new double [snvEmit->nSites[s]];
 56 |     }
 57 |   }
 58 |   //set pointers to prior...
 59 |   bulk_dist = new gsl_matrix ** [nTimes];
 60 |   bulk_mean = new double ** [nTimes];
 61 |   for (int t=0; t<nTimes; t++){
 62 |     bulk_dist[t] = new gsl_matrix * [snvEmit->nSamples];   
 63 |     bulk_mean[t] = new double * [snvEmit->nSamples];   
 64 |     for (int s=0; s<snvEmit->nSamples; s++){
 65 |       bulk_dist[t][s] = bulk_prior[s];
 66 |       bulk_mean[t][s] = bulk_prior_mean[s];
 67 |     }
 68 |   }
 69 | }
 70 | 
 71 | void Clone::set_bulk_to_post(){
 72 |   for (int t=0; t<nTimes; t++){
 73 |     for (int s=0; s<snvEmit->nSamples; s++){
 74 |       bulk_mean[t][s] = bulk_post_mean[t][s];
 75 |       if (bulk_post != NULL)  bulk_dist[t][s] = bulk_post[t][s];
 76 |     }
 77 |   }
 78 | }
 79 | 
 80 | void Clone::set_bulk_to_prior(){
 81 |   for (int t=0; t<nTimes; t++){
 82 |     for (int s=0; s<snvEmit->nSamples; s++){
 83 |       bulk_mean[t][s] = bulk_prior_mean[s];
 84 |       if (bulk_prior != NULL)  bulk_dist[t][s] = bulk_prior[s];
 85 |     }
 86 |   }
 87 | }
 88 | 
 89 | // Bayesian update of the SNV bulk
 90 | void Clone::update_bulk(int sample){
 91 |   if (!snvEmit->is_set) abort();
 92 |   if (bulk_mean == NULL) abort();
 93 |   gsl_vector * bpost=NULL, *flat=NULL, *emit=NULL;
 94 |   gsl_vector_view bprior;
 95 |   bpost = gsl_vector_alloc(bulkGrid+1);
 96 |   flat  = gsl_vector_alloc(bulkGrid+1);
 97 |   gsl_vector_set_all(flat,1.0);
 98 |   //get SNV posterior...
 99 |   alpha_snv[sample]=NULL;
100 |   gamma_snv[sample]=NULL;
101 |   Clone::get_snv_posterior(sample);
102 |   //update the bulk
103 |   for (int time=0; time<nTimes; time++){   
104 |     unsigned int n,N;
105 |     for (int idx=0; idx<snvEmit->nSites[sample]; idx++){
106 |       n = snvEmit->reads[time][sample][idx];
107 |       N = snvEmit->depths[time][sample][idx];
108 |       if (N>0){
109 | 	emit = snvEmit->EmitProb[N][n];
110 | 	if (bulk_prior == NULL){ 
111 | 	  Clone::get_bulk_post_dist( flat, bpost, emit, time, sample, idx);
112 | 	  bulk_post_mean[time][sample][idx] 
113 | 	    = 0.5*(bulk_prior_mean[sample][idx] + snvEmit->xgrid[gsl_vector_max_index(bpost)]);
114 | 	}
115 | 	else{
116 | 	  bprior = gsl_matrix_row( bulk_prior[sample], idx);
117 | 	  Clone::get_bulk_post_dist( &bprior.vector, bpost, emit, time, sample, idx);
118 | 	  gsl_matrix_set_row( bulk_post[time][sample], idx, bpost);
119 | 	  bulk_post_mean[time][sample][idx] = get_mean( bpost, 0.0, 1.0);
120 | 	}
121 |       }
122 |       else{//no observation
123 | 	bulk_post_mean[time][sample][idx] = bulk_prior_mean[sample][idx];
124 | 	if (bulk_prior != NULL){
125 | 	  bprior = gsl_matrix_row( bulk_prior[sample], idx);
126 | 	  gsl_matrix_set_row( bulk_post[time][sample], idx, &bprior.vector);
127 | 	}
128 |       }
129 |     }
130 |   }
131 |   //clean up...
132 |   gsl_matrix_free(gamma_snv[sample]);
133 |   gamma_snv[sample] = NULL;
134 |   if (bpost!=NULL) gsl_vector_free(bpost);
135 |   if (flat!=NULL)  gsl_vector_free(flat);
136 | }
137 | 
138 | // emission probability
139 | void Clone::get_bulk_post_dist( gsl_vector * bprior, gsl_vector * bpost, gsl_vector * emit, int time, int sample, int idx){
140 |   double dx = snvEmit->dx;
141 |   double prob=0, x=0, y=0, val=0 , f1=0, f2=0, nu=0;
142 |   int old=-1, i1=0, i2=0;
143 |   int chr = snvEmit->chr[sample];
144 |   double ncn = (double) normal_copy[chr];
145 |   int evt = snvEmit->event_of_idx[sample][idx];
146 |   double total_cn = 0;
147 |   if (snvEmit->mean_tcn != NULL){
148 |     total_cn = snvEmit->mean_tcn[time][sample][evt];
149 |   }
150 |   else{
151 |     total_cn = tcn[chr][time][level_of[chr]];
152 |   }
153 |   gsl_vector_set_zero(bpost);
154 |   for (int i=0; i <= bulkGrid; i++){
155 |     y = double(i)*dx * (1.0 - purity[time]) * ncn / total_cn;
156 |     prob=0.0;
157 |     old = -1;
158 |     for (int j=0; j<nLevels; j++){
159 |       x = y + clone_spectrum[time][j] / total_cn;
160 |       if (x > 1.0) continue;
161 |       i1 = int(x/dx);
162 |       nu = x/dx - double(i1);
163 |       if (i1 != old){
164 | 	f1 = emit->data[i1];
165 | 	i2 = (i1<bulkGrid) ? i1+1 : i1;
166 | 	f2 = emit->data[i2];
167 | 	old=i1;
168 |       }
169 |       val = (1.0-nu)*f1 + nu*f2;
170 |       val *= gsl_matrix_get( gamma_snv[sample], evt, j);
171 |       prob += val;
172 |     }
173 |     if (prob != prob) abort();
174 |     gsl_vector_set( bpost, i, prob);
175 |   }
176 |   gsl_vector_mul( bpost, bprior);
177 |   double norm = gsl_blas_dasum(bpost) - 0.5*(bpost->data[0] + bpost->data[bulkGrid]);
178 |   norm *= 1.0/double(bulkGrid);
179 |   if (norm<= 0.0) abort();
180 |   gsl_vector_scale( bpost, 1.0/norm);
181 | }
182 | 
183 | //get the minimum mean bulk freq for all segments
184 | void Clone::get_bulk_min(){
185 |   if (bulk_min==NULL){//allocate
186 |     bulk_min = new double ** [nTimes];
187 |     for (int t=0; t<nTimes; t++){
188 |       bulk_min[t] = new double * [snvEmit->nSamples];
189 |       for (int s=0; s<snvEmit->nSamples; s++){
190 | 	bulk_min[t][s] = new double [snvEmit->nEvents[s]];
191 |       }
192 |     }
193 |   }
194 |   for (int t=0; t<nTimes; t++){
195 |     for (int s=0; s<snvEmit->nSamples; s++){
196 |       for (int evt=0; evt < snvEmit->nEvents[s]; evt++){
197 | 	if (bulk_fix >= 0.0){
198 | 	  bulk_min[t][s][evt] = bulk_fix;
199 | 	}
200 | 	else if (bulk_mean != NULL){
201 | 	  bulk_min[t][s][evt] = 1.1;
202 | 	  int idxi = snvEmit->idx_of_event[s][evt];
203 | 	  int idxf = (evt<snvEmit->nEvents[s]-1) ? snvEmit->idx_of_event[s][evt+1]-1 : snvEmit->nSites[s]-1;
204 | 	  for (int idx=idxi; idx<=idxf; idx++){
205 | 	    bulk_min[t][s][evt] = min( bulk_min[t][s][evt], bulk_mean[t][s][idx]);
206 | 	  }
207 | 	  bulk_min[t][s][evt] = max( bulk_min[t][s][evt], 1.0e-3);
208 | 	}
209 | 	else{
210 | 	  abort();
211 | 	}
212 |       }
213 |     }
214 |   }
215 | }
216 | 


--------------------------------------------------------------------------------
/src/clone-llh.cpp:
--------------------------------------------------------------------------------
  1 | //clone-llh.cpp
  2 | 
  3 | //own headers...
  4 | #include "emission.h"
  5 | #include "log-space.h"
  6 | #include "clone.h"
  7 | 
  8 | using namespace std;
  9 | 
 10 | 
 11 | 
 12 | double Clone::get_all_total_llh(){
 13 |   if (bafEmit->is_set || snvEmit->is_set){
 14 |     alpha_cna = new gsl_matrix * [cnaEmit->nSamples];
 15 |     gamma_cna = new gsl_matrix * [cnaEmit->nSamples];
 16 |     for ( int s=0; s < cnaEmit->nSamples; s++){
 17 |       alpha_cna[s] = gsl_matrix_calloc( cnaEmit->nEvents[s], nLevels);
 18 |       gamma_cna[s] = gsl_matrix_calloc( cnaEmit->nEvents[s], nLevels);
 19 |     }
 20 |     save_cna_alpha = 1;
 21 |     if (bafEmit->is_set && snvEmit->is_set){
 22 |       alpha_baf = new gsl_matrix * [bafEmit->nSamples];
 23 |       gamma_baf = new gsl_matrix * [bafEmit->nSamples];
 24 |       for ( int s=0; s < bafEmit->nSamples; s++){
 25 | 	alpha_baf[s] = gsl_matrix_calloc( bafEmit->nEvents[s], nLevels);
 26 | 	gamma_baf[s] = gsl_matrix_calloc( bafEmit->nEvents[s], nLevels);
 27 |       }
 28 |       save_baf_alpha = 1;
 29 |     }
 30 |     else{
 31 |       save_baf_alpha = 0;
 32 |     }
 33 |   }
 34 |   else{
 35 |     save_cna_alpha = 0;
 36 |   }
 37 |   get_gofs = 0;
 38 |   save_snv_alpha = 0;
 39 |   total_llh     = 0.0;
 40 |   cna_total_llh = 0.0;
 41 |   baf_total_llh = 0.0;
 42 |   snv_total_llh = 0.0;
 43 |   double * llhs = NULL;
 44 |   int sample;
 45 | #ifdef _OPENMP
 46 |   int nt = min( cnaEmit->nSamples, omp_get_max_threads());
 47 | #pragma omp parallel for schedule( dynamic, 1) default(shared) num_threads(nt)
 48 | #endif
 49 |   for ( sample=0; sample < cnaEmit->nSamples; sample++){
 50 |     double llh=0,ent=-1;   
 51 |     Clone::do_cna_Fwd( sample, llh, llhs);
 52 | #ifdef _OPENMP
 53 | #pragma omp critical
 54 | #endif
 55 |     {
 56 |       cna_total_llh += llh;
 57 |     }
 58 |     if (save_cna_alpha==1){
 59 |       int cnaChr = cnaEmit->chr[sample];
 60 |       Clone::do_cna_Bwd( sample, ent);
 61 |       Clone::get_mean_tcn(sample);
 62 |       if ( bafEmit->is_set && bafEmit->chrs.count(cnaChr) == 1){
 63 | 	Clone::map_mean_tcn( cnaEmit, sample, bafEmit);	
 64 | 	if ( snvEmit->is_set && snvEmit->chrs.count(cnaChr) == 1){
 65 | 	  int bafsample = bafEmit->idx_of[cnaEmit->chr[sample]];
 66 | 	  Clone::map_mean_tcn( bafEmit, bafsample, snvEmit);
 67 | 	}
 68 |       }
 69 |       else if ( snvEmit->is_set && snvEmit->chrs.count(cnaChr) == 1){
 70 | 	Clone::map_mean_tcn( cnaEmit, sample, snvEmit);
 71 |       }
 72 |       gsl_matrix_free(alpha_cna[sample]);
 73 |       alpha_cna[sample] = NULL;
 74 |     }
 75 |   }//END PARALLEL FOR
 76 |   //
 77 |   // BAF
 78 |   if ( bafEmit->is_set ){
 79 | #ifdef _OPENMP
 80 |     int nt = min( bafEmit->nSamples, omp_get_max_threads());
 81 | #pragma omp parallel for schedule( dynamic, 1) default(shared) num_threads(nt)
 82 | #endif
 83 |     for ( sample=0; sample < bafEmit->nSamples; sample++){//START PARALLEL FOR
 84 |       double llh=0,ent=-1;
 85 |       Clone::do_baf_Fwd( sample, llh, llhs);
 86 | #ifdef _OPENMP
 87 | #pragma omp critical
 88 | #endif
 89 |       {
 90 | 	baf_total_llh += llh;
 91 |       }
 92 |       if (save_baf_alpha==1){
 93 | 	Clone::do_baf_Bwd( sample, ent);
 94 | 	gsl_matrix_free(alpha_baf[sample]);
 95 | 	alpha_baf[sample] = NULL;
 96 |       }
 97 |     }//END PARALLEL FOR
 98 |   }
 99 |   //
100 |   // SNV
101 |   if( snvEmit->is_set ){
102 | #ifdef _OPENMP
103 |     int nt = min( snvEmit->nSamples,  omp_get_max_threads());
104 | #pragma omp parallel for schedule( dynamic, 1) default(shared) num_threads(nt)
105 | #endif
106 |     for ( sample=0; sample < snvEmit->nSamples; sample++){//START PARALLEL FOR
107 |       double llh = 0.0;
108 |       Clone::do_snv_Fwd(sample, llh, llhs);
109 | #ifdef _OPENMP
110 | #pragma omp critical
111 | #endif
112 |       {
113 | 	snv_total_llh += llh;
114 |       }
115 |     }//END PARALLEL FOR
116 |   }
117 |   //cleanup...
118 |   if (cnaEmit->is_set && save_cna_alpha==1){
119 |     for ( sample=0; sample < cnaEmit->nSamples; sample++){
120 |       gsl_matrix_free(gamma_cna[sample]);
121 |     }
122 |     delete [] alpha_cna;
123 |     delete [] gamma_cna;
124 |     alpha_cna = NULL;
125 |     gamma_cna = NULL;
126 |   }
127 |   if (bafEmit->is_set && save_baf_alpha==1){
128 |     for ( sample=0; sample < bafEmit->nSamples; sample++){
129 |       gsl_matrix_free(gamma_baf[sample]);
130 |     }
131 |     delete [] alpha_baf;
132 |     delete [] gamma_baf;
133 |     alpha_baf = NULL;
134 |     gamma_baf = NULL;
135 |   }
136 |   total_llh = cna_total_llh + baf_total_llh + snv_total_llh;
137 |   return(total_llh);
138 | }
139 | 
140 | 
141 | 
142 | 
143 | double Clone::get_cna_total_llh(){
144 |   int sample;
145 |   save_cna_alpha = 0;
146 |   cna_total_llh  = 0.0;
147 |   double * llhs = NULL;
148 | #ifdef _OPENMP
149 |   int nt = min( cnaEmit->nSamples,  omp_get_max_threads());
150 | #pragma omp parallel for schedule( dynamic, 1) default(shared) num_threads(nt)
151 | #endif
152 |   for ( sample=0; sample < cnaEmit->nSamples; sample++){
153 |     double llh;
154 |     Clone::do_cna_Fwd( sample, llh, llhs);
155 | #ifdef _OPENMP
156 | #pragma omp critical
157 | #endif
158 |     {
159 |       cna_total_llh += llh;
160 |     }
161 |   }//END PARALLEL FOR
162 |   return(cna_total_llh);
163 | }
164 | 
165 | 
166 | double Clone::get_baf_total_llh(){
167 |   if ( nClones > 0 && cnaEmit->is_set && gamma_cna == NULL ) abort();
168 |   save_baf_alpha = 0;
169 |   baf_total_llh  = 0.0;
170 |   double * llhs = NULL;
171 |   int sample;
172 | #ifdef _OPENMP
173 |   int nt = min( bafEmit->nSamples,  omp_get_max_threads());
174 | #pragma omp parallel for schedule( dynamic, 1) default(shared) num_threads(nt)
175 | #endif
176 |   for ( sample=0; sample< bafEmit->nSamples; sample++){
177 |     double llh;
178 |     Clone::do_baf_Fwd( sample, llh, llhs);
179 | #ifdef _OPENMP
180 | #pragma omp critical
181 | #endif
182 |     {
183 |       baf_total_llh += llh;
184 |     }
185 |   }
186 |   return(baf_total_llh);
187 | }
188 | 
189 | 
190 | 
191 | 
192 | double Clone::get_snv_total_llh(){
193 |   if ( nClones > 0 && cnaEmit->is_set && gamma_cna == NULL ) abort();
194 |   int sample;
195 |   save_snv_alpha = 0;
196 |   snv_total_llh  = 0.0;
197 |   double * llhs  = NULL;
198 | #ifdef _OPENMP
199 |   int nt = min( snvEmit->nSamples,  omp_get_max_threads());
200 | #pragma omp parallel for schedule( dynamic, 1) default(shared) num_threads(nt)
201 | #endif
202 |   for ( sample=0; sample< snvEmit->nSamples; sample++){
203 |     double llh=0;
204 |     Clone::do_snv_Fwd( sample, llh, llhs);
205 | #ifdef _OPENMP
206 | #pragma omp critical
207 | #endif
208 |     {
209 |       snv_total_llh += llh;
210 |     }
211 |   }
212 |   return(snv_total_llh);
213 | }
214 | 
215 | 
216 | 
217 | 
218 | double Clone::get_cna_posterior(int sample){
219 |   double llh=0;
220 |   double * llhs = NULL;
221 |   save_cna_alpha = 1;
222 |   //set fw-bw arrays    
223 |   if (alpha_cna[sample] == NULL) 
224 |     alpha_cna[sample] = gsl_matrix_calloc( cnaEmit->nEvents[sample], nLevels);
225 |   if (gamma_cna[sample] == NULL) 
226 |     gamma_cna[sample] = gsl_matrix_calloc( cnaEmit->nEvents[sample], nLevels);
227 |   Clone::do_cna_Fwd( sample, llh, llhs);
228 |   Clone::do_cna_Bwd( sample, cna_total_ent);
229 |   gsl_matrix_free(alpha_cna[sample]);
230 |   alpha_cna[sample] = NULL;
231 |   return(llh);
232 | }
233 | 
234 | double Clone::get_baf_posterior(int sample){
235 |   if (cnaEmit->is_set){
236 |     int cna_sample = cnaEmit->idx_of[bafEmit->chr[sample]];
237 |     if ( nClones > 0 && (gamma_cna == NULL || gamma_cna[cna_sample] == NULL)){
238 |       abort();
239 |     }
240 |   }
241 |   double llh=0;
242 |   double * llhs = NULL;
243 |   save_baf_alpha = 1;
244 |   //set fw-bw arrays    
245 |   if (alpha_baf[sample] == NULL) 
246 |     alpha_baf[sample] = gsl_matrix_calloc( bafEmit->nEvents[sample], nLevels);
247 |   if (gamma_baf[sample] == NULL) 
248 |     gamma_baf[sample] = gsl_matrix_calloc( bafEmit->nEvents[sample], nLevels);
249 |   Clone::do_baf_Fwd( sample, llh, llhs);
250 |   Clone::do_baf_Bwd( sample, baf_total_ent);
251 |   gsl_matrix_free(alpha_baf[sample]);
252 |   alpha_baf[sample] = NULL;
253 |   return(llh);
254 | }
255 | 
256 | 
257 | double Clone::get_snv_posterior(int sample){
258 |   if (cnaEmit->is_set){
259 |     int cna_sample = cnaEmit->idx_of[snvEmit->chr[sample]];
260 |     if ( nClones > 0 && (gamma_cna == NULL || gamma_cna[cna_sample] == NULL)){
261 |       abort();
262 |     }
263 |   }
264 |   double llh=0;
265 |   double * llhs = NULL;
266 |   save_snv_alpha = 1;
267 |   //set fw-bw arrays    
268 |   if (alpha_snv[sample] == NULL) 
269 |     alpha_snv[sample] = gsl_matrix_calloc( snvEmit->nEvents[sample], nLevels);
270 |   if (gamma_snv[sample] == NULL) 
271 |     gamma_snv[sample] = gsl_matrix_calloc( snvEmit->nEvents[sample], nLevels);
272 |   Clone::do_snv_Fwd( sample, llh, llhs);
273 |   Clone::do_snv_Bwd( sample, snv_total_ent);
274 |   gsl_matrix_free(alpha_snv[sample]);
275 |   alpha_snv[sample] = NULL;
276 |   return(llh);
277 | }
278 | 


--------------------------------------------------------------------------------
/src/clone.h:
--------------------------------------------------------------------------------
  1 | //clone.h
  2 | 
  3 | #include <stdio.h>
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <sstream>
  7 | #include <time.h>
  8 | #include <math.h>
  9 | #include <ctype.h> 
 10 | #include <string>
 11 | #include <map>
 12 | #include <set>
 13 | #include <vector>
 14 | #include <algorithm>
 15 | 
 16 | #ifdef _OPENMP
 17 | #include <omp.h>
 18 | #endif
 19 | 
 20 | //#include <unordered_map>
 21 | 
 22 | // GSL headers...
 23 | #include "gsl/gsl_vector.h"
 24 | #include "gsl/gsl_matrix.h"
 25 | #include "gsl/gsl_randist.h"
 26 | #include "gsl/gsl_blas.h"
 27 | 
 28 | //sort by values in array
 29 | struct SortAsc{
 30 |   double * arg; 
 31 |   bool operator() (int i, int j) {return(arg[i]<arg[j]);}
 32 | };
 33 | struct SortDesc{
 34 |   double * arg; 
 35 |   bool operator() (int i, int j) {return(arg[i]>arg[j]);}
 36 | };
 37 | 
 38 | 
 39 | using namespace std;
 40 | 
 41 | //forward class declaration
 42 | class Emission;
 43 | 
 44 | class Clone{
 45 |  public:
 46 |   Clone();
 47 |   ~Clone();
 48 |   Emission * cnaEmit, * bafEmit, * snvEmit;
 49 |   // functions
 50 |   // *** in clone.cpp ********************************************************************************
 51 |   void allocate(Emission * cnaEmit, Emission * bafEmit, Emission * snvEmit, const char * chr_fn);
 52 |   void clean();
 53 |   int nTimes, nClones;
 54 |   int allocated, is_set;
 55 |   int total_loci;
 56 |   std::set<int> chrs;
 57 |   int maxChr;
 58 |   // copy number combination states and frequencies
 59 |   int ** copynumber;
 60 |   void set_copynumbers();
 61 |   gsl_matrix * freqs;
 62 |   double * purity;
 63 |   gsl_vector * min_purity;
 64 |   double ** clone_spectrum;
 65 |   int nLevels, nFreq;
 66 |   void set(const gsl_matrix * freq);
 67 |   void set_clone_spectrum(const gsl_matrix * freq);
 68 |   double logzero;
 69 |   //normal copy number per chr
 70 |   std::map<int,int> normal_copy;
 71 |   void get_normal_copy(const char * chr_fn);
 72 |   void set_normal_copy(const char * chr_fn);
 73 |   int maj_ncn;
 74 |   // maximum total copy number per chr
 75 |   int maxtcn;
 76 |   std::map<int, vector<int> > maxtcn_input;
 77 |   std::map<int, vector<int> > maxtcn_per_clone;
 78 |   std::set<int> all_maxtcn;
 79 |   void set_maxtcn_per_clone();
 80 |   void set_all_levels();
 81 |   std::map<int,int> level_of;
 82 |   double *** tcn;
 83 |   double *** log_tcn;
 84 |   void allocate_tcn();
 85 |   void set_tcn();
 86 |   // penalties and parameters
 87 |   double cna_pen_zero, cna_pen_norm, cna_pen_diff;
 88 |   double baf_pen_comp;
 89 |   double snv_pen_high, snv_pen_mult;
 90 |   double snv_fpr,snv_fpf;
 91 |   // pre computed variables (consider unordered_map<>)
 92 |   map< unsigned int, double> logn;
 93 |   map< unsigned int, double> loggma;
 94 |   int logn_set;
 95 |   void set_logn();
 96 |   // masses
 97 |   void set_mass(gsl_vector * mass);
 98 |   gsl_vector * mass, * log_mass;
 99 |   gsl_vector * nmean;
100 |   void get_nmean();
101 |   // mass-gauging
102 |   gsl_matrix * margin_map;
103 |   void get_cna_marginals();
104 |   gsl_matrix * marginals;
105 |   gsl_matrix * mass_candidates;
106 |   void get_mass_candidates();
107 |   vector<int> levels_sorted;
108 |   gsl_matrix * copynumber_post;
109 |   gsl_vector * majcn_post;
110 |   gsl_matrix ** bafSymMap;
111 |   void set_bafSymMap();
112 |   // *** in clone-prior.cpp **************************************************************************
113 |   int learn_priors;
114 |   gsl_matrix * baf_prior_map;
115 |   gsl_matrix ** snv_prior_from_cna_baf_map;
116 |   gsl_matrix *  snv_prior_from_cna_map;
117 |   void set_margin_map();
118 |   void set_baf_prior_map();
119 |   void set_snv_prior_map();
120 |   void get_baf_prior_from_cna_post(gsl_vector * prior, gsl_vector * post);
121 |   void get_snv_prior_from_cna_post(gsl_vector * prior, gsl_vector * post);
122 |   void get_snv_prior_from_cna_baf_post(gsl_vector * prior, gsl_vector * cnapost, gsl_vector * bafpost);
123 |   void apply_snv_prpc( gsl_vector * prior, gsl_matrix * snv_prpc, double pc0);
124 |   std::map<int,gsl_vector*> snv_prior;
125 |   void set_cna_prior( gsl_vector * prior, int sample);
126 |   void set_snv_prior( gsl_matrix * prior_param );
127 |   gsl_matrix * initial_snv_prior_param;
128 |   void initialize_snv_prior_param();
129 |   // mean total c.n. and available c.n.
130 |   void get_mean_tcn(int sample);//for cna only
131 |   void map_mean_tcn( Emission * fromEmit, int from_sample,  Emission * toEmit);//from cna/baf to baf/snv
132 |   void get_avail_cn( Emission * myEmit, int sample);//for cna/baf
133 |   void get_snv_prior_from_av_cn( gsl_vector * prior, int sample, int evt);
134 |   double *** cn_usage;
135 |   void allocate_cn_usage();
136 |   void set_cn_usage();
137 |   // *** in clone-predict.cpp ************************************************************************  
138 |   gsl_matrix ** TransMat_cna;
139 |   gsl_matrix ** TransMat_snv;
140 |   void set_TransMat_cna();
141 |   void set_TransMat_snv();
142 |   void set_TransMat_cna(gsl_matrix * Trans, int chr);
143 |   void set_TransMat_snv(gsl_matrix * Trans, int chr);
144 |   void predict( gsl_vector * prior, gsl_vector * post, Emission * myEmit, double pj, gsl_matrix * T);
145 |   void predict( gsl_vector * prior, gsl_vector * post, Emission * myEmit, double pj, gsl_vector * flat);
146 |   void apply_maxtcn_mask( gsl_vector * prior, int chr, int log_space);
147 |   // *** in clone-fwd-bwd.cpp ************************************************************************
148 |   void combine_prior(gsl_vector*& prior, gsl_vector*& mem, int n);
149 |   void scale_prior(gsl_vector*& prior, int n);
150 |   void do_cna_Fwd( int sample, double& llh, double*& llhs);
151 |   void do_cna_Bwd( int sample, double& ent);
152 |   void do_baf_Fwd( int sample, double& llh, double*& llhs);
153 |   void do_baf_Bwd( int sample, double& ent);
154 |   void do_snv_Fwd( int sample, double& llh, double*& llhs);
155 |   void do_snv_Bwd( int sample, double& ent);
156 |   int got_gamma, save_cna_alpha, save_baf_alpha, save_snv_alpha;
157 |   gsl_matrix ** alpha_cna, ** alpha_baf, ** alpha_snv;
158 |   gsl_matrix ** gamma_cna, ** gamma_baf, ** gamma_snv;
159 |   double entropy(gsl_vector * x);
160 |   int get_gofs;
161 |   void allocate_all_gofs();
162 |   void get_cna_gof(gsl_vector * post, int sample, int evt);
163 |   void get_baf_gof(gsl_vector * post, int sample, int evt);
164 |   void get_snv_gof(gsl_vector * post, int sample, int evt);
165 |   double * cna_gofs, * baf_gofs, * snv_gofs;
166 |   double *** cna_all_gofs, *** baf_all_gofs, *** snv_all_gofs;
167 |   //void sym_baf( gsl_vector * bafPost, gsl_vector * cnvPost);
168 |   //gsl_matrix ** map1, ** map2;
169 |   //int symmetrize_baf;
170 |   // *** in clone-llh.cpp ****************************************************************************
171 |   double get_cna_posterior(int sample);
172 |   double get_baf_posterior(int sample);
173 |   double get_snv_posterior(int sample);
174 |   // log-likelihoods
175 |   double total_llh, total_entropy; 
176 |   double cna_total_llh, baf_total_llh, snv_total_llh;
177 |   double * cna_llhs, * baf_llhs, * snv_llhs;
178 |   double cna_total_ent, baf_total_ent, snv_total_ent;
179 |   double get_all_total_llh();
180 |   double get_cna_total_llh();
181 |   double get_baf_total_llh();
182 |   double get_snv_total_llh();
183 |   // *** in clone-update.cpp *************************************************************************
184 |   // update step CNA
185 |   double update( gsl_vector * prior, gsl_vector * post, Emission * myEmit, int sample, int site, double*& llhs);
186 |   void update_cna( gsl_vector * prior, gsl_vector * post, int sample, int site, gsl_matrix * Post);
187 |   void update_cna_event( gsl_vector * prior, gsl_vector * post, int sample, int evt, gsl_matrix * Post);
188 |   void update_cna_site_noclone( gsl_vector * post, int sample, int site, gsl_matrix * Post);
189 |   void update_cna_site_wclone( gsl_vector * prior, gsl_vector * post, int sample, int site, gsl_matrix * Post);
190 |   // update step BAF
191 |   void update_baf( gsl_vector * prior, gsl_vector * post, int sample, int evt, gsl_matrix * Post);
192 |   void update_baf_event( gsl_vector * prior, gsl_vector * post, int sample, int evt, gsl_matrix * Post);
193 |   void update_baf_site( gsl_vector * prior, gsl_vector * post, int sample, int site, gsl_matrix * Post);
194 |   // update step SNV
195 |   void update_snv( gsl_vector * prior, gsl_vector * post, int sample, int evt, gsl_matrix * Post);
196 |   void update_snv_event( gsl_vector * prior, gsl_vector * post, int sample, int evt, gsl_matrix * Post);
197 |   void update_snv_site_ncorr( gsl_vector * prior, gsl_vector * post, int sample, int site, gsl_matrix * Post);
198 |   void update_snv_site_fixed( gsl_vector * prior, gsl_vector * post, int sample, int site, gsl_matrix * Post);
199 |   void update_snv_site_nfixed( gsl_vector * prior, gsl_vector * post, int sample, int site, gsl_matrix * Post);	
200 |   // 1D and 2D interpolation
201 |   double get_interpolation(double x, double xmin, double xmax, double dx, gsl_vector * emit);
202 |   double get_interpolation(double x, double xmin, double xmax,
203 | 			   double y, double ymin, double ymax,
204 | 			   gsl_matrix * emit);
205 |   double trapezoidal( gsl_vector * blk, double a, double b, gsl_vector * emit, int get_log);
206 |   //precomputed log-emission probabilities for BAF update
207 |   gsl_matrix *** bafEmitLog;
208 |   gsl_matrix *** cnaEmitLog;
209 |   gsl_matrix **** snvEmitLog;
210 |   void get_bafEmitLog();
211 |   void get_cnaEmitLog();
212 |   void get_snvEmitLog();
213 |   double * cna_xmin;
214 |   double * cna_xmax;
215 |   int snvGrid, bafGrid, cnaGrid, bulkGrid;
216 |   // BIC complecity penalty
217 |   double complexity;
218 |   void get_complexity();
219 |   // *** in clone-bulk.cpp ***************************************************************************
220 |   double bulk_fix;
221 |   void allocate_bulk_dist();
222 |   void allocate_bulk_mean();
223 |   void update_bulk( int time, int sample);
224 |   gsl_matrix **  bulk_prior;
225 |   gsl_matrix *** bulk_post;
226 |   gsl_matrix *** bulk_dist;
227 |   double **  bulk_prior_mean;
228 |   double *** bulk_post_mean;
229 |   double *** bulk_mean;
230 |   double ***  bulk_min;
231 |   void set_bulk_to_prior();
232 |   void set_bulk_to_post();
233 |   void get_bulk_min();
234 |   //SNV bulk updates
235 |   void update_bulk(int sample);
236 |   void get_bulk_post_dist( gsl_vector * bprior, gsl_vector * bpost, gsl_vector * emit, int time, int sample, int idx);
237 | };
238 | 


--------------------------------------------------------------------------------
/docs/README-cloneHD.md:
--------------------------------------------------------------------------------
  1 | # cloneHD command line arguments
  2 | 
  3 | ## Typical usage options
  4 | 
  5 | Format of input files: the first two columns of all three input file
  6 |     types are always chromosome and coordinate of each observation. Both
  7 |     are expected to be integers (change X->23,Y->24). 
  8 | 
  9 | *   `--cna [file]` Read depth data file. 
 10 | 
 11 |     Format: For each sample, there are two additional columns with
 12 |     (i) the read depth and (ii) the number of independent observations
 13 |     this is the sum of. For human NGS data, use the mean read depth
 14 |     per 1 kb as the highest resolution. 
 15 | 
 16 |         1  1000  93   1  75   1  etc.
 17 |         1  2000  101  1  81   1
 18 |         1  3000  105  1  85   1
 19 |         1  5000  197  2  156  2
 20 |         etc.
 21 | 
 22 | *    `--baf [file]` B-allele read count data file. 
 23 | 
 24 |      Format: For each sample, there are two additional columns with
 25 |      (i) the number of reads of the minor allele and (ii) the total
 26 |      read depth at originally heterozygous loci.
 27 | 
 28 |         1  1036  43  90   28  72  etc.
 29 |         1  1287  47  99   32  80
 30 |         1  2877  30  100  36  82
 31 |         etc.
 32 | 
 33 | *    `--snv [file]` Somatic nucleotide variant read count data file.
 34 | 
 35 |      Format: For each sample, there are two additional columns with (i) the number 
 36 |      of reads of the somatic variant allele and (ii) the total read depth at that locus.
 37 | 
 38 |         1  1314  12  92   28  72  etc.
 39 |         1  1287  47  99   32  80
 40 |         1  2877  30  100  36  82
 41 |         etc.
 42 | 
 43 | *    `--pre [string:"./out"]`  Prefix for all output files.
 44 | 
 45 | *    `--bias [file]`  The bias field for the read depth data. 
 46 | 
 47 |      This must be a filterHD `pre.posterior-[int].txt` file, typically from a
 48 |      filterHD run on matched-normal read depth data. This is used to take into account
 49 |      the technical read depth modulation. Make sure that this is comparable between 
 50 |      matched normal and tumor (see below).
 51 | 
 52 | *    `--max-tcn [int]`  The maximum total copy number to be available.
 53 | 
 54 |      This is used as an upper limit for the total copy number genome wide (in all chr).
 55 |      If not specified, the normal copy number is used as limit for each chromosome.
 56 |      This number should be chosen conservatively, since it increases the
 57 |      HMM dimensionality and can open up the possibility for spurious solutions. 
 58 |      It is also possible to specify upper limits per chr and subclone (see below).
 59 | 
 60 | *    `--nmax [int:2]`  The maximum number of subclones to be tried.
 61 | 
 62 |      All subclone numbers `n` from `0..nmax` will be used and the one
 63 |      with maximum BIC chosen for output. BIC is affected by model complexity, 
 64 |      which is a combination of `n` and `max-tcn` (see below).
 65 | 
 66 | *    `--force [int]`  Fix the number `n` of subclones to be used.
 67 | 
 68 | *    `--trials [int:1]`  The number of independent optimizations.
 69 | 
 70 |      Global parameters, fractions `f` and mass `M`, are found numerically by local maximization of
 71 |      the total log-likelihood. The best result out of `trials` independent,
 72 |      newly seeded, runs will be used.
 73 | 
 74 | ### snv-mode options
 75 | 
 76 | *    `--mean-tcn [file]`  Use a fixed mean total copy number for SNV data. 
 77 | 
 78 |      For a SNV data analysis, the cloneHD output file
 79 |      ending `pre.mean-tcn.txt` from a CNA(+BAF) run can be supplied here. Since
 80 |      the subclonal decomposition can be different for SNVs, this option
 81 |      ensures that a reasonable mean total copy number is used.
 82 | 
 83 | *    `--avail-cn [file]`  Use SNV copy number availablility constraint.
 84 | 
 85 |      For a SNV data analysis, the cloneHD output file
 86 |      ending `pre.avail-cn.txt` from a CNA(+BAF) analysis can be supplied here. Since
 87 |      the subclonal decomposition can be different for SNVs, this option
 88 |      ensures that the SNV genotype is consistent with the fraction of
 89 |      cells in which this number of copies is available, at all.
 90 |      This can only be used together with `--mean-tcn [file]` option. In
 91 |      combination, this is usually a much stronger constraint than using
 92 |      `mean-tcn` alone.
 93 | 
 94 | ### Fuzzy segmentation options
 95 | 
 96 | For data with persistence along the genome, a fuzzy segmentation can
 97 | be used based on the filterHD posterior jump probability (must be a
 98 | `pre.jumps.txt` file). Data between potential jump sites, with a jump
 99 | probability of at least `min-jump`, is collapsed. The jump probability
100 | is used in the HMM transition.
101 | 
102 | *    `--cna-jumps [file]`
103 | *    `--baf-jumps [file]`
104 | *    `--snv-jumps [file]`
105 | *    `--min-jump [double:0.01]` 
106 | 
107 | ## Parameter options
108 | 
109 | The shape parameter for the over-dispersed emission models
110 | (Negative-Binomial or Beta-Binomial). If not specified, the normal
111 | models are used (Poisson or Binomial).
112 | 
113 | *    `--cna-shape [double:inf]`
114 | *    `--baf-shape [double:inf]`
115 | *    `--snv-shape [double:inf]`
116 | 
117 | The rate for indiviual random emissions per data set. 
118 | 
119 | *    `--cna-rnd [double:1.0e-6]`
120 | *    `--baf-rnd [double:1.0e-6]`
121 | *    `--snv-rnd [double:1.0e-6]`
122 | 
123 | Both can be learned with filterHD for data with persistence.
124 | 
125 | ## Advanced options
126 | 
127 | *    `--clones [file]` Use fixed mass(es) `M` and/or subclonal fractions `f`. 
128 | 
129 |      Either all mass parameters, or all subclonal fractions, or both 
130 |      can be given (for each sample in the input data). The likelihoods
131 |      and posteriors will be computed under these conditions. 
132 |      Remaining parameters will be learned.
133 |   
134 |      Format: One line per sample. The first column, if greater than
135 |      1.0, is interpreted as mass; the remaining as subclonal fractions.
136 | 
137 |         30.0 0.64 0.12
138 |         28.0 0.31 0.23
139 | 
140 |     More than one parameter set can be given (as a continued list). Then,
141 |     only the likelihoods are computed and printed to a file ending
142 |     `pre.llh-values.txt`.  Useful for mapping the log-likelihood surface
143 |     or comparing several given solutions.
144 | 
145 | *    `--purity [file]`  Use fixed purities, i.e. lower bounds for the sum of
146 |      subclonal frequencies. One line per sample.
147 | 
148 | *    `--restarts [int:10]`  The number of perturbations in local random
149 |      search mode.
150 | 
151 |      This simple random search routine is used: after finding a local
152 |      maximum of LLH, the best solution is perturbed and a new optimum
153 |      is sought. 
154 | 
155 | *    `--seed [int]`  A fixed seed to make inferences reproducible.
156 | 
157 | *    `--mass-gauging [0/1:1]`  Whether to use mass-gauging.
158 | 
159 |      The optimization in the space of masses (seq depths per haploid
160 |      DNA) and subclonal frequencies can suffer from many local
161 |      optima.  To fix the mass(es), one can, for a given solution,
162 |      assume that an occupied state is actually all-normal. All
163 |      occupied states will be proposed to fix the mass(es) 
164 | 
165 | *    `--min-occ [double:0.01]`  The minimum occupancy of levels to be
166 |      used for the mass gauging.
167 | 
168 | *    `--print-all [0/1:0]`  If 1, the posterior for every observation
169 |      is printed to files ending `*[cna/baf/snv].posterior.txt`. 
170 |      If 0, only one line for each segment is printed.
171 | 
172 | *    `--max-tcn [file]`  The maximum total copy number per chr and subclone.
173 | 
174 |     This file should have the format: chr max1 max2 max3 etc., e.g.
175 | 
176 |         1  2
177 |         2  2
178 |         3  8  2
179 |         4  2
180 |         etc.
181 | 
182 |     The first column is the chromosome, the next columns are the limits to be used for subclone 1, 2 etc. For subclones not specified, the limit in the last column is used. In the example above, subclone 1 has an upper limit of 8 total copies in chr3, for all other subclones and in all other chromosomes, the upper limit is 2. If only SNV data is provided (and `--avail-cn [file]` is not given), this is used to fix the total number of copies. If `--max-tcn` is not given, cloneHD uses the normal copy number for each chr.
183 | 
184 | *    `--learn-priors [0/1:0]` For snv-mode only: if 1, then the parameters
185 |      for the multiplicative SNV genotype priors are learned.
186 | 
187 | *    `--chr [file]`  Set normal copy numbers.
188 | 
189 |      The normal copy number for every single chromosome can be specified. This is needed only for non-human DNA. If not given, human DNA is assumed and the sex is inferred from the presence or absence of chr 24 (= chr Y) in the input data.
190 | 
191 | *    `--snv-fprate [double:1.0e-4]`  Set the false positive rate for SNVs.
192 | 
193 | *    `--snv-fpfreq [double:0.01]`  The typical frequency of false positive SNVs.
194 | 
195 | *    `--snv-pen-mult [double:0.01]`  Set the penalty against multiple SNVs.
196 | 
197 | *    `--snv-pen-high [doube:0.5]`  Set the penalty against higher genotypes.
198 | 
199 | *    `--cna-pen-zero [double:0.9]`  Set the penalty against zero total copies.
200 | 
201 | *    `--cna-pen-norm [double:1.0]`  Set the penalty against non-normal total c.n.
202 | 
203 | *    `--cna-pen-diff [double:1.0]`  Set the penalty against different total c.n.
204 | 
205 | *    `--baf-pen-comp [double:1.0]`  The penalty against complex minor allele status.
206 | 
207 | *    `--cna-jump [double:-1.0]`
208 | 
209 | *    `--baf-jump [double:-1.0]`
210 | 
211 | *    `--snv-jump [double:-1.0]`
212 | 
213 |     A constant jump probability per base pair can be set. If set to `-1.0`, then observations are uncorrellated along the genome (not the same as `1.0`). Can be learned with filterHD. No fuzzy data segmentation is performed.  Useful in combination with `--clones`, where very high-definition information available. Using this option will change the posterior output file format.
214 | 
215 | ## Bulk options
216 | 
217 | These options are only needed if the sequenced cell population is a mixture of a
218 | diverse bulk, with known allele frequency profile, and a number of
219 | subclones with unknown genotypes and frequencies. Allele frequency
220 | data is input with `--snv`. Data segmentation can be used with
221 | `--snv-jumps`.  Read depth data can also be specified with `--cna`. 
222 | 
223 | *    `--bulk-mean [file]`  The bulk allele frequency profile. 
224 | 
225 |      Must be a filterHD `*posterior-[int].txt` file. Only the posterior mean is used.
226 | 
227 | *    `--bulk-prior [file]`  The bulk allele frequency profile. 
228 | 
229 |      Must be a filterHD `*posterior-[int].txt` file. The whole posterior
230 |      distribution is used (run filterHD with `--dist 1` to obtain it).
231 | 
232 | *    `--bulk-updates [int:0]`  The number of (Bayesian) updates of the
233 |      bulk allele frequency profile (if `--bulk-prior` was used).
234 | 
235 | *    `--bulk-fix [double:0.0]`  Use a flat and fixed bulk allele
236 |      frequency profile.
237 | 
238 | ## Technical options
239 | 
240 | The grid sizes for the pre-computed emission probabilities if fuzzy data segmentation is used.
241 | 
242 | *    `--cna-grid [int:300]`  
243 | *    `--baf-grid [int:100]` 
244 | *    `--snv-grid [int:100]`
245 | 
246 | # Program output  
247 | 
248 | cloneHD generates a number of output files automatically. Here, we provide annotated screenshots for the most important of them for the simulated example data set.
249 | 
250 | ## STDOUT
251 | 
252 | ![stdout1](/images/screenshots/cloneHD-stdout-1.png "cloneHD stdout 1")
253 | ![stdout2](/images/screenshots/cloneHD-stdout-2.png "cloneHD stdout 2")
254 | 
255 | ## Output files
256 | 
257 | ### The inference summary
258 | ![summary](/images/screenshots/cloneHD-summary.png "cloneHD summary")
259 | 
260 | ### The posterior distribution over all copy number/genotype states
261 | ![posterior](/images/screenshots/cloneHD-posterior.png "cloneHD posterior")
262 | 
263 | ### The posterior distribution for each subclone separately
264 | ![subclone](/images/screenshots/cloneHD-subclone.png "cloneHD subclone")
265 | 
266 | ### The mean total copy number per segment
267 | ![meantcn](/images/screenshots/cloneHD-mean.png "cloneHD mean-tcn")
268 | 
269 | ### The available genotype per segment
270 | ![avail](/images/screenshots/cloneHD-avail.png "cloneHD avail-cn")
271 | 
272 | 
273 | 
274 | 


--------------------------------------------------------------------------------
/src/pre-filter.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | ******************************************************************************
  3 | 
  4 | Copyright (c) 11/12/13  Genome Research Ltd.
  5 | 
  6 | Author: Andrej Fischer (af7[at]sanger.ac.uk)
  7 | 
  8 | This file is part of cloneHD.
  9 | 
 10 | cloneHD is free software: you can redistribute it and/or modify it under the terms of the 
 11 | GNU General Public License as published by the Free Software Foundation; either version 3 
 12 | of the License, or (at your option) any later version.
 13 | 
 14 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 
 15 | without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
 16 | See the GNU General Public License for more details.
 17 | 
 18 | You should have received a copy of the GNU General Public License along with this program.  
 19 | If not, see <http://www.gnu.org/licenses/>.
 20 | 
 21 | ******************************************************************************
 22 | */
 23 | 
 24 | 
 25 | #include <stdio.h>
 26 | #include <stdlib.h>
 27 | #include <iostream>
 28 | #include <fstream>
 29 | #include <sstream>
 30 | #include <time.h>
 31 | #include <math.h>
 32 | #include <ctype.h> 
 33 | #include <string>
 34 | #include <map>
 35 | #include <vector>
 36 | #include <list>
 37 | #include <algorithm>
 38 | 
 39 | 
 40 | // GSL headers...
 41 | #include "gsl/gsl_vector.h"
 42 | #include "gsl/gsl_matrix.h"
 43 | #include "gsl/gsl_randist.h"
 44 | #include "gsl/gsl_blas.h"
 45 | #include "gsl/gsl_statistics_double.h"
 46 | #include "gsl/gsl_sort.h"
 47 | 
 48 | //own headers...
 49 | #include "emission.h"
 50 | #include "common-functions.h"
 51 | 
 52 | using namespace std;
 53 | 
 54 | 
 55 | struct cmdl_opts{
 56 |   const char * data_fn, * pick_fn, * match_fn;
 57 |   const char * pre;
 58 |   double remVar, remOut;
 59 |   int wSize, print_tracks;
 60 | };
 61 | 
 62 | 
 63 | //*** OWN FUNCTIONS ***
 64 | void get_opts( int argc, const char ** argv, cmdl_opts& opts);
 65 | void test_opts(cmdl_opts& opts);
 66 | void default_opts(cmdl_opts& opts);
 67 | void pre_filter( Emission& dataEmit, cmdl_opts& opts);  
 68 | void pick_match( Emission * pickEmit, Emission * matchEmit, cmdl_opts& opts);
 69 | 
 70 | 
 71 | // *** MAIN START***
 72 | int main (int argc, const char * argv[]){
 73 |   cmdl_opts opts;
 74 |   get_opts( argc, argv, opts);
 75 |   vector<int> chrs;
 76 |   vector<int> nSites;
 77 |   int nTimes;  
 78 |   //the data and emission model object
 79 |   Emission dataEmit, pickEmit, matchEmit;
 80 |   if (opts.data_fn != NULL){
 81 |     get_dims( opts.data_fn, nTimes, chrs, nSites, 1);
 82 |     if (nTimes > 1){
 83 |       printf("WARNING: Pre-filtering will be carried out based on the first in %s sample only.\n",opts.data_fn);
 84 |     }
 85 |     dataEmit.connect = 1;//all data will be retained
 86 |     dataEmit.set( nTimes, chrs, nSites, 100);
 87 |     get_data( opts.data_fn, &dataEmit);
 88 |   }
 89 |   else{
 90 |     get_dims( opts.pick_fn, nTimes, chrs, nSites, 1);
 91 |     pickEmit.connect = 1;
 92 |     pickEmit.set( nTimes, chrs, nSites, 100);
 93 |     get_data( opts.pick_fn, &pickEmit);
 94 |     get_dims( opts.match_fn, nTimes, chrs, nSites, 1);
 95 |     matchEmit.connect = 1;
 96 |     matchEmit.set( nTimes, chrs, nSites, 100);
 97 |     get_data( opts.match_fn, &matchEmit);
 98 |   }
 99 |   if (dataEmit.is_set) pre_filter( dataEmit, opts); 
100 |   if (pickEmit.is_set) pick_match( &pickEmit, &matchEmit, opts);
101 |   //done
102 |   return (0);
103 | }
104 | // *** MAIN END ***
105 | 
106 | 
107 | void default_opts(cmdl_opts& opts){
108 |   opts.data_fn   = NULL;
109 |   opts.pick_fn   = NULL;
110 |   opts.match_fn  = NULL;
111 |   opts.pre      = "./out";
112 |   opts.wSize    = 100;
113 |   opts.remVar   = 2.0;
114 |   opts.remOut   = 3.0;
115 |   opts.print_tracks = 0;
116 | }
117 | 
118 | // get command line arguments...
119 | void get_opts( int argc, const char ** argv, cmdl_opts& opts){
120 |   default_opts(opts);
121 |   int opt_idx = 1;
122 |   string opt_switch;  
123 |   while ( opt_idx < argc && (argv[opt_idx][0] == '-')){
124 |     opt_switch = argv[opt_idx];
125 |     opt_idx++;
126 |     if (opt_idx==argc) break;
127 |     if ( argv[opt_idx][0] == '-') continue;
128 |     if ( opt_switch.compare("--data") == 0){
129 |       opts.data_fn = argv[opt_idx];
130 |     }  
131 |     else if ( opt_switch.compare("--pre") == 0){
132 |       opts.pre = argv[opt_idx];
133 |     }
134 |     else if ( opt_switch.compare("--pick-from") == 0){
135 |       opts.pick_fn = argv[opt_idx];
136 |     }
137 |     else if ( opt_switch.compare("--match-to") == 0){
138 |       opts.match_fn = argv[opt_idx];
139 |     }
140 |     else if ( opt_switch.compare("--remove-variable") == 0){
141 |       opts.remVar = atof(argv[opt_idx]);
142 |     }
143 |     else if ( opt_switch.compare("--remove-outlier") == 0){
144 |       opts.remOut = atof(argv[opt_idx]);
145 |     }
146 |     else if ( opt_switch.compare("--window-size") == 0){
147 |       opts.wSize = atoi(argv[opt_idx]);
148 |     }
149 |     else if ( opt_switch.compare("--print-tracks") == 0){
150 |       opts.print_tracks = atoi(argv[opt_idx]);
151 |     }
152 |     else {
153 |       cout << opt_switch << "?\n";
154 |       cout << "Usage: pre-filter  --pre [string:./out] (--data [file] --remove-variable [double:2.0] --remove-outlier [double:3.0] --window-size [int:100]) (--pick-from [file] --match-to [file])"<<endl;
155 |       exit(1);
156 |     }
157 |     opt_switch.clear();
158 |     opt_idx++;
159 |   }
160 |   test_opts(opts);
161 | }
162 | 
163 | 
164 | void test_opts(cmdl_opts& opts){
165 |   if (opts.data_fn == NULL && opts.pick_fn == NULL && opts.match_fn == NULL){
166 |     cout<<"ERROR: Either (--data [file]) or (--pick-from [file] and --match-to [file]) must be given.\n";
167 |     exit(1);
168 |   }
169 |   if (opts.data_fn != NULL){
170 |     if(opts.remVar == 0.0 && opts.remOut == 0.0){
171 |       printf("Nothing to be done.\n");
172 |       exit(0);
173 |     }
174 |     if (opts.pick_fn != NULL || opts.match_fn != NULL){
175 |       cout<<"ERROR: --pick-from [file] and --match-to [file] cannot be used together with --data [file].\n";
176 |       exit(1);
177 |     }
178 |     if(opts.wSize <= 1){
179 |       printf("ERROR: window-size of %i is too small.\n", opts.wSize);
180 |       exit(1);
181 |     }
182 |   }
183 |   else if (opts.pick_fn != NULL && opts.match_fn != NULL){
184 |     opts.remVar = 0.0;
185 |     opts.remOut = 0.0;
186 |   }
187 | }
188 | 
189 | void pre_filter( Emission& dataEmit, cmdl_opts& opts){   
190 |   char buff[1024];
191 |   sprintf( buff, "%s.pref.txt", opts.pre);
192 |   FILE * pref_fp  = fopen(buff,"w");
193 |   FILE * track_fp = NULL;
194 |   if (opts.print_tracks == 1){
195 |     sprintf( buff, "%s.tracks.txt", opts.pre);
196 |     track_fp = fopen(buff,"w");
197 |   }
198 |   //***Pre-filtering of each chromosome in turn***
199 |   for (int s=0; s < dataEmit.nSamples; s++){
200 |     int L = dataEmit.nSites[s];
201 |     unsigned int * rds = dataEmit.reads[0][s];
202 |     unsigned int * dps = dataEmit.depths[0][s];
203 |     if (dataEmit.nSites[s] == 0) abort();
204 |     double * wMean = new double [L];
205 |     double * wVar  = new double [L];
206 |     int * mask     = new int    [L];
207 |     for (int l=0; l<L; l++){
208 |       mask[l]  = 1;
209 |       wMean[l] = 0;
210 |       wVar[l]  = 0;
211 |     }
212 |     double medAbsDev=0, median=0;
213 |     if (opts.remVar > 0.0){//***REMOVE LOCI ACCORDING TO VARIABILITY***
214 |       //get median read depth
215 |       vector<double> allx;    
216 |       for (int l=0; l<L; l++){
217 | 	if ( dps[l] > 0){
218 | 	  allx.push_back(double(rds[l]) / double(dps[l]));
219 | 	}
220 |       }
221 |       gsl_sort( allx.data(), 1, allx.size());
222 |       median = gsl_stats_quantile_from_sorted_data ( allx.data(), 1, allx.size(), 0.5);
223 |       allx.clear();
224 |       //get median absolute deviation from median
225 |       for (int l=0; l<L; l++){
226 | 	if ( dps[l] > 0){
227 | 	  allx.push_back( fabs(double(rds[l]) / double(dps[l]) - median) );
228 | 	}
229 |       }
230 |       gsl_sort( allx.data(), 1, allx.size());
231 |       medAbsDev = gsl_stats_quantile_from_sorted_data ( allx.data(), 1, allx.size(), 0.5);
232 |       allx.clear();
233 |       //get local variablility and mask
234 |       double sum = 0.0;
235 |       int front=-1, size=0, center=-opts.wSize-1, back=-2*opts.wSize-1;
236 |       while (center < L-1){	
237 | 	while (front < L){
238 | 	  front++;
239 | 	  if (front==L || dps[front] > 0) break;
240 | 	}
241 | 	if (front < L){
242 | 	  if (size < 2*opts.wSize+1) size++;
243 | 	  sum += fabs(median - double(rds[front]) / double(dps[front]));
244 | 	}	
245 | 	if (back >= 0){
246 | 	  sum -= fabs(median - double(rds[back]) / double(dps[back]));
247 | 	  if (front==L && size>1) size--;
248 | 	}
249 | 	while (back < L){
250 | 	  back++;
251 | 	  if ( back<0 || back==L || dps[back] > 0 ) break;
252 | 	}
253 | 	while (center < L-1){
254 | 	  center++;
255 | 	  if (center < 0 || dps[center] > 0 ) break;
256 | 	}
257 | 	if (center >= 0){
258 | 	  wVar[center] = sum / double(size); 	  
259 | 	}
260 |       }
261 |       //apply filter
262 |       for (int l=0; l<L; l++){
263 | 	if (wVar[l] > opts.remVar * medAbsDev) mask[l] = 0;
264 |       }
265 |     }
266 |     if (opts.remOut > 0.0){//***REMOVE LOCI ACCORDING TO OUTLIER***    
267 |       double sum = 0.0;
268 |       int front=-1, size=0, center=-opts.wSize-1, back=-2*opts.wSize-1;
269 |       while (center < L-1){
270 | 	while (front < L){
271 | 	  front++;
272 | 	  if (front == L) break;
273 | 	  if (dps[front] > 0 && mask[front]==1)	break;
274 | 	}
275 | 	if (front < L){
276 | 	  if (size < 2*opts.wSize+1) size++;
277 | 	  sum += double(rds[front]) / double(dps[front]);
278 | 	}
279 | 	if (back >= 0){
280 | 	  sum -= double(rds[back]) / double(dps[back]);
281 | 	  if (front==L && size>1) size--;
282 | 	}
283 | 	while (back < L){
284 | 	  back++;
285 | 	  if (back==L || back<0) break;
286 | 	  if (dps[back]>0 && mask[back]==1) break;
287 | 	}
288 | 	while (center < L-1){
289 | 	  center++;
290 | 	  if (center<0 || (dps[center]>0 && mask[center]==1)) break;
291 | 	}
292 | 	if (center >= 0){
293 | 	  wMean[center] = sum / double(size);
294 | 	}
295 |       }
296 |       //apply filter
297 |       for (int l=0; l<L; l++){
298 | 	if (mask[l] == 0 || dps[l] == 0) continue;
299 | 	double xobs = double(rds[l]) / double(dps[l]);
300 | 	if ( fabs(wMean[l] - xobs) > opts.remOut * sqrt(wMean[l]) ){
301 | 	  mask[l] = 0;
302 | 	}
303 |       }
304 |     }
305 |     //print
306 |     for (int l=0; l<L; l++){
307 |       if ( mask[l] == 1){
308 | 	fprintf( pref_fp, "%-2i %12i %3i %3i\n", 
309 | 		 dataEmit.chr[s], dataEmit.loci[s][l], rds[l], dps[l]
310 | 		 );
311 |       }
312 |       if (track_fp == NULL) continue;
313 |       fprintf( track_fp, "%-2i %12i %.3f %.3f\n", 
314 | 	       dataEmit.chr[s], dataEmit.loci[s][l], wMean[l], 
315 | 	       (medAbsDev > 0) ? wVar[l] / medAbsDev : 0
316 | 	       );
317 |     }
318 |     //cleanup
319 |     delete [] wMean;
320 |     delete [] wVar;
321 |     delete [] mask;
322 |   }
323 |   fclose(pref_fp);
324 |   if (track_fp != NULL) fclose(track_fp);
325 | }
326 | 
327 | 
328 | void pick_match( Emission * pickEmit, Emission * matchEmit, cmdl_opts& opts){
329 |   char buff[1024];
330 |   sprintf( buff, "%s.pref.txt", opts.pre);
331 |   FILE * pref_fp  = fopen(buff,"w"); 
332 |   for (int s=0; s < pickEmit->nSamples; s++){
333 |     int chr=pickEmit->chr[s];
334 |     if ( matchEmit->chrs.count(chr) == 0){
335 |       printf( "ERROR: chr %i in %s could not be found in %s\n",
336 | 	      chr, opts.pick_fn, opts.match_fn
337 | 	      );
338 |       exit(1);
339 |     }
340 |     int matchSample = matchEmit->idx_of[chr];
341 |     // get all bins and determine bin width...
342 |     // !!!NOTE: assumes uniform bin width!!!
343 |     std::map<int,int> diffs;
344 |     unsigned int * mloci = matchEmit->loci[matchSample];
345 |     int binw=0;
346 |     for (int l=1; l<matchEmit->nSites[matchSample]; l++){
347 |       binw = int(mloci[l]) - int(mloci[l-1]);
348 |       if (diffs.count(binw) == 0) diffs.insert(std::pair<int,int>(binw,0));
349 |       diffs[binw] =+ 1;
350 |     }
351 |     int ct = 0;
352 |     std::map<int,int>::iterator it;
353 |     for (it = diffs.begin(); it != diffs.end(); it++){
354 |       if (it->second > ct){
355 | 	binw = it->first;
356 | 	ct   = it->second;
357 |       }
358 |     }
359 |     diffs.clear();
360 |     //now pick...
361 |     int midx = 0;
362 |     int mlocus = (int) mloci[0];
363 |     for (int idx=0; idx<pickEmit->nSites[s]; idx++){
364 |       int plocus = (int) pickEmit->loci[s][idx];
365 |       while (mlocus < plocus){
366 | 	midx++;
367 | 	if (midx == matchEmit->nSites[matchSample]) break;
368 | 	mlocus = (int) mloci[midx];	
369 |       }
370 |       if ( mlocus < plocus || plocus <= mlocus - binw ) continue;
371 |       fprintf( pref_fp, "%-2i %12i", chr, plocus);
372 |       for ( int t=0; t<pickEmit->nTimes; t++){
373 | 	fprintf( pref_fp, " %-3i %-3i", pickEmit->reads[t][s][idx], pickEmit->depths[t][s][idx]);
374 |       }
375 |       fprintf( pref_fp, "\n");
376 |     }
377 |   }
378 | }
379 | 


--------------------------------------------------------------------------------
/src/clone-prior.cpp:
--------------------------------------------------------------------------------
  1 | //clone-prior.cpp
  2 | 
  3 | //own headers...
  4 | #include "emission.h"
  5 | #include "log-space.h"
  6 | #include "clone.h"
  7 | 
  8 | using namespace std;
  9 | 
 10 | 
 11 | // CNA prior (only used for chr entry or w/o correlations)...
 12 | void Clone::set_cna_prior( gsl_vector * prior, int sample){
 13 |   // cna_pen_norm: penalty for being  different from the normal copy number
 14 |   // cna_pen_diff: penalty for having different copynumbers across clones
 15 |   // cna_pen_zero: penalty for zero total copies
 16 |   if (cnaEmit->is_set == 0) abort();
 17 |   if (nClones==0){
 18 |     gsl_vector_set_all(prior,1.0);
 19 |   }
 20 |   else{
 21 |     std::set<int> cns;
 22 |     int chr = cnaEmit->chr[sample];
 23 |     int ncn = normal_copy[chr];
 24 |     for (int i=0; i<nLevels; i++){
 25 |       cns.clear();
 26 |       double p = 1.0;
 27 |       for (int j=0; j<nClones; j++){
 28 | 	p *= pow( cna_pen_norm, abs(copynumber[i][j] - ncn));
 29 | 	if (copynumber[i][j] == 0) p *= cna_pen_zero;
 30 | 	if (copynumber[i][j] > maxtcn_per_clone[chr][j]) p = 0.0;
 31 | 	cns.insert( copynumber[i][j] );
 32 |       }
 33 |       p *= pow( cna_pen_diff, (int) cns.size());
 34 |       gsl_vector_set( prior, i, p);
 35 |     } 
 36 |     //normalize and logify...
 37 |     double norm = gsl_blas_dasum(prior);
 38 |     if (norm <= 0.0 || norm != norm) abort();
 39 |     gsl_vector_scale( prior, 1.0 / norm);
 40 |     if (cnaEmit->log_space){
 41 |       for (int l=0; l<nLevels;l++){ 
 42 | 	prior->data[l] = prior->data[l] > 0.0 ? log(prior->data[l]) : logzero;
 43 |       }
 44 |     }
 45 |   }
 46 | }
 47 | 
 48 | //used in SNV-only mode, w/o correlation and w/o cn-info...
 49 | void Clone::initialize_snv_prior_param(){// SNV prior, conditional on max-tcn
 50 |   if (initial_snv_prior_param != NULL) gsl_matrix_free(initial_snv_prior_param);
 51 |   initial_snv_prior_param = NULL;
 52 |   if (nClones == 0) return;
 53 |   initial_snv_prior_param = gsl_matrix_calloc( maxtcn+1, maxtcn+1);
 54 |   gsl_matrix_set( initial_snv_prior_param, 0, 0, snv_fpr);
 55 |   double p = snv_pen_high;// penalty for higher genotypes
 56 |   for (int cn=1; cn <= maxtcn; cn++){
 57 |     if ( all_maxtcn.count(cn) == 0 ) continue;
 58 |     gsl_vector_view subrow = gsl_matrix_subrow( initial_snv_prior_param, cn, 0, cn+1);
 59 |     gsl_vector_set( &subrow.vector, 0, p);//P0=P1>P2>... or P00=P10=P10=P11>P20...
 60 |     for (int i=1; i<=cn; i++) gsl_vector_set( &subrow.vector, i, pow( p, i));
 61 |     gsl_vector_scale( &subrow.vector, 1.0 / gsl_blas_dasum(&subrow.vector) );
 62 |   }
 63 |   //set above fixed priors
 64 |   Clone::set_snv_prior(initial_snv_prior_param);
 65 | }
 66 | 
 67 | 
 68 | //SNV-only mode, w/o cn-info...
 69 | void Clone::set_snv_prior( gsl_matrix * snv_prior_param){
 70 |   double fpr = gsl_matrix_get( snv_prior_param, 0, 0);
 71 |   snv_prior.clear();
 72 |   std::map<int, vector<int> >::iterator it;
 73 |   gsl_vector * mem = gsl_vector_calloc(nLevels);    
 74 |   for (it=maxtcn_per_clone.begin(); it != maxtcn_per_clone.end(); it++){
 75 |     int chr = it->first;
 76 |     mem->data[0] = 0;
 77 |     for (int i=1; i<nLevels; i++){
 78 |       double p=1.0;
 79 |       for (int j=0; j<nClones; j++){
 80 | 	int limit = maxtcn_per_clone[chr][j];
 81 | 	if (copynumber[i][j] <= limit){
 82 | 	  p *= limit==0 ? 1.0 : gsl_matrix_get( snv_prior_param, limit, copynumber[i][j]);
 83 | 	}
 84 | 	else{
 85 | 	  p = 0.0;
 86 | 	  break;
 87 | 	}
 88 |       }
 89 |       //gsl_vector_set( snv_prior[chr], i, p);
 90 |       mem->data[i] = p;
 91 |     }
 92 |     //normalize and logify...
 93 |     double norm = gsl_blas_dasum(mem);
 94 |     if (norm<=0) abort();
 95 |     gsl_vector_scale( mem, (1.0-fpr) / norm);
 96 |     gsl_vector_set( mem, 0, fpr);
 97 |     if (snvEmit->log_space){
 98 |       for (int l=0; l<nLevels; l++){
 99 | 	double val = gsl_vector_get( mem, l);
100 | 	gsl_vector_set( mem, l, val>0 ? log(val) : logzero);
101 |       }
102 |     }
103 |     snv_prior[chr] = gsl_vector_calloc(nLevels);
104 |     gsl_vector_memcpy(snv_prior[chr],mem);
105 |   }
106 |   gsl_vector_free(mem);
107 | }
108 | 
109 | //CNA + BAF (+SNV) mode...
110 | void Clone::set_baf_prior_map(){
111 |   if ( baf_prior_map == NULL){
112 |     baf_prior_map = gsl_matrix_alloc( maxtcn+1, maxtcn+1);
113 |   }
114 |   gsl_matrix_set_zero(baf_prior_map); 
115 |   double f = 0;
116 |   for (int cn=0; cn <= maxtcn; cn++){
117 |     for (int bcn=0; bcn <= cn; bcn++){//penalty for complex chromosome status 
118 |       f = pow( baf_pen_comp, int( fabs(double(bcn) - 0.5*double(cn))));
119 |       gsl_matrix_set( baf_prior_map, bcn, cn, f);
120 |     }
121 |     //normalize...
122 |     gsl_vector_view col = gsl_matrix_column( baf_prior_map,cn);
123 |     double norm = gsl_blas_dasum(&col.vector);
124 |     if (norm <= 0.0) abort();
125 |     gsl_vector_scale( &col.vector, 1.0 / norm);
126 |   }
127 | }
128 | 
129 | //CNA (+BAF) + SNV mode...
130 | void Clone::set_snv_prior_map(){//either via BAF or else via CNA
131 |   if (nClones == 0) abort();
132 |   if (cnaEmit->is_set == 0) abort();
133 |   //allocate
134 |   if (bafEmit->is_set){//via CNA + BAF posterior...
135 |     if ( snv_prior_from_cna_baf_map == NULL){
136 |       snv_prior_from_cna_baf_map = new gsl_matrix * [maxtcn+1];
137 |       for (int cn=0; cn <= maxtcn; cn++){ 
138 | 	snv_prior_from_cna_baf_map[cn] = gsl_matrix_alloc( maxtcn+1, maxtcn+1);
139 |       }
140 |     }
141 |     for (int cn=0; cn <= maxtcn; cn++){//cn=total cn
142 |       gsl_matrix_set_zero( snv_prior_from_cna_baf_map[cn]); 
143 |       for (int j=0; j<=cn; j++){//j=minor cn
144 | 	for (int i=0; i<= cn; i++){//penalty for multiple hit SNVs
145 | 	  double pen = pow( snv_pen_mult, max( 0, i - max(j,cn-j)) );
146 | 	  gsl_matrix_set( snv_prior_from_cna_baf_map[cn], i, j, pen);
147 | 	}
148 | 	//normalize...
149 | 	gsl_vector_view col = gsl_matrix_column( snv_prior_from_cna_baf_map[cn], j);
150 | 	double norm = gsl_blas_dasum(&col.vector);
151 | 	if (norm <=0.0) abort();
152 | 	gsl_vector_scale( &col.vector, 1.0 / norm);
153 |       }
154 |     } 
155 |   }
156 |   //via CNA posterior only...
157 |   if ( snv_prior_from_cna_map == NULL){//allocate
158 |     snv_prior_from_cna_map = gsl_matrix_alloc( maxtcn+1, maxtcn+1);
159 |   }
160 |   gsl_matrix_set_zero( snv_prior_from_cna_map);  
161 |   double pen = snvEmit->connect ? 1.0 : snv_pen_high;// penalty for high genotypes 
162 |   for (int cn=0; cn <= maxtcn; cn++){
163 |     gsl_matrix_set( snv_prior_from_cna_map, 0, cn, pen);
164 |     for (int i=1; i<=cn; i++){
165 |       gsl_matrix_set( snv_prior_from_cna_map, i, cn, pow(pen,i));
166 |     }
167 |     //normalize...
168 |     gsl_vector_view col = gsl_matrix_column( snv_prior_from_cna_map, cn);
169 |     double norm = gsl_blas_dasum(&col.vector);
170 |     if (norm <=0.0) abort();
171 |     gsl_vector_scale( &col.vector, 1.0 / norm);
172 |   }
173 | }
174 | 
175 | 
176 | 
177 | 
178 | // CNA + BAF (+SNV) mode
179 | void Clone::get_baf_prior_from_cna_post(gsl_vector * prior, gsl_vector * post){
180 |   gsl_vector * post_per_clone  = gsl_vector_calloc( nClones*(maxtcn+1) );
181 |   gsl_matrix * prior_per_clone = gsl_matrix_calloc( nClones, maxtcn+1);
182 |   gsl_blas_dgemv( CblasNoTrans, 1.0, margin_map, post, 0.0, post_per_clone);
183 |   gsl_vector_view po_pc,pr_pc;
184 |   for (int i=0; i<nClones; i++){
185 |     po_pc = gsl_vector_subvector( post_per_clone, i*(maxtcn+1), maxtcn+1);
186 |     pr_pc = gsl_matrix_row( prior_per_clone, i);
187 |     gsl_blas_dgemv( CblasNoTrans, 1.0, baf_prior_map, &po_pc.vector, 0.0, &pr_pc.vector);
188 |   }
189 |   gsl_vector_set_all( prior, 1.0);
190 |   for (int i=0; i<nLevels; i++){
191 |     for (int j=0; j<nClones; j++){
192 |       prior->data[i] *= gsl_matrix_get( prior_per_clone, j, copynumber[i][j]);
193 |     }
194 |   }
195 |   if (bafEmit->log_space){//log-transform?
196 |     for (int l=0; l<nLevels; l++){
197 |       prior->data[l] = prior->data[l] > 0.0 ? log(prior->data[l]) : logzero;
198 |     }
199 |   }
200 |   gsl_matrix_free(prior_per_clone);
201 |   gsl_vector_free(post_per_clone);
202 | }
203 | 
204 | 
205 | // CNA + SNV mode...
206 | void Clone::get_snv_prior_from_cna_post(gsl_vector * prior, gsl_vector * cnapost){
207 |   gsl_vector * cnapostpc = gsl_vector_calloc( nClones*(maxtcn+1));
208 |   gsl_matrix * snv_prpc  = gsl_matrix_calloc( nClones, maxtcn+1);
209 |   gsl_blas_dgemv( CblasNoTrans, 1.0, margin_map, cnapost, 0.0, cnapostpc);
210 |   gsl_vector_view cnappc,prpc;
211 |   for (int i=0; i<nClones; i++){
212 |     cnappc = gsl_vector_subvector( cnapostpc, i*(maxtcn+1), maxtcn+1);
213 |     prpc = gsl_matrix_row( snv_prpc, i);
214 |     gsl_blas_dgemv( CblasNoTrans, 1.0, snv_prior_from_cna_map, &cnappc.vector, 0.0, &prpc.vector);
215 |   }
216 |   Clone::apply_snv_prpc( prior, snv_prpc, cnapost->data[0]);
217 |   gsl_matrix_free(snv_prpc);
218 |   gsl_vector_free(cnapostpc);
219 | }
220 | 
221 | 
222 | 
223 | 
224 | //CNA + BAF + SNV mode...
225 | void Clone::get_snv_prior_from_cna_baf_post(gsl_vector * prior, gsl_vector * cnapost, gsl_vector * bafpost){
226 |   gsl_vector * cnapostpc  = gsl_vector_calloc( nClones*(maxtcn+1));
227 |   gsl_vector * bafpostpc  = gsl_vector_calloc( nClones*(maxtcn+1));
228 |   gsl_matrix * snv_prpc   = gsl_matrix_calloc( nClones, maxtcn+1);
229 |   gsl_blas_dgemv( CblasNoTrans, 1.0, margin_map, cnapost, 0.0, cnapostpc);
230 |   gsl_blas_dgemv( CblasNoTrans, 1.0, margin_map, bafpost, 0.0, bafpostpc);
231 |   gsl_vector_view prpc;
232 |   gsl_vector * bafppc = gsl_vector_alloc(maxtcn+1);
233 |   for (int i=0; i<nClones; i++){
234 |     prpc   = gsl_matrix_row( snv_prpc, i);
235 |     for (int cn=0; cn<=maxtcn; cn++){
236 |       gsl_vector_set_zero(bafppc); 
237 |       double norm=0;
238 |       for (int j=0; j<=cn;j++){
239 | 	bafppc->data[j] = gsl_vector_get( bafpostpc, i*(maxtcn+1)+j) + 1.0e-10;
240 | 	norm += bafppc->data[j];
241 |       }
242 |       if (norm <= 0.0) abort();
243 |       gsl_vector_scale(bafppc,1.0/norm);
244 |       double pcna = gsl_vector_get( cnapostpc, i*(maxtcn+1) + cn);
245 |       gsl_blas_dgemv( CblasNoTrans, pcna, snv_prior_from_cna_baf_map[cn], bafppc, 1.0, &prpc.vector);
246 |     }
247 |   }
248 |   Clone::apply_snv_prpc( prior, snv_prpc, cnapost->data[0]);
249 |   gsl_matrix_free(snv_prpc);
250 |   gsl_vector_free(cnapostpc);
251 |   gsl_vector_free(bafpostpc);
252 |   gsl_vector_free(bafppc);
253 | }
254 | 
255 | 
256 | //CNA (+BAF) + SNV mode...
257 | void Clone::apply_snv_prpc( gsl_vector * prior, gsl_matrix * snv_prpc,  double pzero){
258 |   gsl_vector_set_all( prior, 1.0);
259 |   for (int level=0; level<nLevels; level++){
260 |     for (int j=0; j<nClones; j++){
261 |       prior->data[level] *= gsl_matrix_get( snv_prpc, j, copynumber[level][j]);
262 |     }
263 |   }
264 |   //normalize...
265 |   if ( !snvEmit->connect ){
266 |     prior->data[0] = 0.0;
267 |     double norm = gsl_blas_dasum(prior);
268 |     if (norm > 0.0 ) gsl_vector_scale(prior, (1.0-snv_fpr)*(1.0-pzero) / norm);
269 |     prior->data[0] = 1.0 - (1.0-snv_fpr)*(1.0-pzero);//SNV false positive rate
270 |   }
271 |   else{
272 |     double norm = gsl_blas_dasum(prior);
273 |     if (norm <=0.0 || norm != norm) abort();
274 |     gsl_vector_scale(prior, 1.0 / norm);
275 |   }
276 |   //log-transform?
277 |   if (snvEmit->log_space){
278 |     for (int l=0; l<nLevels; l++){
279 |       prior->data[l] = prior->data[l] > 0.0 ? log(prior->data[l]) : logzero;
280 |     }
281 |   }
282 | }
283 | 
284 | 
285 | 
286 | 
287 | //get mean total copy number...
288 | void Clone::get_mean_tcn(int sample){//only ever used for cnaEmit
289 |   int chr = cnaEmit->chr[sample];
290 |   if (nClones == 0){   
291 |     for (int t=0; t<nTimes; t++){
292 |       for (int evt=0; evt < cnaEmit->nEvents[sample]; evt++){
293 |        cnaEmit->mean_tcn[t][sample][evt] = tcn[chr][t][0];
294 |       }
295 |     }
296 |   }
297 |   else{//nClones > 0
298 |     if (gamma_cna[sample] == NULL) abort();
299 |     double mn;
300 |     for (int t=0; t<nTimes; t++){
301 |       for (int evt=0; evt < cnaEmit->nEvents[sample]; evt++){
302 |        gsl_vector_view TCN  = gsl_vector_view_array( tcn[chr][t], nLevels);
303 |        gsl_vector_view post = gsl_matrix_row( gamma_cna[sample], evt);
304 |        gsl_blas_ddot( &TCN.vector, &post.vector, &mn);
305 |        if (mn<=0.0){
306 | 	 cout<<"ERROR\n";
307 | 	 abort();
308 |        }
309 |        cnaEmit->mean_tcn[t][sample][evt] = mn;
310 |       }
311 |     }
312 |   }
313 | }
314 | 
315 | void Clone::map_mean_tcn( Emission * fromEmit, int fromSample, Emission * toEmit){
316 |   int fromChr = fromEmit->chr[fromSample];
317 |   if (toEmit->chrs.count(fromChr) == 0) abort();
318 |   if (toEmit->mean_tcn==NULL) abort();
319 |   int toSample = toEmit->idx_of[fromChr];
320 |   for (int evt=0; evt < toEmit->nEvents[toSample]; evt++){
321 |     int idx = toEmit->idx_of_event[toSample][evt];
322 |     int fromEvt = toEmit->Event_of_idx[toSample][idx];
323 |     for (int t=0; t<nTimes; t++){//mean total copynumber
324 |       toEmit->mean_tcn[t][toSample][evt] = fromEmit->mean_tcn[t][fromSample][fromEvt];
325 |     }
326 |   }
327 | }
328 | 
329 | 
330 | 
331 | void Clone::get_avail_cn(Emission * myEmit, int sample){
332 |   int chr = myEmit->chr[sample];
333 |   if (nClones==0){
334 |     for (int t=0; t<nTimes; t++){
335 |       for (int evt=0; evt < myEmit->nEvents[sample]; evt++){
336 |        for (int cn=0; cn<=maxtcn; cn++){
337 |          if (myEmit==cnaEmit){
338 |            myEmit->av_cn[t][sample][evt][cn] = (cn <= normal_copy[chr]) ? 1.0 : 0.0;
339 |          }
340 |          else if (myEmit==bafEmit){
341 |            myEmit->av_cn[t][sample][evt][cn] = (cn <= 1) ? 1.0 : 0.0;
342 |          }
343 |        }
344 |       }
345 |     }
346 |   }
347 |   else{//nClones > 0
348 |     gsl_matrix * gamma = NULL;
349 |     if (myEmit == cnaEmit) gamma = gamma_cna[sample];
350 |     if (myEmit == bafEmit) gamma = gamma_baf[sample];
351 |     if (gamma== NULL) abort();
352 |     double val=0;
353 |     gsl_vector * post_per_clone = gsl_vector_alloc((maxtcn+1)*nClones);
354 |     int * cnest = new int [nClones];//conservative estimate
355 |     for (int t=0; t<nTimes; t++){
356 |       for (int evt=0; evt < myEmit->nEvents[sample]; evt++){
357 |         gsl_vector_view post = gsl_matrix_row(gamma,evt);
358 |         gsl_blas_dgemv(CblasNoTrans, 1.0, margin_map, &post.vector, 0.0, post_per_clone);
359 |         for (int j=0; j<nClones; j++){
360 |           val=0;
361 |           for (int cn=0; cn<=maxtcn; cn++){
362 |             val += gsl_vector_get( post_per_clone, j*(maxtcn+1) + cn);
363 |             if (val > 0.99){
364 |               cnest[j] = cn;
365 |               break;
366 |             }
367 |           }
368 |         }
369 |         for (int cn=0; cn<=maxtcn; cn++){
370 |           double av = 0.0;
371 |           if (myEmit == cnaEmit && cn <= normal_copy[chr]) av += 1.0-purity[t];
372 |           if (myEmit == bafEmit && cn <= 1) av += 1.0-purity[t];
373 |           for (int j=0;j<nClones;j++){
374 |             if (cn<=cnest[j]) av += gsl_matrix_get(freqs,t,j);
375 |           }
376 |           myEmit->av_cn[t][sample][evt][cn] = av;
377 |         }
378 |       }
379 |     }
380 |     gsl_vector_free(post_per_clone);
381 |   }
382 | }
383 | 
384 | 
385 | void Clone::get_snv_prior_from_av_cn(gsl_vector * prior, int sample, int evt){
386 |   if (snvEmit->av_cn==NULL) abort();
387 |   int snvChr = snvEmit->chr[sample];
388 |   int found=0;
389 |   prior->data[0] = 0;
390 |   for (int l=1; l<nLevels; l++){
391 |     found=0;
392 |     prior->data[l] = (snv_prior[snvChr])->data[l];
393 |     for (int t=0;t<nTimes;t++){//genotype not available?
394 |       for (int cn=0; cn<=maxtcn; cn++){
395 |        if (cn_usage[t][cn][l] > snvEmit->av_cn[t][sample][evt][cn]){
396 |          found = 1;
397 | 	 prior->data[l] *= snv_pen_mult;//multiple hit SNV
398 |          break;
399 |        }
400 |        if (found) break;
401 |       }
402 |       if (found) break;
403 |     }
404 |   }
405 |   double norm = gsl_blas_dasum(prior);
406 |   if (norm <= 0.0) abort();
407 |   gsl_vector_scale(prior, (1.0-snv_fpr)/norm);
408 |   prior->data[0] = snv_fpr;
409 | }
410 | 


--------------------------------------------------------------------------------
/src/minimization.cpp:
--------------------------------------------------------------------------------
  1 | //minimization.cpp
  2 | 
  3 | #include "minimization.h"
  4 | 
  5 | #define PI 3.1415926
  6 | #define LOG2 0.693147
  7 | 
  8 | using namespace std;
  9 | 
 10 | 
 11 | 
 12 | double find_local_optimum(
 13 | 			  int nSimplex,
 14 | 			  gsl_vector**& simplex,
 15 | 			  gsl_vector * lower,
 16 | 			  gsl_vector * other,
 17 | 			  gsl_vector * range,
 18 | 			  void * params,
 19 | 			  double (*obj_fn)( const gsl_vector * x, void * p),
 20 | 			  double prec,
 21 | 			  int& steps,
 22 | 			  int verbose
 23 | 			  ){
 24 |   // Here starts the minimizing step...
 25 |   int iter = 0, max_iter = 1.0e4; // max no. iterations
 26 |   int status=0,ct=0;
 27 |   double f1=0, f2=0;
 28 |   const gsl_multimin_fminimizer_type * T = NULL;
 29 |   gsl_multimin_fminimizer * s = NULL;
 30 |   gsl_multimin_function my_func;
 31 |   gsl_vector *  x = NULL;
 32 |   // map the arguments to a single vector
 33 |   arg_map( nSimplex, simplex, lower, other, range, &x);
 34 |   int nvar = (int) x->size;
 35 |   // Give the elements of the function-to-minimize object...
 36 |   my_func.n       = nvar;
 37 |   my_func.f       = obj_fn;
 38 |   my_func.params  = params;
 39 |   gsl_vector * dx = gsl_vector_alloc(nvar);
 40 |   // initial displacement vector...
 41 |   for (int i=0; i<nvar; i++){
 42 |     gsl_vector_set( dx, i, 0.1 * fabs( gsl_vector_get( x, i) ) + 0.001);
 43 |   }
 44 |   // Define type of minimization procedure...
 45 |   T = gsl_multimin_fminimizer_nmsimplex2;
 46 |   s = gsl_multimin_fminimizer_alloc( T, x->size);
 47 |   gsl_multimin_fminimizer_set( s, &my_func, x, dx);
 48 |   // Now iterate to find the minimum...
 49 |   do{
 50 |     iter++;
 51 |     steps++;
 52 |     status = gsl_multimin_fminimizer_iterate(s);     
 53 |     if (status) break;
 54 |     status = gsl_multimin_test_size( gsl_multimin_fminimizer_size(s), prec);
 55 |     // stop-criterion
 56 |     f2 = gsl_multimin_fminimizer_minimum(s);
 57 |     if (iter > 10 && f1 != f2){
 58 |       if (f1-f2 > 0.0 && f1-f2 < 1.0){
 59 | 	ct++;
 60 | 	if (ct==10) status = 1;
 61 |       }
 62 |       else{
 63 | 	ct=0;
 64 |       }
 65 |       f1=f2;
 66 |     }    
 67 |     //
 68 |     if (verbose==1 && iter % 10 == 0){
 69 |       printf("%4i ", iter);
 70 |       arg_unmap( s->x, nSimplex, simplex, lower, other, range);
 71 |       if (simplex != NULL){
 72 | 	for (int i=0; i< nSimplex; i++){
 73 | 	  for (int j=0; j< (int) (simplex[i])->size; j++){
 74 | 	    printf("%.5e ", gsl_vector_get( simplex[i],j) );
 75 | 	  }
 76 | 	  printf("/ ");
 77 | 	}
 78 | 	printf("/ ");
 79 |       }
 80 |       if (other != NULL){
 81 | 	for (int i=0;i<(int)other->size;i++) printf("%.5e ", other->data[i]);
 82 |       }
 83 |       printf("%.10e", s->fval);
 84 |       cout<<endl;
 85 |     }
 86 |   }
 87 |   while (status == GSL_CONTINUE && iter < max_iter);
 88 |   //back-transformation...
 89 |   int err = arg_unmap( gsl_multimin_fminimizer_x(s), nSimplex, simplex, lower, other, range);
 90 |   if (err==1) abort();
 91 |   double fmin = gsl_multimin_fminimizer_minimum(s);
 92 |   // cleanup...
 93 |   gsl_multimin_fminimizer_free(s);
 94 |   gsl_vector_free(x);
 95 |   gsl_vector_free(dx);
 96 |   return(fmin);
 97 | }
 98 | 
 99 | double find_optimum_wrestarts( int nSimplex,
100 | 			       gsl_vector**& simplex,
101 | 			       gsl_vector * lower,
102 | 			       gsl_vector * other,  
103 | 			       gsl_vector * range,  
104 | 			       void * params,
105 | 			       double (*obj_fn)( const gsl_vector * x, void * p),
106 | 			       double prec,
107 | 			       int restarts,
108 | 			       int& steps,
109 | 			       int verbose
110 | 			       ){
111 |   steps=0;
112 |   int talk=0;
113 |   double fbest = find_local_optimum( nSimplex, simplex, lower, other, range, params, obj_fn, prec, steps, verbose);
114 |   if (restarts > 0 && nSimplex>0 && simplex != NULL){
115 |     double eps = 0.99;
116 |     gsl_vector ** simplex_best = new gsl_vector * [nSimplex];
117 |     for (int i=0; i<nSimplex;i++){
118 |       simplex_best[i] = gsl_vector_alloc(simplex[i]->size);
119 |       gsl_vector_memcpy(simplex_best[i], simplex[i]);
120 |     }
121 |     gsl_vector * other_best=NULL;
122 |     if (other!=NULL){
123 |       other_best = gsl_vector_alloc(other->size);
124 |       gsl_vector_memcpy(other_best,other);
125 |     }
126 |     if (talk) printf("%-3i: %.8e\n", 0, fbest);
127 |     cout<<flush;
128 |     int ct=1;
129 |     while (restarts>0){
130 |       if (talk) printf("\r%-3i: ", ct);
131 |       cout<<flush;
132 |       for (int i=0; i<nSimplex;i++){
133 | 	simplex_random_step_uniform(simplex_best[i], simplex[i], lower->data[i], eps);
134 |       }
135 |       if (other!=NULL) gsl_vector_memcpy(other,other_best);
136 |       //find new local optimum...
137 |       double ftest = find_local_optimum( nSimplex, simplex, lower, other, range, params, obj_fn, prec, steps, verbose);
138 |       //test new value...
139 |       if (ftest < fbest){
140 | 	for (int i=0; i<nSimplex;i++) gsl_vector_memcpy(simplex_best[i], simplex[i]);
141 | 	if (other!=NULL) gsl_vector_memcpy(other_best,other);
142 | 	fbest = ftest;
143 | 	eps *= 0.95;
144 | 	if (talk) printf("\r%-3i: %.8e\n", ct, fbest);
145 |       }
146 |       restarts--;
147 |       ct++;
148 |     }
149 |     //set to best...
150 |     for (int i=0; i<nSimplex;i++) gsl_vector_memcpy(simplex[i],simplex_best[i]);
151 |     if (other!=NULL) gsl_vector_memcpy(other,other_best);
152 |     if (talk) printf("\r");
153 |     cout<<flush;
154 |     //cleanup...
155 |     for (int i=0; i<nSimplex;i++) gsl_vector_free(simplex_best[i]);
156 |     delete [] simplex_best;
157 |     if (other!=NULL) gsl_vector_free(other_best);
158 |   }
159 |   return (fbest);
160 | }
161 | 
162 | 
163 | void spherical_random_step_uniform( double ri, double& rf, double lower,
164 | 				    const gsl_vector * anglei, gsl_vector*& anglef,
165 | 				    double eps){
166 |   int n = (anglei!=NULL) ? (int) anglei->size + 1 : 1;
167 |   double p=0,na=0,r=0;
168 |   for (int i=0; i<n-1; i++){
169 |     p = double(rand()) / double(RAND_MAX);
170 |     p = (p - 0.5) * eps * 0.5*PI;
171 |     //na = anglei->data[i];
172 |     na = anglei->data[i] + p;
173 |     na = max( na, -na);
174 |     na = min( na, PI-na);
175 |     anglef->data[i] = na;
176 |   }
177 |   p = double(rand()) / double(RAND_MAX);
178 |   p = (p - 0.5)*eps*(1.0-lower);
179 |   r = ri+p;
180 |   r=max(r,2*lower-r);
181 |   rf=min(r,2-r);
182 | }
183 | 
184 | void simplex_random_step_uniform(const gsl_vector*simplexi, gsl_vector*& simplexf,
185 | 				 double lower, double eps){
186 |   int n = (int) simplexi->size;
187 |   gsl_vector * anglei = NULL,* anglef = NULL;
188 |   double ri=0,rf=0,L=0,l=0,p=0;
189 |   if (n>1){
190 |     anglei = gsl_vector_alloc(n-1);
191 |     anglef = gsl_vector_alloc(n-1);
192 |   }
193 |   simplex_to_spherical( simplexi, ri, anglei);  
194 |   L = spherical_to_simplex( ri, anglei, simplexf, 1);
195 |   int acc=0;
196 |   double low=sqrt(lower);
197 |   while (acc==0){
198 |     spherical_random_step_uniform( ri, rf, low, anglei, anglef, eps);
199 |     l = spherical_to_simplex( rf, anglef, simplexf, 1);
200 |     p = double(rand())/double(RAND_MAX);
201 |     if ( l>L || p < exp(l-L)) acc=1;
202 |   }
203 |   if (anglei!=NULL) gsl_vector_free(anglei);
204 |   if (anglef!=NULL) gsl_vector_free(anglef);
205 | }
206 | 
207 | 
208 | 
209 | 
210 | // map the points on/in simplex to spherical coordinates and then to REAL
211 | void arg_map( 
212 | 	     int nSimplex, 
213 | 	     gsl_vector**& simplex, 
214 | 	     gsl_vector * lower, 
215 | 	     const gsl_vector * other, 
216 | 	     const gsl_vector * range, 
217 | 	     gsl_vector ** x
218 | 	      ){
219 |   if (nSimplex == 0 && other == NULL) abort();
220 |   int m=0;
221 |   if (other != NULL) m = (int) other->size;
222 |   int nvar=m;
223 |   if (nSimplex>0){
224 |     if (lower==NULL || (int) lower->size != nSimplex) abort();
225 |     for (int t=0; t<nSimplex; t++){
226 |       int n = (simplex[t])->size;
227 |       nvar += (lower->data[t] < 1.0) ? n : n-1;//inside or on the simplex?
228 |     }
229 |   }
230 |   if (nvar == 0) abort();
231 |   if (*x != NULL) gsl_vector_free(*x);
232 |   *x = gsl_vector_alloc(nvar);//allocate here
233 |   gsl_vector * angle = NULL;
234 |   double radial=0;
235 |   // set transformed variables...
236 |   int ct=0;
237 |   for (int t=0; t<nSimplex; t++){    
238 |     int n = (simplex[t])->size;
239 |     if (n==1 && lower->data[t] < 1.0){//only one freq in [lower_t,1]
240 |       radial = gsl_vector_get(simplex[t],0);
241 |       gsl_vector_set( *x, ct, sqrt(radial));
242 |       ct++;
243 |     }
244 |     else if (n>1){
245 |       if (angle != NULL) gsl_vector_free(angle);
246 |       angle = gsl_vector_alloc(n-1);
247 |       simplex_to_spherical( simplex[t], radial, angle);
248 |       if (lower->data[t] < 1.0){//inside
249 | 	gsl_vector_set( *x, ct, radial);
250 | 	ct++;
251 |       }
252 |       for (int j=0; j<n-1; j++){
253 | 	gsl_vector_set( *x, ct, gsl_vector_get(angle,j));
254 | 	ct++;
255 |       }
256 |     }
257 |   }
258 |   if (angle != NULL) gsl_vector_free(angle);
259 |   if (ct != nvar - m) abort();
260 |   // remaining variables...
261 |   for (int k=0; k<m; k++){
262 |     double R = gsl_vector_get(range,k);
263 |     if (R > 0.0){
264 |       gsl_vector_set( *x, ct, logify( gsl_vector_get(other,k), R));
265 |     }
266 |     else if (R==0.0){
267 |       gsl_vector_set( *x, ct, log( gsl_vector_get(other,k) ));
268 |     }
269 |     else{
270 |       gsl_vector_set( *x, ct, gsl_vector_get(other,k) );
271 |     }
272 |     ct++;
273 |   }
274 | }
275 | 
276 | 
277 | int arg_unmap( 
278 | 	      const gsl_vector * x, 
279 | 	      int nSimplex, 
280 | 	      gsl_vector**& simplex, 
281 | 	      gsl_vector * lower,
282 | 	      gsl_vector * other,
283 | 	      const gsl_vector * range
284 | 	       ){
285 |   if ( nSimplex > 0 ){
286 |     if (lower == NULL || (int) lower->size != nSimplex) abort();
287 |   } 
288 |   // test dimensionality...
289 |   int test=0;
290 |   for (int t=0; t<nSimplex; t++){
291 |     int n = (int) (simplex[t])->size;
292 |     test += (lower->data[t] < 1.0) ? n : n-1;//inside or on simplex?
293 |   }
294 |   if (other != NULL) test += (int) other->size;
295 |   int nvar = (int) x->size;
296 |   if (test != nvar) abort();
297 |   //...passed test
298 |   double radial,val;
299 |   gsl_vector * angle = NULL;
300 |   int ct=0;
301 |   for (int t=0; t<nSimplex; t++){ 
302 |     if (lower->data[t] < 1.0){//inside simplex
303 |       radial = x->data[ct];
304 |       if (radial < sqrt(lower->data[t]) || radial > 1.0) return(1);//test radial bounds
305 |       ct++;
306 |     }
307 |     else{//or on the surface
308 |       radial = 1.0;
309 |     }
310 |     int n = (int) (simplex[t])->size;
311 |     if (n>1){
312 |       if (angle != NULL) gsl_vector_free(angle);
313 |       angle = gsl_vector_alloc(n-1);
314 |       for (int j=0; j<n-1; j++){
315 | 	gsl_vector_set( angle, j, x->data[ct]);
316 | 	if (angle->data[j] < 0.0 || angle->data[j] > 0.5*PI){//angle range test
317 | 	  gsl_vector_free(angle);
318 | 	  return(1);
319 | 	}
320 | 	ct++;
321 |       }
322 |       spherical_to_simplex( radial, angle, simplex[t], 0);
323 |     }
324 |     else{// n2==1
325 |       gsl_vector_set( simplex[t], 0, pow(radial,2));
326 |     }
327 |   }
328 |   if (angle != NULL) gsl_vector_free(angle);
329 |   if (other == NULL) return(0);
330 |   for (int k=0; k<(int) other->size; k++){
331 |     double R = gsl_vector_get(range,k);
332 |     val = x->data[ct];
333 |     if (R>0.0){
334 |       if ( val < -20.0 || val > 20.0) return(1);
335 |       other->data[k] = delogify( val, R);
336 |     }
337 |     else if ( R == 0.0){
338 |       if (val<-20.0 || val > 20.0) return(1);
339 |       other->data[k] = exp(val);
340 |     }
341 |     else{
342 |       other->data[k] = val;
343 |     }
344 |     ct++;
345 |   }
346 |   return(0);
347 | }
348 | 
349 | 
350 | double logify( double x, double R){
351 |   if (x==0.0){
352 |     return(-20.0);
353 |   }
354 |   else if (x<0.5*R){
355 |     return( log(x) - log(0.5*R));
356 |   }
357 |   else if (x<R){
358 |     return( log(0.5*R) - log(R-x));
359 |   }
360 |   else{
361 |     return(20.0);
362 |   }
363 | }
364 | 
365 | double delogify(double y, double R){
366 |   if (y<0.0){
367 |     return( 0.5*R*exp(y) );
368 |   }
369 |   else{
370 |     return( R-0.5*R*exp(-y) );
371 |   }
372 | }
373 | 
374 | 
375 | 
376 | double spherical_to_simplex( double radial, const gsl_vector * angle, gsl_vector *& simplex, int getLJD){
377 |   int n = simplex->size;
378 |   if (angle != NULL && n-1 != (int) angle->size) abort();
379 |   double LJD=0.0;
380 |   if (getLJD) LJD = double(n)*log(2.0) + (2.0*double(n)-1.0)*log(radial);
381 |   if (n>1){
382 |     double * sinV = new double[n-1];
383 |     double * cosV = new double[n-1];
384 |     for ( int i =0; i<n-1; i++){
385 |       sinV[i] = sin(gsl_vector_get(angle,i));
386 |       cosV[i] = cos(gsl_vector_get(angle,i));
387 |       if (getLJD){
388 | 	LJD += log(cosV[i]);
389 | 	LJD += (2.0*double(n-i)-1.0)*log(sinV[i]);
390 |       }
391 |     }
392 |     double mem=1;
393 |     for ( int i=0; i<n-1; i++){
394 |       gsl_vector_set( simplex, i, mem * cosV[i]);
395 |       mem = mem*sinV[i];
396 |     }
397 |     gsl_vector_set( simplex, n-1, mem);
398 |     for ( int i=0; i<n; i++) simplex->data[i] = pow(simplex->data[i],2);
399 |     double norm = gsl_blas_dasum(simplex);
400 |     if (norm<=0.0) abort();
401 |     gsl_vector_scale( simplex, pow(radial,2) / norm);
402 |     delete [] sinV;
403 |     delete [] cosV;
404 |   }
405 |   else{
406 |     simplex->data[0] = pow(radial,2);
407 |   }
408 |   return(LJD);
409 | }
410 | 
411 | 
412 | 
413 | 
414 | void simplex_to_spherical( const gsl_vector * simplex, double& radial, gsl_vector*& angle){
415 |   int n = simplex->size;
416 |   if (angle != NULL && n-1 != (int) angle->size) abort();
417 |   if (n>1){
418 |     double partSum = gsl_vector_get( simplex, n-1) + gsl_vector_get( simplex, n-2);
419 |     double val=0;
420 |     double x = gsl_vector_get( simplex, n-1);
421 |     if (x > 0.0){
422 |       val =  (sqrt(partSum) + sqrt(gsl_vector_get( simplex, n-2))) / sqrt(x);
423 |       gsl_vector_set( angle, n-2,  PI - 2.0*atan(val));
424 |     }
425 |     else if (x == 0.0){
426 |       gsl_vector_set( angle, n-2, 0.0);
427 |     }
428 |     else abort();
429 |     for ( int i=3; i<n+1; i++){
430 |       val = sqrt( gsl_vector_get(simplex,n-i) / partSum);
431 |       val = PI/2.0 - atan(val);
432 |       if ( val<0 || val>0.5*PI) abort();
433 |       gsl_vector_set( angle, n-i, val);
434 |       partSum += gsl_vector_get( simplex, n-i);
435 |     }
436 |     radial = sqrt(partSum);
437 |   }
438 |   else{
439 |     radial = sqrt(simplex->data[0]);
440 |   }
441 | }
442 | 
443 | 
444 | 
445 | 
446 | 
447 | /*
448 | struct rs_func{
449 |   double (*f)(gsl_vector * x, void * p);
450 |   int n;
451 |   void * params;
452 |   gsl_vector * guess;
453 |   double eps;
454 |   int steps;
455 |   int on_simplex;
456 | };
457 | 
458 | void random_search( rs_func * rsf){
459 |   gsl_vector * start = gsl_vector_alloc(rsf->n);
460 |   gsl_vector * best  = gsl_vector_alloc(rsf->n);
461 |   gsl_vector * x     = gsl_vector_alloc(rsf->n);
462 |   gsl_vector_memcpy(start,rsf->guess);
463 |   gsl_vector_memcpy(best,rsf->guess);
464 |   gsl_vector_memcpy(x,rsf->guess);
465 |   double eps = rsf->eps;
466 |   double curr_f = (*rsf->f)(start,rsf->params);
467 |   double best_f = curr_f;
468 |   double test_f = 0;
469 |   int acc=0, rej=0;
470 |   for (int t=0; t<rsf->steps; t++){
471 |     random_step(best,x,eps);
472 |     test_f = (*rsf->f)(x,rsf->params);
473 |     if (test_f < best_f){
474 |       best_f = test_f;
475 |       gsl_vector_memcpy(best,x);
476 |       acc++;
477 |     }
478 |     else{
479 |       rej++;
480 |     }
481 |   }
482 | }
483 | 
484 | 
485 | void random_step(gsl_vector * init, gsl_vector * x, double eps){
486 |   int n = x->size;
487 |   double p, val;
488 |   for( int i=0; i<n; i++){
489 |     p = (double) rand() / RAND_MAX;
490 |     p = (1.0 - 2.0*p) * eps;
491 |     val = gsl_vector_get(init,i) * (1.0+p);
492 |     gsl_vector_set( x, i, val);
493 |   }
494 | }
495 | */
496 | 


--------------------------------------------------------------------------------
/src/jump-diffusion.cpp:
--------------------------------------------------------------------------------
  1 | //jump-diffusion.cpp
  2 | 
  3 | //own headers...
  4 | #include "emission.h"
  5 | #include "log-space.h"
  6 | #include "jump-diffusion.h"
  7 | #include "common-functions.h"
  8 | 
  9 | #define PI 3.1415926
 10 | 
 11 | // Constructor
 12 | JumpDiffusion::JumpDiffusion( Emission * emit, int t){
 13 |   myEmit   = emit;
 14 |   time     = t;
 15 |   nSamples = myEmit->nSamples;
 16 |   nSites   = myEmit->nSites;
 17 |   dist     = myEmit->dist;
 18 |   loci     = myEmit->loci;
 19 |   mask     = myEmit->mask;
 20 |   mode     = myEmit->mode;
 21 |   //bias     = myEmit->bias;
 22 |   gridSize = myEmit->gridSize;
 23 |   jump      = -1.0;
 24 |   sigma     = -1.0;
 25 |   rnd_emit  = -1.0;
 26 |   wTotal    = 0;
 27 |   pstay     = new double * [nSamples];
 28 |   pjump     = new double * [nSamples];
 29 |   pnojump   = new double * [nSamples];
 30 |   // matrices...
 31 |   alpha  = new gsl_matrix * [nSamples];
 32 |   gamma  = new gsl_matrix * [nSamples];  
 33 |   proposal = gsl_vector_alloc(gridSize+1);
 34 |   gsl_vector_set_all( proposal, 1.0);// uniform proposal distribution on [0,1]
 35 |   for (int s=0; s<nSamples; s++){
 36 |     pstay[s]   = new double [nSites[s]];
 37 |     pjump[s]   = new double [nSites[s]];
 38 |     pnojump[s] = new double [nSites[s]];
 39 |     alpha[s] = NULL;
 40 |     gamma[s] = NULL;
 41 |   }
 42 |   pstay_set  = 0;
 43 |   save_alpha = 0;
 44 |   DiffProp = NULL;
 45 |   DiffProp_set = 0;
 46 | }
 47 | 
 48 | JumpDiffusion::~JumpDiffusion(){
 49 |   for (int s=0; s<nSamples; s++){
 50 |     delete [] pstay[s];
 51 |     delete [] pjump[s];
 52 |     delete [] pnojump[s];
 53 |     if (alpha[s] != NULL) gsl_matrix_free(alpha[s]);
 54 |     if (gamma[s] != NULL) gsl_matrix_free(gamma[s]);
 55 |   }
 56 |   delete [] alpha;
 57 |   delete [] pstay;
 58 |   delete [] pjump;
 59 |   delete [] pnojump;
 60 |   delete [] gamma;
 61 |   JumpDiffusion::reset_DiffProp();
 62 | }
 63 | 
 64 | double JumpDiffusion::get_total_llh(){
 65 |   save_alpha  = 0;
 66 |   if (myEmit->EmitProb_set == 0){
 67 |     myEmit->set_EmitProb(time);
 68 |   }
 69 |   gsl_vector_set_all(proposal, 1.0/(myEmit->xmax - myEmit->xmin));
 70 |   //compute the staying probabilities..
 71 |   if (pstay_set == 0)    JumpDiffusion::set_pstay();
 72 |   if (DiffProp_set == 0) JumpDiffusion::get_DiffProp();
 73 |   int sample;
 74 |   total_llh   = 0.0;
 75 |   // SAMPLES:
 76 | #ifdef _OPENMP
 77 |   int nt = min( nSamples, omp_get_max_threads());
 78 | #pragma omp parallel for schedule( dynamic, 1) default(shared) num_threads(nt)
 79 | #endif
 80 |   for (sample=0; sample<nSamples; sample++){
 81 |     double llh = JumpDiffusion::do_Fwd(sample);
 82 | #ifdef _OPENMP
 83 | #pragma omp critical
 84 | #endif
 85 |     {
 86 |       total_llh += llh;
 87 |     }
 88 |   }
 89 |   return(total_llh);
 90 | }
 91 | 
 92 | void JumpDiffusion::get_posterior(int sample){
 93 |   if (alpha[sample] != NULL) gsl_matrix_free(alpha[sample]);
 94 |   if (gamma[sample] != NULL) gsl_matrix_free(gamma[sample]);
 95 |   alpha[sample] = gsl_matrix_alloc(nSites[sample],gridSize+1);
 96 |   gamma[sample] = gsl_matrix_alloc(nSites[sample],gridSize+1);
 97 |   save_alpha  = 1;
 98 |   if (myEmit->EmitProb_set == 0){
 99 |     myEmit->set_EmitProb(time);
100 |   }
101 |   gsl_vector_set_all(proposal, 1.0/(myEmit->xmax - myEmit->xmin));
102 |   if (pstay_set == 0)    JumpDiffusion::set_pstay();
103 |   if (DiffProp_set == 0) JumpDiffusion::get_DiffProp();
104 |   JumpDiffusion::do_Fwd(sample);
105 |   JumpDiffusion::do_Bwd(sample);
106 |   gsl_matrix_free(alpha[sample]);
107 |   alpha[sample] = NULL;
108 | }
109 | 
110 | void JumpDiffusion::reset_DiffProp(){
111 |   if (DiffProp != NULL){
112 |     for (int i=0; i< (int) myEmit->frequent_dist.size(); i++){
113 |       gsl_matrix_free(DiffProp[i]);
114 |     }
115 |     delete [] DiffProp;
116 |     DiffProp = NULL;
117 |   }
118 | }
119 | 
120 | void JumpDiffusion::get_DiffProp(){
121 |   if (myEmit->dist_set == 0){
122 |     cout<<"ERROR-1 in JumpDiffusion::set_DiffProp()\n";
123 |     exit(1);
124 |   }
125 |   // allocate...
126 |   map<unsigned int,int>::iterator it;
127 |   if (DiffProp == NULL){
128 |     DiffProp = new gsl_matrix * [myEmit->frequent_dist.size()];
129 |     int i=0;
130 |     for (it = myEmit->frequent_dist.begin(); it != myEmit->frequent_dist.end(); ++it){
131 |       DiffProp[i] = gsl_matrix_alloc(gridSize+1,gridSize+1);
132 |       position.insert(pair<unsigned int,int>(it->first,i));
133 |       i++;
134 |     }
135 |   }
136 |   // now set the matrices...
137 |   is_identity.clear();
138 |   int i=0;
139 |   for (it=myEmit->frequent_dist.begin(); it != myEmit->frequent_dist.end(); it++){
140 |     int is_id = JumpDiffusion::set_DiffProp( DiffProp[i],  sigma*sqrt(double(it->first)));
141 |     is_identity.push_back(is_id);
142 |     i++;
143 |   }
144 |   DiffProp_set = 1;
145 | }
146 | 
147 | 
148 | int JumpDiffusion::set_DiffProp( gsl_matrix * propagator, double sd){
149 |   double dx = myEmit->dx;
150 |   if (3.0*sd <= dx) return(1);// with the current resolution, propagator is Dirac-Delta!
151 |   int range = 3 * ceil(sd / dx);// this means, only fluctuations up to 3 sigma are possible
152 |   if (2*range > gridSize+1){
153 |     sd = dx * double(gridSize) / 6.0;
154 |     range = 3 * ceil(sd / dx);
155 |   }
156 |   gsl_vector * gauss = gsl_vector_alloc(2*range+1);
157 |   for (int i=0; i<2*range+1; i++){
158 |     gsl_vector_set(gauss,i, gsl_ran_gaussian_pdf( double(i-range)*dx, sd));
159 |   }
160 |   gsl_matrix_set_zero(propagator);
161 |   double val = 0, norm=0;
162 |   gsl_vector_view row;
163 |   for (int i=0; i<= gridSize; i++){
164 |     norm = 0.0;
165 |     for (int j=i-range; j<=i+range; j++){
166 |       if ( j<0 || j>gridSize) continue;
167 |       val = gsl_vector_get( gauss, j-i+range);
168 |       if( i <= range && j < range - i + 1){
169 | 	val += gsl_vector_get( gauss, range - i - j);
170 |       }
171 |       if( i >= gridSize - range && j >= 2*gridSize - i - range){
172 | 	val += gsl_vector_get( gauss, 2*gridSize + range - i - j);
173 |       }
174 |       gsl_matrix_set(propagator,i,j,val);
175 |       norm += (j==0 || j==gridSize) ? 0.5*val : val;
176 |     }
177 |     row = gsl_matrix_row(propagator,i);
178 |     norm *= dx;
179 |     if (norm <=0.0 ||norm!=norm){
180 |       cout<<"ERROR in JumpDiffusion::set_DiffProp()\n";
181 |       printf("sd=%e dx=%e\n",sd,dx);
182 |       exit(1);
183 |     }
184 |     gsl_vector_scale(&row.vector,1.0/norm);
185 |   }
186 |   gsl_vector_free(gauss);
187 |   return(0);
188 | }
189 | 
190 | 
191 | //compute the staying probabilities..
192 | void JumpDiffusion::set_pstay(){
193 |   int s;
194 |   // SAMPLES:
195 | #ifdef _OPENMP
196 | #pragma omp parallel for schedule( dynamic, 1) default(shared)
197 | #endif
198 |   for (s=0; s<nSamples; s++){
199 |     pstay[s][0] = 1.0;
200 |     for (int l=1; l< nSites[s]; l++){
201 |       if (dist[s][l] > 0){
202 | 	if (jump == 1.0){
203 | 	  pstay[s][l] = 0.0;
204 | 	}
205 | 	else if (jump > 0.0){
206 | 	  pstay[s][l] = pow( 1.0-jump, dist[s][l]);
207 | 	}
208 | 	else if (jump == 0.0){
209 | 	  pstay[s][l] = 1.0;
210 | 	}
211 |       }
212 |       else{
213 | 	pstay[s][l] = 1.0;
214 |       }
215 |     }
216 |   }
217 |   pstay_set = 1;
218 | }
219 | 
220 | int JumpDiffusion::predict(gsl_vector * prior, gsl_vector * post, 
221 | 			   gsl_matrix*& DP, gsl_matrix**& DP_pt, int s, int l){
222 |   int is_id=0;
223 |   //check whether DiffProp is pre-computed
224 |   if (  (myEmit->frequent_dist).count(dist[s][l]) == 0 ){//not found
225 |     is_id = JumpDiffusion::set_DiffProp( DP, sigma*sqrt(double(dist[s][l])));
226 |     DP_pt = &DP;
227 |   }
228 |   else{//exists
229 |     if ( is_identity[ position[dist[s][l]] ] == 1 ){
230 |       is_id = 1;
231 |       DP_pt = &DP;
232 |     }
233 |     else{
234 |       is_id = 0;
235 |       DP_pt = &( DiffProp[ position[dist[s][l]] ] );
236 |     }
237 |   }
238 |   //apply diffusion propagator
239 |   if (is_id == 0){
240 |     gsl_vector * mem = gsl_vector_alloc(gridSize+1);
241 |     post->data[0]        *= 0.5;
242 |     post->data[gridSize] *= 0.5;
243 |     gsl_blas_dgemv(CblasTrans, myEmit->dx, *DP_pt, post, 0.0, mem);
244 |     gsl_vector_memcpy( post, mem);
245 |     gsl_vector_free(mem);  
246 |   }
247 |   else{//increase variance by hand (exact for gaussian distribution)
248 |     double mn  = get_mean( post, myEmit->xmin, myEmit->xmax);
249 |     double var = get_var(  post, myEmit->xmin, myEmit->xmax, mn);
250 |     double e = var / ( var + 100*pow(sigma,2)*double(dist[s][l]));
251 |     for (int i=0; i<=gridSize; i++){
252 |       double p = post->data[i];
253 |       if (p>0.0) post->data[i] = pow(p,e);
254 |     }
255 |     double norm = gsl_blas_dasum(post) - 0.5*(post->data[0] + post->data[gridSize]);
256 |     norm *= myEmit->dx;
257 |     gsl_vector_scale(post,1.0/norm);
258 |   }
259 |   //apply jump propagator
260 |   gsl_vector_scale( prior, 1.0 - pstay[s][l]);
261 |   gsl_vector_scale( post,  pstay[s][l]);
262 |   gsl_vector_add( prior, post);
263 |   return(is_id);
264 | }
265 | 
266 | 
267 | 
268 | double JumpDiffusion::do_Fwd(int s){
269 |   // prepare fwd...
270 |   gsl_vector * eprob = gsl_vector_alloc(gridSize+1);
271 |   gsl_vector * prior = gsl_vector_alloc(gridSize+1);
272 |   gsl_vector * post  = gsl_vector_alloc(gridSize+1);
273 |   //gsl_vector * mem   = gsl_vector_alloc(gridSize+1);
274 |   gsl_matrix * DP    = gsl_matrix_alloc(gridSize+1,gridSize+1);
275 |   gsl_matrix ** DP_pt = NULL;
276 |   double norm;
277 |   double llh=0.0;
278 |   int get_log=0;
279 |   // Forward Pass
280 |   for (int l=0; l<nSites[s]; l++){
281 |     if (mask[s][l] == 0) continue;
282 |     int N = myEmit->depths[time][s][l];
283 |     int n = myEmit->reads[time][s][l];
284 |     gsl_vector_memcpy( prior, proposal);
285 |     if (dist[s][l] > 0) JumpDiffusion::predict( prior, post, DP, DP_pt, s, l);
286 |     //emission probability
287 |     if (N>0){//if observation
288 |       if (myEmit->bias != NULL){
289 | 	myEmit->get_eprob_wBias( eprob, myEmit->EmitLog[N][n], myEmit->bias[s][l], n, N, get_log);
290 |       }
291 |       else{
292 | 	gsl_vector_memcpy( eprob, myEmit->EmitProb[N][n] );
293 | 	if (myEmit->reflect && n != N-n){
294 | 	  gsl_vector_add( eprob, myEmit->EmitProb[N][N-n]);
295 | 	  //gsl_vector_scale(eprob,0.5);
296 | 	}
297 |       }
298 |       //random emission channel
299 |       gsl_vector_scale( eprob, 1.0-rnd_emit);
300 |       if (mode==1 || mode==2){
301 | 	gsl_vector_add_constant( eprob, rnd_emit / double(N+1));
302 |       }
303 |       else if (mode==3 || mode==4){
304 | 	double rnd = double(N)*(myEmit->maxRate - myEmit->minRate);
305 | 	if (myEmit->bias != NULL) rnd *= myEmit->bias[s][l];
306 | 	rnd =  rnd_emit / (rnd+1.0);
307 | 	gsl_vector_add_constant( eprob, rnd);
308 |       }
309 |       gsl_vector_mul( prior, eprob);// at this time it is the posterior!
310 |       norm  = gsl_blas_dasum(prior);
311 |       norm -= 0.5*(prior->data[0] + prior->data[gridSize]);
312 |       norm *= myEmit->dx;
313 |       if (norm <=0 || norm != norm){
314 | 	cout<<"ERROR\n";
315 | 	abort();
316 |       }
317 |       gsl_vector_scale( prior, 1.0 / norm);
318 |       llh += log(norm);// get part of the total log-likelihood
319 |     }
320 |     gsl_vector_memcpy(post,prior);
321 |     if (save_alpha == 1){
322 |       gsl_matrix_set_row(alpha[s], l, post);// save forward variable    
323 |     }
324 |   }
325 |   // clean up...
326 |   gsl_vector_free(eprob);
327 |   gsl_vector_free(prior);
328 |   gsl_vector_free(post);
329 |   gsl_matrix_free(DP);
330 |   return(llh);
331 | }
332 | 
333 | 
334 | // Backward Pass...
335 | void JumpDiffusion::do_Bwd(int s){
336 |   // prepare bwd...
337 |   gsl_vector * eprob = gsl_vector_alloc(gridSize+1);
338 |   gsl_vector * prior = gsl_vector_alloc(gridSize+1);
339 |   gsl_vector * post  = gsl_vector_alloc(gridSize+1);
340 |   gsl_vector * beta  = gsl_vector_alloc(gridSize+1);
341 |   gsl_vector * last_beta  = gsl_vector_alloc(gridSize+1);
342 |   gsl_vector * last_eprob = gsl_vector_alloc(gridSize+1);
343 |   gsl_vector * mem   = gsl_vector_alloc(gridSize+1);
344 |   gsl_vector * mem2  = gsl_vector_alloc(gridSize+1);
345 |   gsl_matrix * DP    = gsl_matrix_calloc(gridSize+1,gridSize+1);
346 |   gsl_matrix ** DP_pt = NULL;
347 |   double x,y,norm;
348 |   int get_log=0;
349 |   int is_id=1;
350 |   int last=-1;
351 |   for (int l = nSites[s]-1; l>=0; l--){
352 |     if (mask[s][l]==0) continue;
353 |     gsl_vector_memcpy(prior,proposal);
354 |     if (last>0) is_id = JumpDiffusion::predict( prior, post, DP, DP_pt, s, last);
355 |     gsl_vector_memcpy( beta, prior);
356 |     // get gamma, i.e. the total posterior probability vector
357 |     gsl_vector_view alph = gsl_matrix_row(alpha[s],l);
358 |     gsl_vector_memcpy( post, beta);
359 |     gsl_vector_mul( post, &alph.vector);
360 |     norm  = gsl_blas_dasum(post);
361 |     norm -= 0.5*(post->data[0] + post->data[gridSize]);
362 |     norm *= myEmit->dx;
363 |     gsl_vector_scale( post, 1.0 / norm);
364 |     // posterior on-site sojourn probability.
365 |     gsl_matrix_set_row( gamma[s], l, post);
366 |     // emission probability
367 |     int N = myEmit->depths[time][s][l];
368 |     int n = myEmit->reads[time][s][l];
369 |     if (N>0){
370 |       if (myEmit->bias != NULL){
371 | 	myEmit->get_eprob_wBias( eprob, myEmit->EmitLog[N][n], myEmit->bias[s][l], n, N, get_log);
372 |       }
373 |       else{
374 | 	gsl_vector_memcpy( eprob, myEmit->EmitProb[N][n] );
375 | 	if (myEmit->reflect && n != N-n){
376 | 	  gsl_vector_add( eprob, myEmit->EmitProb[N][N-n]);
377 | 	}
378 |       }
379 |       // random emission channel
380 |       gsl_vector_scale( eprob, 1.0-rnd_emit);
381 |       if (mode==1 || mode==2){
382 | 	gsl_vector_add_constant( eprob, rnd_emit / double(N+1));
383 |       }
384 |       else if (mode==3 || mode==4){
385 | 	double rnd = double(N)*(myEmit->maxRate-myEmit->minRate);
386 | 	if (myEmit->bias != NULL) rnd *= myEmit->bias[s][l];
387 | 	rnd =  rnd_emit / (rnd+1.0);
388 | 	gsl_vector_add_constant( eprob, rnd);
389 |       }
390 |       // get posterior update for the next step...
391 |       gsl_vector_mul( prior, eprob);// now it is the posterior!
392 |       norm = gsl_blas_dasum(prior);
393 |       norm -= 0.5*(prior->data[0] + prior->data[gridSize]);
394 |       norm *= myEmit->dx;
395 |       gsl_vector_scale(prior, 1.0 / norm);
396 |     }
397 |     gsl_vector_memcpy( post, prior);
398 |     // posterior jump-probability...
399 |     if (last>0){
400 |       if (jump > 0.0 && jump < 1.0){
401 | 	gsl_vector_memcpy(mem,last_beta);
402 | 	gsl_vector_mul(mem,last_eprob);
403 | 	mem->data[0]        *= 0.5;
404 | 	mem->data[gridSize] *= 0.5;
405 | 	x = gsl_blas_dasum(mem);
406 | 	x *= myEmit->dx * (1.0-pstay[s][last]) / (myEmit->xmax - myEmit->xmin);
407 | 	if (is_id==0){
408 | 	  gsl_blas_dgemv(CblasNoTrans, myEmit->dx, *DP_pt, mem, 0.0, mem2);
409 | 	}
410 | 	else{
411 | 	  gsl_vector_memcpy(mem2,mem);
412 | 	}
413 | 	gsl_vector_mul(mem2,&alph.vector);
414 | 	mem2->data[0]        *= 0.5;
415 | 	mem2->data[gridSize] *= 0.5;
416 | 	y = gsl_blas_dasum(mem2);
417 | 	y *= myEmit->dx * pstay[s][last];
418 | 	// log(pjump) and log(1.0-pjump) for the transition l->l+1
419 | 	pjump[s][last]   = log(x) - log(x+y);
420 | 	pnojump[s][last] = log(y) - log(x+y);
421 |       }
422 |       else if (jump == 0.0){
423 | 	pjump[s][last]   = -1.0e3;
424 | 	pnojump[s][last] = 0.0;
425 |       }
426 |       else if (jump == 1.0){
427 | 	pjump[s][last]   = 0.0;
428 | 	pnojump[s][last] = -1.0e3;
429 |       }
430 |     }
431 |     gsl_vector_memcpy(last_beta,beta);
432 |     gsl_vector_memcpy(last_eprob,eprob);
433 |     last = l;
434 |   }
435 |   pjump[s][0]   = -1.0e3;
436 |   pnojump[s][0] = 0.0;
437 |   //clean up...
438 |   gsl_vector_free(eprob);
439 |   gsl_vector_free(prior);
440 |   gsl_vector_free(post);
441 |   gsl_vector_free(beta);
442 |   gsl_vector_free(mem);
443 |   gsl_vector_free(mem2);
444 |   gsl_vector_free(last_beta);
445 |   gsl_vector_free(last_eprob);
446 |   gsl_matrix_free(DP);
447 | }
448 | 
449 | 
450 | 
451 | int JumpDiffusion::adapt_range(){
452 |   gsl_vector * cum = gsl_vector_calloc(gridSize+1);
453 |   //gsl_vector_view gma;
454 |   double mn = myEmit->xmax;
455 |   double mx = myEmit->xmin;
456 |   int low=0;
457 |   int redo=0;
458 |   double eps = 1.0e-4;
459 |   for (int s=0; s<nSamples; s++){
460 |     JumpDiffusion::get_posterior(s);
461 |     for (int l=0; l<nSites[s]; l++){
462 |       if (mask[s][l] == 0) continue;
463 |       low=0;
464 |       cum->data[0] = 0.0;
465 |       for (int i=1; i<=gridSize;i++){
466 | 	cum->data[i]  = cum->data[i-1];
467 | 	cum->data[i] += 0.5*myEmit->dx*(gsl_matrix_get(gamma[s],l,i-1) + gsl_matrix_get(gamma[s],l,i));
468 | 	if (low==0 && cum->data[i] >= eps){
469 | 	  mn = min( mn, myEmit->xgrid[i-1]);
470 | 	  low=1;
471 | 	}
472 | 	if (cum->data[i] >= 1.0-eps){
473 | 	  mx = max( mx, myEmit->xgrid[i]);
474 | 	  break;	  
475 | 	}		
476 |       }
477 |     }
478 |     gsl_matrix_free(gamma[s]);
479 |     gamma[s] = NULL;
480 |   }
481 |   gsl_vector_free(cum);
482 |   if (mx-mn < 10.0){
483 |     double xc = 0.5*(mn+mx);
484 |     mn = max(0.01,xc - 5.0);
485 |     mx = xc + 5.0;
486 |   }
487 |   if (mn > myEmit->xmin || mx < myEmit->xmax){
488 |     redo = 1;
489 |     myEmit->xmin = mn;
490 |     myEmit->xmax = mx;
491 |     if (myEmit->bias == NULL){
492 |       myEmit->ymin = mn;
493 |       myEmit->ymax = mx;
494 |     }
495 |     myEmit->set_grid();
496 |     myEmit->set_EmitProb(time);
497 |     Fwd_done=0;
498 |     Bwd_done=0;
499 |   }
500 |   return(redo);
501 | }
502 | 


--------------------------------------------------------------------------------
/src/cloneHD.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |   ******************************************************************************
  3 | 
  4 | Copyright (c) 11/12/13  Genome Research Ltd.
  5 | 
  6 | Author: Andrej Fischer (af7[at]sanger.ac.uk)
  7 | 
  8 | This file is part of cloneHD.
  9 | 
 10 | cloneHD is free software: you can redistribute it and/or modify it under the terms of the 
 11 | GNU General Public License as published by the Free Software Foundation; either version 3 
 12 | of the License, or (at your option) any later version.
 13 | 
 14 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 
 15 | without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
 16 | See the GNU General Public License for more details.
 17 | 
 18 | You should have received a copy of the GNU General Public License along with this program.  
 19 | If not, see <http://www.gnu.org/licenses/>.
 20 | 
 21 | 
 22 | ******************************************************************************
 23 | */
 24 | 
 25 | #include <stdio.h>
 26 | #include <stdlib.h>
 27 | #include <iostream>
 28 | #include <fstream>
 29 | #include <sstream>
 30 | #include <time.h>
 31 | #include <math.h>
 32 | #include <ctype.h> 
 33 | #include <string>
 34 | #include <map>
 35 | #include <vector>
 36 | #include <list>
 37 | 
 38 | 
 39 | // GSL headers...
 40 | #include "gsl/gsl_vector.h"
 41 | #include "gsl/gsl_matrix.h"
 42 | #include "gsl/gsl_randist.h"
 43 | #include "gsl/gsl_blas.h"
 44 | 
 45 | //own headers...
 46 | #include "cloneHD-functions.h"
 47 | #include "cloneHD-inference.h"
 48 | #include "common-functions.h"
 49 | #include "clone.h"
 50 | #include "emission.h"
 51 | 
 52 | 
 53 | using namespace std;
 54 | 
 55 | 
 56 | 
 57 | // *** MAIN START***
 58 | int main (int argc, const char * argv[]){
 59 |   cmdl_opts opts;
 60 |   get_opts( argc, argv, opts);
 61 |   int nTimes=0, nT=0;
 62 |   //*** EMITTED DATA OBJECTS ***
 63 |   Emission cnaEmit, bafEmit, snvEmit;
 64 |   if (opts.cna_fn != NULL) get_cna_data( &cnaEmit, opts, nTimes);
 65 |   if (opts.baf_fn != NULL) get_baf_data( &bafEmit, opts, nTimes, nT);
 66 |   if (opts.snv_fn != NULL) get_snv_data( &snvEmit, opts, nTimes, nT);
 67 |   //*** ANNOUNCE ***
 68 |   printf("\ncloneHD: probabilistic inference of sub-clonality using...\n\n");
 69 |   if (cnaEmit.is_set){
 70 |     printf("CNA data in %s: %i sites in %i chr across %i samples\n", 
 71 | 	   opts.cna_fn, cnaEmit.total_loci, cnaEmit.nSamples, nTimes);
 72 |   }
 73 |   if (bafEmit.is_set){
 74 |     printf("BAF data in %s: %i sites in %i chr across %i samples\n", 
 75 | 	   opts.baf_fn, bafEmit.total_loci, bafEmit.nSamples, nTimes);
 76 |   }
 77 |   if (snvEmit.is_set){
 78 |     printf("SNV data in %s: %i sites in %i chr across %i samples\n", 
 79 | 	   opts.snv_fn, snvEmit.total_loci, snvEmit.nSamples, nTimes);
 80 |   }
 81 |   cout<<endl;
 82 |   // *** ALLOCATE CLONE ***
 83 |   Clone myClone;
 84 |   myClone.allocate( &cnaEmit, &bafEmit, &snvEmit, opts.chr_fn);
 85 |   myClone.cna_pen_zero = opts.cna_pen_zero;//CNA penalty for zero total copies
 86 |   myClone.cna_pen_diff = opts.cna_pen_diff;//CNA penalty for different c.n.
 87 |   myClone.cna_pen_norm = opts.cna_pen_norm;//CNA penalty for non-normal c.n.
 88 |   myClone.baf_pen_comp = opts.baf_pen_comp;//BAF penalty for complex chr status
 89 |   myClone.snv_pen_high = opts.snv_pen_high;//SNV penalty for high SNV genotypes
 90 |   myClone.snv_pen_mult = opts.snv_pen_mult;//SNV penalty for multiple hit SNVs
 91 |   myClone.snv_fpr  = opts.snv_fpr;//SNV false-positive rate
 92 |   myClone.snv_fpf  = opts.snv_fpf;//SNV frequency of false positives
 93 |   myClone.bulk_fix = opts.bulk_fix;
 94 |   myClone.cnaGrid  = opts.cnaGrid;
 95 |   myClone.bafGrid  = opts.bafGrid;
 96 |   myClone.snvGrid  = opts.snvGrid;
 97 |   myClone.bulkGrid = opts.bulkGrid;
 98 |   myClone.learn_priors = (cnaEmit.is_set || snvEmit.connect || opts.avcn_fn != NULL) ? 0 : opts.learn_priors;
 99 |   // *** GET MAX-TCN INFO ***
100 |   get_maxtcn_input( opts.maxtcn_fn, opts.maxtcn, &myClone);
101 |   // *** GET SNV BULK PRIOR ***
102 |   if ( snvEmit.is_set && opts.bulk_fn != NULL ){
103 |     printf("Using data in %s as SNV bulk prior...\n", opts.bulk_fn);
104 |     get_snv_bulk_prior( &myClone, opts);
105 |   }
106 |   //*** GET JUMP PROBABILITY TRACKS and COLLAPSE TO EVENTS***
107 |   get_jump_probability( &myClone, opts);
108 |   //...now all segments are fixed and mean_tcn/av_cn allocated.
109 |   if ( snvEmit.is_set && !cnaEmit.is_set ){//for SNV only
110 |     // *** GET TOTAL MEAN COPYNUMBER TRACK ***  
111 |     if( opts.mntcn_fn != NULL ){
112 |       get_mean_tcn( opts.mntcn_fn, &myClone, &snvEmit);
113 |     } 
114 |     // *** GET AVAILABLE COPYNUMBER TRACK ***  
115 |     if ( opts.avcn_fn != NULL ){
116 |       get_avail_cn( opts.avcn_fn, &myClone, &snvEmit);
117 |     }
118 |   }
119 |   //*** GET READ DEPTH BIAS FIELD ***
120 |   if (cnaEmit.is_set && opts.bias_fn != NULL){
121 |     get_bias_field( &myClone, opts);
122 |   }
123 |   //*** PREPARE COARSE-GRAINED DATA ***
124 |   if (cnaEmit.is_set && (opts.cna_jumps_fn != NULL || opts.cna_jump == 0.0)){
125 |     cnaEmit.log_space      = 1;
126 |     cnaEmit.coarse_grained = 1;
127 |     printf( "Collapsed CNA data to %5i segments based on potential jump events.\n", 
128 | 	    cnaEmit.total_events);
129 |     cout<<"Precomputing for CNA..."<<flush;
130 |     myClone.get_cnaEmitLog();
131 |     cout<<"done."<<endl;
132 |   }
133 |   if (bafEmit.is_set && ( opts.cna_jumps_fn != NULL || opts.baf_jumps_fn != NULL || opts.baf_jump == 0.0)){
134 |     bafEmit.log_space      = 1;
135 |     bafEmit.coarse_grained = 1;
136 |     printf("Collapsed BAF data to %5i segments based on potential jump events.\n", bafEmit.total_events);
137 |     cout<<"Precomputing for BAF..."<<flush;
138 |     myClone.get_bafEmitLog();
139 |     cout<<"done."<<endl;
140 |   }
141 |   if (snvEmit.is_set && opts.snv_jumps_fn != NULL){
142 |     snvEmit.log_space      = 1;
143 |     snvEmit.coarse_grained = 1;
144 |     printf("Collapsed SNV data to %5i segments based on potential jump events.\n", snvEmit.total_events);
145 |     cout<<"Precomputing for SNV..."<<flush;
146 |     myClone.get_snvEmitLog();
147 |     cout<<"done."<<endl;
148 |   }
149 |   cout<<endl;
150 |   //exit(0);
151 |   // get purities...
152 |   if (opts.purity_fn != NULL){
153 |     get_purity( opts.purity_fn, myClone.min_purity);
154 |   }
155 |   // get user pre-defined clones
156 |   gsl_matrix * clones = NULL;
157 |   gsl_vector * mass   = NULL;
158 |   if (opts.clones_fn != NULL) get_fixed_clones( clones, mass, opts.clones_fn, nTimes);
159 |   int bestn=0, rows=0;
160 |   if (mass != NULL   && (int) mass->size > nTimes)    rows = (int) mass->size;
161 |   if (clones != NULL && (int) clones->size1 > nTimes) rows = (int) clones->size1;
162 |   if (rows > nTimes){//print LLH's for predefined parameter values...
163 |     print_llh_for_set( clones, mass, &myClone, opts);
164 |     return(0);
165 |   }
166 |   else{
167 |     // ****** INFERENCE STARTS HERE ******
168 |     bestn = infer_clones( clones, mass, &myClone, opts);
169 |     printf("cloneHD in ");
170 |     if (cnaEmit.is_set && bafEmit.is_set && snvEmit.is_set) cout<<"cna-baf-snv ";
171 |     if (cnaEmit.is_set && bafEmit.is_set && !snvEmit.is_set) cout<<"cna-baf ";
172 |     if (cnaEmit.is_set && !bafEmit.is_set && snvEmit.is_set) cout<<"cna-snv ";
173 |     if (cnaEmit.is_set && !bafEmit.is_set && !snvEmit.is_set) cout<<"cna ";
174 |     if (!cnaEmit.is_set && !bafEmit.is_set && snvEmit.is_set) cout<<"snv ";
175 |     printf("mode found support for %i sub-clone(s) in the data.\n", bestn);
176 |     // ****** INFERENCE COMPLETED ********
177 |   }
178 |   print_all_results( &myClone, opts);
179 |   // all done...
180 |   return (0);
181 | }
182 | // *** MAIN END ***
183 | 
184 | 
185 | 
186 | 
187 | // get command line arguments...
188 | void get_opts( int argc, const char ** argv, cmdl_opts& opts){
189 |   default_opts(opts);
190 |   int opt_idx = 1;
191 |   string opt_switch;
192 |   while ( opt_idx < argc && (argv[opt_idx][0] == '-')){
193 |     opt_switch = argv[opt_idx];
194 |     if ( opt_switch.compare("--help") == 0){
195 |       print_usage();
196 |       exit(0);
197 |     }
198 |     opt_idx++;
199 |     if (opt_idx==argc) break;
200 |     if ( argv[opt_idx][0] == '-' && argv[opt_idx][1] == '-') continue;
201 |     if ( opt_switch.compare("--cna") == 0){
202 |       opts.cna_fn = argv[opt_idx];
203 |     }
204 |     else if ( opt_switch.compare("--baf") == 0){
205 |       opts.baf_fn = argv[opt_idx];
206 |     }
207 |     else if ( opt_switch.compare("--snv") == 0){
208 |       opts.snv_fn = argv[opt_idx];
209 |     }
210 |     else if ( opt_switch.compare("--clones") == 0){
211 |       opts.clones_fn = argv[opt_idx];
212 |     }
213 |     else if ( opt_switch.compare("--max-tcn") == 0){
214 |       if ( isdigit(argv[opt_idx][0]) ){
215 | 	opts.maxtcn = atoi(argv[opt_idx]);
216 |       }
217 |       else{
218 | 	opts.maxtcn_fn = argv[opt_idx];
219 |       }
220 |     }
221 |     else if ( opt_switch.compare("--mean-tcn") == 0){
222 |       opts.mntcn_fn = argv[opt_idx];
223 |     }
224 |     else if ( opt_switch.compare("--avail-cn") == 0){
225 |       opts.avcn_fn = argv[opt_idx];
226 |     }
227 |     else if ( opt_switch.compare("--bias") == 0){
228 |       opts.bias_fn = argv[opt_idx];
229 |     }
230 |     else if ( opt_switch.compare("--pre") == 0){
231 |       opts.pre = argv[opt_idx];
232 |     }
233 |     else if ( opt_switch.compare("--chr") == 0){
234 |       opts.chr_fn = argv[opt_idx];
235 |     }
236 |     else if ( opt_switch.compare("--cna-grid") == 0){
237 |       opts.cnaGrid = atoi(argv[opt_idx]);
238 |     }
239 |     else if ( opt_switch.compare("--baf-grid") == 0){
240 |       opts.bafGrid = atoi(argv[opt_idx]);
241 |     }
242 |     else if ( opt_switch.compare("--snv-grid") == 0){
243 |       opts.snvGrid = atoi(argv[opt_idx]);
244 |     }
245 |     else if ( opt_switch.compare("--bulk-grid") == 0){
246 |       opts.bulkGrid = atoi(argv[opt_idx]);
247 |     }
248 |     else if ( opt_switch.compare("--seed") == 0){
249 |       opts.seed = atoi(argv[opt_idx]);
250 |     }
251 |     else if ( opt_switch.compare("--nmax") == 0){
252 |       opts.nmax = atoi(argv[opt_idx]);
253 |     }
254 |     else if ( opt_switch.compare("--trials") == 0){
255 |       opts.trials = atoi(argv[opt_idx]);
256 |     }
257 |     else if ( opt_switch.compare("--restarts") == 0){
258 |       opts.restarts = atoi(argv[opt_idx]);
259 |     }
260 |     else if ( opt_switch.compare("--cna-rnd") == 0){
261 |       opts.cna_rnd = atof(argv[opt_idx]);
262 |     }
263 |     else if ( opt_switch.compare("--baf-rnd") == 0){
264 |       opts.baf_rnd = atof(argv[opt_idx]);
265 |     }
266 |     else if ( opt_switch.compare("--snv-rnd") == 0){
267 |       opts.snv_rnd = atof(argv[opt_idx]);
268 |     }
269 |     else if ( opt_switch.compare("--snv-fpfreq") == 0){
270 |       opts.snv_fpf = atof(argv[opt_idx]);
271 |     }
272 |     else if ( opt_switch.compare("--snv-fprate") == 0){
273 |       opts.snv_fpr = atof(argv[opt_idx]);
274 |     }
275 |     else if ( opt_switch.compare("--cna-jump") == 0){
276 |       opts.cna_jump = atof(argv[opt_idx]);
277 |     }
278 |     else if ( opt_switch.compare("--baf-jump") == 0){
279 |       opts.baf_jump = atof(argv[opt_idx]);
280 |     }
281 |     else if ( opt_switch.compare("--snv-jump") == 0){
282 |       opts.snv_jump = atof(argv[opt_idx]);
283 |     }
284 |     else if ( opt_switch.compare("--cna-shape") == 0){
285 |       opts.cna_shape = atof(argv[opt_idx]);
286 |     }
287 |     else if ( opt_switch.compare("--baf-shape") == 0){
288 |       opts.baf_shape = atof(argv[opt_idx]);
289 |     }
290 |     else if ( opt_switch.compare("--snv-shape") == 0){
291 |       opts.snv_shape = atof(argv[opt_idx]);
292 |     }
293 |     else if ( opt_switch.compare("--cna-pen-zero") == 0){
294 |       opts.cna_pen_zero = atof(argv[opt_idx]);
295 |     }
296 |     else if ( opt_switch.compare("--cna-pen-diff") == 0){
297 |       opts.cna_pen_diff = atof(argv[opt_idx]);
298 |     }
299 |     else if ( opt_switch.compare("--cna-pen-norm") == 0){
300 |       opts.cna_pen_norm = atof(argv[opt_idx]);
301 |     }
302 |     else if ( opt_switch.compare("--baf-pen-comp") == 0){
303 |       opts.baf_pen_comp = atof(argv[opt_idx]);
304 |     }
305 |     else if ( opt_switch.compare("--snv-pen-high") == 0){
306 |       opts.snv_pen_high = atof(argv[opt_idx]);
307 |     }
308 |     else if ( opt_switch.compare("--snv-pen-mult") == 0){
309 |       opts.snv_pen_mult = atof(argv[opt_idx]);
310 |     }
311 |     else if ( opt_switch.compare("--purity") == 0){
312 |       opts.purity_fn = argv[opt_idx];
313 |     }
314 |     else if ( opt_switch.compare("--cna-jumps") == 0){
315 |       opts.cna_jumps_fn = argv[opt_idx];
316 |     }
317 |     else if ( opt_switch.compare("--baf-jumps") == 0){
318 |       opts.baf_jumps_fn = argv[opt_idx];
319 |     }
320 |     else if ( opt_switch.compare("--snv-jumps") == 0){
321 |       opts.snv_jumps_fn = argv[opt_idx];
322 |     }
323 |     else if ( opt_switch.compare("--force") == 0){
324 |       opts.force = atoi(argv[opt_idx]);
325 |     }
326 |     else if ( opt_switch.compare("--print-all") == 0){
327 |       opts.print_all = atoi(argv[opt_idx]);
328 |       if (opts.print_all > 0) opts.print_all = 1;
329 |     }
330 |     else if ( opt_switch.compare("--mass-gauging") == 0){
331 |       opts.mass_gauging = atoi(argv[opt_idx]);
332 |     }
333 |     else if ( opt_switch.compare("--bulk-mean") == 0 ){
334 |       opts.bulk_fn = argv[opt_idx];
335 |       opts.bulk_mean=1;
336 |     }
337 |     else if ( opt_switch.compare("--bulk-prior") == 0 ){
338 |       opts.bulk_fn = argv[opt_idx];
339 |       opts.bulk_prior=1;
340 |     }
341 |     else if ( opt_switch.compare("--bulk-updates") == 0 ){
342 |       opts.bulk_updates = atoi(argv[opt_idx]);
343 |     }
344 |     else if ( opt_switch.compare("--bulk-fix") == 0){
345 |       opts.bulk_fix = atof(argv[opt_idx]);
346 |     }
347 |     else if ( opt_switch.compare("--bulk-sigma") == 0){
348 |       opts.bulk_sigma = atof(argv[opt_idx]);
349 |     }
350 |     else if ( opt_switch.compare("--min-occ") == 0){
351 |       opts.min_occ = atof(argv[opt_idx]);
352 |     }  
353 |     else if ( opt_switch.compare("--min-jump") == 0){
354 |       opts.min_jump = atof(argv[opt_idx]);
355 |     } 
356 |     else if ( opt_switch.compare("--learn-priors") == 0){
357 |       opts.learn_priors = atoi(argv[opt_idx]);
358 |       if (opts.learn_priors > 0) opts.learn_priors = 1;
359 |     }   
360 |     else{
361 |       cout<<"ERROR: unknown option "<<opt_switch<<" ?"<<endl;
362 |       print_usage();
363 |       exit(0);
364 |     }
365 |     opt_idx++;
366 |     opt_switch.clear();
367 |   }
368 |   test_opts(opts);
369 |   srand(opts.seed);
370 | }
371 | 
372 | void default_opts(cmdl_opts& opts){
373 |   //input files...
374 |   opts.cna_fn    = NULL;
375 |   opts.baf_fn    = NULL;
376 |   opts.snv_fn    = NULL;
377 |   opts.bulk_fn   = NULL;
378 |   opts.clones_fn = NULL;
379 |   opts.bias_fn   = NULL;
380 |   opts.mntcn_fn  = NULL;
381 |   opts.maxtcn_fn = NULL;
382 |   opts.avcn_fn   = NULL;
383 |   opts.chr_fn    = NULL;
384 |   opts.purity_fn = NULL;
385 |   //output options...
386 |   opts.pre       = "./out";
387 |   opts.print_all = 0;
388 |   //jump tracks...
389 |   opts.cna_jumps_fn = NULL;
390 |   opts.baf_jumps_fn = NULL;
391 |   opts.snv_jumps_fn = NULL;  
392 |   //optimizations switches...
393 |   opts.trials       = 1;
394 |   opts.restarts     = 10;
395 |   opts.learn_priors = 0;
396 |   opts.mass_gauging = 1;
397 |   opts.seed = 123456 * (int(time(NULL)) % 10) + (int(time(NULL)) % 1000);
398 |   //grid sizes...
399 |   opts.cnaGrid  = 300;
400 |   opts.bafGrid  = 100;
401 |   opts.snvGrid  = 100;
402 |   opts.bulkGrid = 100;
403 |   //jump rates...
404 |   opts.cna_jump = -1.0;
405 |   opts.baf_jump = -1.0;
406 |   opts.snv_jump = -1.0;
407 |   //random error rates...
408 |   opts.cna_rnd = 1.0e-6;
409 |   opts.baf_rnd = 1.0e-6;
410 |   opts.snv_rnd = 1.0e-6;
411 |   opts.snv_fpf = 0.01;
412 |   opts.snv_fpr = 0.001;
413 |   //shape parameters...
414 |   opts.cna_shape = -1.0;
415 |   opts.baf_shape = -1.0;
416 |   opts.snv_shape = -1.0;
417 |   //penalty terms...
418 |   opts.cna_pen_zero = 0.9;
419 |   opts.cna_pen_diff = 1.0;
420 |   opts.cna_pen_norm = 1.0;
421 |   opts.baf_pen_comp = 1.0;
422 |   opts.snv_pen_mult = 0.01;
423 |   opts.snv_pen_high = 0.5;
424 |   //model complexity...
425 |   opts.force    = -1;
426 |   opts.nmax     = 3;
427 |   opts.maxtcn   = -1;
428 |   opts.min_occ  = 0.01;
429 |   opts.min_jump = 0.01;
430 |   //bulk options...
431 |   opts.bulk_fix   = -1.0;
432 |   opts.bulk_sigma = -1.0;
433 |   opts.bulk_mean    = 0;
434 |   opts.bulk_prior   = 0;
435 |   opts.bulk_updates = 0;
436 | }
437 | 
438 | void test_opts(cmdl_opts& opts){
439 |   // *** CHECK COMMAND LINE ARGUMENTS ***
440 |   if ( opts.cna_fn == NULL && opts.baf_fn == NULL && opts.snv_fn == NULL){
441 |     cout<<"ERROR: One of --cna [file], --baf [file] and --snv [file] must be given.\n";
442 |     exit(1);
443 |   }
444 |   if (opts.snv_fn != NULL){
445 |     if ( opts.snv_jumps_fn != NULL || opts.snv_jump >= 0.0){//with SNV persistence
446 |       if ( opts.bulk_fn == NULL && opts.bulk_fix < 0.0){
447 | 	cout<<"ERROR: With --snv [file] with correlations, one of --bulk-(prior/mean) [file] or --bulk-fix [double] must be given.\n";
448 | 	exit(1);
449 |       }
450 |       opts.snv_fpf = 0.0;//there are no false positives
451 |     }
452 |     else{//no SNV persistence
453 |       opts.bulk_fix = 0.0;
454 |     }
455 |     if (opts.bulk_fn != NULL && opts.bulk_fix >= 0.0){
456 |       cout<<"ERROR: Only one of --bulk-(mean/prior) [file] and --bulk-fix [double] can be used.\n";
457 |       exit(1);
458 |     }
459 |     if (opts.bulk_fn != NULL && opts.bulk_mean == 1 && opts.bulk_prior == 1){
460 |       cout<<"ERROR: Only one of --bulk-mean or --bulk-prior [file] can be used.\n";
461 |       exit(1);
462 |     }
463 |   }
464 |   if (opts.cna_fn != NULL && (opts.mntcn_fn != NULL || opts.avcn_fn != NULL )){
465 |     cout<<"ERROR: --mean-tcn [file] and --avail-cn [file] cannot be used with --cna [file].\n";
466 |     exit(1);
467 |   }
468 |   if ( opts.mntcn_fn == NULL && opts.avcn_fn != NULL ){
469 |     cout<<"ERROR: --avail-cn [file] can only be used together with --mean-tcn [file].\n";
470 |     exit(1);
471 |   }
472 |   if ( opts.cna_fn != NULL && opts.cna_jump < 0.0 && opts.cna_jumps_fn == NULL ){
473 |     cout<<"ERROR: With --cna [file], --cna-jump [double] or --cna-jumps [file] must be given.\n";
474 |     exit(1);
475 |   }
476 |   if (opts.bulk_fix == 0.0 && opts.snv_rnd == 0.0){
477 |     opts.snv_rnd = 1.0e-9;
478 |   }
479 |   if (opts.force > 0){
480 |     opts.nmax = opts.force;
481 |   }
482 | }
483 | 
484 | void print_usage(){
485 |   cout<<endl<<"For all command line options, see ./docs/README-cloneHD.md\n";
486 |   cout<<endl;
487 |   exit(0);
488 | }
489 | 


--------------------------------------------------------------------------------
/src/filterHD.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |   filterHD.cpp
  3 |  
  4 |   Input: a file with four columns:
  5 |   chr locus reads depth
  6 | 
  7 |   Each chromosome is treated as an independent sample from the same process.
  8 |   The hidden emission rate state sequence is modeled as a jump-difusion trajectory.
  9 |   Its value x at any point in time is the mean rate of a integer-valued emission process that 
 10 |   generates the observed data.
 11 | 
 12 |   Output:
 13 |   The posterior mean and standard deviation of the emission rate.
 14 |   The posterior jump probabiliy for each transition.
 15 |   On request (--dist), the posterior distribution at each point (may be large files!).
 16 | 
 17 |   Required Arguments:
 18 |   --data        The input observed data file
 19 |   --mode        Emission model:
 20 |                 1 Binomial
 21 | 		2 Beta-Binomial
 22 | 		3 Poisson
 23 | 		4 Negative Binomial
 24 | 
 25 |   Optional Arguments:
 26 |   --pre         The prefix to put before all output files
 27 |   --grid [int]  The grid size for the distributions (partitions [0,1] into [grid] bins).
 28 |   --dist        Prints all the posterior distributions as well.
 29 |   --jump [double]  To fix the jump probability per base
 30 |   --sigma [double] To fix the diffusion constant
 31 |   --rnd [double]   To fix the random emission rate
 32 |   --shape [double] To fix the shape parameter in mode 2 and 4
 33 |   --bias [file]    A bias field modulating Poisson emissions. Use filterHD output file style 
 34 |                    (posterior distribution not needed). Bias field is assumed not to have jumps.
 35 |                    With bias field, filterHD will set --sigma 0.0.
 36 | 
 37 | */
 38 | 
 39 | #include <stdio.h>
 40 | #include <stdlib.h>
 41 | #include <iostream>
 42 | #include <fstream>
 43 | #include <sstream>
 44 | #include <time.h>
 45 | #include <math.h>
 46 | #include <ctype.h> 
 47 | #include <string>
 48 | #include <map>
 49 | #include <vector>
 50 | #include <list>
 51 | 
 52 | 
 53 | // GSL headers...
 54 | #include "gsl/gsl_vector.h"
 55 | #include "gsl/gsl_matrix.h"
 56 | #include "gsl/gsl_randist.h"
 57 | #include "gsl/gsl_blas.h"
 58 | 
 59 | //own headers...
 60 | #include "emission.h"
 61 | #include "jump-diffusion.h"
 62 | #include "minimization.h"
 63 | #include "common-functions.h"
 64 | 
 65 | using namespace std;
 66 | 
 67 | 
 68 | struct cmdl_opts{
 69 |   const char * data_fn;
 70 |   const char * bias_fn;
 71 |   const char * pre;
 72 |   int grid, dist, nojump, mode, seed, filter_pVal,filter_shortSeg, jumps, reflect;
 73 |   double sigma, jump, rnd_emit, shape,xmin, xmax, min_jump;
 74 |   double sigma_i, jump_i, rnd_emit_i, shape_i;
 75 | };
 76 | 
 77 | 
 78 | //*** OWN FUNCTIONS ***
 79 | void get_opts( int argc, const char ** argv, cmdl_opts& opts);
 80 | void test_opts(cmdl_opts& opts);
 81 | void default_opts(cmdl_opts& opts);
 82 | void print_opts();
 83 | 
 84 | double Q( const gsl_vector * x, void * p);
 85 | double find_JD_parameters(JumpDiffusion * myJD, cmdl_opts& opts);
 86 | struct fpar{
 87 |   JumpDiffusion * myJD;
 88 |   vector<int> to_opt;
 89 | };
 90 | void init_parameter(gsl_vector*& var, gsl_vector*& range, vector<int>& to_opt, JumpDiffusion * myJD, cmdl_opts& opts);
 91 | 
 92 | // *** MAIN START***
 93 | int main (int argc, const char * argv[]){
 94 |   cmdl_opts opts;
 95 |   get_opts( argc, argv, opts);
 96 |   srand(opts.seed);
 97 |   vector<int> chrs;
 98 |   vector<int> nSites;
 99 |   int nTimes;
100 |   int keepzero=1;
101 |   get_dims( opts.data_fn, nTimes, chrs, nSites, keepzero);
102 |   int nSamples = (int) chrs.size();
103 |   int total_nLoci=0;
104 |   for (int s=0; s<nSamples; s++){
105 |     total_nLoci += nSites[s];
106 |   }
107 |   //announce:
108 |   printf("\nfilterHD: Fitting jump-diffusion model to data at %i loci ", total_nLoci);
109 |   printf("in %i segment(s) in %i sample(s) with a ", nSamples, nTimes);
110 |   if (opts.mode == 1){
111 |     printf("binomial ");
112 |   }
113 |   else if (opts.mode == 2){
114 |     printf("beta-binomial ");
115 |   }
116 |   else if (opts.mode == 3){
117 |     printf("poisson ");
118 |   }
119 |   else if (opts.mode == 4){
120 |     printf("negative-binomial ");
121 |   }
122 |   printf("emission model:\n");
123 |   //the data and emission model object
124 |   Emission myEmit;
125 |   myEmit.mode    = opts.mode;
126 |   myEmit.shape   = opts.shape;
127 |   myEmit.get_log = 1;
128 |   myEmit.reflect = opts.reflect;
129 |   myEmit.connect = 1;//all data will be retained
130 |   myEmit.set( nTimes, chrs, nSites, opts.grid);
131 |   get_data( opts.data_fn, &myEmit);
132 |   // *** BIAS FIELD *** emission bias field
133 |   if (opts.bias_fn != NULL){
134 |     myEmit.allocate_bias();
135 |     get_bias( opts.bias_fn, &myEmit);
136 |   }
137 |   // *** POSTERIOR JUMP TRACK ACROSS ALL TIME POINTS ***
138 |   double ** mean = new double * [nSamples];
139 |   double ** std  = new double * [nSamples];
140 |   for (int s=0; s<nSamples; s++){
141 |     mean[s] = new double [nSites[s]];
142 |     std[s]  = new double [nSites[s]];
143 |   }
144 |   double ** jumps = NULL;
145 |   if (opts.jumps==1){//compunded jump probabilities
146 |     jumps = new double * [nSamples];
147 |     for (int s=0; s<nSamples; s++){
148 |       jumps[s] = new double [nSites[s]];
149 |       for (int l=0; l<nSites[s]; l++){
150 | 	jumps[s][l] = 1.0;
151 |       }
152 |     }
153 |   }
154 |   int ** mask = NULL;//the mask is used to filter out loci
155 |   if (opts.filter_pVal || opts.filter_shortSeg > 0){
156 |     mask = new int * [nSamples];
157 |     for (int s=0; s<nSamples; s++){
158 |       mask[s] = new int [nSites[s]];
159 |       for (int l=0; l<nSites[s]; l++){
160 | 	mask[s][l] = 1;
161 |       }
162 |     }
163 |   }
164 |   //***Jump-Diffusion filtering of each sample***
165 |   for (int t=0; t<nTimes; t++){
166 |     printf("\nFiltering sample %i of %i:\n", t+1, nTimes);
167 |     //the jump diffusion propagation object
168 |     JumpDiffusion myJD( &myEmit, t);
169 |     //find maximum-likelihood estimates of all parameters
170 |     double llh = find_JD_parameters( &myJD, opts);
171 |     //calculate means and standard deviations...
172 |     int uidx  = opts.reflect ? int(0.5*double( myJD.gridSize)) :  myJD.gridSize;
173 |     double mx = opts.reflect ? 0.5 : myJD.myEmit->xmax;
174 |     double mn = myJD.myEmit->xmin;
175 |     gsl_vector * post = gsl_vector_alloc(uidx+1);
176 |     double crit = 10.0/double(total_nLoci);
177 |     char buff[1024];
178 |     sprintf(buff,"%s.posterior-%i.txt", opts.pre, t+1);
179 |     FILE * total_fp = fopen(buff,"w");
180 |     fprintf(total_fp, "#sample site mean std-dev jump-prob");
181 |     fprintf(total_fp, " posterior %.5e %.5e\n", myJD.myEmit->xmin, myJD.myEmit->xmax);
182 |     double gof=0, xobs=0, gofNorm=0;
183 |     for (int s=0; s < myJD.nSamples; s++){//get posterior distribution with the ML parameters
184 |       myJD.get_posterior(s);
185 |       double mstd = 0.0;
186 |       for (int l=0; l < myJD.nSites[s]; l++){
187 | 	if (opts.reflect){//distribution in lower half
188 | 	  gsl_vector_view lower = gsl_matrix_subrow( myJD.gamma[s], l, 0, uidx+1);
189 | 	  gsl_vector_memcpy( post, &lower.vector);
190 | 	  double norm = gsl_blas_dasum(post);
191 | 	  norm -= 0.5*(post->data[0] + post->data[uidx]);
192 | 	  norm *= myEmit.dx;
193 | 	  if (norm <= 0.0) abort();
194 | 	  gsl_vector_scale(post,1.0/norm);
195 | 	}
196 | 	else{
197 | 	  gsl_matrix_get_row( post, myJD.gamma[s], l);
198 | 	}
199 | 	mean[s][l] = get_mean( post, mn, mx);
200 | 	std[s][l]  = sqrt(get_var(post,mn,mx,mean[s][l]));
201 | 	mstd += std[s][l];
202 | 	if (opts.jumps==1){
203 | 	  jumps[s][l] *= exp(myJD.pnojump[s][l]);
204 | 	}
205 | 	//goodness of fit...
206 | 	if ((mask==NULL || mask[s][l] == 1) && myEmit.depths[t][s][l] > 0){
207 | 	  xobs = double(myEmit.reads[t][s][l]) / double(myEmit.depths[t][s][l]);
208 | 	  if (opts.reflect) xobs = min(xobs,1.0-xobs);	 
209 | 	  double x=0,g=0,dg=0;
210 | 	  double b = (myEmit.bias == NULL) ? 1.0 : myEmit.bias[s][l];
211 | 	  for (int i=0; i<=uidx; i++){
212 | 	    x = myEmit.xgrid[i] * b;
213 | 	    dg = fabs(x - xobs) * post->data[i];
214 | 	    if (i==0||i==uidx) dg *= 0.5;
215 | 	    g += dg;
216 | 	  }
217 | 	  gof += g * myEmit.dx;
218 | 	  gofNorm += 1.0;
219 | 	}
220 |       }
221 |       mstd /= double(myJD.nSites[s]);
222 |       //filter out data points which are not compatible with the emission model
223 |       if (opts.filter_pVal){
224 | 	for (int l=0; l < myJD.nSites[s]; l++){
225 | 	  double pval = myEmit.get_pval( t, s, l, mean[s][l]);
226 | 	  if (pval < crit) mask[s][l] = 0;
227 | 	}
228 |       }
229 |       if (opts.filter_shortSeg > 0){
230 | 	int last=0;
231 | 	for (int l=1; l < myJD.nSites[s]; l++){
232 | 	  if (fabs(mean[s][l] - mean[s][l-1]) > 4.0*mstd){
233 | 	    if (l < last + opts.filter_shortSeg){
234 | 	      for (int i=last; i<l; i++) mask[s][i] = 0;
235 | 	    }
236 | 	    last = l;
237 | 	  }
238 | 	}
239 |       }
240 |       // print posterior information to file
241 |       for (int l=0; l < myJD.nSites[s]; l++){
242 | 	fprintf(total_fp, "%i %6i %.2e %.2e %.2e", 
243 | 		chrs[s], myJD.loci[s][l], mean[s][l], std[s][l], exp(myJD.pjump[s][l]));
244 | 	if(opts.dist==1){// full posterior distribution? LARGE!
245 | 	  for (int i=0; i <= myJD.gridSize; i++){
246 | 	    fprintf(total_fp, " %.2e", gsl_matrix_get( myJD.gamma[s], l, i)); 
247 | 	  }
248 | 	}
249 | 	fprintf(total_fp,"\n");
250 |       }
251 |       gsl_matrix_free(myJD.gamma[s]);
252 |       myJD.gamma[s] = NULL;
253 |     }
254 |     fclose(total_fp);
255 |     gsl_vector_free(post);
256 |     // *** PROCLAIM RESULTS ***
257 |     printf("Filtered sample %i of %i: llh = %.5e, gof = %.5e, --jump %.3e --sigma %.3e --rnd %.3e",
258 | 	   t+1, nTimes, llh,  gof/gofNorm, myJD.jump, myJD.sigma, myJD.rnd_emit);
259 |     if (opts.mode == 2 || opts.mode==4) printf(" --shape %.3e", myJD.myEmit->shape);
260 |     cout<<endl;
261 |     if ( (opts.mode == 2 || opts.mode==4) && myJD.myEmit->shape > 1.0e3){
262 |       printf("With --shape %.3e, you might consider choosing mode %i.\n", 
263 | 	     myJD.myEmit->shape, opts.mode==2 ? 1 : 3);
264 |     } 
265 |     // *** RESET ***
266 |     myEmit.delete_old_Emit();
267 |     myEmit.range_set = 0;
268 |     myEmit.reset_mask();
269 |   }
270 |   // *** PRINT JUMPS ***
271 |   if (opts.jumps == 1){
272 |     for (int s=0; s < nSamples; s++){
273 |       for (int l=0; l < nSites[s]; l++){
274 | 	myEmit.pjump[s][l] = 1.0 - jumps[s][l];
275 |       }
276 |       if (opts.min_jump > 0.0){
277 | 	myEmit.coarse_grain_jumps( s, opts.min_jump, 5);
278 |       }
279 |       delete [] jumps[s];
280 |     }
281 |     delete [] jumps;
282 |     char buff[1024];
283 |     sprintf(buff,"%s.jumps.txt", opts.pre);
284 |     FILE * jumps_fp = fopen(buff,"w");
285 |     fprintf(jumps_fp, "#sample site jump-prob\n");
286 |     for (int s=0; s < nSamples; s++){
287 |       for (int l=0; l < nSites[s]; l++){
288 | 	fprintf(jumps_fp, "%i %6i %.2e\n", chrs[s], myEmit.loci[s][l], myEmit.pjump[s][l]);
289 |       }
290 |     }
291 |     fclose(jumps_fp);
292 |   }
293 |   //print filtered
294 |   if (opts.filter_pVal || opts.filter_shortSeg > 0){
295 |     char buff[1024];  
296 |     sprintf(buff,"%s.filtered.txt", opts.pre);
297 |     FILE * filtered_fp = fopen(buff,"w");
298 |     for (int s=0; s < myEmit.nSamples; s++){
299 |       for (int l=0; l < myEmit.nSites[s]; l++){
300 | 	if( mask[s][l] == 1 ){
301 | 	  fprintf( filtered_fp, "%i %6i", chrs[s], myEmit.loci[s][l]);
302 | 	  for (int t=0; t<nTimes; t++) 
303 | 	    fprintf( filtered_fp, " %3i %3i", myEmit.reads[t][s][l], myEmit.depths[t][s][l]);
304 | 	  fprintf( filtered_fp, "\n");
305 | 	}
306 |       }
307 |       delete [] mask[s];
308 |     }
309 |     delete [] mask;
310 |     fclose(filtered_fp);
311 |   }
312 |   for (int s=0; s < myEmit.nSamples; s++){
313 |     delete [] mean[s];
314 |     delete [] std[s];
315 |   }
316 |   delete [] mean;
317 |   delete [] std;
318 |   //done
319 |   return (0);
320 | }
321 | // *** MAIN END ***
322 | 
323 | 
324 | void default_opts(cmdl_opts& opts){
325 |   opts.data_fn  = NULL;
326 |   opts.bias_fn  = NULL;
327 |   opts.pre      = "./out";
328 |   opts.grid     = 100;
329 |   opts.dist     = 0;
330 |   opts.sigma    = -1.0;
331 |   opts.jump     = -1.0;
332 |   opts.rnd_emit = -1.0;
333 |   opts.shape    = -1.0;
334 |   opts.sigma_i    = -1.0;
335 |   opts.jump_i     = -1.0;
336 |   opts.rnd_emit_i = -1.0;
337 |   opts.shape_i    = -1.0;
338 |   opts.mode     = 0;
339 |   opts.xmin = -1.0;
340 |   opts.xmax = -1.0;
341 |   opts.seed = (int) time(NULL);
342 |   opts.filter_pVal     = 0;
343 |   opts.filter_shortSeg = 0;
344 |   opts.jumps  = 0;
345 |   opts.reflect = 0;
346 |   opts.min_jump = 0.0;
347 | }
348 | 
349 | // get command line arguments...
350 | void get_opts( int argc, const char ** argv, cmdl_opts& opts){
351 |   default_opts(opts);
352 |   int opt_idx = 1;
353 |   string opt_switch;  
354 |   while ( opt_idx < argc && (argv[opt_idx][0] == '-')){
355 |     opt_switch = argv[opt_idx];
356 |     if ( opt_switch.compare("--print-options") == 0){
357 |       print_opts();
358 |       exit(0);
359 |     }
360 |     opt_idx++;
361 |     if (opt_idx==argc) break;
362 |     if ( argv[opt_idx][0] == '-') continue;
363 |     if ( opt_switch.compare("--data") == 0){//the input data
364 |       opts.data_fn = argv[opt_idx];
365 |     }
366 |     else if ( opt_switch.compare("--bias") == 0){//the input data
367 |       opts.bias_fn = argv[opt_idx];
368 |     }
369 |     else if ( opt_switch.compare("--pre") == 0){//the prefix for all output files
370 |       opts.pre = argv[opt_idx];
371 |     }
372 |     else if ( opt_switch.compare("--grid") == 0){//the size of the grid for the continuous distributions
373 |       opts.grid = atoi(argv[opt_idx]);
374 |     }
375 |     else if ( opt_switch.compare("--seed") == 0){//random seed
376 |       opts.seed = atoi(argv[opt_idx]);
377 |     }
378 |     else if ( opt_switch.compare("--mode") == 0){//emission model: see above
379 |       opts.mode = atoi(argv[opt_idx]);
380 |     }
381 |     else if ( opt_switch.compare("--sigma") == 0){//diffusion constant
382 |       opts.sigma = atof(argv[opt_idx]);
383 |     }
384 |     else if ( opt_switch.compare("--rnd") == 0){//random emission rate
385 |       opts.rnd_emit = atof(argv[opt_idx]);
386 |     }
387 |     else if ( opt_switch.compare("--jump") == 0){//jump probability per base
388 |       opts.jump = atof(argv[opt_idx]);
389 |     }
390 |     else if ( opt_switch.compare("--shape") == 0){//shape parameter for mode 2/4
391 |       opts.shape = atof(argv[opt_idx]);
392 |     }
393 |     else if ( opt_switch.compare("--sigmai") == 0){//diffusion constant
394 |       opts.sigma_i = atof(argv[opt_idx]);
395 |     }
396 |     else if ( opt_switch.compare("--rndi") == 0){//random emission rate
397 |       opts.rnd_emit_i = atof(argv[opt_idx]);
398 |     }
399 |     else if ( opt_switch.compare("--jumpi") == 0){//jump probability per base
400 |       opts.jump_i = atof(argv[opt_idx]);
401 |     }
402 |     else if ( opt_switch.compare("--shapei") == 0){//shape parameter for mode 2/4
403 |       opts.shape_i = atof(argv[opt_idx]);
404 |     }
405 |     else if ( opt_switch.compare("--xmin") == 0){//shape parameter for mode 2/4
406 |       opts.xmin = atof(argv[opt_idx]);
407 |     }
408 |     else if ( opt_switch.compare("--xmax") == 0){//shape parameter for mode 2/4
409 |       opts.xmax = atof(argv[opt_idx]);
410 |     }
411 |     else if ( opt_switch.compare("--dist") == 0){//whether to print posterior
412 |       opts.dist = atoi(argv[opt_idx]);
413 |     }
414 |     else if ( opt_switch.compare("--filter-pVal") == 0){//whether to filter out some data points
415 |       opts.filter_pVal = 1;
416 |     }
417 |     else if ( opt_switch.compare("--filter-shortSeg") == 0){//whether to filter out some data points
418 |       opts.filter_shortSeg = atoi(argv[opt_idx]);
419 |     }
420 |     else if ( opt_switch.compare("--jumps") == 0){//whether to filter out some data points
421 |       opts.jumps = atoi(argv[opt_idx]);
422 |     }
423 |     else if ( opt_switch.compare("--min-jump") == 0){//whether to filter out some data points
424 |       opts.min_jump = atof(argv[opt_idx]);
425 |     }
426 |     else if ( opt_switch.compare("--reflect") == 0){//whether to filter out some data points
427 |       opts.reflect = atoi(argv[opt_idx]);
428 |     }
429 |     else {
430 |       cout << "Usage: filterHD --print-options"<<endl;
431 |       exit(1);
432 |     }
433 |     opt_switch.clear();
434 |     opt_idx++;
435 |   }
436 |   test_opts(opts);
437 | }
438 | 
439 | 
440 | void test_opts(cmdl_opts& opts){
441 |   if (opts.mode==0){
442 |     cout<<"ERROR: choose emission mode with --mode [1,2,3,4]\n";
443 |     exit(1);
444 |   }
445 |   if (opts.bias_fn != NULL){
446 |     if( opts.filter_pVal ){
447 |       cout<<"ERROR: --bias [file] and --filter-pVal [0/1] cannot be used together\n";
448 |       exit(1);
449 |     }
450 |   }
451 |   if(opts.reflect==1 && (opts.mode==3||opts.mode==4)){
452 |     cout<<"ERROR: --reflect [0/1] can only be used in mode 1 and 2.\n";
453 |     exit(1);
454 |   }
455 |   if (opts.filter_shortSeg > 0 && opts.min_jump == 0.0 && opts.reflect == 1) opts.min_jump = 1.0e-5;
456 | }
457 | 
458 | void print_opts(){
459 |   cout<<"filterHD --data [file] --mode [1,2,3,4] --bias [file] --pre [string:./out] --grid [int:100] --jump [double] --sigma [double] --sigma [double] --rnd [double] --shape [double] --jumps [0/1:0] --dist [0/1:0] --filter-pVal [0/1:0] --filter-shortSeg [int] --jumpi [double] --sigmai [double] --rndi [double] --shapei [double] --xmin [double] --xmax [double]"<<endl;
460 |   exit(0);
461 | }
462 | 
463 | 
464 | 
465 | // *** LEARN PARAMETERS OF THE JUMP-DIFFUSION MODEL ***
466 | double find_JD_parameters(JumpDiffusion * myJD, cmdl_opts& opts){
467 |   vector<int> to_opt;
468 |   int nvar=0;
469 |   double llh=0;
470 |   if (opts.xmin >= 0.0 && opts.xmax >= 0.0){
471 |     myJD->myEmit->xmin = opts.xmin;
472 |     myJD->myEmit->xmax = opts.xmax;
473 |     myJD->myEmit->ymin = opts.xmin;
474 |     myJD->myEmit->ymax = opts.xmax;
475 |     myJD->myEmit->set_grid();
476 |   }
477 |   else{
478 |     myJD->myEmit->init_range(myJD->time);//get the initial range of rates
479 |   }
480 |   printf("Initial range is %.3e < x < %.3e", myJD->myEmit->xmin, myJD->myEmit->xmax);
481 |   if (opts.bias_fn != NULL){
482 |     printf(", %.3e < y < %.3e", myJD->myEmit->ymin, myJD->myEmit->ymax);
483 |   }
484 |   cout<<endl;
485 |   if ( opts.jump < 0.0){// jump probability
486 |     to_opt.push_back(0);
487 |     nvar++;
488 |   }
489 |   else{
490 |     myJD->jump = opts.jump;
491 |     myJD->set_pstay();
492 |   }
493 |   if (opts.sigma < 0.0){// diffusion constant
494 |     to_opt.push_back(1);
495 |     nvar++;
496 |   }
497 |   else{
498 |     myJD->sigma = opts.sigma;
499 |     myJD->get_DiffProp();
500 |   }
501 |   if (opts.rnd_emit < 0.0){//random error rate
502 |     to_opt.push_back(2);
503 |     nvar++;
504 |   }
505 |   else{
506 |     myJD->rnd_emit = opts.rnd_emit;
507 |   }
508 |   if ( opts.shape < 0.0 && (opts.mode == 2 || opts.mode == 4)){//shape parameter
509 |     to_opt.push_back(3);
510 |     nvar++;
511 |   }
512 |   else{
513 |     myJD->myEmit->shape = opts.shape;
514 |   }
515 |   if(nvar>0){
516 |     gsl_vector * var   = gsl_vector_calloc(nvar);
517 |     gsl_vector * range = gsl_vector_calloc(nvar);
518 |     //set initial values
519 |     init_parameter(var, range,  to_opt, myJD, opts);
520 |     fpar myfpar;
521 |     myfpar.myJD    = myJD;
522 |     myfpar.to_opt  = to_opt;
523 |     void * param = static_cast<void*>(&myfpar);
524 |     // get the ML estimates and ML value
525 |     int steps = 0;
526 |     gsl_vector ** simplex = NULL;
527 |     gsl_vector * lower    = NULL;
528 |     //header...
529 |     printf("%-4s ", "eval");
530 |     for (int i=0; i<nvar; i++){
531 |       if (to_opt[i] == 0){//jump probability
532 | 	printf("%-11s ", "jump");
533 |       }
534 |       else if(to_opt[i] == 1){//diffusion constant
535 | 	printf("%-11s ", "sigma");
536 |       }
537 |       else if(to_opt[i] == 2){//random rate
538 | 	printf("%-11s ", "rnd");
539 |       }
540 |       else if(to_opt[i] == 3){//shape parameter
541 | 	printf("%-11s ", "shape");
542 |       }
543 |     }
544 |     printf("-llh\n");
545 |     llh = - find_local_optimum( 0, simplex, lower, var, range,  param, &Q, 1.0e-3, steps, 1);
546 |     //adapt the range if needed
547 |     if ((opts.mode==3 || opts.mode==4) && (opts.xmin < 0.0 && opts.xmax < 0.0)  && opts.bias_fn == NULL){
548 |       int redo = myJD->adapt_range();
549 |       if (redo==1){
550 | 	printf("Adapted range to %.3e < x < %.3e\n", myJD->myEmit->xmin, myJD->myEmit->xmax);
551 | 	//init_parameter( var, range,  to_opt, myJD, opts);
552 | 	llh = - find_local_optimum( 0, simplex, lower, var, range, param, &Q, 1.0e-3, steps, 1);
553 |       }
554 |     }
555 |     //set the ML values into the objects
556 |     for (int i=0; i<nvar; i++){
557 |       if (to_opt[i] == 0){//jump probability
558 | 	myJD->jump = var->data[i];
559 | 	myJD->set_pstay();
560 |       }
561 |       else if(to_opt[i] == 1){//diffusion constant
562 | 	myJD->sigma = var->data[i];
563 | 	myJD->get_DiffProp();
564 |       }
565 |       else if(to_opt[i] == 2){//random rate
566 | 	myJD->rnd_emit = var->data[i];
567 |       }
568 |       else if(to_opt[i] == 3){//shape parameter
569 | 	myJD->myEmit->shape = var->data[i];
570 | 	myJD->myEmit->set_EmitProb(myJD->time);
571 |       }
572 |     }
573 |     gsl_vector_free(var);
574 |     gsl_vector_free(range);
575 |   }
576 |   else{
577 |     llh = myJD->get_total_llh();
578 |     //adapt the range if needed
579 |     if ( (opts.mode==3 || opts.mode==4) && (opts.xmin < 0.0 && opts.xmax < 0.0) && opts.bias_fn == NULL){
580 |       myJD->adapt_range();
581 |       printf("Adapted range to %.3e < x < %.3e\n", myJD->myEmit->xmin, myJD->myEmit->xmax);
582 |       llh = myJD->get_total_llh();
583 |     }
584 |   }
585 |   return(llh);
586 | }
587 | 
588 | 
589 | 
590 | void init_parameter(gsl_vector*& var, gsl_vector*& range, vector<int>& to_opt, JumpDiffusion * myJD, cmdl_opts& opts){
591 |   int nvar = (int) var->size;
592 |   for (int i=0; i<nvar; i++){
593 |     if (to_opt[i] == 0){//jump probability
594 |       var->data[i]   = (opts.jump_i > 0.0) ? opts.jump_i : 1.0e-5;
595 |       range->data[i] = 1.0;
596 |     }
597 |     else if(to_opt[i] == 1){//diffusion constant
598 |       if (opts.sigma_i > 0.0){
599 | 	var->data[i]   = opts.sigma_i;
600 |       }
601 |       else{
602 | 	var->data[i]   = 0.1*myJD->myEmit->dx / sqrt(myJD->myEmit->median_dist);
603 |       }
604 | 	range->data[i] = 0.0;
605 |     }
606 |     else if(to_opt[i] == 2){//random rate
607 |       var->data[i]   = (opts.rnd_emit_i > 0.0) ? opts.rnd_emit_i : 1.0e-5;
608 |       range->data[i] = 1.0;
609 |     }
610 |     else if(to_opt[i] == 3){//shape parameter
611 |       var->data[i]   = (opts.shape_i > 0.0) ? opts.shape_i : 100.0;
612 |       range->data[i] = 0.0;
613 |     }
614 |   }
615 | }
616 | 
617 | 
618 | 
619 | double Q( const gsl_vector * x, void * p){
620 |   //JumpDiffusion * myJD = static_cast<JumpDiffusion*> (p);
621 |   fpar * myfpar = static_cast<fpar*> (p);
622 |   int nvar = (int) (myfpar->to_opt).size();
623 |   gsl_vector * var   = gsl_vector_alloc(nvar);
624 |   gsl_vector * range = gsl_vector_alloc(nvar);
625 |   for (int i=0; i<nvar; i++){
626 |     if ((myfpar->to_opt)[i] == 0){//jump probability in [0,1]
627 |       range->data[i] = 1.0;
628 |     }
629 |     else if((myfpar->to_opt)[i] == 1){//diffusion constant in [0,\infty]
630 |       range->data[i] = 0.0;
631 |     }
632 |     else if((myfpar->to_opt)[i] == 2){//random rate in [0,1]
633 |       range->data[i] = 1.0;
634 |     }
635 |     else if((myfpar->to_opt)[i] == 3){//shape parameter in [0,\infty]
636 |       range->data[i] = 0.0;
637 |     }
638 |   }
639 |   gsl_vector ** simplex = NULL;
640 |   gsl_vector * lower    = NULL;
641 |   int err = arg_unmap( x, 0, simplex, lower, var, range);
642 |   if (err==1){
643 |     gsl_vector_free(var);
644 |     gsl_vector_free(range);
645 |     return(1.0e20);
646 |   }
647 |   //set the ML values into the objects
648 |   for (int i=0; i<nvar; i++){
649 |     if ((myfpar->to_opt)[i] == 0){//jump probability
650 |       myfpar->myJD->jump = var->data[i];
651 |       myfpar->myJD->set_pstay();
652 |     }
653 |     else if((myfpar->to_opt)[i] == 1){//diffusion constant
654 |       myfpar->myJD->sigma = var->data[i];
655 |       myfpar->myJD->get_DiffProp();
656 |     }
657 |     else if((myfpar->to_opt)[i] == 2){//random rate
658 |       myfpar->myJD->rnd_emit = var->data[i];
659 |     }
660 |     else if((myfpar->to_opt)[i] == 3){//shape parameter
661 |       myfpar->myJD->myEmit->shape = var->data[i];
662 |       myfpar->myJD->myEmit->set_EmitProb(myfpar->myJD->time);
663 |     }
664 |   }
665 |   // DO FWD TO GET LLH
666 |   double llh = (myfpar->myJD)->get_total_llh();
667 |   gsl_vector_free(var);
668 |   gsl_vector_free(range);
669 |   return(-llh);
670 | }
671 | 
672 | 


--------------------------------------------------------------------------------