├── example ├── example.sample └── readme ├── src ├── selection_get_position.h ├── input_line.h ├── selection_subsample.h ├── subsample.h ├── factorial.h ├── binomial.h ├── selection_class.h ├── normalize.h ├── exponentiate_matrix.h ├── sort_vertices.h ├── selection_optimize_test_func.h ├── create_states.h ├── pulses_to_ancestry.h ├── check_vertex.h ├── ancestry_pulse.h ├── nchoosek.h ├── markov_chain.h ├── Makefile ├── evaluate_vertex.h ├── selection_stochastic_traj.h ├── cmd_line.h ├── read_samples.h ├── print_usage.h ├── multichoose.h ├── ploidy_path.h ├── compute_forward.h ├── selection_split_vector.h ├── compute_backward.h ├── selection_markov_chain.h ├── selection_trajectory.h ├── distribute_alleles.h ├── inbred.h ├── selection_cmd_line.h ├── create_pulses.h ├── selection_print_usage.h ├── multipermute.h ├── forward_backward.h ├── golden_search.h ├── bootstrap.h ├── read_emissions.h ├── selection_forward.h ├── read_input.h ├── genotype_emissions.h ├── selection_read_input.h ├── transition_information.h ├── ahmms.cpp ├── create_transition_rates.h ├── read_cmd_line.h ├── viterbi.h ├── nelder_mead.h ├── ancestry_hmm.cpp ├── selection_read_cmd_line.h └── selection_fwd_iter.h └── scripts ├── readme.md └── vcf2ahmm.py /example/example.sample: -------------------------------------------------------------------------------- 1 | sample1 2 2 | sample2 2 3 | sample3 2 4 | sample4 2 5 | -------------------------------------------------------------------------------- /src/selection_get_position.h: -------------------------------------------------------------------------------- 1 | #ifndef __GET_POSITION_H 2 | #define __GET_POSITION_H 3 | 4 | int get_position(int pos, vector &positions) { 5 | for (int i = 0; i < positions.size(); i++) { 6 | if ( positions[i] >= pos ) { 7 | return i; 8 | } 9 | } 10 | return -1; 11 | } 12 | 13 | #endif -------------------------------------------------------------------------------- /example/readme: -------------------------------------------------------------------------------- 1 | cmd line: 2 | 3 | ./ancestry_hmm -i example.panel -s example.sample -a 2 0.8 0.2 -p 0 100000 0.8 -p 1 10 0.2 -g -e 1e-3 4 | 5 | This will fit a model of a single pulse at time 10 with 80% of individuals from the first population, and 20% from the second. 6 | Genotypes are used with a uniform error rate of 1e-3. 7 | All individuals are ploidy 2. 8 | -------------------------------------------------------------------------------- /src/input_line.h: -------------------------------------------------------------------------------- 1 | #ifndef __INPUT_LINE_H 2 | #define __INPUT_LINE_H 3 | 4 | /// hmm input lines 5 | class input_line { 6 | public: 7 | int pos ; 8 | string chrom ; 9 | vector > reference_counts ; 10 | vector > sample_counts ; 11 | double recombination_rate ; 12 | double error_1 ; 13 | double error_2 ; 14 | } ; 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/selection_subsample.h: -------------------------------------------------------------------------------- 1 | #ifndef __SUBSAMPLE_H 2 | #define __SUBSAMPLE_H 3 | 4 | void subsample_reads ( double &c1, double &c2 ) { 5 | while ( c1 + c2 > 170 ) { 6 | double r = ((double) rand() / (RAND_MAX)) ; 7 | if ( r < c1/(c1+c2) ) { 8 | c1 -- ; 9 | } 10 | else { 11 | c2 -- ; 12 | } 13 | } 14 | } 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/subsample.h: -------------------------------------------------------------------------------- 1 | #ifndef __SUBSAMPLE_H 2 | #define __SUBSAMPLE_H 3 | 4 | void subsample_reads ( double &c1, double &c2, int sample_max ) { 5 | while ( c1 + c2 > sample_max ) { 6 | double r = ((double) rand() / (RAND_MAX)) ; 7 | if ( r < c1/(c1+c2) ) { 8 | c1 -- ; 9 | } 10 | else { 11 | c2 -- ; 12 | } 13 | } 14 | } 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/factorial.h: -------------------------------------------------------------------------------- 1 | #ifndef __FACTORIAL_H 2 | #define __FACTORIAL_H 3 | 4 | vector create_factorial () { 5 | 6 | vector factorial(1755) ; 7 | factorial[0] = 1 ; 8 | double fact = 1 ; 9 | for ( double base = 1 ; base < 1755 ; base ++ ) { 10 | fact *= base ; 11 | factorial[base] = fact ; 12 | } 13 | return factorial ; 14 | } 15 | 16 | const vector factorial = create_factorial() ; 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /src/binomial.h: -------------------------------------------------------------------------------- 1 | #ifndef __BINOMIAL_H 2 | #define __BINOMIAL_H 3 | 4 | /// binomial probs 5 | double binomial ( double n, double k, double prob ) { 6 | return nCk[n][k]*pow((1-prob),n-k)*pow(prob,k) ; 7 | } 8 | 9 | /// multinomial probs 10 | double multinomial ( double &nc, double &n, vector &k, vector &prob ) { 11 | double mult = factorial[n] ; 12 | for ( int x = 0 ; x < k.size() ; x ++ ) { 13 | mult *= pow( (double)prob.at(x)/nc, k.at(x) )/factorial[k.at(x)] ; 14 | } 15 | return mult ; 16 | } 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /src/selection_class.h: -------------------------------------------------------------------------------- 1 | #ifndef __SELECTION_CLASS_H 2 | #define __SELECTION_CLASS_H 3 | 4 | class selection { 5 | public: 6 | int pos; 7 | double sel; 8 | double lnl; 9 | 10 | /// sort pulses by time 11 | friend bool operator < ( const selection &a, const selection &b ) { 12 | return a.lnl < b.lnl ; 13 | } 14 | } ; 15 | 16 | ostream& operator<< (ostream &out, selection const& point) { 17 | out << "Selection point. pos:" << point.pos << " sel:" << setprecision(15) << point.sel << " lnL: " << point.lnl; 18 | return out; 19 | } 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /src/normalize.h: -------------------------------------------------------------------------------- 1 | #ifndef __NORMALIZE_H 2 | #define __NORMALIZE_H 3 | 4 | /// normalize vector 5 | double normalize( vector &vec ) { 6 | double sum = 0 ; 7 | for ( int i = 0 ; i < vec.size() ; i ++ ) { 8 | sum += vec.at(i) ; 9 | } 10 | for ( int i = 0 ; i < vec.size() ; i ++ ) { 11 | vec.at(i) /= sum ; 12 | } 13 | return log(sum) ; 14 | } 15 | 16 | /// normalize vector 17 | double normalize( vec &vector ) { 18 | double sum = accu( vector ) ; 19 | for ( int i = 0 ; i < vector.size() ; i ++ ) { 20 | vector(i) /= sum ; 21 | } 22 | return log(sum) ; 23 | } 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /src/exponentiate_matrix.h: -------------------------------------------------------------------------------- 1 | #ifndef __EXPONENTIATE_MATRIX_H 2 | #define __EXPONENTIATE_MATRIX_H 3 | 4 | /// exponentiation by squaring until we find a better (working) solution 5 | mat exp_matrix( mat &matrix, int exp ) { 6 | if ( exp == 2 ) { 7 | return ( matrix * matrix ) ; 8 | } 9 | else if ( exp == 1 ) { 10 | return matrix ; 11 | } 12 | else if ( exp%2 == 1 ) { 13 | mat result = exp_matrix( matrix, exp-1 ) ; 14 | return ( matrix * result ) ; 15 | } 16 | else { 17 | mat result = exp_matrix( matrix, exp/2 ) ; 18 | return result * result ; 19 | } 20 | } 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /src/sort_vertices.h: -------------------------------------------------------------------------------- 1 | #ifndef __SORT_VERTICES_H 2 | #define __SORT_VERTICES_H 3 | 4 | void sort_vertices( vector > &vertices, vector &lnl ) { 5 | 6 | vector > new_pulses ; 7 | vector new_lnl ; 8 | 9 | while ( lnl.size() > 0 ) { 10 | 11 | double max = lnl[0] ; 12 | int index = 0 ; 13 | 14 | for ( int i = 0 ; i < lnl.size() ; i ++ ) { 15 | if ( lnl[i] > max ) { 16 | max = lnl[i] ; 17 | index = i ; 18 | } 19 | } 20 | new_pulses.push_back( vertices[index] ) ; 21 | new_lnl.push_back( max ) ; 22 | lnl.erase( lnl.begin() + index ) ; 23 | vertices.erase( vertices.begin() + index ) ; 24 | } 25 | 26 | /// swap out for ordered sets 27 | swap ( lnl, new_lnl ) ; 28 | swap ( vertices, new_pulses ) ; 29 | 30 | } 31 | 32 | #endif 33 | 34 | -------------------------------------------------------------------------------- /src/selection_optimize_test_func.h: -------------------------------------------------------------------------------- 1 | #ifndef __OPTIMIZE_TEST_FUNC_H 2 | #define __OPTIMIZE_TEST_FUNC_H 3 | 4 | double optimize_test_func(int x, double y) { 5 | return 2000-1000*(pow(sin(x/1000), 10) + cos(10 + y * x/1000) * cos(x/1000)); 6 | //2000-1000*(np.sin(x/1000) ** 10 + np.cos(10 + y * x/1000) * np.cos(x/1000)) 7 | 8 | } 9 | double optimize_test_func2(int x, double y) { 10 | double xx = x; 11 | return 1000 - 50*(pow((xx/1000)-3,2) + pow(y-3,2)); 12 | } 13 | 14 | double selection_evaluate_point2(selection &point, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, cmd_line &options, map > > &state_changes ) { 15 | //cout << "Evaluate point1: " << point<< endl; 16 | point.lnl = optimize_test_func2(point.pos, point.sel); 17 | //cout << "Evaluate point2: " << point<< endl; 18 | return point.lnl; 19 | } 20 | 21 | #endif -------------------------------------------------------------------------------- /src/create_states.h: -------------------------------------------------------------------------------- 1 | #ifndef __CREATE_STATES_H 2 | #define __CREATE_STATES_H 3 | 4 | /// create the set of states that are permissable for a given ploidy 5 | void create_initial_states ( double &number_chromosomes, vector &ancestry_pulses, map > > &state_list ) { 6 | 7 | /// if we have already computed the state list for this sample ploidy, move on 8 | if ( state_list.find( number_chromosomes ) != state_list.end() ) { 9 | return ; 10 | } 11 | 12 | /// list of all possible states for single chromosomes 13 | vector states ; 14 | for ( int p = 0 ; p < ancestry_pulses.size() ; p ++ ) { 15 | states.push_back( p ) ; 16 | } 17 | 18 | /// now get all possible arrangements and store them in our state list 19 | vector< vector > results = multichoose( number_chromosomes, states ) ; 20 | for ( int i = 0 ; i < results.size() ; i ++ ) { 21 | vector state_vector( ancestry_pulses.size(), 0 ) ; 22 | for ( int j = 0 ; j < results[i].size() ; j ++ ) { 23 | state_vector.at(results[i][j]) ++ ; 24 | } 25 | state_list[number_chromosomes].push_back( state_vector ) ; 26 | } 27 | } 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /src/pulses_to_ancestry.h: -------------------------------------------------------------------------------- 1 | #ifndef __PULSES_TO_ANCESTRY_H 2 | #define __PULSES_TO_ANCESTRY_H 3 | 4 | vector pulses_to_ancestry ( vector counts, vector &pulses ) { 5 | 6 | /// create ancestry types 7 | vector ancestry (2,0) ; 8 | 9 | /// now add up the counts 10 | for ( int c = 0 ; c < counts.size() ; c ++ ) { 11 | if ( pulses[c].type + 1 > ancestry.size() ) { 12 | ancestry.resize( pulses[c].type + 1 ) ; 13 | } 14 | ancestry[pulses[c].type] += counts[c] ; 15 | } 16 | 17 | return ancestry ; 18 | } 19 | 20 | /// create a map that links the states to their ancestry type counts 21 | map > > create_pulse_map ( map > > states, vector pulses, vector ploidy_list ) { 22 | 23 | map > > ploidy2pulses2ancestry ; 24 | for ( int p = 0 ; p < ploidy_list.size() ; p ++ ) { 25 | for ( int i = 0 ; i < states[ploidy_list[p]].size() ; i ++ ) { 26 | ploidy2pulses2ancestry[ploidy_list[p]][i] = pulses_to_ancestry( states[ploidy_list[p]][i], pulses ) ; 27 | } 28 | } 29 | return ploidy2pulses2ancestry ; 30 | } 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /src/check_vertex.h: -------------------------------------------------------------------------------- 1 | #ifndef __CHECK_VERTEX_H 2 | #define __CHECK_VERTEX_H 3 | 4 | /// check vertex to make sure it's within the parameter bounds 5 | /// then use random box method to select points within the admissable parameter space 6 | /// hope to prevent collapsing the edge by using random box, but boundary cases should be checked carefully 7 | 8 | /// rng stuff 9 | double range_term = 0.05 ; 10 | 11 | /// perform check 12 | void check_vertex( vector &vertex, cmd_line &options ) { 13 | for ( int p = 0 ; p < vertex.size() ; p ++ ) { 14 | if ( vertex[p].time_fixed == false ) { 15 | if ( vertex[p].time > options.t_max ) { 16 | vertex[p].time = options.t_max - ((double) rand() / (RAND_MAX))*range_term * ( options.t_max - options.t_min ) ; 17 | } 18 | else if ( vertex[p].time < options.t_min ) { 19 | vertex[p].time = options.t_min + ((double) rand() / (RAND_MAX))*range_term * ( options.t_max - options.t_min ) ; 20 | } 21 | } 22 | if ( vertex[p].proportion_fixed == false ) { 23 | if ( vertex[p].fraction_of_remainder > options.p_max ) { 24 | vertex[p].fraction_of_remainder = options.p_max - ((double) rand() / (RAND_MAX))*range_term * ( options.p_max - options.p_min ) ; 25 | } 26 | else if ( vertex[p].fraction_of_remainder < options.p_min ) { 27 | vertex[p].fraction_of_remainder = options.p_min + ((double) rand() / (RAND_MAX))*range_term * ( options.p_max - options.p_min ) ; 28 | } 29 | } 30 | } 31 | } 32 | 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /src/ancestry_pulse.h: -------------------------------------------------------------------------------- 1 | #ifndef __ANCESTRY_PULSE_H 2 | #define __ANCESTRY_PULSE_H 3 | 4 | /// ancestry pulses 5 | class pulse { 6 | 7 | public: 8 | 9 | /// time of pulse 10 | double time ; 11 | 12 | /// is time fixed? 13 | bool time_fixed ; 14 | 15 | /// ancestry type that pulses 16 | double type ; 17 | 18 | /// proportion of the individuals replaced with this pulse 19 | double proportion ; 20 | 21 | /// is the proportion fixed 22 | bool proportion_fixed ; 23 | 24 | /// if there are multiple pulses of the same ancestry type, proportion that remains that this pulse will take 25 | double fraction_of_remainder ; 26 | 27 | /// order pulses were entered in 28 | int entry_order ; 29 | 30 | /// print a pulse 31 | void print () ; 32 | 33 | /// sort pulses by time 34 | friend bool operator < ( const pulse &a, const pulse &b ) { 35 | return a.time > b.time ; 36 | } 37 | } ; 38 | 39 | //// sort vector by type 40 | void sort_pulse_vector ( vector &ancestry_pulses, int ancestry_types ) { 41 | vector return_pulses ; 42 | for ( int a = 0 ; a < ancestry_types ; a++ ) { 43 | for ( int p = 0 ; p < ancestry_pulses.size() ; p ++ ) { 44 | if ( ancestry_pulses[p].type == a ) { 45 | return_pulses.push_back( ancestry_pulses[p] ) ; 46 | } 47 | } 48 | } 49 | ancestry_pulses = return_pulses ; 50 | } 51 | 52 | /// print pulse information 53 | void pulse::print () { 54 | cerr << "\t\t" << time << "\t" << type << "\t" << fraction_of_remainder << endl ; 55 | } 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /src/nchoosek.h: -------------------------------------------------------------------------------- 1 | #ifndef __NCHOOSEK_H 2 | #define __NCHOOSEK_H 3 | 4 | /// this will just create a matrix through which we can look up any nck within the range that can be represented using a double 5 | const int max_n = 1001 ; 6 | 7 | vector > create_nck_table () { 8 | vector > nck(max_n,vector(max_n) ) ; 9 | for ( double n = 0 ; n < max_n ; n ++ ) { 10 | for ( double k = 0 ; k <= n ; k ++ ) { 11 | 12 | if ( n == k || k == 0 ) { 13 | nck[n][k] = 1 ; 14 | continue ; 15 | } 16 | 17 | double l = k ; 18 | if ( n-k < k ) { 19 | l = n - k ; 20 | } 21 | 22 | double lf = 2 ; 23 | double nf = n - l + 1 ; 24 | double result = 1 ; 25 | 26 | while ( lf < l + 1 || nf < n + 1 ) { 27 | 28 | /// check if >= 1 because we want to stay in the dynamic range of the double as long as possible to retain precision and make computation possible 29 | if ( ( result >= 1 || nf == n + 1 ) && lf < l + 1 ) { 30 | result /= lf ; 31 | lf ++ ; 32 | } 33 | else { 34 | result *= nf ; 35 | nf ++ ; 36 | } 37 | } 38 | nck[n][k] = result ; 39 | } 40 | } 41 | 42 | return nck ; 43 | } 44 | 45 | //// function to create the table is called automatically such that this matrix is available as long as the header is included 46 | const vector > nCk = create_nck_table() ; 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/markov_chain.h: -------------------------------------------------------------------------------- 1 | #ifndef __MARKOV_CHAIN_H 2 | #define __MARKOV_CHAIN_H 3 | 4 | /// will include all basic information for input data and functions to compute forward, backward, forward-backward, and viterbi 5 | class markov_chain { 6 | public: 7 | 8 | /// sample attributes 9 | string output_file ; 10 | double number_chromosomes ; 11 | 12 | /// file describing ploidy path across the genome 13 | string path_file ; 14 | 15 | /// data object storing ploidy paths to be looked up during emissions computation with chromosome as key 16 | vector sample_ploidy_path ; 17 | vector ploidy_switch_position ; 18 | vector ploidy_switch ; 19 | 20 | /// read from input file 21 | vector emission_probabilities ; 22 | 23 | /// create initial states to be stored 24 | double start_prob ; 25 | double end_prob ; 26 | 27 | /// forward probs 28 | vector alphas ; 29 | double compute_forward_probabilities( map > &transition_matrix, vector &interploidy_transitions ) ; 30 | 31 | /// backward probs 32 | vector betas ; 33 | void compute_backward_probabilities( map > &transition_matrix, vector &interploidy_transitions ) ; 34 | 35 | /// combine probs 36 | void combine_prob( vector &position, map > > &states, vector &chrom, bool output_pulses, vector &pulses ) ; 37 | 38 | /// output viterbi paths 39 | void viterbi( vector &position, vector &recombination_rate, map > > &states, vector &chrom, map > &transition_probabilites, vector &interploidy_transitions, bool output_pulses, vector &pulses ) ; 40 | 41 | } ; 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | ## A MAKEFILE FOR The Ancestry HMM. TO USE, RUN THE COMMAND "make" VIA COMMAND LINE ## 2 | 3 | TCFLAGS = -ltcmalloc 4 | ARMAFLAGS = -larmadillo 5 | CONDAFLAGS = -fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem ${CONDA_PREFIX}/include 6 | 7 | all: 8 | $(CXX) -O3 $(CXXFLAGS) -o ancestry_hmm ancestry_hmm.cpp $(ARMAFLAGS) 9 | $(CXX) -O3 $(CXXFLAGS) -o ahmm-s ahmms.cpp $(ARMAFLAGS) 10 | 11 | conda: 12 | $(CXX) -O3 $(CONDAFLAGS) -o ahmm-s ahmms.cpp -L ${CONDA_PREFIX}/lib -I ${CONDA_PREFIX}/include $(ARMAFLAGS) 13 | $(CXX) -O3 $(CONDAFLAGS) -o ancestry_hmm ancestry_hmm.cpp -L ${CONDA_PREFIX}/lib -I ${CONDA_PREFIX}/include $(ARMAFLAGS) 14 | 15 | ahmms: 16 | $(CXX) -O3 $(CXXFLAGS) -o ahmm-s ahmms.cpp $(ARMAFLAGS) 17 | 18 | ahmm: 19 | $(CXX) -O3 $(CXXFLAGS) -o ancestry_hmm ancestry_hmm.cpp $(ARMAFLAGS) 20 | 21 | 22 | ## if you have a local install of google perftools, please add a TCFlag link. 23 | ### $(LINK.cc) -std=c++11 -O3 -o ahmm-s ahmms.cpp $(ARMAFLAGS) $(TCFLAGS) 24 | 25 | ## if you have a local armadillo installation, you will need to provide the directory during compile time and possible also link lblas and lapack 26 | ## our recommendation is to use miniconda3 to do the installation 27 | ## $ conda install -c conda-forge armadillo 28 | ## then you will have the appropriate lib and include files in your home directory under subdirectory miniconda3/ 29 | ## so, replace USERNAME with your unix id on the following line and try this 30 | 31 | ## $(CXX) -std=c++11 -O3 -o ahmm-s ahmms.cpp -L ${CONDA_PREFIX}/lib -I ${CONDA_PREFIX}/include $(ARMAFLAGS) 32 | 33 | ## if it builds correctly, you may also need to link the library during runtime 34 | ## to do this, add the following line to your ~/.bash_profile or ~/.bashrc 35 | ## export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${CONDA_PREFIX}/lib 36 | 37 | -------------------------------------------------------------------------------- /src/evaluate_vertex.h: -------------------------------------------------------------------------------- 1 | #ifndef __EVALUATE_VERTEX_H 2 | #define __EVALUATE_VERTEX_H 3 | 4 | /// evaluate likelihood for a single vertex 5 | double evaluate_vertex( vector &vertex, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, cmd_line &options, map > > &state_changes ) { 6 | 7 | /// create single chromosome transition matrix for single chromosomes 8 | mat transition_rates = create_transition_rates( vertex, options.ne, options.ancestry_proportion ) ; 9 | 10 | /// create transition matrix for all ploidies 11 | map > transition_matrix ; 12 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 13 | create_transition_matrix( transition_matrix, transition_matrix_information[markov_chain_information.at(m).number_chromosomes], recombination_rate, position, markov_chain_information.at(m).number_chromosomes, transition_rates ) ; 14 | for ( int p = 0 ; p < markov_chain_information[m].ploidy_switch.size() ; p ++ ) { 15 | create_transition_matrix( transition_matrix, transition_matrix_information[markov_chain_information[m].ploidy_switch[p]], recombination_rate, position, markov_chain_information[m].ploidy_switch[p], transition_rates ) ; 16 | } 17 | } 18 | 19 | /// compute transitions within a state 20 | vector interploidy_transitions ; 21 | interploidy_transitions = create_interploidy_transitions( state_changes, vertex, options.ancestry_proportion ) ; 22 | 23 | /// now compute the forward probabilities 24 | double lnl = 0 ; 25 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 26 | lnl += markov_chain_information[m].compute_forward_probabilities( transition_matrix, interploidy_transitions ) ; 27 | } 28 | return lnl ; 29 | } 30 | 31 | #endif 32 | 33 | -------------------------------------------------------------------------------- /src/selection_stochastic_traj.h: -------------------------------------------------------------------------------- 1 | #ifndef __SELECTION_STOCHASTIC_TRAJECTORY_H 2 | #define __SELECTION_STOCHASTIC_TRAJECTORY_H 3 | 4 | 5 | // generates a frequency trajectory of the selected site using the stochastic method 6 | // the stochastic method will generate many random trajectories and calculate and return the average trajectory 7 | void selection_stochastic_trajectory(vector &trajectory, double s, double m, int generations, int ne, int reps) 8 | { 9 | vector traj_sum(generations+1,0); //// +1?????? 10 | default_random_engine rand_gen; 11 | 12 | double fixed_freq; 13 | double n_sel; 14 | double n_nonsel; 15 | int fix_gen = 0; 16 | double ns_fit; 17 | double nns_fit; 18 | double ns_freq; 19 | double new_ns; 20 | 21 | bool not_lost_fixed = true; 22 | int rcount = 0; 23 | 24 | 25 | while (rcount < reps) { 26 | n_sel = ne * m; 27 | n_nonsel = ne * (1 - m); 28 | 29 | fixed_freq = 0; 30 | vector traj(generations+1,0); 31 | traj[0] += m; 32 | 33 | for (int g = 1; g <= generations; g++) { //// not sure about start and end 34 | ns_fit = n_sel * (1 + s); 35 | nns_fit = n_nonsel; 36 | ns_freq = ns_fit / (ns_fit + nns_fit); 37 | 38 | if ( ns_freq == 0.0) { 39 | not_lost_fixed = false; 40 | break; 41 | } 42 | 43 | binomial_distribution<> repopulate(ne, ns_freq); 44 | new_ns = repopulate(rand_gen); 45 | traj[g] = new_ns/ne; 46 | 47 | n_sel = new_ns; 48 | n_nonsel = ne - new_ns; 49 | } 50 | 51 | if (not_lost_fixed = true) { 52 | for (int f = 0; f < traj.size(); f++) { 53 | traj_sum[f] += traj[f]; 54 | } 55 | rcount++; 56 | } 57 | not_lost_fixed = true; 58 | } 59 | 60 | for (int i = 0; i < traj_sum.size(); i++) { 61 | trajectory.push_back(traj_sum[i]/reps); 62 | } 63 | 64 | } 65 | 66 | #endif -------------------------------------------------------------------------------- /src/cmd_line.h: -------------------------------------------------------------------------------- 1 | #ifndef __CMD_LINE_H 2 | #define __CMD_LINE_H 3 | 4 | /// command line information and global parameters 5 | class cmd_line { 6 | public: 7 | 8 | /// terms to bound the optimization 9 | double t_max ; 10 | double t_min ; 11 | 12 | /// to bound proportion search 13 | double p_max ; 14 | double p_min ; 15 | 16 | /// to create intial simplex points 17 | double t_length ; 18 | double p_length ; 19 | 20 | /// number of restarts 21 | int n_restarts ; 22 | 23 | /// proportion ancestry for 0-n must sum to 1 24 | /// these are therefore the final ancestry proportion, not necessary the proportion that fluxed if additional pulses occured closer to the present 25 | vector ancestry_proportion ; 26 | 27 | /// store relevant ancestry information 28 | vector ancestry_pulses ; 29 | 30 | /// diploid effective population size ( i.e. 2n ) 31 | double ne ; 32 | 33 | /// tolerance for parameter search 34 | double tolerance ; 35 | 36 | /// minimum recombinational distance between markers 37 | double minimum_distance ; 38 | 39 | /// error rates for reads (if read based) or genotypes (if genotype based) 40 | double error_rate ; 41 | 42 | /// bool sample is expressed as genotypes, not read counts 43 | bool genotype ; 44 | 45 | /// ancestral genotype frequencies are fixed 46 | bool ancestral_fixed ; 47 | 48 | /// viterbi output 49 | /// caution: not recommended for more samples of ploidy > 1 50 | bool viterbi ; 51 | 52 | /// number of digits of precision to include 53 | int precision ; 54 | 55 | /// output actual pulses rather than ancestry states 56 | bool output_pulses ; 57 | 58 | /// error rates specifed 59 | bool error_rates ; 60 | 61 | /// input file name 62 | string input_file ; 63 | 64 | /// sample file 65 | string sample_file ; 66 | 67 | /// bootstrap 68 | int n_bootstraps ; 69 | int block_size ; 70 | 71 | /// read relevant information 72 | void read_cmd_line ( int argc, char *argv[] ) ; 73 | 74 | } ; 75 | 76 | #endif 77 | 78 | -------------------------------------------------------------------------------- /scripts/readme.md: -------------------------------------------------------------------------------- 1 | # This is a simple updated version of our vcf2ahmm script to take an input vcf file and output a file compatible ancestry_hmm. 2 | 3 | ## Disclaimer: 4 | 5 | This script is provided as a simple utility to convert vcf files to ahmm input. I do not expect that it will be suitable for all possible use cases. Please read this carefully and feel free to post questions/requests as needed. See all Assumptions below. 6 | 7 | ## Basic Usage: 8 | 9 | The script takes the following required options. 10 | 11 | python3 vcf2ahmm.py -v [vcf_file] -s [sample2population mappings] > [ahmm_input_file] 12 | 13 | Where the sample2population mapping file is a text-based table with two columns. 14 | 15 | 1. sample id exactly as it appears in the vcf file 16 | 2. the population to which that individual belongs. This can be either one of the ancestral populations indicated with an integer (0,1,2..k), or the sample is admixed in which case the population must read "admixed" 17 | 18 | e.g. 19 | 20 | sample1 0 \ 21 | sample2 0 \ 22 | sample3 1 \ 23 | sample4 1 \ 24 | sample5 admixed 25 | 26 | This file has two individuals from ancestral population 0, two individuals from ancestral population 1, and a single admixed sample. 27 | 28 | Note that ancestry_hmm will still require a sample input file indicating the ploidy of each admixed sample. 29 | 30 | ## Assumptions: 31 | 32 | 1. Sample are either diploid or haploid. The allele depth field (AD), immediately follows the genotype (GT) in the vcf format. This is typical of most VCF files. 33 | 2. A single uniform per basepair, per generation recombination rate should be used. Default: 1e-8. 34 | 35 | ## Optional arguments: 36 | 37 | 1. "-g 1" indicates that admixed sample genotypes (rather than allele counts) should be used 38 | 2. "-r [float]" sets the per site generation rate 39 | 3. "-m [int]" minimum distance in basepairs between successful SNPs to be included in this analysis 40 | 4. "-o [string]" file to print admixed sample ploidy for ahmm input. Default is ploidy.txt 41 | 5. "--min_total [int]" minimum number of samples in each ancestral population to consider a site. Default 10. 42 | 6. "--min_diff [float]" minimum allele frequency difference between any pair of ancestral populations to include a site. I.e., this selects AIMs. Default 0.1. 43 | -------------------------------------------------------------------------------- /src/read_samples.h: -------------------------------------------------------------------------------- 1 | #ifndef __READ_SAMPLES_H 2 | #define __READ_SAMPLES_H 3 | 4 | void read_samples( vector &markov_chain_information, string &input_file, bool viterbi ) { 5 | 6 | ifstream in ( input_file.c_str() ) ; 7 | while ( !in.eof() ) { 8 | markov_chain new_sample ; 9 | in >> new_sample.output_file ; 10 | 11 | if ( new_sample.output_file == "" ) { 12 | continue ; 13 | } 14 | 15 | if ( viterbi == false ) { 16 | new_sample.output_file.append( ".posterior" ) ; 17 | } 18 | else { 19 | new_sample.output_file.append( ".viterbi" ) ; 20 | } 21 | 22 | in >> new_sample.number_chromosomes ; 23 | 24 | new_sample.path_file = "null" ; 25 | 26 | /// if ploidy path is present, store file and read later 27 | if ( new_sample.number_chromosomes < 0 ) { 28 | in >> new_sample.path_file ; 29 | } 30 | 31 | /// store new samples 32 | markov_chain_information.push_back( new_sample ) ; 33 | } 34 | in.close() ; 35 | 36 | /// read or generate ploidy pahts 37 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 38 | if ( markov_chain_information[m].path_file != "null" ) { 39 | read_ploidy_file( markov_chain_information[m].path_file, markov_chain_information[m].sample_ploidy_path ) ; 40 | markov_chain_information[m].ploidy_switch_position.push_back( 0 ) ; 41 | markov_chain_information[m].ploidy_switch.push_back( markov_chain_information[m].sample_ploidy_path[0].ploidy ) ; 42 | } 43 | else { 44 | markov_chain_information[m].ploidy_switch_position.push_back( 0 ) ; 45 | markov_chain_information[m].ploidy_switch.push_back( markov_chain_information[m].number_chromosomes ) ; 46 | ploidy_entry new_entry ; 47 | new_entry.ploidy = markov_chain_information[m].number_chromosomes ; 48 | markov_chain_information[m].sample_ploidy_path.push_back( new_entry ) ; 49 | } 50 | } 51 | 52 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 53 | markov_chain_information[m].end_prob = 1 ; 54 | markov_chain_information[m].start_prob = 1 ; 55 | } 56 | 57 | return ; 58 | } 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/print_usage.h: -------------------------------------------------------------------------------- 1 | #ifndef __PRINT_USAGE_H 2 | #define __PRINT_USAGE_H 3 | 4 | void print_usage() { 5 | 6 | cerr << endl << endl << "ancestry_hmm usage:" << endl << endl ; 7 | cerr << "\trequired:" << endl ; 8 | cerr << "\t\t-i [string]\t\tinput file name" << endl ; 9 | cerr << "\t\t-s [string]\t\tsample id and ploidy file" << endl ; 10 | cerr << "\t\t-a [int] [float] [float] ..." << endl ; 11 | cerr << "\t\t\tnumber of ancestral populations and ancestry proportion attributable to each" << endl ; 12 | cerr << "\t\t-p [int] [int] [float]" << endl ; 13 | cerr << "\t\t\tancestry pulse with format, ancestral population, time," << endl ; 14 | cerr << "\t\t\tand proportion of final ancestry from this pulse" << endl ; 15 | cerr << "\t\t\tnegative time or proportions indicate that parameters are to be estimated" << endl << endl ; 16 | 17 | cerr << "\toptional:" << endl ; 18 | cerr << "\t\t--help\t\t\tprint this help statement" << endl ; 19 | cerr << "\t\t--ne [int]\t\teffective population size of the admixed population" << endl ; 20 | cerr << "\t\t-g\t\t\tsamples are specified with genotypes rather than read counts" << endl ; 21 | cerr << "\t\t--precision [int]\tmodify float and double precision to int" << endl ; 22 | cerr << "\t\t-v\t\t\tviterbi decoding" << endl ; 23 | cerr << "\t\t-b [int] [int]\t\tnumber of bootstraps and bootstrap block size in number of SNPs" << endl ; 24 | cerr << "\t\t--tmax [int]\t\tmaximum time of an admixture pulse" << endl ; 25 | cerr << "\t\t--tmin [int]\t\tminimum time of an admixture pulse" << endl ; 26 | cerr << "\t\t--tolerance [float]\tdistance in lnL units to just convergence" << endl ; 27 | cerr << "\t\t-e [float]\t\terror rates" << endl ; 28 | cerr << "\t\t-E\t\t\tsite specific error rates are included" << endl ; 29 | cerr << "\t\t--fix\t\t\tancestral allele frequencies are certain" << endl << endl ; 30 | 31 | cerr << "\toptional and relevant only for multiple pulse models:" << endl ; 32 | cerr << "\t\t--output-ancestry\toutput ancestry posteriors rather than pulses" << endl ; 33 | cerr << "\t\t-r [int]\t\tnumber of random restarts during nelder-mead optimization" << endl ; 34 | cerr << "\t\t--pmax [int]\t\tmaximum proportion ancestry in an admixture pulse" << endl ; 35 | cerr << "\t\t--pmin [int]\t\tminimum proportion ancestry in an admixture pulse" << endl ; 36 | 37 | } 38 | 39 | #endif 40 | 41 | -------------------------------------------------------------------------------- /src/multichoose.h: -------------------------------------------------------------------------------- 1 | #ifndef __MULTICHOOSE_H 2 | #define __MULTICHOOSE_H 3 | 4 | /* 5 | 6 | multichoose.h -- n multichoose k for generic vectors 7 | 8 | author: Erik Garrison 9 | last revised: 2010-04-16 10 | 11 | Copyright (c) 2010 by Erik Garrison 12 | 13 | Permission is hereby granted, free of charge, to any person 14 | obtaining a copy of this software and associated documentation 15 | files (the "Software"), to deal in the Software without 16 | restriction, including without limitation the rights to use, 17 | copy, modify, merge, publish, distribute, sublicense, and/or sell 18 | copies of the Software, and to permit persons to whom the 19 | Software is furnished to do so, subject to the following 20 | conditions: 21 | 22 | The above copyright notice and this permission notice shall be 23 | included in all copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 27 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 29 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 30 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 31 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 32 | OTHER DEALINGS IN THE SOFTWARE. 33 | 34 | */ 35 | 36 | 37 | // provides multiset combinations out of the std::vector of objects 38 | template 39 | std::vector< std::vector > multichoose(int k, std::vector& objects) { 40 | 41 | std::vector< std::vector > choices; 42 | 43 | int j,j_1,q,r; 44 | 45 | r = objects.size() - 1; 46 | 47 | // combination indexes 48 | std::vector a, b; 49 | 50 | for (int i=0;i multiset; 57 | for(int i=0;i &path ) { 13 | 14 | ifstream in ( path_file.c_str() ) ; 15 | while ( !in.eof() ) { 16 | 17 | ploidy_entry new_ploidy_entry ; 18 | 19 | in >> new_ploidy_entry.chrom >> new_ploidy_entry.start >> new_ploidy_entry.stop >> new_ploidy_entry.ploidy ; 20 | 21 | if ( new_ploidy_entry.chrom == "" ) { 22 | continue ; 23 | } 24 | 25 | path.push_back( new_ploidy_entry ) ; 26 | } 27 | } 28 | 29 | /// create the interploidy transition matrix for each model 30 | vector create_interploidy_transitions ( map > > &state_list, vector &vertex, vector &ancestry_proportion ) { 31 | 32 | // need to compute what proportion of final is this ancestry type 33 | vector a = ancestry_proportion ; 34 | for ( int p = 0 ; p < vertex.size() ; p ++ ) { 35 | vertex[p].proportion = a[vertex[p].type] * vertex[p].fraction_of_remainder ; 36 | a[vertex[p].type] -= vertex[p].proportion ; 37 | } 38 | 39 | /// create interploidy transitions from ploidy one to ploidy two 40 | vector interploidy_transition_rates ; 41 | interploidy_transition_rates.resize(2) ; 42 | interploidy_transition_rates[0].zeros( state_list[2].size(), state_list[1].size() ) ; 43 | interploidy_transition_rates[1].zeros( state_list[1].size(), state_list[2].size() ) ; 44 | 45 | //// iterate through all possible transitions into interploidy states 46 | for ( int i = 0 ; i < state_list[1].size() ; i ++ ) { 47 | for ( int j = 0 ; j < state_list[2].size() ; j ++ ) { 48 | for ( int s = 0 ; s < state_list[1][i].size() ; s ++ ) { 49 | if ( state_list[2][j][s] - state_list[1][i][s] == 1 ) { 50 | interploidy_transition_rates[0]( j, i ) = vertex[s].proportion ; 51 | if ( state_list[2][j][s] == 2 ) { 52 | interploidy_transition_rates[1]( i, j ) = 1 ; 53 | } 54 | else { 55 | interploidy_transition_rates[1]( i, j ) = 0.5 ; 56 | } 57 | } 58 | } 59 | } 60 | } 61 | 62 | return interploidy_transition_rates ; 63 | } 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /src/compute_forward.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMPUTE_FORWARD_H 2 | #define __COMPUTE_FORWARD_H 3 | 4 | double markov_chain::compute_forward_probabilities( map > &transition_probabilites, vector &interploidy_transitions ) { 5 | 6 | /// return log likelihood which is sum of cts 7 | double lnl = 0 ; 8 | 9 | /// clear the fw probs matrix 10 | alphas.resize( transition_probabilites[ploidy_switch[0]].size() ) ; 11 | 12 | /// ploidy index to tract where in path we are 13 | int ploidy_index = 0 ; 14 | 15 | //// set all values to zero, but mostly just reize 16 | alphas[0].resize( transition_probabilites[ploidy_switch[0]][1].n_cols ) ; 17 | 18 | /// get initial state set 19 | alphas[0] = emission_probabilities[0] * start_prob ; 20 | lnl += normalize( alphas[0] ) ; 21 | 22 | /// do all other sites 23 | for ( int i = 1 ; i < emission_probabilities.size() ; i ++ ) { 24 | 25 | /// if we're at or past the next switch position 26 | bool ploidy_change = false ; 27 | if ( i >= ploidy_switch_position[ploidy_index+1] ) { 28 | ploidy_index ++ ; 29 | if ( ploidy_switch[ploidy_index] != ploidy_switch[ploidy_index-1] ) { 30 | ploidy_change = true ; 31 | } 32 | } 33 | 34 | /// resize matrix 35 | alphas[i].resize( transition_probabilites[ploidy_switch[ploidy_index]][1].n_cols ) ; 36 | 37 | /// requires slightly different math if we are transitioning in ploidy between two adjacent sites 38 | if ( ploidy_change == true ) { 39 | 40 | /// transitions across a chromosome boundary will have low self-self rates 41 | if ( transition_probabilites[ploidy_switch[ploidy_index]][i](0,0) < 0.75 ) { 42 | alphas[i].fill( 1 ) ; 43 | } 44 | 45 | //// otherwise, this is a transition across ploidy types on the same chromosome use the interploidy transition rates 46 | else { 47 | alphas[i] = interploidy_transitions[ploidy_switch[ploidy_index-1]-1] * alphas[i-1] % emission_probabilities[i] ; 48 | } 49 | } 50 | 51 | /// otehrwise business as ususal 52 | else { 53 | alphas[i] = transition_probabilites[ploidy_switch[ploidy_index]][i] * alphas[i-1] % emission_probabilities[i] ; 54 | } 55 | 56 | /// normalize and updated likelihood 57 | lnl += normalize( alphas[i] ) ; 58 | 59 | } 60 | 61 | return lnl ; 62 | } 63 | 64 | #endif 65 | -------------------------------------------------------------------------------- /src/selection_split_vector.h: -------------------------------------------------------------------------------- 1 | #ifndef __SPLIT_VECTOR_H 2 | #define __SPLIT_VECTOR_H 3 | 4 | /// splits vector (of recombination rates) into two vectors at the selected site. 5 | /// the back vector is generated in reverse order 6 | 7 | // min function because of namespace collision between std and arma 8 | int int_min(int a, int b){ 9 | int minout; 10 | if (a < b) { 11 | minout = a; 12 | } 13 | else { 14 | minout = b; 15 | } 16 | return minout; 17 | } 18 | 19 | // splits a vector of chromosomal positions into two vectors going away from a focal site. 20 | // one vector is reversed going back to 0 21 | // also trims the hmm window used 22 | vector > split_vector(int sel_site, vector &whole_vec, cmd_line &options) 23 | { 24 | vector > split_vecs; 25 | vector fwd_vec; 26 | vector back_vec; 27 | 28 | // trim vector if size is specified in morgans 29 | if (options.win_unit == "m") { 30 | double sum_morgans_fwd; 31 | double sum_morgans_back; 32 | for (int i = sel_site; i < whole_vec.size(); i++) { 33 | sum_morgans_fwd += whole_vec[i]; 34 | if (sum_morgans_fwd <= options.win_morgan) { 35 | fwd_vec.push_back(whole_vec[i]) ; 36 | } 37 | else { 38 | break; 39 | } 40 | } 41 | for (int i = sel_site; i > 0; i--) { 42 | sum_morgans_back += whole_vec[i]; 43 | if (sum_morgans_back <= options.win_morgan) { 44 | back_vec.push_back(whole_vec[i]) ; 45 | } 46 | else { 47 | break; 48 | } 49 | } 50 | } 51 | 52 | // trim vector if size is specified in percent 53 | else if (options.win_unit == "p") { 54 | int percent_size = whole_vec.size() * (options.win_percent/100); 55 | int trim_size_fwd = int_min((whole_vec.size() - sel_site) , percent_size); // Check for off by 1 error 56 | int trim_size_back = int_min(sel_site, percent_size); 57 | 58 | for (int i = sel_site; i < (sel_site + trim_size_fwd); i++) { 59 | fwd_vec.push_back(whole_vec[i]) ; 60 | } 61 | 62 | for (int i = sel_site; i > (sel_site - trim_size_back); i--) { 63 | back_vec.push_back(whole_vec[i]) ; 64 | } 65 | 66 | } 67 | 68 | split_vecs.push_back(fwd_vec); 69 | split_vecs.push_back(back_vec); 70 | return split_vecs; 71 | //cout << "Split vector lengths: " << fwd_vec.size() << ", " << back_vec.size() << endl; 72 | } 73 | 74 | 75 | 76 | #endif -------------------------------------------------------------------------------- /src/compute_backward.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMPUTE_BACKWARD_H 2 | #define __COMPUTE_BACKWARD_H 3 | 4 | /// backward probabilities are same except transpose transition matrix 5 | void markov_chain::compute_backward_probabilities( map > &transition_probabilites, vector &interploidy_transitions ) { 6 | 7 | // resize betas matrix 8 | betas.resize(transition_probabilites[number_chromosomes].size()) ; 9 | 10 | /// index to tract position in ploidy path will iterate through backwards for backward probs 11 | int ploidy_index = ploidy_switch_position.size() - 2 ; 12 | 13 | /// set last states to one 14 | betas.back().resize(alphas.back().size()) ; 15 | 16 | /// start with last position and multiply 17 | betas.back() = emission_probabilities.back() * end_prob ; 18 | normalize(betas.back()) ; 19 | 20 | /// now go from t-1 to 0 to iterate through backwards 21 | for ( int i = emission_probabilities.size()-2 ; i > -1 ; i -- ) { 22 | 23 | /// if we're at or before the last switch position 24 | bool ploidy_change = false ; 25 | if ( i < ploidy_switch_position[ploidy_index] ) { 26 | ploidy_index -- ; 27 | if ( ploidy_switch[ploidy_index] != ploidy_switch[ploidy_index+1] ) { 28 | ploidy_change = true ; 29 | } 30 | } 31 | 32 | /// resize vector 33 | betas[i].zeros( transition_probabilites[ploidy_switch[ploidy_index]][1].n_cols ) ; 34 | 35 | //// if there was a transition in ploidy 36 | if ( ploidy_change == true ) { 37 | 38 | /// if self transition is this low, we switched across a chromosome boundary or marker densities are so low that LAI is pointless, add a check for this 39 | if ( transition_probabilites[ploidy_switch[ploidy_index]][i+1](0,0) < 0.75 ) { 40 | /// new chromosome just starts with flat probs, will normalize to one 41 | betas[i].fill( 1 ) ; 42 | } 43 | //// otherwise, this is a transition across ploidy types on the same chromosome use the interploidy transition rates 44 | else { 45 | betas[i] = interploidy_transitions[ploidy_switch[ploidy_index]-1].t() * betas[i+1] % emission_probabilities[i] ; 46 | } 47 | } 48 | 49 | /// multiply matrices to produce latest betas 50 | else { 51 | betas[i] = transition_probabilites[ploidy_switch[ploidy_index]][i+1].t() * betas[i+1] % emission_probabilities[i] ; 52 | } 53 | 54 | /// normalize vector to prevent underflow issues 55 | normalize(betas[i]) ; 56 | } 57 | } 58 | 59 | #endif 60 | 61 | -------------------------------------------------------------------------------- /src/selection_markov_chain.h: -------------------------------------------------------------------------------- 1 | #ifndef __SELECTION_MARKOV_CHAIN_H 2 | #define __SELECTION_MARKOV_CHAIN_H 3 | 4 | /// will include all basic information for input data and functions to compute forward, backward, forward-backward, and viterbi 5 | class markov_chain { 6 | public: 7 | 8 | /// sample attributes 9 | string output_file ; 10 | double number_chromosomes ; 11 | 12 | /// file describing ploidy path across the genome 13 | string path_file ; 14 | 15 | /// data object storing ploidy paths to be looked up during emissions computation with chromosome as key 16 | vector sample_ploidy_path ; 17 | vector ploidy_switch_position ; 18 | vector ploidy_switch ; 19 | 20 | /// read from input file 21 | vector emission_probabilities ; 22 | 23 | /// create initial states to be stored 24 | double start_prob ; 25 | double end_prob ; 26 | 27 | /// forward probs 28 | vector alphas ; 29 | double compute_forward_probabilities( map > &transition_matrix, vector &interploidy_transitions ) ; 30 | 31 | 32 | 33 | /// For selection 34 | //vector genotype_freqs; 35 | 36 | double selection_forward_probabilities( map > &transition_probabilites, vector &interploidy_transitions, selection &point, bool go_downstream ) ; 37 | 38 | void selection_forward_loop_reverse( map > &transition_probabilites, vector &interploidy_transitions, selection &point, double &lnl, int ploidy_index, vector &position) ; 39 | 40 | void selection_forward_loop( map > &transition_probabilites, vector &interploidy_transitions, selection &point, double &lnl, int ploidy_index, vector &position) ; 41 | 42 | double selection_forward_probabilities_genotypes( map > &transition_probabilites, vector &interploidy_transitions, selection &point, bool go_downstream, vector &genofreq , vector &position) ; 43 | 44 | 45 | 46 | 47 | /// backward probs 48 | vector betas ; 49 | void compute_backward_probabilities( map > &transition_matrix, vector &interploidy_transitions ) ; 50 | 51 | /// combine probs 52 | void combine_prob( vector &position, map > > &states, vector &chrom, bool output_pulses, vector &pulses ) ; 53 | 54 | /// output viterbi paths 55 | void viterbi( vector &position, vector &recombination_rate, map > > &states, vector &chrom, map > &transition_probabilites, vector &interploidy_transitions, bool output_pulses, vector &pulses ) ; 56 | 57 | } ; 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /src/selection_trajectory.h: -------------------------------------------------------------------------------- 1 | #ifndef __SELECTION_TRAJECTORY_H 2 | #define __SELECTION_TRAJECTORY_H 3 | 4 | 5 | // generates vector with allele frequency of selected allele over time/generations 6 | void selection_trajectory(vector &freq, double s, int tt, double m, int generations, int n) 7 | { 8 | // returns flat vector if selection is 0 (ie, no change in ellele frequency over time) 9 | if ( s == 0) { 10 | freq.assign(generations,m); 11 | return; 12 | } 13 | 14 | int t0 = 0; 15 | int t0min = 0; 16 | bool found = false; 17 | double f; 18 | 19 | // loops over generations until initial frequency (m) is reached + number of generations (gen) has passed 20 | while (found == false) { 21 | f = 1 / (1 + 2 * n * s * exp(-s*t0)); 22 | if (f > m) { 23 | if (tt == 0) { 24 | tt = t0; 25 | } 26 | else if (t0 == (tt + generations)) { 27 | found = true; 28 | } 29 | freq.push_back(f); 30 | } 31 | t0++; 32 | } 33 | 34 | } 35 | 36 | // checks if selective coeffient causes site to go to fixation in the time since introgression 37 | bool selection_reaches_fixation(double s, double m, int generations, int n) 38 | { 39 | double max_freq = 0.99; 40 | int tt = 0; 41 | int t0 = 0; 42 | int t0min = 0; 43 | bool found = false; 44 | double f; 45 | 46 | s *= 0.5; 47 | 48 | // loops over generations until initial frequency (m) is reached + number of generations (gen) has passed 49 | while (found == false) { 50 | f = 1 / (1 + 2 * n * s * exp(-s*t0)); 51 | if (f > max_freq) { 52 | cerr << "Frequency " << f << " generation " << t0-tt << " selection " << s < m) { 57 | if (tt == 0) { 58 | tt = t0; 59 | } 60 | else if (t0 == (tt + generations)) { 61 | break; 62 | } 63 | } 64 | t0++; 65 | } 66 | return found; 67 | } 68 | 69 | // returns selection coeffient that reaches 0.99 in the time since introgression 70 | // used to speed up calculations and prevent division errors 71 | double selection_get_max_sel(double min_s, double max_s, double step_s, double m, int generations, int n) 72 | { 73 | cerr << "selection_get_max_sel " << min_s << " " << max_s << " " << step_s << " " << m << " " << generations << " " << n << endl; 74 | double last_s = 0; 75 | for (double s = min_s; s <= max_s; s += step_s) { 76 | if (selection_reaches_fixation(s, m, generations, n) == true) { 77 | return last_s; 78 | } 79 | last_s = s; 80 | } 81 | return max_s; 82 | } 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /src/distribute_alleles.h: -------------------------------------------------------------------------------- 1 | #ifndef __DISTRIBUTE_ALLELES_H 2 | #define __DISTRIBUTE_ALLELES_H 3 | 4 | /// solution to computing all ways of distributing A reads among vector of read counts 5 | /// using recursion to generate all possible lists of outcomes stored in nA 6 | void compute_allele_counts( vector > &nA, vector nA_iteration, vector &read_counts, double A, double t, int pop ) { 7 | 8 | int min = 0 ; 9 | if ( t - read_counts[pop] < A ) { 10 | min = A - ( t - read_counts[pop] ) ; 11 | } 12 | int max = read_counts[pop] ; 13 | if ( A < read_counts[pop] ) { 14 | max = A ; 15 | } 16 | 17 | for ( int c = min ; c <= max ; c ++ ) { 18 | nA_iteration.push_back(c) ; 19 | if ( pop == read_counts.size() - 1 ) { 20 | nA.push_back( nA_iteration ) ; 21 | } 22 | else { 23 | compute_allele_counts( nA, nA_iteration, read_counts, A - c, t - read_counts[pop], pop + 1 ) ; 24 | } 25 | nA_iteration.pop_back() ; 26 | } 27 | } 28 | 29 | /// distribute alleles (if genotype space) across all possible states 30 | /// read counts is the number of chromosomes from each pulse 31 | void distribute_alleles ( vector &read_counts, double &A, double &read_total, map , double > &A_counts ) { 32 | 33 | vector nA_iteration ; 34 | vector > nA ; 35 | compute_allele_counts( nA, nA_iteration, read_counts, A, read_total, 0 ) ; 36 | 37 | /// now compute ways to get to those arrangements 38 | double total = 0 ; 39 | for ( int n = 0 ; n < nA.size() ; n ++ ) { 40 | double sum = 1 ; 41 | for ( int l = 0 ; l < nA[n].size() ; l ++ ) { 42 | sum *= nCk[read_counts[l]][nA[n][l]] ; 43 | } 44 | A_counts[nA[n]] = sum ; 45 | total += sum ; 46 | } 47 | } 48 | 49 | /// distribute reads (if pileup space) across all possible states 50 | void distribute_reads ( vector &state, double &read_total, vector > &read_counts ) { 51 | 52 | /// list of all possible states to be drawn from where we ignore states with zero chromosomes in that state 53 | vector states ; 54 | for ( int i = 0 ; i < state.size() ; i ++ ) { 55 | if ( state[i] != 0 ) { 56 | states.push_back( i ) ; 57 | } 58 | } 59 | 60 | /// now get all possible arrangements of reads and store them in our list 61 | vector< vector > results = multichoose( read_total, states ) ; 62 | for ( int i = 0 ; i < results.size() ; i ++ ) { 63 | vector state_vector( state.size(), 0 ) ; 64 | for ( int j = 0 ; j < results[i].size() ; j ++ ) { 65 | state_vector.at(results[i][j]) ++ ; 66 | } 67 | read_counts.push_back( state_vector ) ; 68 | } 69 | } 70 | 71 | #endif 72 | 73 | -------------------------------------------------------------------------------- /src/inbred.h: -------------------------------------------------------------------------------- 1 | #ifndef __INBRED_H 2 | #define __INBRED_H 3 | 4 | void fix_ibd_transitions( vector, double > > > &transition_matrix_information, vector > &states, vector &ancestry_pulses, double inbreeding_transition_rate ) { 5 | 6 | /// number of ancestry pulses 7 | int number_pulses = ancestry_pulses.size() ; 8 | 9 | /// this is the first pulse 10 | for ( int i = 0 ; i < transition_matrix_information.size() ; i ++ ) { 11 | /// this is the pulse we're transitioning into 12 | for ( int j = 0 ; j < transition_matrix_information[i].size() ; j ++ ) { 13 | 14 | /// skip situations where we are comparing two outbred states 15 | if ( i < states.size() - number_pulses && j < states.size() - number_pulses ) { 16 | continue ; 17 | } 18 | 19 | /// skip situtions where we compare two inbred states 20 | if ( i >= states.size() - number_pulses && j >= states.size() - number_pulses ) { 21 | continue ; 22 | } 23 | 24 | /// okay then we need to fix it for ibd comparisons 25 | /// clear the existing transisions, they are wrong 26 | transition_matrix_information[i][j].clear() ; 27 | 28 | //// 29 | // ask how many are identical in state in inbred versus outbred? if 0, 30 | // then there are no ways to transitions between the states, we can record the 31 | // proportion of possible donor chromsomes and multiply by the rate of IBD 32 | vector sum ( number_pulses, 0 ) ; 33 | for ( int p = 0 ; p < states[i].size() ; p ++ ) { 34 | sum[0] += states[i][p] ; 35 | sum[1] += states[j][p] ; 36 | } 37 | 38 | /// find and store valid transitions 39 | /// must have one chromsome of same type as ibd state 40 | /// then prob is related to the proportion of chromosomes of the same type 41 | for ( int p = 0 ; p < states[i].size() ; p ++ ) { 42 | if ( states[i][p] > 0 && states[j][p] > 0 ) { 43 | vector new_transition (1) ; 44 | new_transition[0].start_state = p ; 45 | new_transition[0].end_state = p ; 46 | new_transition[0].transition_count = 1 ; 47 | new_transition[0].ibd_transition = true ; 48 | double prob_chrom = (double)states[j][p]/(double)sum[1]* (double)states[i][p]/(double)sum[0] ; 49 | transition_matrix_information[i][j][new_transition] = inbreeding_transition_rate * prob_chrom ; 50 | } 51 | } 52 | } 53 | } 54 | } 55 | 56 | #endif 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /src/selection_cmd_line.h: -------------------------------------------------------------------------------- 1 | #ifndef __SELECTION_CMD_LINE_H 2 | #define __SELECTION_CMD_LINE_H 3 | 4 | /// command line information and global parameters 5 | class cmd_line { 6 | public: 7 | 8 | /// terms to bound the optimization 9 | double t_max ; 10 | double t_min ; 11 | 12 | /// to bound proportion search 13 | double p_max ; 14 | double p_min ; 15 | 16 | /// to create intial simplex points 17 | double t_length ; 18 | double p_length ; 19 | 20 | /// number of restarts 21 | int n_restarts ; 22 | 23 | /// proportion ancestry for 0-n must sum to 1 24 | /// these are therefore the final ancestry proportion, not necessary the proportion that fluxed if additional pulses occured closer to the present 25 | vector ancestry_proportion ; 26 | 27 | /// store relevant ancestry information 28 | vector ancestry_pulses ; 29 | 30 | /// diploid effective population size ( i.e. 2n ) 31 | double ne ; 32 | 33 | /// tolerance for parameter search 34 | double tolerance ; 35 | 36 | /// minimum recombinational distance between markers 37 | double minimum_distance ; 38 | 39 | /// error rates for reads (if read based) or genotypes (if genotype based) 40 | double error_rate ; 41 | 42 | /// bool sample is expressed as genotypes, not read counts 43 | bool genotype ; 44 | 45 | /// ancestral genotype frequencies are fixed 46 | bool ancestral_fixed ; 47 | 48 | /// viterbi output 49 | /// caution: not recommended for more samples of ploidy > 1 50 | bool viterbi ; 51 | 52 | /// number of digits of precision to include 53 | int precision ; 54 | 55 | /// output actual pulses rather than ancestry states 56 | bool output_pulses ; 57 | 58 | /// error rates specifed 59 | bool error_rates ; 60 | 61 | /// input file name 62 | string input_file ; 63 | 64 | /// sample file 65 | string sample_file ; 66 | 67 | /// bootstrap 68 | int n_bootstraps ; 69 | int block_size ; 70 | 71 | /// +=+=+=+=+=+=+ selection +=+=+=+=+=+=+ 72 | 73 | 74 | bool is_limit; // limits to only one chromosome 75 | string limit_chr ; // specifies which chromosome 76 | int limit_win_start ; // window to analyse start 77 | int limit_win_end ; // window to analyze end 78 | 79 | // grid search 80 | bool calc_grid; // set grid search 81 | int grid_pstart; // position start 82 | int grid_pstop; 83 | int grid_pstep; 84 | double grid_sstart; // selection coeffient start 85 | double grid_sstop; 86 | double grid_sstep; 87 | 88 | // single site test 89 | bool test_point; 90 | int test_pos; 91 | double test_sel; 92 | 93 | // golden section search 94 | bool run_gss; // set gss search 95 | int gs_pstart; 96 | int gs_pstop; 97 | int gs_pstep; 98 | double gs_sstart; 99 | double gs_sstop; 100 | double gs_sstep; 101 | int gs_max_iterations; 102 | double gs_precision; 103 | 104 | bool is_coord; // use chromosome coordinates 105 | 106 | bool limit_sel_space; // use full search space for s 107 | 108 | int traj_function = 0; // set trajectory function 109 | 110 | // HMM chain window size 111 | string win_unit; 112 | double win_morgan; 113 | double win_percent; 114 | // int win_bp; 115 | 116 | // stochastic trajectory 117 | bool use_stochastic; // use stochastic trajectory function 118 | int stochastic_reps; // how many repeats 119 | 120 | /// read relevant information 121 | void read_cmd_line ( int argc, char *argv[] ) ; 122 | 123 | } ; 124 | 125 | #endif 126 | 127 | -------------------------------------------------------------------------------- /src/create_pulses.h: -------------------------------------------------------------------------------- 1 | #ifndef __CREATE_PULSES_H 2 | #define __CREATE_PULSES_H 3 | 4 | /// create pulses and create data points for optimization function 5 | int create_pulses ( vector > &vertices, cmd_line &options ) { 6 | 7 | //// count parameters to estimate and figure out which are dependent on each other 8 | int nparams = 0 ; 9 | 10 | /// count dependent pulses 11 | map dependent_pulses ; 12 | 13 | /// figure out how many ancestry types pulse more than once 14 | /// record as pulse and count of pulse 15 | vector proportion_accounted ( options.ancestry_proportion.size(), 0 ) ; 16 | for ( int p = options.ancestry_pulses.size() -1 ; p > -1 ; p-- ) { 17 | proportion_accounted[options.ancestry_pulses[p].type] += options.ancestry_pulses[p].proportion ; 18 | if ( options.ancestry_pulses[p].proportion_fixed == false ) { 19 | if ( dependent_pulses.find( options.ancestry_pulses[p].type ) == dependent_pulses.end() ) { 20 | dependent_pulses[ options.ancestry_pulses[p].type ] = 1 ; 21 | options.ancestry_pulses[p].fraction_of_remainder = 1 ; 22 | options.ancestry_pulses[p].proportion_fixed = true ; 23 | } 24 | else { 25 | options.ancestry_pulses[p].fraction_of_remainder = options.ancestry_pulses[p].proportion/proportion_accounted[options.ancestry_pulses[p].type] ; 26 | dependent_pulses[ options.ancestry_pulses[p].type ] ++ ; 27 | nparams ++ ; /// only have to estimate non-fixed second - n of an ancestry type 28 | } 29 | } 30 | else { 31 | options.ancestry_pulses[p].fraction_of_remainder = options.ancestry_pulses[p].proportion / proportion_accounted[options.ancestry_pulses[p].type] ; 32 | } 33 | if ( options.ancestry_pulses[p].time_fixed == false ) { 34 | nparams ++ ; 35 | } 36 | } 37 | 38 | /// if there are model parameters to estimate, create vertices for the starting simplex 39 | if ( nparams > 0 ) { 40 | 41 | /// create the starting point x0 42 | vector x0 = options.ancestry_pulses ; 43 | vertices.push_back( x0 ) ; 44 | 45 | /// need nparm + 1 points in simplex 46 | for ( int n = 0 ; n < nparams ; n ++ ) { 47 | int param = 0 ; 48 | vector vertex = x0 ; 49 | for ( int p = 0 ; p < options.ancestry_pulses.size() ; p ++ ) { 50 | if ( options.ancestry_pulses[p].time_fixed == false ) { 51 | if ( param == n ) { 52 | if ( options.t_max - vertex[p].time > vertex[p].time - options.t_min ) { 53 | vertex[p].time /= ( 1 - options.t_length ) ; 54 | } 55 | else { 56 | vertex[p].time *= ( 1 - options.t_length ) ; 57 | } 58 | } 59 | param ++ ; 60 | } 61 | if ( options.ancestry_pulses[p].proportion_fixed == false ) { 62 | if ( param == n ) { 63 | if ( options.p_max - vertex[p].fraction_of_remainder > vertex[p].fraction_of_remainder - options.p_min ) { 64 | vertex[p].fraction_of_remainder *= options.p_length ; 65 | } 66 | else { 67 | vertex[p].fraction_of_remainder *= ( 1 - options.p_length ) ; 68 | } 69 | } 70 | param ++ ; 71 | } 72 | } 73 | check_vertex( vertex, options ) ; 74 | vertices.push_back( vertex ) ; 75 | } 76 | } 77 | return nparams ; 78 | } 79 | 80 | #endif 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/selection_print_usage.h: -------------------------------------------------------------------------------- 1 | #ifndef __SELECTION_PRINT_USAGE_H 2 | #define __SELECTION_PRINT_USAGE_H 3 | 4 | void print_usage() { 5 | 6 | cerr << endl << endl << "ahmm-s usage:" << endl << endl ; 7 | cerr << "\trequired:" << endl ; 8 | cerr << "\t\t-i [string]\n\t\t\tinput file name" << endl ; 9 | cerr << "\t\t-s [string]\n\t\t\tsample id and ploidy file" << endl ; 10 | 11 | cerr << "\t\t-p [int] [int] [float]" << endl ; 12 | cerr << "\t\t\t ancestry pulse with format, ancestral population, time," << endl ; 13 | cerr << "\t\t\t and proportion of final ancestry from this pulse" << endl ; 14 | cerr << "\t\t--ne [int]\n\t\t\teffective population size of the admixed population" << endl ; 15 | 16 | cerr << "\n\tselect one of the following working modes:" << endl ; 17 | 18 | cerr << "\t\t--gss [int] [int] [int] [float] [float]" << endl ; 19 | cerr << "\t\t\t golden section search for optimal selection coeffient at each site." << endl ; 20 | cerr << "\t\t\t parameters: chromosomal position start, stop, step, selection coefficient start, stop" << endl ; 21 | 22 | cerr << "\t\t--grid [int] [int] [int] [float] [float] [float]" << endl ; 23 | cerr << "\t\t\t calculate likelihood ratios in a grid." << endl ; 24 | cerr << "\t\t\t parameters: chromosomal position start, stop, step, selection coefficient start, stop, step." << endl ; 25 | 26 | cerr << "\t\t--site [int] [float]" << endl ; 27 | cerr << "\t\t\t calculate likelihood ratios for a single value of s at a single site." << endl ; 28 | cerr << "\t\t\t parameters: chromosomal position, selective coeffient" << endl ; 29 | 30 | 31 | cerr << "\n\toptional:" << endl ; 32 | cerr << "\t\t--help\n\t\t\tprint this help statement" << endl ; 33 | cerr << "\t\t-g\n\t\t\tsamples are specified with genotypes rather than read counts" << endl ; 34 | 35 | cerr << "\t\t--chr [string]" << endl ; 36 | cerr << "\t\t\t specify chromosome that will be analyzed" << endl ; 37 | cerr << "\t\t\t (only necessary when there are multiple chromosomes in input file)" << endl ; 38 | cerr << "\t\t--chr_win [int] [int]" << endl ; 39 | cerr << "\t\t\t limit region on chromosome that will be analyzed" << endl ; 40 | 41 | cerr << "\t\t--gss_precision [float]" << endl ; 42 | cerr << "\t\t\t specify precision in finding optimal value of s using golden section search. default: 1e-5" << endl ; 43 | cerr << "\t\t--unit_coords" << endl ; 44 | cerr << "\t\t\t unit for start and stop position in grid and gss search can be defined as chromosome" << endl ; 45 | cerr << "\t\t\t coordinates rather than as line number in input file. default off" << endl ; 46 | cerr << "\t\t--window [string] [float]" << endl ; 47 | cerr << "\t\t\t specify size of Markov chain in percent or Morgans." << endl ; 48 | cerr << "\t\t\t \"p 10\" extends the markov chain 10% of chromosome length on each side of selected site." << endl ; 49 | cerr << "\t\t\t \"m 0.1\" extends the windows 0.1 Morgan on each side of the selected site." << endl ; 50 | cerr << "\t\t\t default: \"p 100\"" << endl ; 51 | cerr << "\t\t--traj [int]" << endl ; 52 | cerr << "\t\t\t change algorithm for generating selection trajectories." << endl ; 53 | cerr << "\t\t\t 4: 4-point approximation, 3: 3-point approximation (legacy option, not recommended)." << endl ; 54 | cerr << "\t\t\t default: forward iteration." << endl ; 55 | cerr << "\t\t--stochastic" << endl ; 56 | cerr << "\t\t\t enables the stochastic method for generation selection trajectory." << endl ; 57 | cerr << "\t\t\t (Experimental. Slow. Use for small values of s.)" << endl ; 58 | cerr << "\t\t--stochastic_reps [int]" << endl ; 59 | cerr << "\t\t\t specifies number of simulations for the stochastic trajectory algorithm." << endl ; 60 | cerr << "\t\t\t default: 10000" << endl ; 61 | cerr << "\t\t--full_selection_space" << endl ; 62 | cerr << "\t\t\t turns off optimization of the selection coeffient search space. (Experimental)" << endl ; 63 | } 64 | 65 | #endif 66 | 67 | -------------------------------------------------------------------------------- /src/multipermute.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | multipermute.h -- multiset permutations for generic vectors 4 | 5 | Follows 'Algorithm 1' from "Loopless Generation of Multiset Permutations using 6 | a Constant Number of Variables by Prefix Shifts." Aaron Williams, 2009 7 | 8 | author: Erik Garrison 9 | last revised: 2010-04-16 10 | 11 | Copyright (c) 2010 by Erik Garrison 12 | 13 | Permission is hereby granted, free of charge, to any person 14 | obtaining a copy of this software and associated documentation 15 | files (the "Software"), to deal in the Software without 16 | restriction, including without limitation the rights to use, 17 | copy, modify, merge, publish, distribute, sublicense, and/or sell 18 | copies of the Software, and to permit persons to whom the 19 | Software is furnished to do so, subject to the following 20 | conditions: 21 | 22 | The above copyright notice and this permission notice shall be 23 | included in all copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 27 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 29 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 30 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 31 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 32 | OTHER DEALINGS IN THE SOFTWARE. 33 | 34 | */ 35 | 36 | #ifndef __MULTIPERMUTE_H 37 | #define __MULTIPERMUTE_H 38 | 39 | template 40 | class ListElement { 41 | 42 | public: 43 | T value; 44 | ListElement* next; 45 | 46 | ListElement() { } 47 | 48 | ListElement(T val, ListElement* n) { 49 | value = val; 50 | next = n; 51 | } 52 | 53 | ListElement* nth(int n) { 54 | ListElement* o = this; 55 | int i = 0; 56 | while (i < n && o->next != NULL) { 57 | o = o->next; 58 | ++i; 59 | } 60 | return o; 61 | } 62 | 63 | ~ListElement() { 64 | if (next != NULL) { 65 | delete next; 66 | } 67 | } 68 | 69 | }; 70 | 71 | template 72 | ListElement* list_init(std::vector& multiset) { 73 | std::sort(multiset.begin(), multiset.end()); // ensures proper non-increasing order 74 | typename std::vector::const_iterator item = multiset.begin(); 75 | ListElement* h = new ListElement(*item, NULL); 76 | ++item; 77 | while (item != multiset.end()) { 78 | h = new ListElement(*item, h); 79 | ++item; 80 | } 81 | return h; 82 | } 83 | 84 | template 85 | std::vector linked_list_to_vector(ListElement* h) { 86 | ListElement* o = h; 87 | std::vector l; 88 | while (o != NULL) { 89 | l.push_back(o->value); 90 | o = o->next; 91 | } 92 | return l; 93 | } 94 | 95 | // provides multiset permutations out of the std::vector multiset 96 | template 97 | std::vector< std::vector > multipermute(std::vector& multiset) { 98 | 99 | std::vector< std::vector > results; 100 | 101 | ListElement* h = list_init(multiset); 102 | ListElement* i = h->nth(multiset.size() - 2); 103 | ListElement* j = h->nth(multiset.size() - 1); 104 | ListElement* s; 105 | ListElement* t; 106 | 107 | results.push_back(linked_list_to_vector(h)); 108 | 109 | while (j->next != NULL || j->value < h->value) { 110 | if (j->next != NULL && i->value >= j->next->value) { 111 | s = j; 112 | } else { 113 | s = i; 114 | } 115 | t = s->next; 116 | s->next = t->next; 117 | t->next = h; 118 | if (t->value < h->value) { 119 | i = t; 120 | } 121 | j = i->next; 122 | h = t; 123 | results.push_back(linked_list_to_vector(h)); 124 | } 125 | 126 | delete h; 127 | 128 | return results; 129 | 130 | } 131 | 132 | #endif 133 | -------------------------------------------------------------------------------- /src/forward_backward.h: -------------------------------------------------------------------------------- 1 | #ifndef __FORWARD_BACKWARD_H 2 | #define __FORWARD_BACKWARD_H 3 | 4 | void markov_chain::combine_prob( vector &position, map > > &states, vector &chrom, bool output_pulses, vector &pulses ) { 5 | 6 | /// find unique ploidy entries in the sample and obtain a sorted list of those entries 7 | vector unique_ploidy_entries = ploidy_switch ; 8 | sort( unique_ploidy_entries.begin(), unique_ploidy_entries.end() ) ; 9 | vector::iterator u = unique ( unique_ploidy_entries.begin(), unique_ploidy_entries.end() ) ; 10 | unique_ploidy_entries.resize( distance( unique_ploidy_entries.begin(), u ) ) ; 11 | 12 | /// output stream 13 | ofstream out ( output_file.c_str() ) ; 14 | 15 | //// output ancestry states, rather than pulses 16 | if ( output_pulses == false ) { 17 | 18 | /// pulse to ancestry 19 | /// ploidy >> index of ancestry pulse vector >> ancestry state vector 20 | map > > ploidy2pulse2ancestry = create_pulse_map( states, pulses, unique_ploidy_entries ) ; 21 | 22 | /// get unique and sorted list of vectors of each ancestry type 23 | map,int > > ancestry_states ; 24 | for ( std::map > >::iterator p = ploidy2pulse2ancestry.begin() ; p != ploidy2pulse2ancestry.end() ; ++ p ) { 25 | for ( std::map >::iterator i = p->second.begin() ; i != p->second.end() ; ++ i ) { 26 | ancestry_states[p->first][i->second] = 0 ; 27 | } 28 | } 29 | 30 | /// print those unique lists for each ploidy 31 | for ( map,int > >::iterator p = ancestry_states.begin() ; p != ancestry_states.end() ; ++ p ) { 32 | out << "chrom\tposition" ; 33 | for ( map,int >::reverse_iterator s = p->second.rbegin() ; s != p->second.rend() ; ++ s ) { 34 | out << "\t" ; 35 | for ( int c = 0 ; c < s->first.size() - 1 ; c ++ ) { 36 | out << s->first[c] << "," ; 37 | } 38 | out << s->first.back() ; 39 | } 40 | out << endl ; 41 | } 42 | 43 | /// finally create a state count to ploidy map, this way we can just look up the appropriate map from the alpha matrix size 44 | map statecount2ploidy ; 45 | for ( map > >::iterator s = states.begin() ; s != states.end() ; s ++ ) { 46 | statecount2ploidy[s->second.size()] = s->first ; 47 | } 48 | 49 | //// now iterate through function and print appropriate ancestry states given ploidy at each site 50 | for ( int i = 0 ; i < alphas.size() ; i ++ ) { 51 | 52 | vec smoothed_probs = alphas[i] % betas[i] ; 53 | normalize( smoothed_probs ) ; 54 | 55 | /// current ploidy 56 | int ploidy = statecount2ploidy[smoothed_probs.size()] ; 57 | 58 | /// need to translate pulse state probs to ancestry state probs 59 | map,double> ancestry_states ; 60 | for ( int l = 0 ; l < smoothed_probs.n_rows ; l ++ ) { 61 | ancestry_states[ploidy2pulse2ancestry[ploidy][l]] += smoothed_probs(l) ; 62 | } 63 | 64 | out << chrom.at(i) << "\t" << position.at(i) ; 65 | for ( map,double>::reverse_iterator l = ancestry_states.rbegin() ; l != ancestry_states.rend() ; ++ l ) { 66 | out << "\t" << l->second ; 67 | } 68 | out << endl ; 69 | } 70 | 71 | out.close() ; 72 | } 73 | 74 | /// output ancestry pulses rather than ancestry states [default] 75 | else { 76 | 77 | for ( int p = 0 ; p < unique_ploidy_entries.size() ; p ++ ) { 78 | out << "chrom\tposition" ; 79 | for ( int s = 0 ; s < states[unique_ploidy_entries[p]].size() ; s ++ ) { 80 | out << "\t" ; 81 | for ( int c = 0 ; c < states[unique_ploidy_entries[p]][s].size() - 1 ; c ++ ) { 82 | out << states[unique_ploidy_entries[p]][s][c] << "," ; 83 | } 84 | out << states[unique_ploidy_entries[p]][s].back() ; 85 | } 86 | out << endl ; 87 | } 88 | 89 | for ( int i = 0 ; i < alphas.size() ; i ++ ) { 90 | 91 | vec smoothed_probs = alphas[i] % betas[i] ; 92 | normalize( smoothed_probs ) ; 93 | 94 | out << chrom.at(i) << "\t" << position.at(i) ; 95 | for ( int l = 0 ; l < smoothed_probs.n_rows ; l ++ ) { 96 | out << "\t" << smoothed_probs(l) ; 97 | } 98 | out << endl ; 99 | } 100 | } 101 | out.close() ; 102 | return ; 103 | } 104 | 105 | #endif 106 | -------------------------------------------------------------------------------- /src/golden_search.h: -------------------------------------------------------------------------------- 1 | #ifndef __GOLDEN_SEARCH_H 2 | #define __GOLDEN_SEARCH_H 3 | 4 | /// golden section search for single parameter optimization 5 | /// not currently included, but may be useful to legacy versions 6 | vector golden_search ( cmd_line &options, vector &markov_chain_information, map, double > > > > transition_matrix_information, vector &recombination_rate, vector &position, map > > &state_changes ) { 7 | 8 | /// now do golden search until we reach tolerance threshhold and stop 9 | double phi = ( sqrt(5) - 1 ) / 2 ; 10 | 11 | /// parameter values to hold during search 12 | double low_bracket ; 13 | double high_bracket ; 14 | double param_low ; 15 | double param_high ; 16 | 17 | /// now figure out the parameters that vary 18 | for ( int p = 0 ; p < options.ancestry_pulses.size() ; p ++ ) { 19 | if ( options.ancestry_pulses[p].time_fixed == false ) { 20 | high_bracket = options.t_max ; 21 | low_bracket = options.t_min ; 22 | param_low = options.t_max + phi * ( options.t_min - options.t_max ) ; 23 | param_high = options.t_min + phi * ( options.t_max - options.t_min ) ; 24 | 25 | } 26 | else if ( options.ancestry_pulses[p].proportion_fixed == false ) { 27 | high_bracket = options.p_max ; 28 | low_bracket = options.p_min ; 29 | param_low = options.p_max + phi * ( options.p_min - options.p_max ) ; 30 | param_high = options.p_min + phi * ( options.p_max - options.p_min ) ; 31 | } 32 | } 33 | 34 | /// likelihood information for test points 35 | double lnl_low = 0 ; 36 | double lnl_high = 0 ; 37 | double lnl_diff = 1000 ; 38 | int iteration = 0 ; 39 | 40 | cerr << "\titeration\tlow bound\tlow test\thigh test\thigh bound\tlnl low\tlnl high\n" ; 41 | while ( options.tolerance < lnl_diff ) { 42 | 43 | /// compute probabilty of low point 44 | if ( lnl_low == 0 ) { 45 | vector v_low = options.ancestry_pulses ; 46 | for ( int p = 0 ; p < v_low.size() ; p ++ ) { 47 | if ( options.ancestry_pulses[p].time_fixed == false ) { 48 | v_low[p].time = param_low ; 49 | } 50 | else if ( options.ancestry_pulses[p].proportion_fixed == false ) { 51 | v_low[p].fraction_of_remainder = param_low ; 52 | } 53 | } 54 | 55 | lnl_low += evaluate_vertex( v_low, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes ) ; 56 | } 57 | 58 | /// compute probability of high point 59 | if ( lnl_high == 0 ) { 60 | vector v_high = options.ancestry_pulses ; 61 | for ( int p = 0 ; p < v_high.size() ; p ++ ) { 62 | if ( options.ancestry_pulses[p].time_fixed == false ) { 63 | v_high[p].time = param_high ; 64 | } 65 | else if ( options.ancestry_pulses[p].proportion_fixed == false ) { 66 | v_high[p].fraction_of_remainder = param_high ; 67 | } 68 | } 69 | lnl_high = evaluate_vertex( v_high, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes ) ; 70 | } 71 | 72 | /// print update 73 | cerr << "\t" << iteration << "\t" << low_bracket << "\t" << param_low << "\t" << param_high << "\t" << high_bracket << "\t" << lnl_low << "\t" << lnl_high << endl ; 74 | iteration ++ ; 75 | 76 | /// record dfiference 77 | lnl_diff = abs( lnl_low - lnl_high ) ; 78 | 79 | /// if true, we know that the maximum is between param_low and high_bracket 80 | if ( lnl_high >= lnl_low ) { 81 | low_bracket = param_low ; 82 | param_low = param_high ; 83 | param_high = low_bracket + ( high_bracket - low_bracket ) * phi ; 84 | lnl_low = lnl_high ; 85 | lnl_high = 0 ; 86 | } 87 | 88 | /// otherwise, the maximum is between low_bracket and param_high 89 | else { 90 | high_bracket = param_high ; 91 | param_high = param_low ; 92 | param_low = high_bracket + ( low_bracket - high_bracket ) * phi ; 93 | lnl_high = lnl_low ; 94 | lnl_low = 0 ; 95 | } 96 | } 97 | 98 | vector optimum = options.ancestry_pulses ; 99 | for ( int p = 0 ; p < optimum.size() ; p ++ ) { 100 | if ( options.ancestry_pulses[p].time_fixed == false ) { 101 | optimum[p].time = ( param_high + param_low ) / 2 ; 102 | } 103 | else if ( options.ancestry_pulses[p].proportion_fixed == false ) { 104 | optimum[p].fraction_of_remainder = ( param_high + param_low ) / 2 ; 105 | } 106 | } 107 | 108 | return optimum ; 109 | } 110 | 111 | 112 | #endif 113 | 114 | -------------------------------------------------------------------------------- /src/bootstrap.h: -------------------------------------------------------------------------------- 1 | #ifndef __BOOTSTRAP_H 2 | #define __BOOTSTRAP_H 3 | 4 | /// create_bootstraps 5 | vector > bootstraps (vector > &vertices, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, cmd_line &options, map > > &state_list, vector &chromosomes ) { 6 | 7 | /// vector to store optimum admixture model for each bootstrap 8 | vector > bootstrap_models ; 9 | 10 | /// rng to select block positions 11 | int max = floor(recombination_rate.size()/ options.block_size) ; 12 | 13 | /// create b total bootstraps 14 | for ( int b = 0 ; b < options.n_bootstraps ; b ++ ) { 15 | 16 | //// bootstrap data objects 17 | vector bootstrap_chain ( markov_chain_information.size() ) ; 18 | vector bootstrap_recombination_rate ; 19 | vector bootstrap_positions ; 20 | 21 | /// populate these data objects 22 | while ( bootstrap_positions.size() < position.size() ) { 23 | 24 | /// find admissable start positions 25 | int start = ( rand() % max ) * options.block_size ; 26 | int end = start + options.block_size ; 27 | if ( end > position.size() ) { 28 | end = position.size() - 1 ; 29 | } 30 | 31 | /// create positions in bootstrap region 32 | bootstrap_positions.insert( bootstrap_positions.end(), position.begin() + start , position.begin() + end ) ; 33 | 34 | //// create recombination rate in bootstrap regions 35 | bootstrap_recombination_rate.push_back( 0.5 ) ; 36 | bootstrap_recombination_rate.insert( bootstrap_recombination_rate.end(), recombination_rate.begin() + start + 1 , recombination_rate.begin() + end ) ; 37 | 38 | //// create markov chain information for each sample 39 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 40 | 41 | /// add emissions 42 | bootstrap_chain[m].emission_probabilities.insert( bootstrap_chain[m].emission_probabilities.end() , markov_chain_information[m].emission_probabilities.begin() + start ,markov_chain_information[m].emission_probabilities.begin() + end ) ; 43 | 44 | /// start probs 45 | bootstrap_chain[m].start_prob = 1 ; 46 | bootstrap_chain[m].end_prob = 1 ; 47 | 48 | /// ploidy transitions 49 | bootstrap_chain[m].ploidy_switch_position.push_back( bootstrap_chain[m].emission_probabilities.size() - options.block_size ) ; 50 | bool start_found = false ; 51 | for ( int p = 0 ; p < markov_chain_information[m].ploidy_switch_position.size() ; p ++ ) { 52 | 53 | /// find and insert start ploidy 54 | if ( markov_chain_information[m].ploidy_switch_position[p] > start && start_found == false ) { 55 | bootstrap_chain[m].ploidy_switch.push_back( markov_chain_information[m].ploidy_switch[p-1] ) ; 56 | start_found = true ; 57 | } 58 | 59 | /// insert all ploidy changes across that block 60 | if ( start_found == true && markov_chain_information[m].ploidy_switch_position[p] <= end && markov_chain_information[m].ploidy_switch_position[p] >= start ) { 61 | bootstrap_chain[m].ploidy_switch.push_back( markov_chain_information[m].ploidy_switch[p] ) ; 62 | bootstrap_chain[m].ploidy_switch_position.push_back( bootstrap_chain[m].emission_probabilities.size() - options.block_size + markov_chain_information[m].ploidy_switch_position[p] - start ) ; 63 | } 64 | } 65 | } 66 | } 67 | 68 | //// to avoid look ahead errors 69 | for ( int m = 0 ; m < bootstrap_chain.size() ; m ++ ) { 70 | bootstrap_chain[m].ploidy_switch_position.push_back(2147483647) ; 71 | bootstrap_chain[m].ploidy_switch.push_back( bootstrap_chain[m].ploidy_switch.back() ) ; 72 | } 73 | 74 | /// if there are params to estimate, do amoeba search 75 | if ( vertices.size() > 2 ) { 76 | cerr << "starting nelder-mead search\t\n\tbootstrap no:\t" << b << endl ; 77 | bootstrap_models.push_back( nelder_mead_search( vertices, options, bootstrap_chain, transition_matrix_information, bootstrap_recombination_rate, bootstrap_positions, state_list ) ) ; 78 | } 79 | 80 | /// or do golden section line search for single parameter optimization 81 | else { 82 | cerr << "starting golden section search\t\n\tbootstrap no:\t" << b << endl << endl ; 83 | bootstrap_models.push_back( golden_search( options, bootstrap_chain, transition_matrix_information, bootstrap_recombination_rate, bootstrap_positions, state_list ) ) ; 84 | } 85 | } 86 | 87 | return bootstrap_models ; 88 | } 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /src/read_emissions.h: -------------------------------------------------------------------------------- 1 | #ifndef __READ_EMISSIONS_H 2 | #define __READ_EMISSIONS_H 3 | 4 | /// this will calculate the emissions probabilities using read based methods 5 | void create_emissions_matrix( double n, input_line &new_line, bool &ancestral_fixed, vector > &states, int sample_index, vector &pulses, vec &emission_matrix ) { 6 | 7 | /// if no data in sample, return flat probs 8 | if ( new_line.sample_counts[sample_index][2] == 0 ) { 9 | emission_matrix.ones(states.size()) ; 10 | return ; 11 | } 12 | 13 | /// indexes will correspond to states vector that we passed here 14 | emission_matrix.zeros(states.size()) ; 15 | 16 | /// i is the index of the state we're in 17 | for ( double i = 0 ; i < states.size() ; i ++ ) { 18 | 19 | /// ancestry states 20 | vector ancestry_states = pulses_to_ancestry( states[i], pulses ) ; 21 | 22 | /// find all distributions of reads across existing states using multichoose 23 | /// this data object is the number of A's derived from each ancestry type 24 | vector > read_counts ; 25 | distribute_reads( ancestry_states, new_line.sample_counts[sample_index][2], read_counts ) ; 26 | 27 | /// now the probability that you sample reads from each ancestry type 28 | for ( int rc = 0 ; rc < read_counts.size() ; rc ++ ) { 29 | 30 | // the probability of this particular sampling of reads given the state i 31 | double p_reads = multinomial( n, new_line.sample_counts[sample_index][2], read_counts[rc], ancestry_states ) ; 32 | 33 | /// now we need to distribute our alleles 34 | /// this data object is the number of A reads from each class, conditional on the number of reads from each class, i.e. A_counts[i][j] < read_counts[r][j] 35 | map, double > A_counts ; 36 | distribute_alleles( read_counts[rc], new_line.sample_counts[sample_index][0], new_line.sample_counts[sample_index][2], A_counts ) ; 37 | 38 | /// now compute probability of each sampling arrangement 39 | for ( std::map,double>::iterator a = A_counts.begin() ; a != A_counts.end() ; ++ a ) { 40 | 41 | double prob_counts = 1 ; 42 | for ( int c = 0 ; c < a->first.size() ; c ++ ) { 43 | double sum = 0 ; 44 | /// whether ancestral frequencies are not fixed [default] 45 | if ( ancestral_fixed == false ) { 46 | /// number of samples that are allele A is j 47 | for ( double j = 0 ; j <= ancestry_states[c] + 0.01 ; j ++ ) { 48 | double pA = j/ancestry_states[c]*(1-new_line.error_1)+(1-j/ancestry_states[c])*new_line.error_2 ; 49 | 50 | /// ancestral pop probs + choose j probs ( all genotype probs ) 51 | sum += 1/( new_line.reference_counts[c][2] + ancestry_states[c] + 1 ) 52 | * 1/nCk[ new_line.reference_counts[c][2] + ancestry_states[c] ][ new_line.reference_counts[c][0] + j ] 53 | 54 | /// remaining sample genotype prob 55 | * nCk[ ancestry_states[c] ][ j ] 56 | 57 | /// read probs now, i.e. prob of number of A's given the sampling probs 58 | * pow( pA, a->first[c] ) 59 | * pow( 1-pA, read_counts[rc][c] - a->first[c] ) ; 60 | } 61 | } 62 | 63 | /// if ancestral frequencies are fixed 64 | else { 65 | double f = new_line.reference_counts[c][0] / new_line.reference_counts[c][2] ; 66 | for ( double j = 0 ; j <= ancestry_states[c] + 0.01 ; j ++ ) { 67 | /// error probs are same 68 | double pA = j/ancestry_states[c]*(1-new_line.error_1)+(1-j/ancestry_states[c])*new_line.error_2 ; 69 | 70 | /// no ancestral probs required 71 | 72 | /// prob of sampling these A's 73 | sum += nCk[ ancestry_states[c] ][ j ] 74 | * pow( f, j ) * pow ( 1 - f, ancestry_states[c] - j ) 75 | 76 | //// prob of smapling these reads 77 | * pow( pA, a->first[c] ) * pow( 1-pA, read_counts[rc][c] - a->first[c] ) ; 78 | } 79 | } 80 | prob_counts *= sum ; 81 | } 82 | emission_matrix(i) += p_reads * prob_counts * a->second ; 83 | } 84 | } 85 | } 86 | 87 | /// check if any entries are 0 and set to smallest possible float 88 | for ( int i = 0 ; i < emission_matrix.size() ; i ++ ) { 89 | if ( emission_matrix(i) == 0 ) { 90 | emission_matrix(i) = 1.17549e-38 ; 91 | } 92 | } 93 | 94 | return ; 95 | } 96 | #endif 97 | 98 | -------------------------------------------------------------------------------- /src/selection_forward.h: -------------------------------------------------------------------------------- 1 | #ifndef __SELECTION_FORWARD_H 2 | #define __SELECTION_FORWARD_H 3 | 4 | 5 | 6 | // Forward algoritm modified for selection inferrence. 7 | double markov_chain::selection_forward_probabilities_genotypes( map > &transition_probabilites, vector &interploidy_transitions, selection &point, bool go_downstream, vector &genofreq, vector &position ) { 8 | //cerr << "cp2_1 " << genofreq[0] << endl; 9 | 10 | /// return log likelihood which is sum of cts 11 | double lnl = 0 ; 12 | 13 | /// clear the fw probs matrix 14 | alphas.resize( transition_probabilites[ploidy_switch[0]].size() ) ; 15 | 16 | /// ploidy index to tract where in path we are 17 | int ploidy_index = 0 ; 18 | 19 | //// set all values to zero, but mostly just reize 20 | alphas[0].resize( transition_probabilites[ploidy_switch[0]][1].n_cols ) ; 21 | 22 | /// get initial state set 23 | //alphas[0] = emission_probabilities[point.pos] * start_prob ; 24 | 25 | // Populate starting conditions 26 | 27 | // Check how to specify nn. The current way is a bit of a hack. 28 | double nn = transition_probabilites[ploidy_switch[0]][1].n_cols - 1; 29 | for (int k = nn; k >= 0; k--) { 30 | alphas[0][nn-k] = binomial(nn, k, genofreq[0]); 31 | } 32 | 33 | lnl += normalize( alphas[0] ) ; 34 | 35 | /// do all other sites 36 | /// Checks if going upstream or downstream from the selected site. 37 | if (go_downstream == true) { 38 | selection_forward_loop_reverse(transition_probabilites, interploidy_transitions, point, lnl, ploidy_index, position) ; 39 | } 40 | else { 41 | selection_forward_loop(transition_probabilites, interploidy_transitions, point, lnl, ploidy_index, position) ; 42 | } 43 | 44 | return lnl ; 45 | } 46 | 47 | // Loop in forward algorithm going downstream from the selected site 48 | void markov_chain::selection_forward_loop( map > &transition_probabilites, vector &interploidy_transitions, selection &point, double &lnl, int ploidy_index, vector &position) { 49 | // WARNING: Check what +1 index does. May be unnecessary. 50 | //cerr << "Emission probabilities: " << emission_probabilities.size() << " " << point.pos << endl; 51 | 52 | int j = 1; 53 | int k; 54 | double normalpha; 55 | 56 | for ( int i = 1 ; i < transition_probabilites[ploidy_switch[ploidy_index]].size() ; i ++ ) { 57 | k = point.pos + i; 58 | /// if we're at or past the next switch position 59 | bool ploidy_change = false ; 60 | if ( i >= ploidy_switch_position[ploidy_index+1] ) { 61 | ploidy_index ++ ; 62 | if ( ploidy_switch[ploidy_index] != ploidy_switch[ploidy_index-1] ) { 63 | ploidy_change = true ; 64 | } 65 | } 66 | /// resize matrix 67 | alphas[j].resize( transition_probabilites[ploidy_switch[ploidy_index]][1].n_cols ) ; 68 | 69 | /// requires slightly different math if we are transitioning in ploidy between two adjacent sites 70 | if ( ploidy_change == true ) { 71 | /// transitions across a chromosome boundary will have low self-self rates 72 | if ( transition_probabilites[ploidy_switch[ploidy_index]][i](0,0) < 0.75 ) { 73 | alphas[j].fill( 1 ) ; 74 | } 75 | 76 | //// otherwise, this is a transition across ploidy types on the same chromosome use the interploidy transition rates 77 | else { 78 | alphas[j] = interploidy_transitions[ploidy_switch[ploidy_index-1]-1] * alphas[j-1] % emission_probabilities[k] ; 79 | } 80 | } 81 | 82 | /// otehrwise business as ususal 83 | 84 | else { 85 | alphas[j] = transition_probabilites[ploidy_switch[ploidy_index]][j] * alphas[j-1] % emission_probabilities[k] ; 86 | normalpha = normalize( alphas[j] ) ; 87 | } 88 | 89 | /// normalize and updated likelihood 90 | lnl += normalpha ; 91 | j++; 92 | } 93 | } 94 | 95 | // Loop in forward algorithm going upstream from the selected site 96 | void markov_chain::selection_forward_loop_reverse( map > &transition_probabilites, vector &interploidy_transitions, selection &point, double &lnl, int ploidy_index, vector &position) { 97 | // WARNING: Check what +1 index does. May be unnecessary. 98 | 99 | int j = 1; 100 | int k; 101 | double normalpha; 102 | 103 | for ( int i = 0 ; i < transition_probabilites[ploidy_switch[ploidy_index]].size()-1 ; i ++ ) { 104 | k = point.pos - i; 105 | /// if we're at or past the next switch position 106 | bool ploidy_change = false ; 107 | if ( i >= ploidy_switch_position[ploidy_index+1] ) { 108 | ploidy_index ++ ; 109 | if ( ploidy_switch[ploidy_index] != ploidy_switch[ploidy_index-1] ) { 110 | ploidy_change = true ; 111 | } 112 | } 113 | /// resize matrix 114 | alphas[j].resize( transition_probabilites[ploidy_switch[ploidy_index]][1].n_cols ) ; 115 | 116 | /// requires slightly different math if we are transitioning in ploidy between two adjacent sites 117 | if ( ploidy_change == true ) { 118 | /// transitions across a chromosome boundary will have low self-self rates 119 | if ( transition_probabilites[ploidy_switch[ploidy_index]][i](0,0) < 0.75 ) { 120 | alphas[j].fill( 1 ) ; 121 | } 122 | 123 | //// otherwise, this is a transition across ploidy types on the same chromosome use the interploidy transition rates 124 | else { 125 | alphas[j] = interploidy_transitions[ploidy_switch[ploidy_index-1]-1] * alphas[j-1] % emission_probabilities[k] ; 126 | } 127 | } 128 | 129 | /// otehrwise business as ususal 130 | else { 131 | alphas[j] = transition_probabilites[ploidy_switch[ploidy_index]][j] * alphas[j-1] % emission_probabilities[k] ; 132 | normalpha = normalize( alphas[j] ) ; 133 | } 134 | 135 | /// normalize and updated likelihood 136 | lnl += normalpha ; 137 | j++; 138 | } 139 | } 140 | 141 | #endif 142 | -------------------------------------------------------------------------------- /scripts/vcf2ahmm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | import csv 4 | 5 | ### get the args 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("-v", type=str, help="input vcf file", required=True ) 8 | parser.add_argument("-s", type=str, help="sample to population file", required=True ) 9 | parser.add_argument("-g", type=int, help="boolean (0 = use sample reads| 1 = use genotypes)", default = 0 ) 10 | parser.add_argument("-r", type=float, help="uniform recombination rate per bp in morgans/bp (float)", default = 1e-8 ) 11 | parser.add_argument("-m", type=int, help="minimum distance between successive snps (int,bp)", default = 1000 ) 12 | parser.add_argument("-o", type=str, help="ploidy file for ahmm input", default = "ahmm.ploidy") 13 | parser.add_argument("--min_total", type=int, help="minimum number of reference population samples (int)", default = 10 ) 14 | parser.add_argument("--min_diff",type=float, help="minimum allele frequency difference between a pair of populations (float)", default = 0.1 ) 15 | args = parser.parse_args() 16 | 17 | ### get sample to population mappings 18 | sample2pop = {} 19 | with open( args.s ) as file : 20 | for line in file : 21 | (key, val) = line.split() 22 | sample2pop[key] = val 23 | 24 | ### IDs 25 | sample_fields = [] 26 | 27 | ### keep track of where we are on the chromosome 28 | last_position = -10000000 29 | current_chrom = "NA" 30 | 31 | ### have we printed the ploidy file yet 32 | ploidy_print = 0 33 | 34 | ## read the vcf file 35 | with open( args.v ) as tsv : 36 | 37 | ### split on tab 38 | for line in csv.reader(tsv, delimiter="\t") : 39 | 40 | ### parse the header line 41 | if ( line[0].startswith('#CHROM') ) : 42 | 43 | ### get our info 44 | sample_files = line 45 | continue 46 | 47 | ### skip lines that start with ## 48 | if ( line[0].startswith('##') ) : 49 | continue 50 | 51 | ### remove sites that are not biallelic snps 52 | if ( len(line[3]) != 1 or len(line[4]) != 1 ) : 53 | continue 54 | 55 | ### now read the line and decide what to do with it 56 | if ( current_chrom != line[0] ) : 57 | current_chrom = line[0] 58 | last_position = -1000000 59 | 60 | ### if position too close skip 61 | elif ( int( line[1] ) - last_position < args.m ) : 62 | continue 63 | 64 | ### otherwise get population allele frequencies 65 | alts = {} 66 | totals = {} 67 | for s in range(9, len( line ) ) : 68 | if ( sample_files[s] in sample2pop ) : 69 | if ( sample2pop[sample_files[s]] not in totals ) : 70 | totals[sample2pop[sample_files[s]]] = 0 71 | alts[sample2pop[sample_files[s]]] = 0 72 | if ( line[s].startswith('0/0') ) : 73 | totals[sample2pop[sample_files[s]]] += 2 74 | elif ( line[s].startswith('0/1') ) : 75 | alts[sample2pop[sample_files[s]]] += 1 76 | totals[sample2pop[sample_files[s]]] += 2 77 | elif ( line[s].startswith('1/1') ) : 78 | alts[sample2pop[sample_files[s]]] += 2 79 | totals[sample2pop[sample_files[s]]] += 2 80 | 81 | ### read haploid genotypes as needed 82 | elif ( line[s].startswith('1') ) : 83 | alts[sample2pop[sample_files[s]]] += 1 84 | totals[sample2pop[sample_files[s]]] += 1 85 | elif ( line[2].startswith('0') ) : 86 | totals[sample2pop[sample_files[s]]] += 1 87 | 88 | ### make sure we have enough samples at this position 89 | too_few = 0 90 | freqs = [] 91 | for population, total in totals.items() : 92 | if ( population != "admixed" ) : 93 | freqs.append( float(alts[population])/float(total) ) 94 | if ( total < args.min_total ) : 95 | too_few += 1 96 | if ( too_few > 0 ) : 97 | continue 98 | 99 | ### check allele frequencies 100 | diff = 0 101 | for p1 in range(0, len(freqs)-1) : 102 | for p2 in range(p1+1,len(freqs)) : 103 | if ( abs( freqs[p1] - freqs[p2] ) > args.min_diff ) : 104 | diff += 1 105 | if ( diff == 0 ) : 106 | continue 107 | 108 | ### if we got this far, we're rpinting the site 109 | print( line[0], line[1], end ="\t", sep = "\t" ) 110 | for population, total in totals.items() : 111 | if ( population != "admixed" ) : 112 | ref = total - alts[population] 113 | print ( ref, alts[population], end = "\t", sep = "\t" ) 114 | 115 | rec = float ( float( line[1] ) - last_position ) * args.r 116 | print ( rec, end = "" ) 117 | 118 | ### now print the read counts for each admixed sample 119 | for s in range(9, len( line ) ) : 120 | 121 | ### only admixed samples 122 | if ( sample_files[s] in sample2pop and sample2pop[sample_files[s]] == "admixed" ) : 123 | 124 | ### print genotypes 125 | if ( args.g == 1 ) : 126 | if ( line[s].startswith('0/0') ) : 127 | print( "\t", 2, "\t", 0, end = "", sep = "" ) 128 | elif ( line[s].startswith('0/1') ) : 129 | print( "\t", 1, "\t", 1, end = "", sep = "" ) 130 | elif ( line[s].startswith('1/1') ) : 131 | print( "\t", 0, "\t", 2, end = "", sep = "" ) 132 | elif ( line[s].startswith('0') ) : 133 | print( "\t", 1, "\t", 0, end = "", sep = "" ) 134 | elif ( line[s].startswith('1') ) : 135 | print( "\t", 0, "\t", 1, end = "", sep = "" ) 136 | else : 137 | print( "\t", 0, "\t", 0, end = "", sep = "" ) 138 | if ( ploidy_print == 0 ) : 139 | print(sample_files[s], 2, sep = "\t", file=open(args.o, "a") ) 140 | 141 | ### print read counts 142 | else : 143 | fields = line[s].split( sep = ":" ) 144 | if ( len( fields ) > 2 ) : 145 | ad = fields[1].split( sep = "," ) 146 | print ( "\t", ad[0], "\t", ad[1], end = "", sep = "" ) 147 | else : 148 | print ( "\t", 0, "\t", 0, end = "", sep = "" ) 149 | if ( ploidy_print == 0 ) : 150 | print(sample_files[s], 2, sep = "\t", file=open(args.o, "a") ) 151 | 152 | ## newline 153 | print () 154 | 155 | ### update last position after printing 156 | last_position = int( line[1] ) 157 | ploidy_print = 1 158 | -------------------------------------------------------------------------------- /src/read_input.h: -------------------------------------------------------------------------------- 1 | #ifndef __READ_INPUT_H 2 | #define __READ_INPUT_H 3 | 4 | void read_file ( cmd_line &options, vector &markov_chain_information, map > > &state_list, vector &position, vector &recombination_rate, vector &chromosomes ) { 5 | 6 | /// vector to hold index of inbred path if we have variable ploidy 7 | vector path_index( markov_chain_information.size(), 0 ) ; 8 | 9 | /// stream in file 10 | ifstream in ( options.input_file.c_str() ) ; 11 | 12 | //// since the first site transition matrix does not matter, we can print anything 13 | double extra_recombination = 1 ; 14 | string last_chrom = "" ; 15 | 16 | while( !in.eof() ) { 17 | 18 | input_line new_line ; 19 | in >> new_line.chrom >> new_line.pos ; 20 | 21 | /// if two adjacent sites have the same positions, skip second 22 | if ( ( position.size() > 0 && new_line.pos == position.back() ) ) { 23 | getline( in, new_line.chrom ) ; 24 | continue ; 25 | } 26 | 27 | // read reference panel genotype counts 28 | new_line.reference_counts.resize( options.ancestry_proportion.size() ) ; 29 | int count = 0 ; 30 | for ( int p = 0 ; p < options.ancestry_proportion.size() ; p ++ ) { 31 | double count1, count2 ; 32 | in >> count1 >> count2 ; 33 | new_line.reference_counts[p].push_back(count1) ; 34 | new_line.reference_counts[p].push_back(count2) ; 35 | new_line.reference_counts[p].push_back(count1+count2) ; 36 | 37 | /// note that this subsampling approach assumes this is only necessary for human data. which for now is the only plausible dataset with reference population sizes thi slarge. 38 | if ( new_line.reference_counts[p][2] > 998 ) { 39 | subsample_reads( new_line.reference_counts[p][0], new_line.reference_counts[p][1], 998 ) ; 40 | new_line.reference_counts[p][2] = new_line.reference_counts[p][0] + new_line.reference_counts[p][1] ; 41 | } 42 | } 43 | 44 | /// read recombination rate 45 | in >> new_line.recombination_rate ; 46 | 47 | /// if line specific error rates are provided 48 | new_line.error_1 = options.error_rate ; 49 | new_line.error_2 = options.error_rate ; 50 | if ( options.error_rates == true ) { 51 | in >> new_line.error_1 >> new_line.error_2 ; 52 | } 53 | 54 | // read sample panel read counts 55 | new_line.sample_counts.resize( markov_chain_information.size() ) ; 56 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 57 | double count1, count2 ; 58 | in >> count1 >> count2 ; 59 | 60 | /// subsample reads to a maximum depth so we can compute multinomial probs without overflow errors 61 | if ( count1 + count2 > 170 ) { 62 | subsample_reads( count1, count2, 170 ) ; 63 | } 64 | 65 | /// now store counts and total for sample 66 | new_line.sample_counts[m].push_back(count1) ; 67 | new_line.sample_counts[m].push_back(count2) ; 68 | new_line.sample_counts[m].push_back(count1+count2) ; 69 | } 70 | 71 | if ( new_line.chrom != last_chrom ) { 72 | recombination_rate.push_back( 0.5 ) ; 73 | last_chrom = new_line.chrom ; 74 | extra_recombination = 0 ; 75 | } 76 | 77 | /// ignore lines where recombination may not be suffiicent to make sites independent 78 | /// this might be useful in place of LD pruning 79 | else { 80 | 81 | extra_recombination += new_line.recombination_rate ; 82 | if ( extra_recombination < options.minimum_distance ) { 83 | continue ; 84 | } 85 | new_line.recombination_rate = extra_recombination ; 86 | extra_recombination = 0 ; 87 | 88 | recombination_rate.push_back( new_line.recombination_rate/ ( new_line.pos - position.back() ) ) ; 89 | } 90 | 91 | /// record position 92 | position.push_back( new_line.pos ) ; 93 | chromosomes.push_back( new_line.chrom ) ; 94 | 95 | /// check all path indexes and update as needed 96 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 97 | 98 | if ( markov_chain_information[m].path_file != "null" ) { 99 | 100 | /// record previous ploidy 101 | int previous_ploidy = markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy ; 102 | 103 | /// check to make sure we're on the right ploidy tract 104 | while ( new_line.chrom != markov_chain_information[m].sample_ploidy_path[path_index[m]].chrom ) { 105 | path_index[m] ++ ; 106 | } 107 | while ( new_line.pos > markov_chain_information[m].sample_ploidy_path[path_index[m]].stop ) { 108 | path_index[m] ++ ; 109 | } 110 | 111 | /// record switches 112 | if ( previous_ploidy != markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy ) { 113 | 114 | markov_chain_information[m].ploidy_switch_position.push_back( position.size() - 1 ) ; 115 | markov_chain_information[m].ploidy_switch.push_back( markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy ) ; 116 | } 117 | } 118 | } 119 | 120 | /// 121 | if ( options.genotype == false ) { 122 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 123 | vec emissions ; 124 | create_emissions_matrix( markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy, new_line, options.ancestral_fixed, state_list[markov_chain_information.at(m).sample_ploidy_path[path_index[m]].ploidy], m, options.ancestry_pulses, emissions ) ; 125 | markov_chain_information[m].emission_probabilities.push_back( emissions ) ; 126 | } 127 | } 128 | 129 | /// create emissions matrix with genotypes 130 | else { 131 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 132 | vec emissions ; 133 | create_emissions_matrix_genotype( markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy, new_line, options.ancestral_fixed, state_list[markov_chain_information.at(m).sample_ploidy_path[path_index[m]].ploidy], m, options.ancestry_pulses, emissions ) ; 134 | markov_chain_information[m].emission_probabilities.push_back( emissions ) ; 135 | } 136 | } 137 | } 138 | 139 | /// to avoid lookahead errors 140 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 141 | markov_chain_information[m].ploidy_switch_position.push_back( position.size() ) ; 142 | } 143 | } 144 | 145 | #endif 146 | 147 | -------------------------------------------------------------------------------- /src/genotype_emissions.h: -------------------------------------------------------------------------------- 1 | #ifndef __GENOTYPE_EMISSIONS_H 2 | #define __GENOTYPE_EMISSIONS_H 3 | 4 | /// each inner vector will be length 4 with, A's that are really A's, a's that are really a's, A's that are really a's, and a's that are really A's 5 | double genotype_total_prob( input_line &new_line, const double &A_count, int &chrom_count, double &real_A_count ) { 6 | 7 | if ( chrom_count == 0 ) { 8 | return 1 ; 9 | } 10 | 11 | /// vector of K balls to go into 12 | vector observed_genotypes ; 13 | for ( int i = 0 ; i < A_count ; i ++ ) { 14 | observed_genotypes.push_back(1) ; 15 | } 16 | for ( int i = A_count ; i < chrom_count ; i ++ ) { 17 | observed_genotypes.push_back(0) ; 18 | } 19 | 20 | /// N bins corresponding to the real (unobserved) genotypes, these are not permuted 21 | vector real_genotypes ; 22 | for ( int i = 0 ; i < real_A_count ; i ++ ) { 23 | real_genotypes.push_back(1) ; 24 | } 25 | for ( int i = real_A_count ; i < chrom_count ; i ++ ) { 26 | real_genotypes.push_back(0) ; 27 | } 28 | 29 | /// multipermute those observed genotypes across the chromosomes 30 | vector > observed_arrangements = multipermute( observed_genotypes ) ; 31 | 32 | /// record total prob of all possible genotype observed/real arrangements 33 | double total_prob = 0 ; 34 | 35 | /// count differences and sames 36 | for ( int o = 0 ; o < observed_arrangements.size() ; o ++ ) { 37 | /// record prob of this specific arrangement 38 | double prob = 1 ; 39 | for ( int s = 0 ; s < observed_arrangements[o].size() ; s ++ ) { 40 | if ( observed_arrangements[o][s] == 1 ) { 41 | if ( real_genotypes[s] == 1 ) { 42 | prob *= ( 1 - new_line.error_1 ) ; 43 | } 44 | else { 45 | prob *= new_line.error_1 ; 46 | } 47 | } 48 | else { 49 | if ( real_genotypes[s] == 1 ) { 50 | prob *= new_line.error_2 ; 51 | } 52 | else { 53 | prob *= ( 1 - new_line.error_2 ) ; 54 | } 55 | } 56 | } 57 | total_prob += prob ; 58 | } 59 | 60 | return total_prob ; 61 | } 62 | 63 | /// this will calculate the emissions probabilities using genotype based methods 64 | void create_emissions_matrix_genotype( double n, input_line &new_line, bool &ancestral_fixed, vector > &states, int &sample_index, vector &pulses, vec &emission_matrix ) { 65 | 66 | /// if no data in sample at site, return flat probs 67 | if ( new_line.sample_counts[sample_index][2] == 0 ) { 68 | emission_matrix.ones(states.size()) ; 69 | return ; 70 | } 71 | 72 | /// indexes will correspond to states vector that we passed here 73 | emission_matrix.zeros(states.size()) ; 74 | 75 | /// i is the index of the state we're in 76 | for ( double i = 0 ; i < states.size() ; i ++ ) { 77 | 78 | /// ancestry states 79 | vector ancestry_states = pulses_to_ancestry( states[i], pulses ) ; 80 | 81 | /// genotype sampling probabilities are 1 by definition, so no p_reads term 82 | /// instead just distribute A alleles across sampled chromosomes wihtout replacement 83 | map, double > A_counts ; 84 | distribute_alleles( ancestry_states, new_line.sample_counts[sample_index][0], new_line.sample_counts[sample_index][2], A_counts ) ; 85 | 86 | /// now compute probability of each sampling arrangement 87 | for ( std::map,double>::iterator a = A_counts.begin() ; a != A_counts.end() ; ++ a ) { 88 | 89 | /// update probs as we iterate across pulses 90 | double prob_counts = 1 ; 91 | 92 | //// c is the ancestry pulse from which we have sampled the genotype 93 | for ( int c = 0 ; c < a->first.size() ; c ++ ) { 94 | 95 | double sum = 0 ; 96 | 97 | /// ancestral frequencies are not fixed [default] 98 | if ( ancestral_fixed == false ) { 99 | 100 | /// sum across all chromosomes for producing a given set of genotypes which allows us to accomodate error in genotyping similarly to how we accomodate reads, but still assumes one chromosome == one genotype, i.e. permutation not binomial 101 | /// j is the number of real A alleles on those chromosomes 102 | for ( double j = 0 ; j <= ancestry_states[c] ; j ++ ) { 103 | 104 | //// then compute probability of ancestral alleles and our sample 105 | sum += 1/( new_line.reference_counts[c][2] + ancestry_states[c] + 1 ) 106 | * 1/nCk[ new_line.reference_counts[c][2] + ancestry_states[c] ][ new_line.reference_counts[c][0] + j ] 107 | 108 | /// and include total prob of the observed genotypes given true genotypes, j 109 | * genotype_total_prob( new_line, a->first[c], ancestry_states[c], j ) ; 110 | 111 | } 112 | } 113 | 114 | /// if ancestral frequencies are fixed 115 | else { 116 | 117 | //// allele frequency is just the allele frequency in the reference panel 118 | double f = new_line.reference_counts[c][0] / new_line.reference_counts[c][2] ; 119 | 120 | /// to accomodate error in genotypes, we still sum across all possible genotype combinations 121 | for ( double j = 0 ; j <= ancestry_states[c] ; j ++ ) { 122 | 123 | /// just figure probabilty of getting j A's given ancestry state and ancestry freq, no integral required 124 | sum += nCk[ ancestry_states[c] ][ j ] 125 | * pow( f, j ) 126 | * pow ( 1 - f, ancestry_states[c] - j ) 127 | 128 | /// and include total prob of the observed genotypes given true genotypes, j 129 | * genotype_total_prob( new_line, a->first[c], ancestry_states[c], j ) ; 130 | } 131 | } 132 | 133 | prob_counts *= sum ; 134 | } 135 | 136 | /// recall that a second records the possible permutations of all genotypes into a given ancestry state and is therefore equivalent to the multinomial combinatoric multiplier 137 | emission_matrix[i] += prob_counts * a->second ; 138 | } 139 | } 140 | 141 | /// check if any entries are 0 and set to smallest possible float 142 | for ( int i = 0 ; i < emission_matrix.size() ; i ++ ) { 143 | if ( emission_matrix(i) == 0 ) { 144 | emission_matrix(i) = 1.17549e-38 ; 145 | } 146 | } 147 | 148 | return ; 149 | 150 | } 151 | 152 | #endif 153 | 154 | -------------------------------------------------------------------------------- /src/selection_read_input.h: -------------------------------------------------------------------------------- 1 | #ifndef __SELECTION_READ_INPUT_H 2 | #define __SELECTION_READ_INPUT_H 3 | 4 | void read_file ( cmd_line &options, vector &markov_chain_information, map > > &state_list, vector &position, vector &recombination_rate, vector &chromosomes, int &sel_pos ) { 5 | 6 | /// vector to hold index of inbred path if we have variable ploidy 7 | vector path_index( markov_chain_information.size(), 0 ) ; 8 | 9 | /// stream in file 10 | ifstream in ( options.input_file.c_str() ) ; 11 | 12 | //// since the first site transition matrix does not matter, we can print anything 13 | double extra_recombination = 1 ; 14 | string last_chrom = "" ; 15 | 16 | /// iterator to find selected_site 17 | int ipos = 0; 18 | 19 | while( !in.eof() ) { 20 | 21 | input_line new_line ; 22 | in >> new_line.chrom >> new_line.pos ; 23 | 24 | 25 | /// Limits the genomic region to be parsed ifa range of interest is specified 26 | ///(so far only for selection analysis) 27 | if (options.is_limit == true) { 28 | if ( new_line.chrom != options.limit_chr) { 29 | getline( in, new_line.chrom ) ; 30 | continue ; 31 | } 32 | if ( new_line.pos < options.limit_win_start) { 33 | getline( in, new_line.chrom ) ; 34 | continue ; 35 | } 36 | if ( new_line.pos > options.limit_win_end) { 37 | break ; 38 | } 39 | } 40 | 41 | 42 | /// if two adjacent sites have the same positions, skip second 43 | if ( ( position.size() > 0 && new_line.pos == position.back() ) ) { 44 | getline( in, new_line.chrom ) ; 45 | continue ; 46 | } 47 | 48 | // read reference panel genotype counts 49 | new_line.reference_counts.resize( options.ancestry_proportion.size() ) ; 50 | int count = 0 ; 51 | for ( int p = 0 ; p < options.ancestry_proportion.size() ; p ++ ) { 52 | double count1, count2 ; 53 | in >> count1 >> count2 ; 54 | new_line.reference_counts[p].push_back(count1) ; 55 | new_line.reference_counts[p].push_back(count2) ; 56 | new_line.reference_counts[p].push_back(count1+count2) ; 57 | } 58 | 59 | /// read recombination rate 60 | in >> new_line.recombination_rate ; 61 | 62 | /// if line specific error rates are provided 63 | new_line.error_1 = options.error_rate ; 64 | new_line.error_2 = options.error_rate ; 65 | if ( options.error_rates == true ) { 66 | in >> new_line.error_1 >> new_line.error_2 ; 67 | } 68 | // read sample panel read counts 69 | new_line.sample_counts.resize( markov_chain_information.size() ) ; 70 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 71 | double count1, count2 ; 72 | in >> count1 >> count2 ; 73 | 74 | /// subsample reads to a maximum depth so we can compute multinomial probs without overflow errors 75 | if ( count1 + count2 > 170 ) { 76 | subsample_reads( count1, count2 ) ; 77 | } 78 | 79 | /// now store counts and total for sample 80 | new_line.sample_counts[m].push_back(count1) ; 81 | new_line.sample_counts[m].push_back(count2) ; 82 | new_line.sample_counts[m].push_back(count1+count2) ; 83 | } 84 | 85 | if ( new_line.chrom != last_chrom ) { 86 | recombination_rate.push_back( 0.5 ) ; 87 | last_chrom = new_line.chrom ; 88 | extra_recombination = 0 ; 89 | } 90 | 91 | /// ignore lines where recombination may not be suffiicent to make sites independent 92 | /// this might be useful in place of LD pruning 93 | else { 94 | extra_recombination += new_line.recombination_rate ; 95 | if ( extra_recombination < options.minimum_distance ) { 96 | continue ; 97 | } 98 | new_line.recombination_rate = extra_recombination ; 99 | extra_recombination = 0 ; 100 | 101 | /// Changing recombination rate to no longer be specified per basepair. JS 102 | recombination_rate.push_back( new_line.recombination_rate ) ; 103 | } 104 | 105 | ipos++ ; 106 | 107 | /// record position 108 | position.push_back( new_line.pos ) ; 109 | chromosomes.push_back( new_line.chrom ) ; 110 | 111 | /// check all path indexes and update as needed 112 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 113 | 114 | if ( markov_chain_information[m].path_file != "null" ) { 115 | 116 | /// record previous ploidy 117 | int previous_ploidy = markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy ; 118 | 119 | /// check to make sure we're on the right ploidy tract 120 | while ( new_line.chrom != markov_chain_information[m].sample_ploidy_path[path_index[m]].chrom ) { 121 | path_index[m] ++ ; 122 | } 123 | while ( new_line.pos > markov_chain_information[m].sample_ploidy_path[path_index[m]].stop ) { 124 | path_index[m] ++ ; 125 | } 126 | 127 | /// record switches 128 | if ( previous_ploidy != markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy ) { 129 | 130 | markov_chain_information[m].ploidy_switch_position.push_back( position.size() - 1 ) ; 131 | markov_chain_information[m].ploidy_switch.push_back( markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy ) ; 132 | } 133 | } 134 | } 135 | /// 136 | if ( options.genotype == false ) { 137 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 138 | vec emissions ; 139 | create_emissions_matrix( markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy, new_line, options.ancestral_fixed, state_list[markov_chain_information.at(m).sample_ploidy_path[path_index[m]].ploidy], m, options.ancestry_pulses, emissions ) ; 140 | markov_chain_information[m].emission_probabilities.push_back( emissions ) ; 141 | } 142 | } 143 | 144 | /// create emissions matrix with genotypes 145 | else { 146 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 147 | vec emissions ; 148 | create_emissions_matrix_genotype( markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy, new_line, options.ancestral_fixed, state_list[markov_chain_information.at(m).sample_ploidy_path[path_index[m]].ploidy], m, options.ancestry_pulses, emissions ) ; 149 | markov_chain_information[m].emission_probabilities.push_back( emissions ) ; 150 | } 151 | } 152 | } 153 | 154 | 155 | /// to avoid lookahead errors 156 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 157 | markov_chain_information[m].ploidy_switch_position.push_back( position.size() ) ; 158 | } 159 | } 160 | 161 | #endif 162 | 163 | -------------------------------------------------------------------------------- /src/transition_information.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRANSITION_INFORMATION_H 2 | #define __TRANSITION_INFORMATION_H 3 | 4 | /// binomial information for transitions to look up during computation 5 | class transition_information { 6 | public: 7 | int start_state ; /// will be start state 8 | int end_state ; 9 | int transition_count ; 10 | 11 | bool ibd_transition ; 12 | 13 | friend bool operator < ( transition_information a, transition_information b ) { 14 | if ( a.start_state > b.start_state ) return a.start_state > b.start_state ; 15 | else if ( a.end_state > b.end_state ) return a.end_state > b.end_state ; 16 | else return a.transition_count > b.transition_count ; 17 | } 18 | } ; 19 | 20 | /// object will be sets of sets of outcomes 21 | void enumerate_rows ( double ploidy, int row, vector &pulse_types, vector > &output ) { 22 | 23 | // if we have no chromosomes of this type to transition, skip 24 | if ( row == 0 ) { 25 | vector end (1, 0) ; 26 | output.push_back( end ) ; 27 | return ; 28 | } 29 | 30 | /// find all possible outcomes for a given input 31 | vector > end_states = multichoose( row, pulse_types ) ; 32 | 33 | /// end is then stored as the output pulse (index) and count (value) 34 | for ( int p = 0 ; p < end_states.size() ; p ++ ) { 35 | vector end ( pulse_types.size(), 0 ) ; 36 | for ( int m = 0 ; m < end_states[p].size() ; m ++ ) { 37 | end[end_states[p][m]] ++ ; 38 | } 39 | output.push_back(end) ; 40 | } 41 | } 42 | 43 | /// for each row, compute the binomial coefficient n!/k1!k2!...kn! 44 | /// possible_transitions is the sets for each pulse of possible final transitions 45 | /// binom_coeffs is the final data object to store it 46 | void compute_binomial_coefficients ( vector > &transitions, vector &binomial_coefficients ) { 47 | 48 | for ( int t = 0 ; t < transitions.size() ; t ++ ) { 49 | double multiplier = 1 ; 50 | double sum = 0 ; 51 | for ( int pulse = 0 ; pulse < transitions[t].size() ; pulse ++ ) { 52 | sum += transitions[t][pulse] ; 53 | multiplier *= 1/factorial[transitions[t][pulse]] ; 54 | } 55 | multiplier *= factorial[sum] ; 56 | 57 | binomial_coefficients.push_back( multiplier ) ; 58 | } 59 | } 60 | 61 | void enumerate_combinations ( vector > &groups, vector > > &possible_transitions ) { 62 | 63 | /// to replace groups with updated groups 64 | vector > groups_so_far ; 65 | 66 | /// scroll across all pulses 67 | for ( int p = 0 ; p < possible_transitions.size() ; p ++ ) { 68 | 69 | /// record the groups that we have so far 70 | groups_so_far = groups ; 71 | 72 | /// then empty groups and rebuild the vector 73 | groups.clear() ; 74 | 75 | /// if empty we just make a vector with everything we've got 76 | if ( groups_so_far.empty() ) { 77 | 78 | /// just create a vector of these 79 | for ( int row = 0 ; row < possible_transitions[p].size() ; row ++ ) { 80 | vector new_group ( 1, row ) ; 81 | groups.push_back( new_group ) ; 82 | } 83 | } 84 | 85 | /// otherwise, just append to all existing groups and re-record 86 | else { 87 | for ( int g = 0 ; g < groups_so_far.size() ; g ++ ) { 88 | for ( int row = 0 ; row < possible_transitions[p].size() ; row ++ ) { 89 | groups.push_back( groups_so_far[g] ) ; 90 | groups.back().push_back( row ) ; 91 | } 92 | } 93 | } 94 | } 95 | } 96 | 97 | /// multiply across groups bcs to get ways of producing each matrix 98 | void compute_group_bcs ( vector > &groups, vector > &binomial_coefficients, vector &group_bcs ) { 99 | 100 | for ( int g = 0 ; g < groups.size() ; g ++ ) { 101 | group_bcs.push_back( 1 ) ; 102 | for ( int pulse = 0 ; pulse < groups[g].size() ; pulse ++ ) { 103 | group_bcs.back() *= binomial_coefficients[pulse][groups[g][pulse]] ; 104 | } 105 | } 106 | } 107 | 108 | /// figure out the end state for each group 109 | void determine_end_state( vector > > &possible_transitions, vector &new_transition, vector &group, vector &end ) { 110 | 111 | for ( int p = 0 ; p < possible_transitions.size() ; p ++ ) { 112 | for ( int e = 0 ; e < possible_transitions[p][group[p]].size() ; e ++ ) { 113 | transition_information t ; 114 | t.start_state = p ; 115 | t.end_state = e ; 116 | t.transition_count = possible_transitions[p][group[p]][e] ; 117 | end[e] += possible_transitions[p][group[p]][e] ; 118 | t.ibd_transition = false ; 119 | new_transition.push_back(t) ; 120 | } 121 | } 122 | } 123 | 124 | /// create map of states back onto their position in the state vector 125 | void create_state_map ( vector > &states, map, int> &state_map ) { 126 | for ( int s = 0 ; s < states.size() ; s ++ ) { 127 | state_map[states[s]] = s ; 128 | } 129 | } 130 | 131 | //// find all possible transitions between two adjacent markers and record 132 | void create_transition_information( double &ploidy, map, double > > > > &transition_matrix_information, vector > &states ) { 133 | 134 | /// skip if we already have the ploidy figured out 135 | if ( transition_matrix_information.find(ploidy) != transition_matrix_information.end() ) { 136 | return ; 137 | } 138 | 139 | //// resize vectors 140 | transition_matrix_information[ploidy].resize(states.size()) ; 141 | for ( int i = 0 ; i < states.size() ; i ++ ) { 142 | transition_matrix_information[ploidy][i].resize(states.size()) ; 143 | } 144 | 145 | /// generate vector of all possible pulse types 146 | vector pulse_types ; 147 | for ( int p = 0 ; p < states[0].size() ; p ++ ) { 148 | pulse_types.push_back( p ) ; 149 | } 150 | 151 | /// create state map for quick lookup at the end 152 | map,int> state_map ; 153 | create_state_map( states, state_map ) ; 154 | 155 | /// iterate aross all possible start states 156 | for ( int i = 0 ; i < states.size() ; i ++ ) { 157 | 158 | /// all combinations 159 | vector > > possible_transitions ; 160 | vector > binomial_coefficients ; 161 | 162 | /// iterate through all pulses to find possible outputs (rows) 163 | for ( int row = 0 ; row < states[i].size() ; row ++ ) { 164 | 165 | /// find all possible outputs for the pulse (row) 166 | vector > output ; 167 | enumerate_rows( ploidy, states[i][row], pulse_types, output ) ; 168 | possible_transitions.push_back( output ) ; 169 | 170 | /// find binomial coefficients 171 | vector pulse_bcs ; 172 | compute_binomial_coefficients( output, pulse_bcs ) ; 173 | binomial_coefficients.push_back( pulse_bcs ) ; 174 | } 175 | 176 | /// find all possible combinations among the possible transitions 177 | vector > groups ; 178 | enumerate_combinations( groups, possible_transitions ) ; 179 | 180 | /// compute group bcs 181 | vector group_bcs ; 182 | compute_group_bcs( groups, binomial_coefficients, group_bcs ) ; 183 | 184 | /// determine the end state fo each combination, count transitions 185 | for ( int g = 0 ; g < groups.size() ; g ++ ) { 186 | vector end ( pulse_types.size(), 0 ) ; 187 | vector new_transition ; 188 | determine_end_state( possible_transitions, new_transition, groups[g], end ) ; 189 | transition_matrix_information[ploidy][i][state_map[end]][new_transition] += group_bcs[g] ; 190 | } 191 | } 192 | } 193 | 194 | #endif 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /src/ahmms.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | copyright: Russ Corbett-Detig 4 | rucorbet@ucsc.edu 5 | 6 | Jesper Svedberg 7 | jsvedber@ucsc.edu 8 | 9 | This is software distributed under the gnu public license version 3. 10 | 11 | */ 12 | 13 | /// headers 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | // Includes specific for Ancestry_HMM-S 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include // ++++ REQUIRES C++11 ++++ 29 | 30 | 31 | /// linear algebra library is armadillo 32 | #define ARMA_NO_DEBUG 33 | #include 34 | using namespace arma ; 35 | using namespace std ; 36 | 37 | /// our header files in /src directory 38 | #include "selection_print_usage.h" // JS 39 | #include "factorial.h" 40 | #include "nchoosek.h" 41 | #include "selection_subsample.h" 42 | #include "multichoose.h" 43 | #include "multipermute.h" 44 | #include "normalize.h" 45 | #include "ancestry_pulse.h" 46 | #include "ploidy_path.h" 47 | #include "selection_class.h" // JS 48 | #include "selection_markov_chain.h" 49 | #include "read_samples.h" 50 | #include "pulses_to_ancestry.h" 51 | #include "compute_forward.h" 52 | #include "compute_backward.h" 53 | #include "forward_backward.h" 54 | #include "viterbi.h" 55 | #include "transition_information.h" 56 | #include "exponentiate_matrix.h" 57 | #include "selection_cmd_line.h" 58 | #include "create_transition_rates.h" 59 | #include "selection_read_cmd_line.h" 60 | #include "evaluate_vertex.h" 61 | #include "check_vertex.h" 62 | #include "sort_vertices.h" 63 | #include "create_pulses.h" 64 | #include "create_states.h" 65 | #include "input_line.h" 66 | #include "distribute_alleles.h" 67 | #include "binomial.h" 68 | #include "read_emissions.h" 69 | #include "genotype_emissions.h" 70 | #include "selection_read_input.h" 71 | #include "nelder_mead.h" 72 | #include "golden_search.h" 73 | #include "bootstrap.h" 74 | 75 | // Includes specific for Ancestry_HMM-S 76 | #include "selection_get_position.h" 77 | #include "selection_optimize_test_func.h" // Function for testing Nelder-Mead. Remove? 78 | #include "selection_fwd_iter.h" 79 | #include "selection_trajectory.h" 80 | #include "selection_split_vector.h" 81 | #include "selection_forward.h" 82 | #include "selection_stochastic_traj.h" 83 | #include "selection_transition_rates.h" 84 | 85 | 86 | 87 | 88 | int main ( int argc, char *argv[] ) { 89 | 90 | /// time tracking 91 | clock_t t = clock() ; 92 | clock_t total = clock() ; 93 | 94 | /// seed prng 95 | srand (t) ; 96 | 97 | // read cmd line 98 | cmd_line options ; 99 | cerr << "reading command line" ; t = clock(); 100 | options.read_cmd_line( argc, argv ) ; 101 | 102 | /// chain objects for each sample 103 | vector markov_chain_information ; 104 | 105 | /// get sample ids and ploidy from input file 106 | cerr << "\t\t\t\t" << (double) (clock() - t) << " ms\n" << "reading sample ids and ploidy" ; t = clock(); 107 | read_samples( markov_chain_information, options.sample_file, options.viterbi ) ; 108 | 109 | /// create states matrix 110 | cerr << "\t\t\t" << (double) (clock() - t) << " ms\n" << "creating states matrix" ; t = clock(); 111 | /// store all possible state space arranged by ploidy and then vector of state counts 112 | map > > state_list ; 113 | /// now create initial state list 114 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 115 | for ( int p = 0 ; p < markov_chain_information[m].sample_ploidy_path.size() ; p ++ ) { 116 | create_initial_states( markov_chain_information.at(m).sample_ploidy_path[p].ploidy, options.ancestry_pulses, state_list ) ; 117 | } 118 | } 119 | 120 | /// read in panels and update matrices 121 | cerr << "\t\t\t\t" << (double) (clock() - t) << " ms\n" << "reading data and creating emissions matrices\t" ; t = clock() ; 122 | /// store recombination rates and positions 123 | vector position ; 124 | vector recombination_rate ; 125 | vector chromosomes ; 126 | int sel_pos ; 127 | read_file( options, markov_chain_information, state_list, position, recombination_rate, chromosomes, sel_pos ) ; 128 | 129 | 130 | 131 | /// create basic transition information 132 | cerr << (double) (clock() - t) << " ms" << endl << "computing transition routes\t\t\t" ; t = clock() ; 133 | /// 3d map to look up by ploidy, start state, end state, and then relevant transition information 134 | map, double > > > > transition_matrix_information ; 135 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 136 | for ( int p = 0 ; p < markov_chain_information[m].sample_ploidy_path.size() ; p ++ ) { 137 | create_transition_information( markov_chain_information.at(m).sample_ploidy_path[p].ploidy, transition_matrix_information, state_list[markov_chain_information.at(m).sample_ploidy_path[p].ploidy] ) ; 138 | } 139 | } 140 | cerr << endl; 141 | 142 | 143 | // Below are ahmm-s specific options 144 | 145 | // If using grid search with --grid flag 146 | if (options.calc_grid == true) { 147 | int p_start = options.grid_pstart; 148 | int p_stop = options.grid_pstop; 149 | int p_step = options.grid_pstep; 150 | 151 | double s_start = options.grid_sstart; 152 | double s_stop = options.grid_sstop; 153 | 154 | if ( options.limit_sel_space == true ) { 155 | s_stop = selection_get_max_sel(options.grid_sstart, options.grid_sstop, options.grid_sstep, options.ancestry_pulses[1].proportion, options.ancestry_pulses[1].time, options.ne); 156 | } 157 | double s_step = options.grid_sstep; 158 | 159 | cerr << "Grid search. Likelihood calculated for values of selection between " << s_start << " and " << s_stop << endl; 160 | 161 | selection_grid(p_start, p_stop, p_step, s_start, s_stop, s_step, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_list); 162 | return 0; 163 | } 164 | 165 | 166 | 167 | // If testing a single point using --site flag. 168 | if (options.test_point == true) { 169 | cerr << "Evaluating point: " << options.test_pos << ", " << options.test_sel << endl; 170 | 171 | map > sel_trajectories; 172 | vector > split_vecs; 173 | int testpos; 174 | selection point0; 175 | 176 | if (options.is_coord == true) { 177 | testpos = get_position(options.test_pos, position); 178 | 179 | if (testpos == -1) { 180 | cerr << "ERROR: specified site not found on chromosome" << endl; 181 | exit(1); 182 | } 183 | 184 | } 185 | else { 186 | testpos = options.test_pos; 187 | } 188 | 189 | point0.pos = testpos; 190 | point0.sel = 0; 191 | selection_evaluate_point_genotypes( point0, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_list, split_vecs, sel_trajectories ) ; 192 | 193 | selection point1; 194 | point1.pos = testpos; 195 | point1.sel = options.test_sel; 196 | selection_evaluate_point_genotypes( point1, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_list, split_vecs, sel_trajectories ) ; 197 | 198 | cout << "lnL for a selected site s=" << options.test_sel << " at position " << position[point0.pos] << " is: " << point1.lnl-point0.lnl << endl; 199 | 200 | return 0; 201 | } 202 | 203 | 204 | 205 | // If using Golden Section Search with --gss flag 206 | if (options.run_gss == true) { 207 | selection_golden_section(markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_list); 208 | return 0; 209 | } 210 | 211 | 212 | return 0 ; 213 | } 214 | 215 | 216 | -------------------------------------------------------------------------------- /src/create_transition_rates.h: -------------------------------------------------------------------------------- 1 | #ifndef __CREATE_TRANSITION_RATES_H 2 | #define __CREATE_TRANSITION_RATES_H 3 | 4 | //// create transition matrix for a given admixture model 5 | void create_transition_matrix ( map > &transition_matrix , vector, double > > > &transition_info, vector &recombination_rate, vector &positions, double &number_chromosomes, mat &transition_rates ) { 6 | 7 | /// check if we already computed this for this sample ploidy 8 | if ( transition_matrix.find( number_chromosomes ) != transition_matrix.end() ) { 9 | return ; 10 | } 11 | 12 | /// else, have to create entire matrix 13 | /// first create data object of approporate size 14 | transition_matrix[number_chromosomes].resize(recombination_rate.size()) ; 15 | 16 | //// iterate across all positions and compute transition matrixes 17 | for ( int p = 1 ; p < recombination_rate.size() ; p ++ ) { 18 | 19 | /// create actual transition matrix 20 | transition_matrix[number_chromosomes][p].set_size( transition_info.size(), transition_info.size() ) ; 21 | transition_matrix[number_chromosomes][p].fill( 0 ) ; 22 | 23 | /// if onto a new chromosome 24 | if ( recombination_rate[p] > 0.4 ) { 25 | transition_matrix[number_chromosomes][p].fill(1/double(transition_info.size())) ; 26 | continue ; 27 | } 28 | 29 | /// otherwise normal transition rates 30 | /// create matrix of per bp changes 31 | mat site_transitions = transition_rates * recombination_rate[p] ; 32 | 33 | /// now extract the sum of all non-self transitions in a row, and take remainder 34 | for ( int i = 0 ; i < transition_rates.n_rows ; i ++ ) { 35 | double sum = 1 ; 36 | for ( int j = 0 ; j < transition_rates.n_rows ; j ++ ) { 37 | if ( j != i ) { 38 | sum -= site_transitions(i,j) ; 39 | } 40 | } 41 | site_transitions(i,i) = sum ; 42 | } 43 | 44 | /// create final transition matrix via exponentiation by squaring 45 | mat segment_transitions ; 46 | if ( recombination_rate[p] > 0.49 ) { 47 | segment_transitions = exp_matrix( site_transitions, 2 ) ; 48 | } 49 | else { 50 | segment_transitions = exp_matrix( site_transitions, positions[p] - positions[p-1] ) ; 51 | } 52 | 53 | /// population transitions by summing across all routes 54 | for ( int i = 0 ; i < transition_info.size() ; i ++ ) { 55 | for ( int j = 0 ; j < transition_info[i].size() ; j ++ ) { 56 | for ( std::map,double>::iterator t = transition_info[i][j].begin() ; t != transition_info[i][j].end() ; ++ t ) { 57 | double prob_t = 1 ; 58 | for ( int r = 0 ; r < t->first.size() ; r ++ ) { 59 | prob_t *= pow( segment_transitions(t->first[r].start_state,t->first[r].end_state), t->first[r].transition_count ) ; 60 | } 61 | transition_matrix[number_chromosomes][p](j,i) += prob_t * t->second ; 62 | } 63 | } 64 | } 65 | } 66 | } 67 | 68 | //// create all transition rates between ancestry types for a single chromosome 69 | mat create_transition_rates ( vector admixture_pulses, double n, vector ancestry_proportion ) { 70 | 71 | /// determine ancestry proportions based on the fraction of remainder 72 | for ( int p = 0 ; p < admixture_pulses.size() ; p ++ ) { 73 | admixture_pulses[p].proportion = admixture_pulses[p].fraction_of_remainder * ancestry_proportion[admixture_pulses[p].type] ; 74 | ancestry_proportion[admixture_pulses[p].type] -= admixture_pulses[p].proportion ; 75 | } 76 | 77 | //// sort by time so oldest events are at the top 78 | sort( admixture_pulses.begin(), admixture_pulses.end() ) ; 79 | 80 | /// all ancestry proprionts to this point are in units of final ancestry 81 | /// however before the pulses that came before, there was more 82 | double ancestry_accounted = 0 ; 83 | for ( int p = admixture_pulses.size() - 1 ; p > 0 ; p -- ) { 84 | double accounted_here = admixture_pulses[p].proportion ; 85 | admixture_pulses[p].proportion = admixture_pulses[p].proportion/( 1 - ancestry_accounted ) ; 86 | ancestry_accounted += accounted_here ; 87 | } 88 | 89 | /// matrix to hold transition rates 90 | mat transition_rates( admixture_pulses.size(), admixture_pulses.size(), fill::zeros ) ; 91 | 92 | /// iterate through all states 93 | for ( int s1 = 0 ; s1 < admixture_pulses.size() ; s1 ++ ) { 94 | for ( int s2 = 0 ; s2 < admixture_pulses.size() ; s2 ++ ) { 95 | 96 | //// self transition rates are just going to be 1-all others 97 | if ( s2 == s1 ) continue ; 98 | 99 | /// rates calculated one way if greater than 100 | if ( s1 > s2 ) { 101 | for ( int t = s1 ; t < admixture_pulses.size() ; t ++ ) { 102 | 103 | /// basic recombination rates in each epoch 104 | double rate ; 105 | if ( t != admixture_pulses.size() - 1 ) { 106 | rate = n * ( 1 - exp( (admixture_pulses[t+1].time-admixture_pulses[t].time)/n ) ) ; 107 | } 108 | else { 109 | rate = n * ( 1 - exp( (-1*admixture_pulses[t].time)/n ) ) ; 110 | } 111 | 112 | /// probability of no recombination in prior epochs 113 | /// will skip epoch of s1 since that's in the above equation by definition 114 | for ( int t2 = t - 1 ; t2 > s1 - 1 ; t2 -- ) { 115 | rate *= exp((admixture_pulses[t2+1].time-admixture_pulses[t2].time)/n) ; 116 | } 117 | 118 | /// now probability of not selecting other acnestry types 119 | for ( int a = t ; a > s2 ; a -- ) { 120 | rate *= ( 1 - admixture_pulses[a].proportion ) ; 121 | } 122 | 123 | /// now select the correct ancestry type 124 | if ( s2 != 0 ) { 125 | rate *= admixture_pulses[s2].proportion ; 126 | } 127 | 128 | transition_rates(admixture_pulses[s2].entry_order,admixture_pulses[s1].entry_order) += rate ; 129 | } 130 | } 131 | else { 132 | for ( int t = s2 ; t < admixture_pulses.size() ; t ++ ) { 133 | 134 | /// basic recombination rates in each epoch 135 | double rate ; 136 | if ( t != admixture_pulses.size() - 1 ) { 137 | rate = n * ( 1 - exp( (admixture_pulses[t+1].time-admixture_pulses[t].time)/n ) ) ; 138 | } 139 | else { 140 | rate = n * ( 1 - exp( (-1*admixture_pulses[t].time)/n ) ) ; 141 | } 142 | 143 | /// probability of no recombination in prior epochs 144 | /// will skip epoch of s1 since that's in the above equation by definition 145 | for ( int t2 = t - 1 ; t2 > s2 - 1 ; t2 -- ) { 146 | rate *= exp((admixture_pulses[t2+1].time-admixture_pulses[t2].time)/n) ; 147 | } 148 | 149 | /// now deal with selecting lineage of the correct ancestry type 150 | for ( int a = t ; a > s2 ; a -- ) { 151 | rate *= ( 1 - admixture_pulses[a].proportion ) ; 152 | } 153 | 154 | /// now select the correct ancestry type 155 | rate *= admixture_pulses[s2].proportion ; 156 | 157 | /// and augment by this rate for this epoch 158 | transition_rates(admixture_pulses[s2].entry_order,admixture_pulses[s1].entry_order) += rate ; 159 | } 160 | } 161 | } 162 | } 163 | 164 | return transition_rates.t() ; 165 | } 166 | 167 | #endif 168 | 169 | -------------------------------------------------------------------------------- /src/read_cmd_line.h: -------------------------------------------------------------------------------- 1 | #ifndef __READ_CMD_LINE_H 2 | #define __READ_CMD_LINE_H 3 | 4 | void cmd_line::read_cmd_line ( int argc, char *argv[] ) { 5 | 6 | ///defaults 7 | ancestral_fixed = false ; /// set to true for qtl or experimental evolution application if ancestral genotypes are known and at fixed frequencies. 8 | 9 | /// ideally we recommend pruning LD in advance 10 | minimum_distance = 0 ; /// minimum distance in morgans between sites to consider 11 | ne = 2e4 ; /// actually 2ne 12 | 13 | // time params to bound our search 14 | t_max = 10000 ; 15 | t_min = 1 ; 16 | p_max = 0.99999 ; 17 | p_min = 0.00001 ; 18 | t_length = 0.8 ; 19 | p_length = 0.8 ; 20 | 21 | /// if set, we clear once 22 | bool clear = false ; 23 | 24 | /// error rates 25 | error_rates = false ; 26 | 27 | // the default behavior is a single pulse of ancestry 1 into ancestry 0 28 | ancestry_pulses.resize( 2 ) ; 29 | ancestry_pulses[0].type = 0 ; 30 | ancestry_pulses[1].type = 1 ; 31 | 32 | /// default is 50:50 with single pulse of 1 into 0 33 | ancestry_pulses[0].proportion = 0.5 ; 34 | ancestry_pulses[1].proportion = 0.5 ; 35 | ancestry_pulses[0].proportion_fixed = true ; 36 | ancestry_pulses[1].proportion_fixed = true ; 37 | 38 | /// also the ancestry proportions are known 39 | ancestry_proportion.assign(2,0.5) ; 40 | 41 | /// time is not fixed by default, pulse of 1 into 0 42 | /// does not matter, really since 0>1 would be identical in formulation 43 | ancestry_pulses[0].time = 3000 ; 44 | ancestry_pulses[0].time_fixed = true ; 45 | ancestry_pulses[1].time = 10 ; 46 | ancestry_pulses[1].time_fixed = false ; 47 | 48 | /// end parameter this will be in lnl units 49 | /// i.e. must obtain <= this amount of improvement between all vertices to quit 50 | tolerance = 1e-5 ; 51 | 52 | /// restart number 53 | n_restarts = -1 ; 54 | 55 | /// per site per read error rate 56 | error_rate = 0.01 ; 57 | 58 | /// genotype data rather than read data? 59 | genotype = false ; 60 | 61 | // viterbi 62 | viterbi = false ; 63 | 64 | /// output pulses rather than ancestry counts 65 | output_pulses = true ; 66 | 67 | /// set output precision 68 | precision = 10 ; 69 | 70 | /// sample file 71 | sample_file = "null" ; 72 | 73 | // intput file 74 | input_file = "null" ; 75 | 76 | /// bootstraps 77 | n_bootstraps = 0 ; 78 | block_size = 0 ; 79 | 80 | /// accept command line parameters 81 | for (int i=1; i 0 ) { 101 | new_ancestry_pulse.time_fixed = true ; 102 | } 103 | else { 104 | new_ancestry_pulse.time = new_ancestry_pulse.time * -1 ; 105 | new_ancestry_pulse.time_fixed = false ; 106 | } 107 | 108 | // if proporion is set, we are not estimating it 109 | ////// set proporiton with a negative number to provide the starting guess for this parameter 110 | if ( new_ancestry_pulse.proportion > 0 ) { 111 | new_ancestry_pulse.proportion_fixed = true ; 112 | } 113 | else { 114 | new_ancestry_pulse.proportion_fixed = false ; 115 | new_ancestry_pulse.proportion = -1 * new_ancestry_pulse.proportion ; 116 | } 117 | ancestry_pulses.push_back( new_ancestry_pulse ) ; 118 | ancestry_pulses.back().entry_order = ancestry_pulses.size() - 1 ; 119 | } 120 | 121 | //// for each ancestry type, set the total ancestry fraction 122 | //// this must be set and equal to all the ancestry types listed above 123 | if ( strcmp(argv[i],"-a") == 0 ) { 124 | ancestry_proportion.clear() ; 125 | int stop = atoi(argv[++i]) ; 126 | float sum = 0 ; 127 | for ( int l = 0 ; l < stop ; l ++ ) { 128 | ancestry_proportion.push_back( atof(argv[++i]) ) ; 129 | sum += ancestry_proportion.back() ; 130 | } 131 | 132 | //// check that ancestry proportions sum to one 133 | if ( sum < 0.9999 || sum > 1.0001 ) { 134 | cerr << "\n\n\t\t ERROR: ancestry proportions must sum to one\n\n" ; 135 | print_usage() ; 136 | exit(1) ; 137 | } 138 | } 139 | 140 | if ( strcmp(argv[i],"--help") == 0 ) { 141 | print_usage() ; 142 | exit(0) ; 143 | } 144 | 145 | if ( strcmp(argv[i],"-g") == 0 ) { 146 | genotype = true ; 147 | } 148 | 149 | if ( strcmp(argv[i],"--output-ancestry") == 0 ) { 150 | output_pulses = false ; 151 | } 152 | if ( strcmp(argv[i],"--precision") == 0 ) { 153 | precision = atoi(argv[++i]) ; 154 | cout.precision(precision) ; 155 | cerr.precision(precision) ; 156 | } 157 | 158 | if ( strcmp(argv[i],"-v") == 0 ) { 159 | viterbi = true ; 160 | } 161 | 162 | if ( strcmp(argv[i],"-r") == 0 ) { 163 | n_restarts = atoi(argv[++i]) ; 164 | } 165 | 166 | /// bootstraps supplied as '-b 167 | if ( strcmp(argv[i],"-b") == 0 ) { 168 | n_bootstraps = atoi(argv[++i]) ; 169 | block_size = atoi(argv[++i]) ; 170 | } 171 | 172 | /// to bound possible pulse times 173 | if ( strcmp(argv[i],"--tmax") == 0 ) { 174 | t_max = atof(argv[++i]) ; 175 | } 176 | if ( strcmp(argv[i],"--tmin") == 0 ) { 177 | t_min = atof(argv[++i]) ; 178 | } 179 | if ( strcmp(argv[i],"--pmin") == 0 ) { 180 | p_min = atof(argv[++i]) ; 181 | } 182 | if ( strcmp(argv[i],"--pmax") == 0 ) { 183 | p_max = atof(argv[++i]) ; 184 | } 185 | if ( strcmp(argv[i],"--tlength") == 0 ) { 186 | t_length = atof(argv[++i]) ; 187 | } 188 | if ( strcmp(argv[i],"--plength") == 0 ) { 189 | p_length = atof(argv[++i]) ; 190 | } 191 | if ( strcmp(argv[i],"--tolerance") == 0 ) { 192 | tolerance = atof(argv[++i]) ; 193 | } 194 | if ( strcmp(argv[i], "-e" ) == 0 ) { 195 | error_rate = atof(argv[++i]) ; 196 | } 197 | if ( strcmp(argv[i], "-E" ) == 0 ) { 198 | error_rates = true ; 199 | } 200 | if ( strcmp(argv[i],"--ne") == 0 ) { 201 | ne = 2 * atof(argv[++i]) ; 202 | } 203 | 204 | /// this version will allow inputting all samples in a single file with separate posterior output files 205 | if ( strcmp(argv[i],"-i") == 0 ) { 206 | input_file = string(argv[++i]) ; 207 | } 208 | 209 | /// sample file 210 | if ( strcmp(argv[i],"-s") == 0 ) { 211 | sample_file = string(argv[++i]) ; 212 | } 213 | 214 | if ( strcmp(argv[i],"-d") == 0 ) { 215 | minimum_distance = atof(argv[++i]) ; 216 | } 217 | if ( strcmp(argv[i],"--fix") == 0 ) { 218 | ancestral_fixed = true ; 219 | } 220 | } 221 | 222 | if ( input_file == "null" ) { 223 | cerr << "\n\n\t\tERROR: must provide input file\n\n\t\t\t-i [path/to/input_file]\n\n" ; 224 | print_usage() ; 225 | exit(1) ; 226 | } 227 | if ( sample_file == "null" ) { 228 | cerr << "\n\n\t\tERROR: must provide sample file\n\n\t\t\t-s [path/to/sample_file]\n\n" ; 229 | print_usage() ; 230 | exit(1) ; 231 | } 232 | if ( ancestry_proportion.size() > ancestry_pulses.size() ) { 233 | cerr << "\n\n\t\tERROR: insufficient ancestry pulses specified\n\n" ; 234 | print_usage() ; 235 | exit(1) ; 236 | } 237 | return ; 238 | } 239 | 240 | #endif 241 | 242 | -------------------------------------------------------------------------------- /src/viterbi.h: -------------------------------------------------------------------------------- 1 | #ifndef __VITERBI_H 2 | #define __VITERBI_H 3 | 4 | void markov_chain::viterbi( vector &position, vector &recombination_rate, map > > &states, vector &chrom, map > &transition_probabilites, vector &interploidy_transitions, bool output_pulses, vector &pulses ) { 5 | 6 | //// find largest states group that's possible for this sample 7 | int max_n_states = 2 ; 8 | for ( std::map > >::iterator p = states.begin() ; p != states.end() ; ++ p ) { 9 | if ( p->second.size() > max_n_states ) { 10 | max_n_states = p->second.size() ; 11 | } 12 | } 13 | 14 | //// probs and state paths as we transition through 15 | vector > viterbi_path(max_n_states) ; 16 | vector path_probs(max_n_states,0) ; 17 | 18 | //// starting probs 19 | for ( int s = 0 ; s < emission_probabilities[0].size() ; s ++ ) { 20 | 21 | path_probs[s] += log( emission_probabilities[0](s) ) + log( start_prob ); 22 | viterbi_path[s].push_back(s) ; 23 | } 24 | 25 | /// ploidy index to tract where in path we are 26 | int ploidy_index = 0 ; 27 | 28 | /// now go across all incoming paths for each state, and select highest prob path 29 | for ( int i = 1 ; i < emission_probabilities.size() ; i ++ ) { 30 | 31 | /// if we're at or past the next switch position 32 | bool ploidy_change = false ; 33 | if ( i >= ploidy_switch_position[ploidy_index+1] ) { 34 | ploidy_index ++ ; 35 | if ( ploidy_switch[ploidy_index] != ploidy_switch[ploidy_index-1] ) { 36 | ploidy_change = true ; 37 | } 38 | } 39 | 40 | /// to record best paths and their probabilities for each state 41 | vector max_paths(emission_probabilities[i].size()) ; 42 | vector max_path_probs(emission_probabilities[i].size(), -1.7976931348623157E+308) ; 43 | 44 | /// if the ploidy between adjacent states is identical, we can do a fairly normal transition 45 | if ( ploidy_change == false ) { 46 | for ( int s = 0 ; s < emission_probabilities[i].size() ; s ++ ) { 47 | for ( int p = 0 ; p < emission_probabilities[i].size() ; p ++ ) { 48 | /// find all ways of getting to the current state 49 | double path_prob_state = path_probs[p] + log( transition_probabilites[ploidy_switch[ploidy_index]][i](s,p) ) + log( emission_probabilities[i](s) ) ; 50 | /// check if this path prob is greater and update to that 51 | if ( path_prob_state > max_path_probs[s] ) { 52 | max_paths[s] = p ; 53 | max_path_probs[s] = path_prob_state ; 54 | } 55 | } 56 | } 57 | } 58 | 59 | /// if we have transitioned between adjancent ploidy tracts 60 | else { 61 | for ( int s = 0 ; s < emission_probabilities[i].size() ; s ++ ) { 62 | for ( int p = 0 ; p < emission_probabilities[i-1].size() ; p ++ ) { 63 | 64 | /// do normal interploidy transition 65 | double path_prob_state = path_probs[p] + log( interploidy_transitions[ploidy_switch[ploidy_index-1]-1](s,p) ) + log( emission_probabilities[i](s) ) ; 66 | 67 | /// if this is a between chromosomes transition all states are equally likely 68 | if ( transition_probabilites[ploidy_switch[ploidy_index]][i](0,0) < 0.75 ) { 69 | path_prob_state = path_probs[p] + log( emission_probabilities[i](s) ) ; 70 | } 71 | 72 | if ( path_prob_state > max_path_probs[s] ) { 73 | max_paths[s] = p ; 74 | max_path_probs[s] = path_prob_state ; 75 | } 76 | } 77 | } 78 | } 79 | 80 | /// update recorded paths and probabilities 81 | vector > new_viterbi_paths ( emission_probabilities[i].size() ) ; 82 | vector new_path_probs ( emission_probabilities[i].size() ) ; 83 | for ( int s = 0 ; s < emission_probabilities[i].size() ; s ++ ) { 84 | new_viterbi_paths[s] = viterbi_path[max_paths[s]] ; 85 | new_viterbi_paths[s].push_back(s) ; 86 | new_path_probs[s] = max_path_probs[s] ; 87 | } 88 | 89 | /// now replace viterbi paths/probs 90 | swap( viterbi_path, new_viterbi_paths ) ; 91 | swap( path_probs, new_path_probs ) ; 92 | } 93 | 94 | /// find optimal viterbi path 95 | int optimal_path = 0 ; 96 | for ( int p = 0 ; p < path_probs.size() ; p ++ ) { 97 | if ( path_probs[p] > path_probs[optimal_path] ) { 98 | optimal_path = p ; 99 | } 100 | } 101 | 102 | /// print optimal viterbi path for all states 103 | ofstream out ( output_file.c_str() ) ; 104 | string current_chrom = chrom[0] ; 105 | int start = 0 ; 106 | ploidy_index = 0 ; 107 | int current_state = viterbi_path[optimal_path][0] ; 108 | 109 | /// to keep tract of distance in morgans 110 | double recombination_distance = 0 ; 111 | double recombination_start = 0 ; 112 | 113 | /// iterate across paths 114 | for ( int p = 0 ; p < chrom.size() ; p ++ ) { 115 | 116 | /// if we're at or past the next switch position 117 | bool ploidy_change = false ; 118 | if ( p >= ploidy_switch_position[ploidy_index+1] ) { 119 | ploidy_index ++ ; 120 | if ( ploidy_switch[ploidy_index] != ploidy_switch[ploidy_index-1] ) { 121 | ploidy_change = true ; 122 | } 123 | } 124 | 125 | //// no ploidy change but switch in chromosome 126 | if ( ploidy_change == false && ( current_chrom != chrom[p] || viterbi_path[optimal_path][p] != current_state ) ) { 127 | if ( current_chrom != chrom[p] ) { 128 | out << current_chrom << "\t" << start << "\t" << position[p-1] << "\t" << recombination_start << "\t" << recombination_distance << "\t" ; 129 | start = 0 ; 130 | recombination_start = 0 ; 131 | recombination_distance = 0 ; 132 | } 133 | else { 134 | out << current_chrom << "\t" << start << "\t" << position[p]-1 << "\t" << recombination_start << "\t" << recombination_distance << "\t" ; 135 | start = position[p] ; 136 | recombination_start = recombination_distance ; 137 | } 138 | for ( int c = 0 ; c < states[ploidy_switch[ploidy_index]][current_state].size() - 1 ; c ++ ) { 139 | out << states[ploidy_switch[ploidy_index]][current_state][c] << "," ; 140 | } 141 | out << states[ploidy_switch[ploidy_index]][current_state].back() << endl ; 142 | 143 | current_state = viterbi_path[optimal_path][p] ; 144 | current_chrom = chrom[p] ; 145 | } 146 | 147 | else if ( ploidy_change == true ) { 148 | if ( current_chrom != chrom[p] ) { 149 | out << current_chrom << "\t" << start << "\t" << position[p-1] << "\t" << recombination_start << "\t" << recombination_distance << "\t" ; 150 | start = 0 ; 151 | recombination_start = 0 ; 152 | recombination_distance = 0 ; 153 | } 154 | else { 155 | out << current_chrom << "\t" << start << "\t" << position[p] - 1 << "\t" << recombination_start << "\t" << recombination_distance << "\t" ; 156 | start = position[p] ; 157 | recombination_start = recombination_distance ; 158 | } 159 | 160 | for ( int c = 0 ; c < states[ploidy_switch[ploidy_index-1]][current_state].size() - 1 ; c ++ ) { 161 | out << states[ploidy_switch[ploidy_index-1]][current_state][c] << "," ; 162 | } 163 | out << states[ploidy_switch[ploidy_index-1]][current_state].back() << endl ; 164 | current_chrom = chrom[p] ; 165 | current_state = viterbi_path[optimal_path][p] ; 166 | } 167 | 168 | /// add additional recombination 169 | if ( recombination_rate[p] < 0.4 ) { 170 | if ( p > 0 ) { 171 | recombination_distance += recombination_rate[p] * ( position[p] - position[p-1] ) ; 172 | } 173 | else { 174 | recombination_distance += recombination_rate[p] * position[p] ; 175 | } 176 | } 177 | } 178 | out << current_chrom << "\t" << start << "\t" << position.back() << "\t" << recombination_start << "\t" << recombination_distance << "\t" ; 179 | for ( int c = 0 ; c < states[ploidy_switch[ploidy_index]][current_state].size() - 1 ; c ++ ) { 180 | out << states[ploidy_switch[ploidy_index]][current_state][c] << "," ; 181 | } 182 | out << states[ploidy_switch[ploidy_index]][current_state].back() << endl ; 183 | } 184 | 185 | #endif 186 | -------------------------------------------------------------------------------- /src/nelder_mead.h: -------------------------------------------------------------------------------- 1 | #ifndef __NELDER_MEAD_H 2 | #define __NELDER_MEAD_H 3 | 4 | /// identify centroid for simplex 5 | vector create_centroid ( vector > &vertices ) { 6 | vector centroid = vertices[0] ; 7 | for ( int p = 0 ; p < vertices[0].size() ; p ++ ) { 8 | if ( centroid[p].time_fixed == false ) { 9 | for ( int v = 1 ; v < vertices.size()-1 ; v ++ ) { 10 | centroid[p].time += vertices[v][p].time ; 11 | } 12 | centroid[p].time /= ( vertices.size() - 1 ) ; 13 | } 14 | if ( centroid[p].proportion_fixed == false ) { 15 | for ( int v = 1 ; v < vertices.size()-1 ; v ++ ) { 16 | centroid[p].fraction_of_remainder += vertices[v][p].fraction_of_remainder ; 17 | } 18 | centroid[p].fraction_of_remainder /= ( vertices.size() - 1 ) ; 19 | } 20 | } 21 | return centroid ; 22 | } 23 | 24 | /// create reflection point 25 | vector create_reflection ( vector ¢roid, vector &worst, double &alpha ) { 26 | vector reflection = centroid ; 27 | for ( int p = 0 ; p < centroid.size() ; p ++ ) { 28 | if ( centroid[p].time_fixed == false ) { 29 | reflection[p].time = centroid[p].time + alpha * ( centroid[p].time - worst[p].time ) ; 30 | } 31 | if ( centroid[p].proportion_fixed == false ) { 32 | reflection[p].fraction_of_remainder = centroid[p].fraction_of_remainder + alpha * ( centroid[p].fraction_of_remainder - worst[p].fraction_of_remainder ) ; 33 | } 34 | } 35 | return reflection ; 36 | } 37 | 38 | /// create expansion, contraction, etc 39 | vector create_expansion ( vector ¢roid, vector &worst, double &scale ) { 40 | 41 | vector expansion = centroid ; 42 | for ( int p = 0 ; p < centroid.size() ; p ++ ) { 43 | if ( centroid[p].time_fixed == false ) { 44 | expansion[p].time = centroid[p].time + scale * ( worst[p].time - centroid[p].time ) ; 45 | } 46 | if ( centroid[p].proportion_fixed == false ) { 47 | expansion[p].fraction_of_remainder = centroid[p].fraction_of_remainder + scale * ( worst[p].fraction_of_remainder - centroid[p].fraction_of_remainder ) ; 48 | } 49 | } 50 | return expansion ; 51 | } 52 | 53 | /// retstart nelder_mead with random simplex 54 | void random_restart( vector > &vertices, cmd_line &options ) { 55 | for ( int v = 0 ; v < vertices.size() ; v ++ ) { 56 | for ( int p = 0 ; p < vertices[v].size() ; p ++ ) { 57 | if ( vertices[v][p].time_fixed == false ) { 58 | vertices[v][p].time = options.t_min + rand()/((double)RAND_MAX) * ( options.t_max - options.t_min ) ; 59 | } 60 | if ( vertices[v][p].proportion_fixed == false ) { 61 | vertices[v][p].fraction_of_remainder = options.p_min + rand()/((double)RAND_MAX) * ( options.p_max - options.p_min ) ; 62 | } 63 | } 64 | check_vertex( vertices[v], options ) ; 65 | } 66 | } 67 | 68 | /// optimation via amoeba search 69 | vector nelder_mead_search( vector > &vertices, cmd_line &options, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, map > > &state_changes ) { 70 | 71 | /// amoeba search parameters 72 | double alpha = 1 ; 73 | double gamma = 2 ; 74 | double rho = 0.5 ; 75 | double sigma = 0.5 ; 76 | 77 | /// record best optimum so far 78 | vector best_optimum = vertices[0] ; 79 | double best_lnl = -1.7976931348623157E+308 ; // set initial best to minimum double value, if we underflow that, shiiittttt 80 | 81 | for ( int r = 0 ; r < options.n_restarts+1 ; r ++ ) { 82 | 83 | /// lnl of each vertex to be stored in vector 84 | vector lnls (vertices.size(),0) ; 85 | 86 | /// evaluate likelihood of initial points 87 | for ( int v = 0 ; v < vertices.size() ; v ++ ) { 88 | lnls[v] = evaluate_vertex( vertices[v], markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes ) ; 89 | } 90 | 91 | /// improvement in new position relative to old position both in lnl units 92 | int iteration = 0 ; 93 | while ( lnls[0]-lnls.back() > options.tolerance ) { 94 | 95 | cerr << r << "\t" << iteration << "\t" << lnls[0]-lnls.back() << "\t" ; 96 | iteration ++ ; 97 | 98 | cerr << endl ; 99 | for ( int v = 0 ; v < vertices.size() ; v ++ ) { 100 | cerr << "\tvertex: " << v << "\t" << "lnl: " << lnls[v] << endl ; 101 | for ( int p = 0 ; p < vertices[v].size() ; p ++ ) { 102 | vertices[v][p].print() ; 103 | } 104 | } 105 | cerr << endl ; 106 | 107 | /// create centroid and evaluate 108 | vector centroid = create_centroid ( vertices ) ; 109 | 110 | /// reflection point 111 | vector reflection = create_reflection( centroid, vertices.back(), alpha ) ; 112 | check_vertex( reflection, options ) ; 113 | double lnl_reflection = evaluate_vertex( reflection, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes ) ; 114 | 115 | cerr << "\treflection\t" << "lnl: " << lnl_reflection << endl ; 116 | for ( int p = 0 ; p < reflection.size() ; p ++ ) { 117 | reflection[p].print() ; 118 | } 119 | cerr << endl ; 120 | 121 | if ( lnl_reflection < lnls[0] && lnl_reflection > lnls[lnls.size()-2] ) { 122 | cerr << "\t\t\t\tREFLECTION ACCEPTED\n" ; 123 | lnls.back() = lnl_reflection ; 124 | vertices.back() = reflection ; 125 | sort_vertices( vertices, lnls ) ; 126 | continue ; 127 | } 128 | 129 | /// expansion point 130 | else if ( lnl_reflection >= lnls[0] ) { 131 | vector expansion = create_expansion( centroid, reflection, gamma ) ; 132 | check_vertex( expansion, options ) ; 133 | double lnl_expansion = evaluate_vertex( expansion, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes ) ; 134 | 135 | cerr << "\texpansion\t" << "lnl: " << lnl_expansion << endl ; 136 | for ( int p = 0 ; p < expansion.size() ; p ++ ) { 137 | expansion[p].print() ; 138 | } 139 | cerr << endl ; 140 | 141 | if ( lnl_expansion > lnl_reflection ) { 142 | cerr << "\t\t\t\tEXPANSION ACCEPTED\n" ; 143 | lnls.back() = lnl_expansion ; 144 | vertices.back() = expansion ; 145 | sort_vertices( vertices, lnls ) ; 146 | continue ; 147 | } 148 | else { 149 | cerr << "\t\t\t\tREFLECTION ACCEPTED\n" ; 150 | lnls.back() = lnl_reflection ; 151 | vertices.back() = reflection ; 152 | sort_vertices( vertices, lnls ) ; 153 | continue ; 154 | } 155 | } 156 | 157 | /// contraction 158 | vector contraction = create_expansion( centroid, vertices.back(), rho ) ; 159 | check_vertex( contraction, options ) ; 160 | double lnl_contraction = evaluate_vertex( contraction, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes ) ; 161 | 162 | cerr << "\tcontraction\t" << "lnl: " << lnl_contraction << endl ; 163 | for ( int p = 0 ; p < contraction.size() ; p ++ ) { 164 | contraction[p].print() ; 165 | } 166 | cerr << endl ; 167 | 168 | if ( lnl_contraction > lnls.back() ) { 169 | cerr << "\t\t\t\tCONTRACTION ACCEPTED\n" ; 170 | lnls.back() = lnl_contraction ; 171 | vertices.back() = contraction ; 172 | sort_vertices( vertices, lnls ) ; 173 | continue ; 174 | } 175 | 176 | /// shrink 177 | else { 178 | cerr << "\t\t\t\tSHRINK ACCEPTED\n" ; 179 | for ( int v = 1 ; v < vertices.size() ; v ++ ) { 180 | vector replacement = create_expansion( vertices[0], vertices[v], sigma ) ; 181 | check_vertex( replacement, options ) ; 182 | lnls[v] = evaluate_vertex( replacement, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes ) ; 183 | vertices[v] = replacement ; 184 | } 185 | sort_vertices( vertices, lnls ) ; 186 | continue ; 187 | } 188 | } 189 | 190 | if ( best_lnl < lnls[0] ) { 191 | best_optimum = vertices[0] ; 192 | best_lnl = lnls[0] ; 193 | } 194 | 195 | cerr << "restart: " << r << "\t" << lnls[0]-lnls.back() << endl ; 196 | for ( int p = 0 ; p < vertices[0].size() ; p ++ ) { 197 | vertices[0][p].print() ; 198 | } 199 | cerr << endl ; 200 | 201 | random_restart( vertices, options ) ; 202 | for ( int v = 0 ; v < vertices.size() ; v++ ) { 203 | lnls[v] = evaluate_vertex( vertices[v], markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes ) ; 204 | } 205 | sort_vertices( vertices, lnls ) ; 206 | } 207 | 208 | /// sort points by lnl, with point 0 having highest lnl 209 | /// sort_vertices( vertices, lnls ) ; 210 | return best_optimum ; 211 | } 212 | 213 | 214 | #endif 215 | 216 | -------------------------------------------------------------------------------- /src/ancestry_hmm.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | copyright: Russ Corbett-Detig 4 | rucorbet@ucsc.edu 5 | 6 | This is software distributed under the gnu public license version 3. 7 | 8 | */ 9 | 10 | /// headers 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | using namespace std ; 19 | 20 | /// linear algebra library is armadillo 21 | #define ARMA_NO_DEBUG 22 | #include 23 | using namespace arma ; 24 | 25 | /// our header files in /src directory 26 | #include "print_usage.h" 27 | #include "factorial.h" 28 | #include "nchoosek.h" 29 | #include "subsample.h" 30 | #include "multichoose.h" 31 | #include "multipermute.h" 32 | #include "normalize.h" 33 | #include "ancestry_pulse.h" 34 | #include "ploidy_path.h" 35 | #include "markov_chain.h" 36 | #include "read_samples.h" 37 | #include "pulses_to_ancestry.h" 38 | #include "compute_forward.h" 39 | #include "compute_backward.h" 40 | #include "forward_backward.h" 41 | #include "viterbi.h" 42 | #include "transition_information.h" 43 | #include "exponentiate_matrix.h" 44 | #include "cmd_line.h" 45 | #include "create_transition_rates.h" 46 | #include "read_cmd_line.h" 47 | #include "evaluate_vertex.h" 48 | #include "check_vertex.h" 49 | #include "sort_vertices.h" 50 | #include "create_pulses.h" 51 | #include "create_states.h" 52 | #include "input_line.h" 53 | #include "distribute_alleles.h" 54 | #include "binomial.h" 55 | #include "read_emissions.h" 56 | #include "genotype_emissions.h" 57 | #include "read_input.h" 58 | #include "nelder_mead.h" 59 | #include "golden_search.h" 60 | #include "bootstrap.h" 61 | 62 | int main ( int argc, char *argv[] ) { 63 | 64 | /// time tracking 65 | clock_t t = clock() ; 66 | clock_t total = clock() ; 67 | 68 | /// seed prng 69 | srand (t) ; 70 | 71 | // read cmd line 72 | cmd_line options ; 73 | cerr << "reading command line" ; t = clock(); 74 | options.read_cmd_line( argc, argv ) ; 75 | 76 | /// chain objects for each sample 77 | vector markov_chain_information ; 78 | 79 | /// get sample ids and ploidy from input file 80 | cerr << "\t\t\t\t" << (double) (clock() - t) << " ms\n" << "reading sample ids and ploidy" ; t = clock(); 81 | read_samples( markov_chain_information, options.sample_file, options.viterbi ) ; 82 | 83 | /// create states matrix 84 | cerr << "\t\t\t" << (double) (clock() - t) << " ms\n" << "creating states matrix" ; t = clock(); 85 | /// store all possible state space arranged by ploidy and then vector of state counts 86 | map > > state_list ; 87 | /// now create initial state list 88 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 89 | for ( int p = 0 ; p < markov_chain_information[m].sample_ploidy_path.size() ; p ++ ) { 90 | create_initial_states( markov_chain_information.at(m).sample_ploidy_path[p].ploidy, options.ancestry_pulses, state_list ) ; 91 | } 92 | } 93 | 94 | /// read in panels and update matrices 95 | cerr << "\t\t\t\t" << (double) (clock() - t) << " ms\n" << "reading data and creating emissions matrices\t" ; t = clock() ; 96 | /// store recombination rates and positions 97 | vector position ; 98 | vector recombination_rate ; 99 | vector chromosomes ; 100 | read_file( options, markov_chain_information, state_list, position, recombination_rate, chromosomes ) ; 101 | 102 | /// create basic transition information 103 | cerr << (double) (clock() - t) << " ms" << endl << "computing transition routes\t\t\t" ; t = clock() ; 104 | /// 3d map to look up by ploidy, start state, end state, and then relevant transition information 105 | map, double > > > > transition_matrix_information ; 106 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 107 | for ( int p = 0 ; p < markov_chain_information[m].sample_ploidy_path.size() ; p ++ ) { 108 | create_transition_information( markov_chain_information.at(m).sample_ploidy_path[p].ploidy, transition_matrix_information, state_list[markov_chain_information.at(m).sample_ploidy_path[p].ploidy] ) ; 109 | } 110 | } 111 | 112 | /// create admixture model(s) 113 | cerr << (double) (clock() - t) << " ms" << endl << "creating initial admixture model(s)\t\t" ; t = clock(); 114 | vector > vertices ; 115 | int nparams = create_pulses( vertices, options ) ; 116 | 117 | /// set number of restarts if unspecified default is factorial * 2 118 | cerr << (double) (clock() - t) << " ms" << endl << "estimating " << nparams << " parameters\n" ; 119 | if ( options.n_restarts < 0 ) { 120 | options.n_restarts = factorial[nparams] * 2 ; 121 | } 122 | 123 | /// vector of models to be evaluated and optimized 124 | vector optimum ; 125 | 126 | /// if there are params to estimate, do amoeba search 127 | if ( nparams > 1 ) { 128 | cerr << "starting nelder-mead search\t\t" << endl ; 129 | optimum = nelder_mead_search( vertices, options, markov_chain_information, transition_matrix_information, recombination_rate, position, state_list ) ; 130 | cerr << "\n\t\t\t\t\tSEARCH TIME: " << (double) (clock() - t) << " ms" << endl << endl << "optimal model found:\n\n" ; 131 | } 132 | 133 | /// or do golden section line search for single parameter optimization 134 | else if ( nparams == 1 ) { 135 | cerr << "starting golden section search\t\t" << endl ; 136 | optimum = golden_search( options, markov_chain_information, transition_matrix_information, recombination_rate, position, state_list ) ; 137 | cerr << "\n\t\t\t\t\tSEARCH TIME: " << (double) (clock() - t) << " ms" << endl << endl << "optimal model found:\n\n" ; 138 | } 139 | 140 | /// otherwise just evaluate the supplied model 141 | else { 142 | optimum = options.ancestry_pulses ; 143 | cerr << endl << endl << "evaluating supplied model:\n\n" ; 144 | } 145 | 146 | /// print model 147 | cerr << "\ttype\ttime\tproportion\n" ; 148 | cout << "optimum: \n" ; 149 | cout << "\ttype\ttime\tproportion\n" ; 150 | 151 | vector a = options.ancestry_proportion ; 152 | for ( int p = 0 ; p < optimum.size() ; p ++ ) { 153 | optimum[p].proportion = a[optimum[p].type] * optimum[p].fraction_of_remainder ; 154 | a[optimum[p].type] -= optimum[p].proportion ; 155 | cerr << "\t" << optimum[p].type << "\t" << optimum[p].time << "\t" << optimum[p].proportion << endl ; 156 | cout << "\t" << optimum[p].type << "\t" << optimum[p].time << "\t" << optimum[p].proportion << endl ; 157 | } 158 | 159 | /// bootstrap models as necessary 160 | if ( options.n_bootstraps > 0 ) { 161 | cerr << "computing " << options.n_bootstraps << " bootstrap models" << endl ; 162 | 163 | vector > bootstrap = bootstraps( vertices, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_list, chromosomes ) ; 164 | 165 | //// print out bootstrapped admixture models 166 | for ( int b = 0 ; b < options.n_bootstraps ; b ++ ) { 167 | 168 | cout << "bootstrap: " << b << endl ; 169 | cout << "\ttype\ttime\tproportion\n" ; 170 | 171 | cerr << endl << "bootstrap: " << b << endl ; 172 | cerr << "\ttype\ttime\tproportion\n" ; 173 | 174 | vector a = options.ancestry_proportion ; 175 | for ( int p = 0 ; p < optimum.size() ; p ++ ) { 176 | bootstrap[b][p].proportion = a[optimum[p].type] * bootstrap[b][p].fraction_of_remainder ; 177 | a[bootstrap[b][p].type] -= bootstrap[b][p].proportion ; 178 | cerr << "\t" << bootstrap[b][p].type << "\t" << bootstrap[b][p].time << "\t" << bootstrap[b][p].proportion << endl ; 179 | cout << "\t" << bootstrap[b][p].type << "\t" << bootstrap[b][p].time << "\t" << bootstrap[b][p].proportion << endl ; 180 | } 181 | } 182 | } 183 | 184 | /// create transition rates for the optimal or supplied set of pulses 185 | cerr << endl << "creating per morgan transition rates\t\t" ; t = clock(); 186 | mat transition_rates = create_transition_rates( optimum, options.ne, options.ancestry_proportion ) ; 187 | 188 | /// create transition information 189 | cerr << (double) (clock() - t) << " ms" << endl << "creating transition matrices\t\t\t" ; t = clock(); 190 | map > transition_matrix ; 191 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 192 | create_transition_matrix( transition_matrix, transition_matrix_information[markov_chain_information.at(m).number_chromosomes], recombination_rate, position, markov_chain_information.at(m).number_chromosomes, transition_rates ) ; 193 | for ( int p = 0 ; p < markov_chain_information[m].ploidy_switch.size() ; p ++ ) { 194 | create_transition_matrix( transition_matrix, transition_matrix_information[markov_chain_information[m].ploidy_switch[p]], recombination_rate, position, markov_chain_information[m].ploidy_switch[p], transition_rates ) ; 195 | } 196 | } 197 | 198 | //// create interploidy transition matrix 199 | vector interploidy_transitions = create_interploidy_transitions ( state_list, optimum, options.ancestry_proportion ) ; 200 | 201 | /// output viterbi path for optimized model 202 | if ( options.viterbi == true ) { 203 | cerr << (double) (clock() - t) << " ms" << endl << "viterbi posterior decoding and printing\t" ; t = clock() ; 204 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 205 | markov_chain_information[m].viterbi( position, recombination_rate, state_list, chromosomes, transition_matrix, interploidy_transitions, options.output_pulses, optimum ) ; 206 | } 207 | } 208 | 209 | /// output forward-backward full probability distribution by default 210 | else { 211 | cerr << (double) (clock() - t) << " ms" << endl << "computing forward probabilities\t" ; t = clock() ; 212 | double lnl = 0 ; 213 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 214 | lnl += markov_chain_information[m].compute_forward_probabilities( transition_matrix, interploidy_transitions ) ; 215 | } 216 | 217 | cerr << "lnl: " << lnl << "\t\t" << (double) (clock() - t) << " ms" << endl << "computing backward probabilities\t\t\t" ; t = clock() ; 218 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 219 | markov_chain_information[m].compute_backward_probabilities( transition_matrix, interploidy_transitions ) ; 220 | } 221 | 222 | cerr << (double) (clock() - t) << " ms" << endl << "forward-backward posterior decoding and printing\t\t\t" ; t = clock() ; 223 | for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { 224 | markov_chain_information[m].combine_prob( position, state_list, chromosomes, options.output_pulses, optimum ) ; 225 | } 226 | } 227 | 228 | cerr << (double) (clock() - t) << " ms" << endl ; 229 | cerr << "total run time:\t\t\t" << (double) (clock() - total) << " ms" << endl ; 230 | 231 | return 0 ; 232 | } 233 | 234 | 235 | -------------------------------------------------------------------------------- /src/selection_read_cmd_line.h: -------------------------------------------------------------------------------- 1 | #ifndef __SELECTION_READ_CMD_LINE_H 2 | #define __SELECTION_READ_CMD_LINE_H 3 | 4 | void cmd_line::read_cmd_line ( int argc, char *argv[] ) { 5 | 6 | ///defaults 7 | ancestral_fixed = false ; /// set to true for qtl or experimental evolution application if ancestral genotypes are known and at fixed frequencies. 8 | 9 | /// ideally we recommend pruning LD in advance 10 | minimum_distance = 0 ; /// minimum distance in morgans between sites to consider 11 | ne = 2e4 ; /// actually 2ne 12 | 13 | // time params to bound our search 14 | t_max = 10000 ; 15 | t_min = 1 ; 16 | p_max = 0.99999 ; 17 | p_min = 0.00001 ; 18 | t_length = 0.8 ; 19 | p_length = 0.8 ; 20 | 21 | /// if set, we clear once 22 | bool clear = false ; 23 | 24 | /// error rates 25 | error_rates = false ; 26 | 27 | // the default behavior is a single pulse of ancestry 1 into ancestry 0 28 | ancestry_pulses.resize( 2 ) ; 29 | ancestry_pulses[0].type = 0 ; 30 | ancestry_pulses[1].type = 1 ; 31 | 32 | /// default is 50:50 with single pulse of 1 into 0 33 | ancestry_pulses[0].proportion = 0.5 ; 34 | ancestry_pulses[1].proportion = 0.5 ; 35 | ancestry_pulses[0].proportion_fixed = true ; 36 | ancestry_pulses[1].proportion_fixed = true ; 37 | 38 | /// also the ancestry proportions are known 39 | ancestry_proportion.assign(2,0.5) ; 40 | 41 | /// time is not fixed by default, pulse of 1 into 0 42 | /// does not matter, really since 0>1 would be identical in formulation 43 | ancestry_pulses[0].time = 3000 ; 44 | ancestry_pulses[0].time_fixed = true ; 45 | ancestry_pulses[1].time = 10 ; 46 | ancestry_pulses[1].time_fixed = false ; 47 | 48 | /// end parameter this will be in lnl units 49 | /// i.e. must obtain <= this amount of improvement between all vertices to quit 50 | tolerance = 1e-5 ; 51 | 52 | /// restart number 53 | n_restarts = -1 ; 54 | 55 | /// per site per read error rate 56 | error_rate = 0.01 ; 57 | 58 | /// genotype data rather than read data? 59 | genotype = false ; 60 | 61 | // viterbi 62 | viterbi = false ; 63 | 64 | /// output pulses rather than ancestry counts 65 | output_pulses = true ; 66 | 67 | /// set output precision 68 | precision = 10 ; 69 | 70 | /// sample file 71 | sample_file = "null" ; 72 | 73 | // intput file 74 | input_file = "null" ; 75 | 76 | /// bootstraps 77 | n_bootstraps = 0 ; 78 | block_size = 0 ; 79 | 80 | // selection 81 | is_limit = false ; 82 | calc_grid = false; 83 | test_point = false; 84 | is_coord = false; 85 | 86 | // if --chr_win is not set, read whole chromosome 87 | limit_win_start = 0; 88 | limit_win_end = 1000000000; 89 | 90 | win_unit = "p"; // set default window size unit to percent 91 | win_percent = 100; // default window size in percent 92 | //win_morgan = 0.1; // default window size in morgans 93 | 94 | // golden section search (gss) parameters 95 | gs_precision = 1e-5; // minimum recision in estimation of selection coeffient in gss 96 | gs_sstep = 0.001; 97 | 98 | // number of runs for the stochastic trajectory function 99 | stochastic_reps = 1000; 100 | 101 | // optimizes upper bound for the selection coefficient search space. Makes things faster. 102 | // Default true. Can be turned off with --full_selection_space 103 | limit_sel_space = true; 104 | 105 | /// accept command line parameters 106 | for (int i=1; i 0 ) { 126 | new_ancestry_pulse.time_fixed = true ; 127 | } 128 | else { 129 | new_ancestry_pulse.time = new_ancestry_pulse.time * -1 ; 130 | new_ancestry_pulse.time_fixed = false ; 131 | } 132 | 133 | // if proporion is set, we are not estimating it 134 | ////// set proporiton with a negative number to provide the starting guess for this parameter 135 | if ( new_ancestry_pulse.proportion > 0 ) { 136 | new_ancestry_pulse.proportion_fixed = true ; 137 | } 138 | else { 139 | new_ancestry_pulse.proportion_fixed = false ; 140 | new_ancestry_pulse.proportion = -1 * new_ancestry_pulse.proportion ; 141 | } 142 | ancestry_pulses.push_back( new_ancestry_pulse ) ; 143 | ancestry_pulses.back().entry_order = ancestry_pulses.size() - 1 ; 144 | } 145 | 146 | 147 | 148 | 149 | 150 | //// for each ancestry type, set the total ancestry fraction 151 | //// this must be set and equal to all the ancestry types listed above 152 | if ( strcmp(argv[i],"-a") == 0 ) { 153 | ancestry_proportion.clear() ; 154 | int stop = atoi(argv[++i]) ; 155 | float sum = 0 ; 156 | for ( int l = 0 ; l < stop ; l ++ ) { 157 | ancestry_proportion.push_back( atof(argv[++i]) ) ; 158 | sum += ancestry_proportion.back() ; 159 | } 160 | 161 | //// check that ancestry proportions sum to one 162 | if ( sum < 0.9999 || sum > 1.0001 ) { 163 | cerr << "\n\n\t\t ERROR: ancestry proportions must sum to one\n\n" ; 164 | print_usage() ; 165 | exit(1) ; 166 | } 167 | } 168 | 169 | if ( strcmp(argv[i],"--help") == 0 ) { 170 | print_usage() ; 171 | exit(0) ; 172 | } 173 | 174 | if ( strcmp(argv[i],"-g") == 0 ) { 175 | genotype = true ; 176 | } 177 | 178 | if ( strcmp(argv[i],"--output-ancestry") == 0 ) { 179 | output_pulses = false ; 180 | } 181 | if ( strcmp(argv[i],"--precision") == 0 ) { 182 | precision = atoi(argv[++i]) ; 183 | cout.precision(precision) ; 184 | cerr.precision(precision) ; 185 | } 186 | 187 | if ( strcmp(argv[i],"-v") == 0 ) { 188 | viterbi = true ; 189 | } 190 | 191 | if ( strcmp(argv[i],"-r") == 0 ) { 192 | n_restarts = atoi(argv[++i]) ; 193 | } 194 | 195 | /// bootstraps supplied as '-b 196 | if ( strcmp(argv[i],"-b") == 0 ) { 197 | n_bootstraps = atoi(argv[++i]) ; 198 | block_size = atoi(argv[++i]) ; 199 | } 200 | 201 | /// to bound possible pulse times 202 | if ( strcmp(argv[i],"--tmax") == 0 ) { 203 | t_max = atof(argv[++i]) ; 204 | } 205 | if ( strcmp(argv[i],"--tmin") == 0 ) { 206 | t_min = atof(argv[++i]) ; 207 | } 208 | if ( strcmp(argv[i],"--pmin") == 0 ) { 209 | p_min = atof(argv[++i]) ; 210 | } 211 | if ( strcmp(argv[i],"--pmax") == 0 ) { 212 | p_max = atof(argv[++i]) ; 213 | } 214 | if ( strcmp(argv[i],"--tlength") == 0 ) { 215 | t_length = atof(argv[++i]) ; 216 | } 217 | if ( strcmp(argv[i],"--plength") == 0 ) { 218 | p_length = atof(argv[++i]) ; 219 | } 220 | if ( strcmp(argv[i],"--tolerance") == 0 ) { 221 | tolerance = atof(argv[++i]) ; 222 | } 223 | if ( strcmp(argv[i], "-e" ) == 0 ) { 224 | error_rate = atof(argv[++i]) ; 225 | } 226 | if ( strcmp(argv[i], "-E" ) == 0 ) { 227 | error_rates = true ; 228 | } 229 | if ( strcmp(argv[i],"--ne") == 0 ) { 230 | ne = 2 * atof(argv[++i]) ; 231 | } 232 | 233 | /// this version will allow inputting all samples in a single file with separate posterior output files 234 | if ( strcmp(argv[i],"-i") == 0 ) { 235 | input_file = string(argv[++i]) ; 236 | } 237 | 238 | /// sample file 239 | if ( strcmp(argv[i],"-s") == 0 ) { 240 | sample_file = string(argv[++i]) ; 241 | } 242 | 243 | if ( strcmp(argv[i],"-d") == 0 ) { 244 | minimum_distance = atof(argv[++i]) ; 245 | } 246 | if ( strcmp(argv[i],"--fix") == 0 ) { 247 | ancestral_fixed = true ; 248 | } 249 | 250 | 251 | 252 | ///// Adaptive introgression stuff below 253 | 254 | 255 | /// activate selection detection module 256 | /// uses the following format -j chromosome_of_interest (str) site_of_interest start_window (int) 257 | /// window_start (int) window_end (int) 258 | if ( strcmp(argv[i],"--chr") == 0 ) { 259 | is_limit = true ; 260 | limit_chr = string(argv[++i]) ; 261 | } 262 | 263 | if ( strcmp(argv[i],"--chr_win") == 0 ) { 264 | limit_win_start = atoi(argv[++i]) ; 265 | limit_win_end = atoi(argv[++i]) ; 266 | 267 | cerr << endl << limit_chr << "\t" << limit_win_start << "\t" << limit_win_end << "\t" << endl ; 268 | 269 | /// check if win_start < win_end and site is located within window 270 | if ( limit_win_end <= limit_win_start ) { 271 | cerr << "\n\n\t\t ERROR: formatting for window is wrong\n\n" ; 272 | print_usage() ; 273 | exit(1) ; 274 | } 275 | } 276 | 277 | if ( strcmp(argv[i],"--grid") == 0 ) { 278 | calc_grid = true; 279 | grid_pstart = atoi(argv[++i]); 280 | grid_pstop = atoi(argv[++i]); 281 | grid_pstep = atoi(argv[++i]); 282 | grid_sstart = atof(argv[++i]); 283 | grid_sstop = atof(argv[++i]); 284 | grid_sstep = atof(argv[++i]); 285 | } 286 | 287 | if ( strcmp(argv[i],"--gss") == 0 ) { 288 | run_gss = true; 289 | gs_pstart = atoi(argv[++i]); 290 | gs_pstop = atoi(argv[++i]); 291 | gs_pstep = atoi(argv[++i]); 292 | gs_sstart = atof(argv[++i]); 293 | gs_sstop = atof(argv[++i]); 294 | } 295 | 296 | if ( strcmp(argv[i],"--gss_precision") == 0 ) { 297 | gs_precision = atof(argv[++i]); 298 | } 299 | 300 | if ( strcmp(argv[i], "--unit_coords" ) == 0 ) { 301 | is_coord = true ; 302 | } 303 | 304 | if ( strcmp(argv[i],"--site") == 0 ) { 305 | test_point = true; 306 | test_pos = atoi(argv[++i]); 307 | test_sel = atof(argv[++i]); 308 | } 309 | 310 | // control window size for selection 311 | if ( strcmp(argv[i],"--window") == 0 ) { 312 | win_unit = string(argv[++i]); 313 | 314 | if ( win_unit == "m") { 315 | win_morgan = atof(argv[++i]); 316 | if (win_morgan <= 0) { 317 | cerr << "\n\n\t\tERROR: windows size has to be specified with positive value.\n\n" ; 318 | exit(1) ; 319 | } 320 | } 321 | else if (win_unit == "p") { 322 | win_percent = atof(argv[++i]); 323 | if (win_percent <= 0 || win_percent > 100) { 324 | cerr << "\n\n\t\tERROR: windows size has to be specified in percent (1-100).\n\n" ; 325 | exit(1) ; 326 | } 327 | } 328 | else { 329 | cerr << "\n\n\t\tERROR: wrong unit for window size.\n\n" ; 330 | print_usage() ; 331 | exit(1) ; 332 | } 333 | } 334 | 335 | if ( strcmp(argv[i],"--traj") == 0 ) { 336 | traj_function = atoi(argv[++i]) ; 337 | } 338 | 339 | if ( strcmp(argv[i],"--stochastic") == 0 ) { 340 | use_stochastic = true; 341 | } 342 | 343 | if ( strcmp(argv[i],"--stochastic_reps") == 0 ) { 344 | stochastic_reps = atoi(argv[++i]) ; 345 | } 346 | 347 | if ( strcmp(argv[i],"--full_selection_space") == 0 ) { 348 | limit_sel_space = false; 349 | } 350 | } 351 | 352 | if ( input_file == "null" ) { 353 | cerr << "\n\n\t\tERROR: must provide input file\n\n\t\t\t-i [path/to/input_file]\n\n" ; 354 | print_usage() ; 355 | exit(1) ; 356 | } 357 | if ( sample_file == "null" ) { 358 | cerr << "\n\n\t\tERROR: must provide sample file\n\n\t\t\t-s [path/to/sample_file]\n\n" ; 359 | print_usage() ; 360 | exit(1) ; 361 | } 362 | 363 | return ; 364 | } 365 | 366 | #endif 367 | 368 | -------------------------------------------------------------------------------- /src/selection_fwd_iter.h: -------------------------------------------------------------------------------- 1 | #ifndef __FWD_ITER_H 2 | #define __FWD_ITER_H 3 | 4 | // Calculates transition rate when selection is 0 5 | double neutral_rate(int n, double mm, double gen) { 6 | double m = 1 - mm; 7 | return 2*n*m*(1-exp(-gen/(2*n))); 8 | } 9 | 10 | // Return a flat/constant vector for the case when selection is 0 11 | vector neutral_rates_vector(vector &recombination_rate, double m, int n, int generations) { 12 | vector transition_rates ; 13 | int r; 14 | 15 | double tr01 = neutral_rate(n, m, generations); 16 | double tr10 = neutral_rate(n, 1-m, generations); 17 | 18 | for ( int site = 0 ; site < recombination_rate.size() ; site ++ ) { 19 | r = recombination_rate[site] ; 20 | mat tr_mat(2,2); 21 | 22 | tr_mat(0,0) = 1-r*tr01; 23 | tr_mat(0,1) = r*tr01; 24 | tr_mat(1,0) = r*tr10; 25 | tr_mat(1,1) = 1-r*tr01; 26 | 27 | transition_rates.push_back(tr_mat); 28 | } 29 | 30 | return transition_rates; 31 | 32 | } 33 | 34 | // forward iteration algoritm for calculating transition rates across a chromosome 35 | // takes vector of recombination rates (sites) and vector of allele frequency change of the selected site over time 36 | // uses the 2-site version with back coalescence. 37 | // Not used at the moment 38 | vector fwd_iter(vector &recombination_rate, vector &basefreq, double m, int n) 39 | { 40 | vector freq(basefreq) ; 41 | vector freq_(basefreq); 42 | vector transition_rates ; 43 | double sum ; 44 | double h11; 45 | double h12; 46 | double h21; 47 | double h22; 48 | double r; 49 | 50 | double a1; 51 | double a1_; 52 | 53 | double h11_; 54 | double h12_; 55 | double h21_; 56 | double h22_; 57 | 58 | double p_coal; 59 | 60 | for ( int site = 0 ; site < recombination_rate.size() ; site ++ ) { 61 | r = recombination_rate[site] ; 62 | mat tr_mat(2,2); 63 | 64 | h11 = m; 65 | h12 = 0; 66 | h21 = 0; 67 | h22 = 1-m; 68 | 69 | p_coal = 1 ; 70 | 71 | for ( int t = 0 ; t < basefreq.size()-1 ; t++ ) { 72 | a1 = h11 + h12; 73 | a1_ = h11 + h21; 74 | 75 | h11_ = h11*(1-r) + a1*r*a1_*p_coal; 76 | h12_ = h12*(1-r) + a1*r*(1-a1_)*p_coal; 77 | h21_ = h21*(1-r) + (1-a1)*r*a1_*p_coal; 78 | h22_ = h22*(1-r) + (1-a1)*r*(1-a1_)*p_coal; 79 | 80 | freq_[t] = ( h11_ + h21_ )/( h11 + h12 + h21 + h22 ); 81 | 82 | h11 = h11_ ; 83 | h12 = h12_ ; 84 | h21 = h21_ ; 85 | h22 = h22_ ; 86 | 87 | h11 = h11/freq[t]*freq[t+1]; 88 | h12 = h12/freq[t]*freq[t+1]; 89 | 90 | h21 = h21/(1-freq[t])*(1-freq[t+1]); 91 | h22 = h22/(1-freq[t])*(1-freq[t+1]); 92 | 93 | sum = h11 + h12 + h21 + h22; 94 | h11 = h11/sum; 95 | h12 = h12/sum; 96 | h21 = h21/sum; 97 | h22 = h22/sum; 98 | 99 | freq_[t+1] = (h11 + h21) ; 100 | p_coal *= ( 1 - 1/(2*n) ) ; 101 | 102 | } 103 | 104 | // matrix with transition rates 105 | tr_mat(0,0) = 1 - h12/(h11+h12); 106 | tr_mat(0,1) = h12/(h11+h12); 107 | tr_mat(1,0) = h21/(h21+h22); 108 | tr_mat(1,1) = 1 - h21/(h21+h22); 109 | 110 | transition_rates.push_back(tr_mat) ; 111 | 112 | freq = freq_ ; 113 | } 114 | 115 | return transition_rates; 116 | } 117 | 118 | // Forward iteration function for calculating transition rates 119 | vector fwd_iter_genotype_freq(vector &recombination_rate, vector &basefreq, double m, int n, vector