├── .gitignore ├── Changes ├── LICENSE ├── README.md ├── app ├── dune ├── test_app.ml └── zipper_explorer.ml ├── dune-project ├── lib ├── ID_monad.ml ├── ID_monad.mli ├── IUPAC_AA.ml ├── IUPAC_AA.mli ├── TKF91.ml ├── TKF91.mli ├── alignment.ml ├── alignment.mli ├── alphabet.ml ├── amino_acid.ml ├── amino_acid.mli ├── birth_death.ml ├── birth_death.mli ├── bppsuite.ml ├── bppsuite.mli ├── codon.ml ├── codon.mli ├── discrete_pd.ml ├── discrete_pd.mli ├── dna.ml ├── dna.mli ├── dune ├── felsenstein.ml ├── felsenstein.mli ├── fitch.ml ├── fitch.mli ├── index.mld ├── iupac_nucleotide.ml ├── iupac_nucleotide.mli ├── let_syntax.ml ├── linear_algebra.ml ├── linear_algebra.mli ├── linear_algebra_tools.ml ├── linear_algebra_tools.mli ├── list1.ml ├── list1.mli ├── mCMC.ml ├── mG94.ml ├── mG94.mli ├── mutsel.ml ├── mutsel.mli ├── nelder_mead.ml ├── nelder_mead.mli ├── newick.ml ├── newick.mli ├── newick_ast.ml ├── newick_lexer.mll ├── newick_parser.messages ├── newick_parser.mly ├── nucleotide.ml ├── nucleotide.mli ├── nucleotide_process.ml ├── nucleotide_process.mli ├── phylip.ml ├── phylip.mli ├── phylo_ctmc.ml ├── phylo_ctmc.mli ├── phylogenetic_tree.ml ├── phylogenetic_tree.mli ├── rate_matrix.ml ├── rate_matrix.mli ├── rejection_sampling.ml ├── seq.ml ├── seq.mli ├── sequence_generation.ml ├── sequence_simulator.ml ├── sequence_simulator.mli ├── sigs.ml ├── simulator.ml ├── simulator.mli ├── site_evolution_model.ml ├── site_evolution_model.mli ├── stat_tools.ml ├── stat_tools.mli ├── tree.ml ├── tree.mli ├── utils.ml ├── utils.mli ├── wag.ml ├── wag.mli ├── zipper.ml └── zipper.mli ├── phylogenetics.opam └── tests ├── data ├── tiny1.fasta ├── tiny2.fasta └── wag.dat ├── dune ├── expect ├── birth_death_simulator.expected ├── birth_death_simulator.ml ├── dune ├── phylo_ctmc_conditional_simulation.expected ├── phylo_ctmc_conditional_simulation.ml ├── phylo_ctmc_conditional_simulation_missing_values.ml ├── phylo_ctmc_substitution_mappings.expected ├── phylo_ctmc_substitution_mappings.ml ├── simulator.expected └── simulator.ml ├── test_MCMC.ml ├── test_alignment.ml ├── test_felsenstein.ml ├── test_newick.ml ├── test_phylo_ctmc.ml ├── test_rejection_sampling.ml ├── test_sequence.ml ├── test_site_evolution_model.ml ├── test_topotree.ml ├── test_utils.ml ├── test_utils.mli └── test_zipper.ml /.gitignore: -------------------------------------------------------------------------------- 1 | .merlin 2 | .ocamlinit 3 | _build 4 | *.native 5 | *.byte 6 | *~ 7 | test_data/new_xp.sh 8 | tmp* 9 | *.install 10 | .vscode 11 | -------------------------------------------------------------------------------- /Changes: -------------------------------------------------------------------------------- 1 | v0.3.0 2024-07-08 Coublanc 2 | -------------------------- 3 | 4 | - better documentation (thanks to Corentin Moumard) 5 | - implementation of Muse and Gaut 94 evolution model 6 | - few bug fixes and API changes 7 | 8 | v0.2.0 2023-05-05 Maillezais 9 | ---------------------------- 10 | 11 | - Phylo_ctmc: pruning algorithm with ambiguous observations 12 | - Phylo_ctmc: introduced matrix decomposition to reduce pruning complexity 13 | - Codon: implemented all genetic codes 14 | - Simulator: optimized implementation for Gillespie direct method 15 | - Newick: improved representation and conversion to Tree 16 | - few other optimizations, more tests 17 | 18 | 19 | v0.1.0 2021-11-09 Villeurbanne 20 | ------------------------------ 21 | 22 | - new Nucleotide_process module, implementing classical nucleotide 23 | evolution models 24 | - Mutsel: added persistent positive selection 25 | - Rate_matrix: added HKY85 model 26 | - Alphabet, Alignment: a few more functions 27 | 28 | v0.0.0 2021-11-09 Villeurbanne 29 | ------------------------------ 30 | 31 | First release on OPAM 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biocaml/phylogenetics/8a8f2163e7a1acf924ef30762760aec8ab2f4060/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Phylogenetics 2 | 3 | This library provides algorithms and datastructures to perform 4 | inferences in molecular evolution. It features: 5 | - typed representation of DNA/amino-acid/codon alphabets 6 | - rate matrices and probability transition matrices for various 7 | evolution models (JC69, K80, GTR, Mutsel) 8 | - site-independent Gillespie simulators 9 | - a simulator for gapped alignments under the TKF91 model 10 | - a tree simulator under the birth-death model 11 | - parsers for Newick, NHX and phylip formats 12 | - a GSL-based implementation of the pruning algorithm, with underflow 13 | avoidance 14 | 15 | ## Installation 16 | 17 | Using [opam](http://opam.ocaml.org/), simply type 18 | 19 | ``` 20 | opam install phylogenetics 21 | ``` 22 | 23 | to install the library, or: 24 | 25 | ``` 26 | opam pin add -y phylogenetics --dev-repo 27 | ``` 28 | to get the current development version. 29 | -------------------------------------------------------------------------------- /app/dune: -------------------------------------------------------------------------------- 1 | (executable 2 | (name test_app) 3 | (modules test_app) 4 | (libraries alcotest phylogenetics_test) 5 | (preprocess (pps ppx_jane))) 6 | 7 | (rule 8 | (alias runtest) 9 | (action (run ./test_app.exe -q)) 10 | (deps ../tests/data/tiny1.fasta ../tests/data/tiny2.fasta)) 11 | 12 | (rule 13 | (alias fulltest) 14 | (action (run ./test_app.exe)) 15 | (deps ../tests/data/tiny1.fasta ../tests/data/tiny2.fasta)) 16 | -------------------------------------------------------------------------------- /app/test_app.ml: -------------------------------------------------------------------------------- 1 | open Phylogenetics_test 2 | 3 | let () = 4 | Alcotest.run "All tests" [ 5 | "Sequence", Test_sequence.tests; 6 | "Alignment", Test_alignment.tests; 7 | "Phylogenetic_tree", Test_topotree.tests; 8 | "Zipper", Test_zipper.tests; 9 | "Models", Test_site_evolution_model.tests; 10 | "Felsenstein", Test_felsenstein.tests; 11 | "Rejection_sampling", Test_rejection_sampling.tests; 12 | "MCMC", Test_MCMC.tests ; 13 | "Newick", Test_newick.tests ; 14 | "PhyloCTMC", Test_phylo_ctmc.tests ; 15 | ] 16 | -------------------------------------------------------------------------------- /app/zipper_explorer.ml: -------------------------------------------------------------------------------- 1 | open Printf 2 | open Core_kernel.Std 3 | open Biocaml_phylogeny_core.Zipper 4 | 5 | let zipper_explorer z = 6 | let rec get_command () = 7 | match In_channel.input_line In_channel.stdin with 8 | | Some s -> s 9 | | _ -> failwith "No command." 10 | and parse_command s z = 11 | match String.strip s |> String.split ~on:' ' with 12 | | ["move"; s] -> dir_of_string s |> move z |> display 13 | | ["slide"; sd; sf] -> slide z (dir_of_string sd) (float_of_string sf) |> display 14 | | ["goto"; si] -> goto z (int_of_string si) |> display 15 | | ["random_node"] -> random_node z |> display 16 | | ["exit"] -> () 17 | | _ -> help z 18 | and help z = printf "Available commands:\n* move dir\n* slide dir len\n* goto int\n* random_node\n* exit\n" ; prompt z 19 | and display z = print_fancy z ; flush_all () ; prompt z 20 | and prompt z = printf "Type a command: " ; flush_all () ; 21 | let c = get_command () in 22 | try parse_command c z with 23 | | Failure s -> printf "Error: %s\n" s ; help z 24 | | _ -> printf "Unknown error.\n" ; help z 25 | in display z 26 | 27 | let _ = Biocaml_phylogeny_core.Phylogenetic_tree.make_random 20 28 | |> of_tree 29 | |> init_routing 30 | |> zipper_explorer 31 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 3.6) 2 | (using menhir 2.0) 3 | (generate_opam_files true) 4 | 5 | (name phylogenetics) 6 | (source (github biocaml/phylogenetics)) 7 | (homepage "https://github.com/biocaml/phylogenetics/") 8 | (bug_reports "https://github.com/biocaml/phylogenetics/issues") 9 | (license CeCILL-B) 10 | (authors 11 | "Louis Duchemin" 12 | "Vincent Lanore" 13 | "Corentin Moumard" 14 | "Philippe Veber") 15 | (maintainers "philippe.veber@gmail.com") 16 | 17 | (package 18 | (name phylogenetics) 19 | (synopsis "Algorithms and datastructures for phylogenetics") 20 | (tags (bioinformatics evolution phylogeny)) 21 | (depends 22 | (alcotest :with-test) 23 | angstrom-unix 24 | binning 25 | (biotk (>= 0.2.0)) 26 | (core (>= v0.16.0)) 27 | dune 28 | gsl 29 | (lacaml (>= 10.0.2)) 30 | menhir 31 | ppx_deriving 32 | (printbox (>= 0.6.1)) 33 | printbox-text 34 | (yojson (>= 1.6.0)))) 35 | -------------------------------------------------------------------------------- /lib/ID_monad.ml: -------------------------------------------------------------------------------- 1 | type 'a t = int -> 'a * int 2 | 3 | let return x = fun s -> x, s 4 | 5 | let (let*) (m : 'a t) (f : 'a -> 'b t) : 'b t = 6 | fun state -> 7 | let y, state' = m state in 8 | f y state' 9 | 10 | let (let+) (m : 'a t) (f : 'a -> 'b) : 'b t = 11 | fun state -> 12 | let y, state' = m state in 13 | f y, state' 14 | 15 | let new_id id = (id, id + 1) 16 | 17 | let run m = fst (m 0) 18 | -------------------------------------------------------------------------------- /lib/ID_monad.mli: -------------------------------------------------------------------------------- 1 | (** A state monad to generate integer identifiers *) 2 | 3 | type 'a t 4 | 5 | val return : 'a -> 'a t 6 | val (let*) : 'a t -> ('a -> 'b t) -> 'b t 7 | val (let+) : 'a t -> ('a -> 'b) -> 'b t 8 | 9 | val new_id : int t 10 | 11 | val run : 'a t -> 'a 12 | -------------------------------------------------------------------------------- /lib/IUPAC_AA.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | module AA = Amino_acid 4 | 5 | let chars_of_aa_string = "ACDEFGHIKLMNPQRSTVWYBJZ" 6 | 7 | 8 | include Alphabet.Make(struct let card = 23 end) 9 | 10 | let to_char i = chars_of_aa_string.[i] 11 | 12 | let code_A = Char.to_int 'A' 13 | (** ASCII code for 'A' *) 14 | 15 | let aa_of_chars = 16 | let t = Array.create ~len:26 None in 17 | for i = 0 to card - 1 do 18 | t.(Char.to_int chars_of_aa_string.[i] - code_A) <- Some i 19 | done ; 20 | t 21 | 22 | let of_char = function 23 | | 'A'..'Z' as c -> aa_of_chars.(Char.to_int c - code_A) 24 | | _ -> None 25 | 26 | let of_char_exn c = Option.value_exn (of_char c) 27 | 28 | let aa_B = of_char_exn 'B' 29 | let aa_J = of_char_exn 'J' 30 | let aa_Z = of_char_exn 'Z' 31 | let aa_D = AA.of_char_exn 'D' 32 | let aa_N = AA.of_char_exn 'N' 33 | let aa_I = AA.of_char_exn 'I' 34 | let aa_L = AA.of_char_exn 'L' 35 | let aa_E = AA.of_char_exn 'E' 36 | let aa_Q = AA.of_char_exn 'Q' 37 | 38 | let of_amino_acid (aa : Amino_acid.t) : t = (aa :> int) 39 | 40 | let to_amino_acid x = 41 | Option.some_if (x < AA.card) (Amino_acid.of_int_exn (x:>int)) 42 | 43 | let fold ~init x ~f = 44 | if x < Amino_acid.card then f init (Amino_acid.of_int_exn x) 45 | else if x = aa_B then f (f init aa_D) aa_N 46 | else if x = aa_J then f (f init aa_I) aa_L 47 | else if x = aa_Z then f (f init aa_E) aa_Q 48 | else assert false 49 | 50 | let multiplicity x = if x < AA.card then 1 else 2 51 | 52 | let mem (x:t) (aa:AA.t) = 53 | let aa = (aa :> int) in 54 | if x < AA.card then equal (x :> int) aa 55 | else if x = aa_B then (aa = (aa_N :> int) || aa = (aa_D :> int)) 56 | else if x = aa_J then (aa = (aa_I :> int) || aa = (aa_L :> int)) 57 | else if x = aa_Z then (aa = (aa_E :> int) || aa = (aa_Q :> int)) 58 | else assert false 59 | 60 | let%test "card" = (card = String.length chars_of_aa_string) 61 | 62 | let%test "AA card" = (card = AA.card + 3) 63 | 64 | let%test "mem" = 65 | mem aa_B (AA.of_char_exn 'D') && 66 | mem aa_B (AA.of_char_exn 'N') && 67 | mem aa_J (AA.of_char_exn 'I') && 68 | mem aa_J (AA.of_char_exn 'L') && 69 | mem aa_Z (AA.of_char_exn 'Q') && 70 | mem aa_Z (AA.of_char_exn 'E') 71 | 72 | let%test "of_char" = 73 | let open Poly in 74 | of_char (to_char 19) = Some 19 75 | && of_char 'Z' = Some aa_Z 76 | && of_char 'U' = None 77 | 78 | let%test "to_char" = 79 | Char.(to_char aa_Z = 'Z') && 80 | Char.(to_char aa_B = 'B') && 81 | Char.(to_char aa_J = 'J') && 82 | Char.(to_char (of_amino_acid aa_D) = 'D') 83 | 84 | let%test "AA equivalence" = 85 | (of_char_exn 'D') = (aa_D :> int) && 86 | (of_char_exn 'D') = ((AA.of_char_exn 'D') :> int) 87 | -------------------------------------------------------------------------------- /lib/IUPAC_AA.mli: -------------------------------------------------------------------------------- 1 | (** An extended amino acid alphabet, including B,J and Z equivoque symbols *) 2 | 3 | 4 | include Alphabet.S_int 5 | 6 | val to_char : t -> char 7 | val of_char : char -> t option 8 | val of_char_exn : char -> t 9 | val of_amino_acid : Amino_acid.t -> t 10 | 11 | (** [to_amino_acid x] is Some amino acid iff [x] is univoque *) 12 | val to_amino_acid : t -> Amino_acid.t option 13 | 14 | (** [mem x aa] tests that [x] denotes [aa] *) 15 | val mem : t -> Amino_acid.t -> bool 16 | val fold : init:'a -> t -> f:('a -> Amino_acid.t -> 'a) -> 'a 17 | val multiplicity : t -> int 18 | -------------------------------------------------------------------------------- /lib/TKF91.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | module Make_simulator 4 | (A : Alphabet.S_int) 5 | (BI : Simulator.Branch_info) = 6 | struct 7 | let symbol_sample rng v = 8 | Gsl.Randist.discrete_preproc v 9 | |> Gsl.Randist.discrete rng 10 | |> A.of_int_exn 11 | 12 | type site = Site of (A.t, (A.t * int) option, branch_info) Tree.t 13 | and branch_info = { 14 | original_branch_info : BI.t ; 15 | length : float ; 16 | insertions : (A.t, (A.t * int) option, branch_info) Tree.branch list 17 | } 18 | 19 | let fold_alignment (Site site) ~init ~f = 20 | let open Tree in 21 | let rec node col ((next_col, acc) as acc2) = function 22 | | Leaf (Some (state, row)) -> next_col, f acc ~row ~col state 23 | | Leaf None -> acc2 24 | | Node n -> 25 | List1.fold n.branches ~init:acc2 ~f:(branch col) 26 | 27 | and branch col acc2 (Branch b) = 28 | List.fold b.data.insertions ~init:(node col acc2 b.tip) ~f:(fun (next_col, acc) b -> 29 | branch next_col (next_col + 1, acc) b 30 | ) 31 | in 32 | snd (node 0 (1, init) site) 33 | 34 | let nb_alignment_columns site = 35 | fold_alignment site ~init:0 ~f:(fun prev_col ~row:_ ~col _ -> max col prev_col) + 1 36 | 37 | let alignment_of_site char_of_state tree site = 38 | let ncols = nb_alignment_columns site in 39 | let nrows = List.length (Tree.leaves tree) in 40 | let ali = Array.init nrows ~f:(fun _ -> Bytes.init ncols ~f:(fun _ -> '-')) in 41 | fold_alignment site ~init:() ~f:(fun () ~row ~col state -> 42 | Bytes.set ali.(row) col (char_of_state state) 43 | ) ; 44 | Array.map ali ~f:Bytes.to_string 45 | 46 | let index_leaves tree = 47 | let open Tree in 48 | let rec node state = function 49 | | Node n -> 50 | let state, branches = 51 | List1.fold_right n.branches ~init:(state, []) ~f:(fun b (state, acc) -> 52 | let state, b = branch state b in 53 | state, b :: acc 54 | ) 55 | in 56 | state, Tree.node n.data (List1.of_list_exn branches) 57 | | Leaf l -> 58 | state + 1, Leaf (state, l) 59 | and branch state (Branch b) = 60 | let state, tip = node state b.tip in 61 | state, Tree.branch b.data tip 62 | in 63 | snd (node 0 tree) 64 | 65 | let site_gillespie_direct rng tree ~root ~rate_matrix ~rates_upon_insertion ~lambda ~mu = 66 | 67 | let simulation_step remaining_time state ~insertion ~deletion ~substitution ~rate_matrix ~rates_upon_insertion ~branch_end = 68 | let substitution_rates = A.Table.init (fun m -> if A.equal m state then 0. else rate_matrix.A.%{state, m}) in 69 | let total_substitution_rate = Utils.array_sum (substitution_rates :> float array) in 70 | let total_rate = total_substitution_rate +. lambda +. mu in 71 | let tau = Gsl.Randist.exponential rng ~mu:(1. /. total_rate) in 72 | let remaining_time = Float.(remaining_time - tau) in 73 | if Float.(remaining_time < 0.) then branch_end () 74 | else 75 | let x = total_rate *. Gsl.Rng.uniform rng in 76 | if Float.(x < total_substitution_rate) then 77 | let next_state = symbol_sample rng (substitution_rates :> float array) in 78 | substitution remaining_time next_state 79 | else if Float.(x < total_substitution_rate +. lambda) then 80 | let next_state = symbol_sample rng (rates_upon_insertion :> float array) in 81 | insertion remaining_time next_state 82 | else 83 | deletion remaining_time 84 | in 85 | 86 | let rec simulate_tree : (_, _, BI.t) Tree.t -> A.t option -> site = 87 | fun tree maybe_state -> 88 | match tree, maybe_state with 89 | | Leaf (id, _), Some state -> Site (Tree.leaf (Some (state, id))) 90 | | _, None -> Site (Tree.leaf None) 91 | | Node n, Some state -> 92 | let branches = List1.map n.branches ~f:(fun b -> simulate_branch b state) in 93 | Site (Tree.node state branches) 94 | 95 | and simulate_branch : (_, _, BI.t) Tree.branch -> A.t -> (A.t, (A.t * int) option, branch_info) Tree.branch = 96 | fun (Branch b) state -> 97 | let branch_length = BI.length b.data in 98 | let rate_matrix = rate_matrix b.data in 99 | let rates_upon_insertion = rates_upon_insertion b.data in 100 | let rec loop remaining_time state acc = 101 | simulation_step remaining_time state 102 | ~rate_matrix 103 | ~rates_upon_insertion 104 | ~branch_end:(fun () -> Some state, 0., acc) 105 | ~substitution:(fun remaining_time next_state -> 106 | loop remaining_time next_state acc 107 | ) 108 | ~insertion:(fun remaining_time next_state -> 109 | let insertions = simulate_insertion b.data b.tip rate_matrix rates_upon_insertion remaining_time next_state in 110 | loop remaining_time state (insertions @ acc) 111 | ) 112 | ~deletion:(fun remaining_time -> None, branch_length -. remaining_time, acc) 113 | in 114 | let next_state, length, insertions = loop branch_length state [] in 115 | let Site tip = simulate_tree b.tip next_state in 116 | let bi = { 117 | original_branch_info = b.data ; 118 | length ; insertions ; 119 | } 120 | in 121 | Tree.branch bi tip 122 | 123 | and simulate_insertion (branch_data : BI.t) (rest_of_the_tree : _ Tree.t) rate_matrix rates_upon_insertion (remaining_time : float) (state : A.t) : _ Tree.branch list = 124 | simulation_step remaining_time state 125 | ~rate_matrix 126 | ~rates_upon_insertion 127 | ~branch_end:(fun () -> 128 | [ 129 | let bi = { 130 | original_branch_info = branch_data ; 131 | length = remaining_time ; 132 | insertions = [] ; 133 | } 134 | in 135 | let Site tip = simulate_tree rest_of_the_tree (Some state) in 136 | Tree.branch bi tip 137 | ] 138 | ) 139 | ~substitution:(fun remaining_time next_state -> simulate_insertion branch_data rest_of_the_tree rate_matrix rates_upon_insertion remaining_time next_state) 140 | ~insertion:(fun remaining_time next_state -> 141 | simulate_insertion branch_data rest_of_the_tree rate_matrix rates_upon_insertion remaining_time state 142 | @ simulate_insertion branch_data rest_of_the_tree rate_matrix rates_upon_insertion remaining_time next_state 143 | ) 144 | ~deletion:(fun _ -> []) 145 | in 146 | simulate_tree (index_leaves tree) (Some root) 147 | end 148 | -------------------------------------------------------------------------------- /lib/TKF91.mli: -------------------------------------------------------------------------------- 1 | (** Simulator for alignments with indels under the model by Thorne, 2 | Kishino and Felsenstein. *) 3 | 4 | module Make_simulator 5 | (A : Alphabet.S_int) 6 | (BI : Simulator.Branch_info) : 7 | sig 8 | type site = Site of (A.t, (A.t * int) option, branch_info) Tree.t 9 | and branch_info = { 10 | original_branch_info : BI.t ; 11 | length : float ; 12 | insertions : (A.t, (A.t * int) option, branch_info) Tree.branch list 13 | } 14 | 15 | val site_gillespie_direct : 16 | Gsl.Rng.t -> 17 | (_, _, BI.t) Tree.t -> 18 | root:A.t -> 19 | rate_matrix:(BI.t -> A.matrix) -> 20 | rates_upon_insertion:(BI.t -> float array) -> 21 | lambda:float -> 22 | mu:float -> 23 | site 24 | 25 | val fold_alignment : 26 | site -> 27 | init:'a -> 28 | f:('a -> row:int -> col:int -> A.t -> 'a) -> 29 | 'a 30 | 31 | val alignment_of_site : 32 | (A.t -> char) -> 33 | _ Tree.t -> 34 | site -> 35 | string Array.t 36 | end 37 | -------------------------------------------------------------------------------- /lib/alignment.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | type t = { 4 | descriptions : string array ; 5 | sequences : string array ; 6 | } 7 | 8 | let sequence t i = t.sequences.(i) 9 | 10 | let description t i = t.descriptions.(i) 11 | 12 | let nrows a = Array.length a.sequences 13 | let ncols a = String.length a.sequences.(0) 14 | 15 | 16 | type error = [ 17 | | `Empty_alignment 18 | | `Unequal_sequence_lengths 19 | ] 20 | [@@deriving show] 21 | 22 | type parsing_error = [ 23 | | `Syntax_error of string 24 | | error 25 | ] 26 | [@@deriving show] 27 | 28 | let check_non_empty_list = function 29 | | [] -> Error (`Empty_alignment) 30 | | items -> Ok items 31 | 32 | let of_tuples items = 33 | let descriptions, sequences = Array.unzip items in 34 | let n = String.length sequences.(0) in 35 | if (Array.for_all sequences ~f:(fun s -> String.length s = n)) 36 | then Ok { descriptions ; sequences ; } 37 | else Error `Unequal_sequence_lengths 38 | 39 | let of_assoc_list l = 40 | match check_non_empty_list l with 41 | | Ok items -> Array.of_list items |> of_tuples 42 | | Error e -> Error e 43 | 44 | let map t ~f = 45 | Array.map2_exn t.descriptions t.sequences 46 | ~f:(fun description sequence -> f ~description ~sequence) 47 | |> of_tuples 48 | 49 | let array_mapi t ~f = 50 | Array.mapi t.descriptions 51 | ~f:(fun i description -> f i ~description ~sequence:t.sequences.(i)) 52 | 53 | let fold t ~init ~f = Array.fold2_exn t.descriptions t.sequences ~init 54 | ~f:(fun acc description sequence -> f acc ~description ~sequence) 55 | 56 | module Fasta = struct 57 | 58 | let of_fasta_items (items:Biotk.Fasta.item list) = 59 | List.map items ~f:(fun x -> x.description, x.sequence) 60 | |> of_assoc_list 61 | 62 | let from_file fn = 63 | let open Biotk.Let_syntax.Result in 64 | let* _, items = 65 | Biotk.Fasta.from_file fn 66 | |> Result.map_error ~f:(fun msg -> `Syntax_error msg) 67 | in 68 | of_fasta_items items 69 | 70 | let from_file_exn fn = 71 | match from_file fn with 72 | | Ok al -> al 73 | | Error e -> failwith (show_parsing_error e) 74 | 75 | let to_channel ({sequences ; descriptions}) oc = 76 | Array.iter2_exn descriptions sequences ~f:(fun desc seq -> 77 | Out_channel.output_lines oc [ 78 | sprintf ">%s" desc ; 79 | seq; 80 | ]) 81 | 82 | let to_file al fn = 83 | Out_channel.with_file fn ~f:(to_channel al) 84 | end 85 | 86 | module Phylip = struct 87 | 88 | let of_phylip { Phylip.items ; _ } = 89 | List.map items ~f:(fun { name ; sequence } -> name, sequence) 90 | |> of_assoc_list 91 | 92 | let to_phylip ({sequences ; descriptions} as ali) = 93 | let number_of_sequences = nrows ali in 94 | let items = List.init number_of_sequences ~f:(fun i -> 95 | { Phylip.name = descriptions.(i) ; sequence = sequences.(i) } 96 | ) in 97 | Phylip.make_exn items 98 | 99 | let from_file ?strict fn = 100 | let open Biotk.Let_syntax.Result in 101 | let* phylip = 102 | Phylip.read ?strict fn 103 | |> Result.map_error ~f:(fun (`Msg msg) -> `Syntax_error msg) 104 | in 105 | of_phylip phylip 106 | 107 | let from_file_exn ?strict fn = 108 | match from_file ?strict fn with 109 | | Ok al -> al 110 | | Error e -> failwith (show_parsing_error e) 111 | 112 | end 113 | 114 | let find_sequence t id = 115 | Array.findi t.descriptions ~f:(fun _ x -> String.equal x id) 116 | |> Option.map ~f:(fun (i, _) -> t.sequences.(i)) 117 | 118 | let indel_free_columns ali = 119 | Array.init (nrows ali) ~f:(fun j -> 120 | Array.for_all ali.sequences ~f:(fun s -> Char.(s.[j] <> '-')) 121 | ) 122 | 123 | let residues al ~column:j = 124 | if j < 0 || j > ncols al then raise (Invalid_argument "Alignment.residues") ; 125 | Array.fold al.sequences ~init:Char.Set.empty ~f:(fun acc s -> Set.add acc s.[j]) 126 | 127 | let number_of_residues_per_column_stats al = 128 | let x = 129 | Array.init (ncols al) ~f:(fun column -> 130 | residues al ~column 131 | |> Set.length 132 | ) 133 | in 134 | Binning.counts (Stdlib.Array.to_seq x) 135 | |> Stdlib.List.of_seq 136 | 137 | let composition al = 138 | let module C = Binning.Counter in 139 | let acc = C.create () in 140 | let n = float (nrows al * ncols al) in 141 | Array.iter al.sequences ~f:(fun s -> 142 | String.iter s ~f:(fun c -> C.tick acc c) 143 | ) ; 144 | Stdlib.Seq.map (fun (c, k) -> (c, float k /. n)) (Binning.seq acc) 145 | |> Stdlib.List.of_seq 146 | 147 | let constant_site al j = 148 | let m = nrows al in 149 | let rec find_state i = 150 | if i < m then 151 | match al.sequences.(i).[j] with 152 | | '-' -> find_state (i + 1) 153 | | c -> find_other_state c (i + 1) 154 | else true 155 | and find_other_state c i = 156 | if i < m then 157 | match al.sequences.(i).[j] with 158 | | '-' -> find_other_state c (i + 1) 159 | | c' when Char.equal c c' -> find_other_state c (i + 1) 160 | | _ -> false 161 | else true 162 | in 163 | find_state 0 164 | 165 | open Core 166 | 167 | module Make(S : Seq.S) = struct 168 | type base = S.base 169 | type sequence = S.t 170 | type index = string 171 | type t = (index, sequence) Hashtbl.t 172 | module Sequence = S 173 | 174 | let get_base tab ~seq ~pos = S.get (Hashtbl.find_exn tab seq) pos 175 | 176 | let of_assoc_list l = 177 | let align = String.Table.create ~size:(List.length l) () in 178 | List.iter ~f:(fun (i,s) -> Hashtbl.add_exn ~key:i ~data:s align) l ; 179 | align 180 | 181 | let of_string_list l = 182 | let align = String.Table.create ~size:(List.length l) () in 183 | (* arbitrarily indexes sequences by string Ti where i is an integer; 184 | this mimics the format used by bppseqgen *) 185 | List.iteri ~f:(fun i s -> Hashtbl.add_exn ~key:(Printf.sprintf "T%d" i) ~data:(S.of_string_exn s) align) l ; 186 | align 187 | 188 | let of_fasta filename = 189 | let align = String.Table.create ~size:10 () in (* placeholder size TODO *) 190 | let _, items = Biotk.Fasta.from_file_exn filename in 191 | List.iter items ~f:(fun item -> 192 | let data = S.of_string_exn item.Biotk.Fasta.sequence in 193 | Hashtbl.add_exn ~key:item.Biotk.Fasta.description ~data:data align 194 | ) ; 195 | align 196 | 197 | let length x = (* this is the length of the sequences, not the nb of sequences! *) 198 | if Hashtbl.is_empty x then invalid_arg "empty alignment" 199 | else Hashtbl.fold x ~init:0 ~f:(fun ~key:_ ~data acc -> 200 | let l = S.length data in 201 | if l=0 then invalid_arg "alignment with empty sequence" 202 | else if acc<>0 && acc<>l then invalid_arg "sequence length mismatch" 203 | else l (* returns only if all lengths were equal *) 204 | ) 205 | 206 | let nb_seq x = Hashtbl.length x 207 | 208 | let pp fmt x = 209 | Hashtbl.to_alist x 210 | |> List.map ~f:(fun (i,s) -> Printf.sprintf "%s: %s" i (S.to_string s)) 211 | |> String.concat ~sep:"\n" 212 | |> Format.fprintf fmt "%s" 213 | 214 | let to_file x filename = 215 | Hashtbl.to_alist x 216 | |> List.map ~f:(fun (i,s) -> Printf.sprintf ">%s\n%s" i (S.to_string s)) 217 | |> Out_channel.write_lines filename 218 | 219 | let equal (x : t) y = Hashtbl.equal Poly.equal x y 220 | end 221 | -------------------------------------------------------------------------------- /lib/alignment.mli: -------------------------------------------------------------------------------- 1 | (** A representation for sequence alignments 2 | 3 | An alignment is a non-empty collection of sequences where all 4 | sequences have the same length. Each sequence comes with a {i 5 | description}. A sequence description provides additional textual 6 | information about a sequence in an alignment, such as its name or 7 | biological context. It helps identify and differentiate sequences 8 | without directly inspecting the sequence data. *) 9 | 10 | type t 11 | 12 | val nrows : t -> int 13 | (** [nrows alignment] returns the number of sequences in the 14 | alignment. *) 15 | 16 | val ncols : t -> int 17 | (** [ncols alignment] returns the number of columns in the alignment, 18 | i.e., the length of the sequences. *) 19 | 20 | val description : t -> int -> string 21 | (** [description alignment index] returns the description of the 22 | sequence at the given index in the alignment. *) 23 | 24 | val sequence : t -> int -> string 25 | (** [sequence alignment index] returns the sequence at the given index 26 | in the alignment. *) 27 | 28 | type error = [ 29 | | `Empty_alignment 30 | | `Unequal_sequence_lengths 31 | ] 32 | [@@deriving show] 33 | (** The possible errors that can occur when creating an alignment. *) 34 | 35 | type parsing_error = [ 36 | | `Syntax_error of string 37 | | error 38 | ] 39 | [@@deriving show] 40 | 41 | val map : 42 | t -> 43 | f:(description:string -> sequence:string -> string * string) -> 44 | (t, [> `Unequal_sequence_lengths]) result 45 | (** [map alignment ~f] applies the function [f] to each sequence in 46 | the alignment and returns a new alignment with the modified 47 | descriptions and sequences. If the sequences have unequal lengths, 48 | an error of type [`Unequal_sequence_lengths] is returned. *) 49 | 50 | val array_mapi : 51 | t -> 52 | f:(int -> description:string -> sequence:string -> 'a) -> 53 | 'a array 54 | (** [array_mapi alignment ~f] applies the function [f] to each 55 | sequence in the alignment, preserving the sequence indices, and 56 | returns an array of the resulting values. *) 57 | 58 | val fold : 59 | t -> 60 | init:'a -> 61 | f:('a -> description:string -> sequence:string -> 'a) -> 62 | 'a 63 | (** [fold alignment ~init ~f] applies the function [f] to each 64 | sequence in the alignment using an initial accumulator [init], and 65 | returns the final value of the accumulator. *) 66 | 67 | val find_sequence : 68 | t -> 69 | string -> 70 | string option 71 | (** [find_sequence alignment id] searches for a sequence in the 72 | alignment using its description as the search criterion. It 73 | returns the corresponding sequence as a string option, or [None] 74 | if the sequence is not found. *) 75 | 76 | val of_assoc_list : (string * string) list -> (t, [> error]) result 77 | (** [of_assoc_list l] creates an alignment from a list of key-value 78 | associations, where the key represents the description of the 79 | sequence and the value represents the sequence itself. It returns 80 | the created alignment, or an error if the input data does not 81 | satisfy the alignment invariants. *) 82 | 83 | module Fasta : sig 84 | val from_file : 85 | string -> 86 | (t, [> parsing_error]) result 87 | (** [from_file filename] reads an alignment from a FASTA file with the 88 | given [filename]. It returns the alignment as a result, or a 89 | parsing error if any error occurs during parsing. *) 90 | 91 | val from_file_exn : string -> t 92 | (** Same as {! from_file} but raises [Failure] if some error happens *) 93 | 94 | val to_channel : t -> out_channel -> unit 95 | (** [to_channel alignment oc] writes the alignment to the specified 96 | output channel [oc] in FASTA format. *) 97 | 98 | val to_file : t -> string -> unit 99 | (** [to_file alignment filename] writes the alignment to a FASTA file with the given [filename]. *) 100 | end 101 | 102 | module Phylip : sig 103 | val of_phylip : Phylip.t -> ( t, error) result 104 | 105 | val to_phylip : t -> Phylip.t 106 | 107 | val from_file : ?strict:bool -> string -> (t, parsing_error) result 108 | 109 | val from_file_exn : ?strict:bool -> string -> t 110 | end 111 | 112 | val indel_free_columns : t -> bool array 113 | (** [indel_free_columns alignment] checks each column of the alignment 114 | for the presence of indels ("-"). It returns a boolean array 115 | indicating whether each column is indel-free. *) 116 | 117 | val residues : t -> column:int -> Core.Char.Set.t 118 | (** [residues alignment ~column:j] returns the set of residues present 119 | in the specified column [j] of the alignment. *) 120 | 121 | val number_of_residues_per_column_stats : t -> (int * int) list 122 | (** [number_of_residues_per_column_stats alignment] calculates the 123 | number of unique residues per column in the alignment and returns 124 | the statistics as a list of pairs, where the first element of each 125 | pair represents the column index and the second element represents 126 | the number of unique residues. *) 127 | 128 | val composition : t -> (Char.t * float) list 129 | (** [composition alignment] calculates the residue composition of the 130 | alignment and returns the results as a list of pairs, where each 131 | pair consists of a residue and its relative frequency in the 132 | alignment. *) 133 | 134 | val constant_site : t -> int -> bool 135 | (** [constant_site alignment j] checks whether the specified column 136 | [j] of the alignment contains a constant site, i.e., all sequences 137 | have the same residue at that column. *) 138 | 139 | (** Legacy code: provides a functor to build from SEQUENCE modules. *) 140 | open Sigs 141 | 142 | module Make (S : Seq.S): 143 | ALIGNMENT with type base = S.base and type sequence = S.t 144 | -------------------------------------------------------------------------------- /lib/alphabet.ml: -------------------------------------------------------------------------------- 1 | (** Integer-based representation of sequence alphabets *) 2 | 3 | open Core 4 | 5 | module type S = sig 6 | type t 7 | type vector 8 | type matrix 9 | type 'a table 10 | val equal : t -> t -> bool 11 | val compare : t -> t -> int 12 | val all : t list 13 | val card : int 14 | val to_int : t -> int 15 | val counts : t Sequence.t -> int table 16 | module Table : sig 17 | val init : (t -> 'a) -> 'a table 18 | val get : 'a table -> t -> 'a 19 | val set : 'a table -> t -> 'a -> unit 20 | val map : 'a table -> f:('a -> 'b) -> 'b table 21 | val map2 : 'a table -> 'a table -> f:('a -> 'a -> 'b) -> 'b table 22 | val mapi : 'a table -> f:(t -> 'a -> 'b) -> 'b table 23 | val of_array_exn : 'a array -> 'a table 24 | val of_vector : vector -> float table 25 | val choose : float table -> rng:Gsl.Rng.t -> t 26 | val fold : 'a table -> init:'b -> f:('b -> 'a -> 'b) -> 'b 27 | val foldi : 'a table -> init:'b -> f:(t -> 'b -> 'a -> 'b) -> 'b 28 | val count : 'a table -> f:('a -> bool) -> int 29 | val counti : 'a table -> f:(t -> 'a -> bool) -> int 30 | end 31 | module Vector : sig 32 | type symbol = t 33 | include Linear_algebra.Vector with type t = vector 34 | val init : (symbol -> float) -> vector 35 | val map : vector -> f:(float -> float) -> vector 36 | val mapi : vector -> f:(symbol -> float -> float) -> vector 37 | val map2 : vector -> vector -> f:(float -> float -> float) -> vector 38 | val fold : vector -> init:'a -> f:('a -> float -> 'a) -> 'a 39 | val foldi : vector -> init:'a -> f:(symbol -> 'a -> float -> 'a) -> 'a 40 | val iteri : vector -> f:(symbol -> float -> unit) -> unit 41 | val sum : vector -> float 42 | val maxi : t -> symbol * float 43 | val normalize : vector -> vector 44 | val of_array : float array -> vector option 45 | val of_array_exn : float array -> vector 46 | val upcast_exn : Linear_algebra.vec -> vector 47 | val get : t -> symbol -> float 48 | val set : t -> symbol -> float -> unit 49 | end 50 | val flat_profile : unit -> vector 51 | val random_profile : Gsl.Rng.t -> float -> vector 52 | module Matrix : sig 53 | type symbol = t 54 | include Linear_algebra.Matrix with type t = matrix 55 | and type vec := vector 56 | val init : (symbol -> symbol -> float) -> matrix 57 | val init_sym : (symbol -> symbol -> float) -> matrix 58 | val of_arrays : float array array -> matrix option 59 | val of_arrays_exn : float array array -> matrix 60 | end 61 | val ( .%() ) : vector -> t -> float 62 | val ( .%()<- ) : vector -> t -> float -> unit 63 | val ( .%{} ) : matrix -> t * t -> float 64 | val ( .%{}<- ) : matrix -> t * t -> float -> unit 65 | end 66 | 67 | module type S_int = sig 68 | include S with type t = private int 69 | and type vector = private Linear_algebra.vec 70 | and type matrix = private Linear_algebra.mat 71 | and type 'a table = private 'a array 72 | val of_int : int -> t option 73 | val of_int_exn : int -> t 74 | end 75 | 76 | module Make(X : sig val card : int end) = struct 77 | type t = int 78 | include X 79 | let of_int i = 80 | if i < 0 || i >= card then None 81 | else Some i 82 | 83 | let of_int_exn n = 84 | if n < 0 || n >= card then raise (Invalid_argument "of_int_exn") 85 | else n 86 | 87 | let equal = Int.( = ) 88 | let compare = Int.compare 89 | let all = List.init card ~f:Fn.id 90 | 91 | let counts xs = 92 | let r = Array.create ~len:card 0 in 93 | Sequence.iter ~f:(fun aa -> r.(aa) <- r.(aa) + 1) xs ; 94 | r 95 | 96 | type 'a table = 'a array 97 | module Table = struct 98 | let init f = Array.init card ~f 99 | let get xs a = xs.(a) 100 | let set xs a v = xs.(a) <- v 101 | let map = Array.map 102 | let map2 = Array.map2_exn 103 | let mapi = Array.mapi 104 | let of_array_exn a = 105 | if Array.length a <> card then raise (Invalid_argument "vector_of_array_exn") 106 | else a 107 | let of_vector v = 108 | let open Linear_algebra.Vector in 109 | Array.init (length v) ~f:(get v) 110 | let choose xs ~rng = 111 | Gsl.Randist.(discrete_preproc xs |> discrete rng) 112 | 113 | let fold = Array.fold 114 | let foldi = Array.foldi 115 | let count = Array.count 116 | let counti = Array.counti 117 | end 118 | module Vector = struct 119 | type symbol = t 120 | include Linear_algebra.Vector 121 | 122 | let iteri v ~f = 123 | for i = 0 to card - 1 do 124 | f i (get v i) 125 | done 126 | 127 | let init f = init card ~f 128 | let normalize v = 129 | let s = sum v in 130 | map v ~f:(fun x -> x /. s) 131 | let of_array_exn a = 132 | if Array.length a <> card then raise (Invalid_argument "vector_of_array_exn") 133 | else init (fun i -> a.(i)) 134 | let of_array a = 135 | try Some (of_array_exn a) 136 | with _ -> None 137 | let upcast_exn a = 138 | let n = Linear_algebra.Vector.length a in 139 | if n = card 140 | then a 141 | else 142 | invalid_argf "vector_of_arr_exn: argument has shape %d" n () 143 | 144 | let map2 x y ~f = 145 | init (fun i -> f (get x i ) (get y i)) 146 | 147 | let mapi x ~f = 148 | init (fun i -> f i (get x i)) 149 | 150 | let foldi (x:t) ~init ~f = 151 | let rec loop x i acc ~f = 152 | if i = card then acc else loop x (i+1) (f i acc (get x i)) ~f 153 | in loop x 0 init ~f 154 | 155 | let fold (x:t) ~init ~f = 156 | foldi x ~init ~f:(fun _ acc x -> f acc x) 157 | 158 | let counti (x:t) ~f = 159 | foldi x ~init:0 ~f:(fun i acc x -> if f i x then acc + 1 else acc) 160 | 161 | let count (x:t) ~f = 162 | counti x ~f:(fun _ x -> f x) 163 | 164 | (** Element with maximum value. In case of ties, it is the first one encountered *) 165 | let maxi (x:t) = 166 | let rec loop x i m = 167 | if i = card then m 168 | else 169 | let m = if Float.((get x i) > snd m) then i, get x i else m in 170 | loop x (i+1) m 171 | in loop x 1 (0, get x 0) 172 | 173 | end 174 | 175 | let flat_profile () = 176 | let theta = Float.(1. / of_int card) in 177 | Vector.init (fun _ -> theta) 178 | 179 | let random_profile rng alpha = 180 | let v = Table.init (fun _ -> 0.) in 181 | Gsl.Randist.dirichlet rng ~alpha:(Array.create ~len:card alpha) ~theta:v ; 182 | Vector.init (fun i -> v.(i)) 183 | 184 | module Matrix = struct 185 | type symbol = t 186 | include Linear_algebra.Matrix 187 | 188 | let init f = init card ~f 189 | let init_sym f = init_sym card ~f 190 | let of_arrays_exn xs = 191 | let m = Array.length xs in 192 | if m = card then of_arrays_exn xs 193 | else failwith "Incorrect dimension" 194 | let of_array xs = 195 | let m = Array.length xs in 196 | if m = card then of_arrays xs 197 | else None 198 | end 199 | 200 | let to_int i = i 201 | type vector = Linear_algebra.vec 202 | 203 | let ( .%() ) v i = Vector.get v i 204 | let ( .%()<- ) v i x = Vector.set v i x 205 | type matrix = Linear_algebra.mat 206 | let ( .%{} ) m (i,j) = Matrix.get m i j 207 | let ( .%{}<- ) m (i, j) x = Matrix.set m i j x 208 | end 209 | -------------------------------------------------------------------------------- /lib/amino_acid.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | include Alphabet.Make(struct let card = 20 end) 3 | 4 | let chars_of_aa_string = "ACDEFGHIKLMNPQRSTVWY" 5 | 6 | 7 | let%test "chars_of_aa_string" = 8 | String.length chars_of_aa_string = card 9 | 10 | let to_char i = chars_of_aa_string.[i] 11 | 12 | let yojson_of_vector vec : Yojson.Safe.t = 13 | `Assoc ( 14 | Vector.to_array vec 15 | |> Array.mapi ~f:(fun aa x -> to_char aa |> Char.to_string, `Float x) 16 | |> Array.to_list 17 | ) 18 | 19 | let vector_of_yojson aa_list = 20 | (* Optimization to validate that all AA are present *) 21 | let aa_queue = 22 | Yojson.Safe.Util.to_assoc aa_list 23 | |> List.sort ~compare:(fun (x, _) (y, _) -> String.compare x y) 24 | |> Queue.of_list in 25 | Vector.init (fun aa -> 26 | let aa_char = to_char aa in 27 | let is_expected aa_str = Char.(aa_str |> of_string = aa_char) in 28 | match Queue.dequeue aa_queue with 29 | | Some (key, x) when is_expected key -> Yojson.Safe.Util.to_number x 30 | | Some (key, _ ) -> 31 | failwithf "Expected value for %c ; found %s" aa_char key () 32 | | None -> 33 | failwithf "Missing AA : %c ; reached end of json record without finding it" 34 | aa_char () 35 | ) 36 | 37 | let code_A = Char.to_int 'A' 38 | 39 | let aa_of_chars = 40 | let t = Array.create ~len:26 None in 41 | for i = 0 to String.length chars_of_aa_string - 1 do 42 | t.(Char.to_int chars_of_aa_string.[i] - code_A) <- Some i 43 | done ; 44 | t 45 | 46 | let of_char = function 47 | | 'A'..'Z' as c -> aa_of_chars.(Char.to_int c - code_A) 48 | | _ -> None 49 | 50 | let of_char_exn c = Option.value_exn (of_char c) 51 | 52 | let%test "of_char" = 53 | let open Poly in 54 | of_char (to_char 19) = Some 19 55 | && of_char 'Z' = None 56 | -------------------------------------------------------------------------------- /lib/amino_acid.mli: -------------------------------------------------------------------------------- 1 | (** Integer-based representation for amino acids 2 | 3 | This module implements an amino acid alphabet, which is a specific 4 | instance of the `Alphabet.S_int` module. It represents the set of 5 | 20 standard amino acids. 6 | 7 | The alphabet is represented as an integer alphabet, where each 8 | amino acid is assigned a unique integer index. 9 | 10 | Conversion functions are provided to convert between the integer 11 | index and the corresponding character representation of an amino 12 | acid. 13 | 14 | Note: The integer indices of amino acids start from 0 and go up to 19. 15 | *) 16 | 17 | include Alphabet.S_int 18 | 19 | val yojson_of_vector : vector -> Yojson.Safe.t 20 | (** [yojson_of_vector vec] converts a vector representing the values 21 | of amino acids into a Yojson.Safe.t JSON object. 22 | 23 | {[ 24 | let amino_acid_values = [|1.2; 3.4; 2.1|] (* Vector representing amino acid values *) 25 | let json_repr = yojson_of_vector amino_acid_values (* Conversion to JSON object *) 26 | ]} 27 | 28 | In this example, if [amino_acid_values] represents the values of three amino acids in the order 29 | "Alanine", "Aspartic Acid", and "Cysteine", then [json_repr] will be a JSON object with the following structure: 30 | 31 | {[ 32 | { 33 | "A": 1.2, 34 | "D": 3.4, 35 | "C": 2.1 36 | } 37 | ]} 38 | 39 | Each amino acid is represented as a key-value pair in the JSON object, where the key is the character representation 40 | and the value is the corresponding value in the vector. 41 | 42 | This function is useful when you need to serialize amino acid values in a JSON format for storage or communication purposes. 43 | *) 44 | 45 | val vector_of_yojson : Yojson.Safe.t -> vector 46 | (** [vector_of_yojson aa_list] converts a JSON object of type 47 | `Yojson.Safe.t` into a vector of amino acid values. The JSON 48 | object should contain key-value pairs, where the key is the 49 | character representation of an amino acid and the value is the 50 | corresponding value. The order of the key-value pairs should 51 | follow the alphabetical order. If any amino acid is missing or the 52 | order is not correct, an exception is raised. *) 53 | 54 | val to_char : t -> char 55 | (** [to_char aa] converts the integer index of an amino acid [aa] to 56 | its character representation. *) 57 | 58 | val of_char : char -> t option 59 | (** [of_char c] converts a character representation [c] of an amino 60 | acid to its corresponding integer index, returning an option. If 61 | the character is not a valid amino acid, [None] is returned. *) 62 | 63 | val of_char_exn : char -> t 64 | (** Same as {! of_char} but raises an exception if some error 65 | happens *) 66 | -------------------------------------------------------------------------------- /lib/birth_death.ml: -------------------------------------------------------------------------------- 1 | open Base 2 | open Gsl 3 | 4 | type t = { 5 | birth_rate : float ; 6 | death_rate : float ; 7 | } 8 | 9 | let make ~birth_rate ~death_rate = 10 | if Float.(birth_rate < 0. || death_rate < 0.) 11 | then invalid_arg "birth rate and death rate should be positive" ; 12 | { birth_rate ; death_rate } 13 | 14 | let simulation p rng ~time = 15 | let open ID_monad in 16 | let birth_mean_time = 1. /. p.birth_rate 17 | and death_mean_time = 1. /. p.death_rate in 18 | let rec branch remaining_time = 19 | let next_birth = Randist.exponential rng ~mu:birth_mean_time in 20 | let next_death = Randist.exponential rng ~mu:death_mean_time in 21 | let next_event = Float.min next_birth next_death in 22 | let* id = new_id in 23 | if Float.(next_event > remaining_time) then 24 | return @@ Tree.branch remaining_time (Tree.leaf id) 25 | else if Float.(next_birth < next_death) then 26 | let remaining_time' = remaining_time -. next_event in 27 | let* left_branch = branch remaining_time' in 28 | let+ right_branch = branch remaining_time' in 29 | Tree.branch next_birth (Tree.binary_node id left_branch right_branch) 30 | else 31 | return @@ Tree.branch next_death (Tree.leaf id) 32 | in 33 | run (branch time) 34 | 35 | let sample_different_ints rng n = 36 | let i = Rng.uniform_int rng n in 37 | let j = Rng.uniform_int rng (n - 1) in 38 | let j = if j < i then j else j + 1 in 39 | if i < j then i, j 40 | else j, i 41 | 42 | let sample_branch rng times = 43 | let n = Array.length times + 1 in 44 | let particles = Array.init n ~f:(fun i -> Tree.leaf i, 0.) in 45 | let rec loop i = 46 | if i = 1 then fst particles.(0) 47 | else 48 | let k, l = sample_different_ints rng i in 49 | let t = times.(n - i) in 50 | let length_k = t -. snd particles.(k) in 51 | let branch_k = Tree.branch length_k (fst particles.(k)) in 52 | let length_l = t -. snd particles.(l) in 53 | let branch_l = Tree.branch length_l (fst particles.(l)) in 54 | particles.(k) <- (Tree.binary_node () branch_k branch_l), t ; 55 | particles.(l) <- particles.(i - 1) ; 56 | loop (i - 1) 57 | in 58 | loop n 59 | 60 | (* TODO: check the implementation against the TESS R package, for 61 | * instance by inspecting the distribution of a summary statistics 62 | * like the gamma on simulation from both implementations *) 63 | let age_ntaxa_simulation ?sampling_probability:(rho = 1.) p rng ~age ~ntaxa = 64 | if Float.(p.birth_rate <= p.death_rate) then invalid_arg "expected birth_rate > death_rate" ; 65 | let n_inner_nodes = ntaxa - 1 in 66 | if n_inner_nodes < 1 then invalid_arg "not enough taxa" ; 67 | let u = Array.init n_inner_nodes ~f:(fun _ -> Rng.uniform rng) in 68 | let b = p.birth_rate and d = p.death_rate in 69 | let speciation_times = Array.map u ~f:Float.(fun u -> 70 | age 71 | - 72 | ( 73 | log ( 74 | ( 75 | (b - d) 76 | / (1. - u * (1. - ((b-d)*exp((d-b)*age))/(rho*b+(b*(1.-rho)-d)*exp((d-b)*age) ) ) ) 77 | - (b * (1. - rho) - d) 78 | ) 79 | / 80 | (rho * b) 81 | ) 82 | + (d - b) * age ) / (d-b) 83 | ) 84 | in 85 | Array.sort speciation_times ~compare:Float.compare ; 86 | sample_branch rng speciation_times 87 | -------------------------------------------------------------------------------- /lib/birth_death.mli: -------------------------------------------------------------------------------- 1 | (** Simulation for birth-death models 2 | 3 | Provides functionality for simulating tree structures given a 4 | certain rate for speciation (births) and extinction (deaths) 5 | rates. *) 6 | 7 | type t = private { 8 | birth_rate : float ; 9 | death_rate : float ; 10 | } 11 | 12 | val make : birth_rate:float -> death_rate:float -> t 13 | (** [make ~birth_rate ~death_rate] creates a birth death parameter object 14 | with the given birth rate and death rate. It raises an [Invalid_argument] 15 | exception if either the birth rate or the death rate is negative. *) 16 | 17 | val simulation : 18 | t -> 19 | Gsl.Rng.t -> 20 | time:float -> 21 | (int, int, float) Tree.branch 22 | (** [simulation p rng ~time] simulates a birth death process using the 23 | parameters [p] and the random number generator [rng] for the given 24 | [time] duration. It returns a tree structure where nodes have an 25 | integer ID and where branches are annotated with their length. *) 26 | 27 | val age_ntaxa_simulation : 28 | ?sampling_probability:float -> 29 | t -> 30 | Gsl.Rng.t -> 31 | age:float -> 32 | ntaxa:int -> 33 | (unit, int, float) Tree.t 34 | (** [age_ntaxa_simulation p rng ~age ~ntaxa] simulates a birth death 35 | process of parameters [p] using random generator [rng], 36 | conditioned on the age of the MRCA being [age] and having [ntaxa] 37 | leaves. 38 | Raises [Invalid_argument] if the death rate is greater than the 39 | birth rate. The algorithm is adapted from the TESS R package, see 40 | function [tess.sim.taxa.age.constant], and the underlying algorithm 41 | is described in "Inferring Speciation and Extinction Rates under 42 | Different Sampling Schemes" by Sebastian Höhna et al. *) 43 | -------------------------------------------------------------------------------- /lib/bppsuite.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | type alphabet = 4 | | DNA 5 | | RNA 6 | | Protein 7 | | Binary 8 | | Word of { letter : [`DNA | `RNA | `Protein] ; 9 | length : int } 10 | | Codon of { letter : [`DNA | `RNA] } 11 | 12 | let string_of_letter = function 13 | | `DNA -> "DNA" 14 | | `RNA -> "RNA" 15 | | `Protein -> "Protein" 16 | 17 | let string_of_alphabet = function 18 | | DNA -> "DNA" 19 | | RNA -> "RNA" 20 | | Protein -> "Protein" 21 | | Binary -> "Binary" 22 | | Word { letter ; length } -> 23 | sprintf "Word(letter=%s, length=%d)" 24 | (string_of_letter letter) length 25 | | Codon { letter } -> 26 | sprintf "Codon(letter=%s)" (string_of_letter letter) 27 | 28 | type model = 29 | | JC69 30 | | K80 of { kappa : float option } 31 | 32 | let string_of_model = function 33 | | JC69 -> "JC69" 34 | | K80 { kappa } -> 35 | Option.value_map kappa ~default:"" ~f:(sprintf "kappa=%f") 36 | |> sprintf "K80(%s)" 37 | 38 | module Cmd = struct 39 | let bppml ~alphabet ~model ~input_tree_file ~input_sequence_file ?output_tree_file () = 40 | Printf.sprintf 41 | "bppml \ 42 | input.tree.file=%s \ 43 | input.sequence.file=%s \ 44 | alphabet=\"%s\" \ 45 | model=\"%s\" \ 46 | %s \ 47 | optimization=None" 48 | input_tree_file input_sequence_file 49 | (string_of_alphabet alphabet) 50 | (string_of_model model) 51 | (Option.value_map output_tree_file ~default:"" ~f:(sprintf "output.tree.file=%s")) 52 | 53 | let bppseqgen ~alphabet ~model ~number_of_sites ~input_tree_file ~output_sequence_file = 54 | Printf.sprintf 55 | "bppseqgen \ 56 | input.tree.file=%s \ 57 | output.sequence.file=%s \ 58 | alphabet=\"%s\" \ 59 | model=\"%s\" \ 60 | number_of_sites=%d" 61 | input_tree_file output_sequence_file (string_of_alphabet alphabet) 62 | (string_of_model model) number_of_sites 63 | end 64 | -------------------------------------------------------------------------------- /lib/bppsuite.mli: -------------------------------------------------------------------------------- 1 | (** Utility functions to call programs from the Bio++ library *) 2 | 3 | type alphabet = 4 | | DNA 5 | | RNA 6 | | Protein 7 | | Binary 8 | | Word of { letter : [`DNA | `RNA | `Protein] ; 9 | length : int } 10 | | Codon of { letter : [`DNA | `RNA] } 11 | 12 | val string_of_alphabet : alphabet -> string 13 | 14 | type model = 15 | | JC69 16 | | K80 of { kappa : float option } 17 | 18 | val string_of_model : model -> string 19 | 20 | module Cmd : sig 21 | val bppml : 22 | alphabet:alphabet -> 23 | model:model -> 24 | input_tree_file:string -> 25 | input_sequence_file:string -> 26 | ?output_tree_file:string -> 27 | unit -> 28 | string 29 | 30 | val bppseqgen : 31 | alphabet:alphabet -> 32 | model:model -> 33 | number_of_sites:int -> 34 | input_tree_file:string -> 35 | output_sequence_file:string -> 36 | string 37 | end 38 | -------------------------------------------------------------------------------- /lib/discrete_pd.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | type t = { 4 | shift : int ; 5 | weights : float array ; 6 | } 7 | 8 | let is_leaf dpd i = i >= dpd.shift 9 | 10 | let init n ~f = 11 | let shift = Float.(to_int (2. ** round_up (log (float n) /. log 2.))) - 1 in 12 | let m = shift + n in 13 | let weights = Array.create ~len:m 0. in 14 | for i = 0 to n - 1 do 15 | weights.(shift + i) <- f (i) 16 | done ; 17 | for i = shift - 1 downto 0 do 18 | if 2 * i + 1 < m then weights.(i) <- weights.(2 * i + 1) ; 19 | if 2 * i + 2 < m then weights.(i) <- weights.(i) +. weights.(2 * i + 2) 20 | done ; 21 | { shift ; weights } 22 | 23 | let draw dpd rng = 24 | let x = dpd.weights.(0) *. Gsl.Rng.uniform rng in 25 | let rec loop acc i = 26 | if is_leaf dpd i then i 27 | else if Float.( >= ) (acc +. dpd.weights.(2 * i + 1)) x then 28 | loop acc (2 * i + 1) 29 | else loop (acc +. dpd.weights.(2 * i + 1)) (2 * i + 2) 30 | in 31 | loop 0. 0 - dpd.shift 32 | 33 | let update dpd i w_i = 34 | let m = Array.length dpd.weights in 35 | let j = i + dpd.shift in 36 | dpd.weights.(j) <- w_i ; 37 | let rec loop k = 38 | dpd.weights.(k) <- dpd.weights.(2 * k + 1) ; 39 | if 2 * k + 2 < m then dpd.weights.(k) <- dpd.weights.(k) +. dpd.weights.(2 * k + 2) ; 40 | if k > 0 then loop ((k - 1) / 2) 41 | in 42 | loop ((j - 1) / 2) 43 | 44 | let total_weight dpd = dpd.weights.(0) 45 | 46 | let demo ~n ~ncat = 47 | let rng = Gsl.Rng.(make (default ())) in 48 | let probs = Array.init ncat ~f:(fun _ -> Gsl.Rng.uniform rng) in 49 | let sum = Array.fold probs ~init:0. ~f:( +. ) in 50 | let pd = init ncat ~f:(fun _ -> 0.) in 51 | let counts = Array.create ~len:ncat 0 in 52 | Array.iteri probs ~f:(update pd) ; 53 | for _ = 1 to n do 54 | let k = draw pd rng in 55 | counts.(k) <- counts.(k) + 1 56 | done ; 57 | Array.map probs ~f:(fun x -> x /. sum), 58 | Array.map counts ~f:(fun k -> float k /. float n) 59 | -------------------------------------------------------------------------------- /lib/discrete_pd.mli: -------------------------------------------------------------------------------- 1 | (** Updatable discrete probability distribution 2 | 3 | This module provides functionality for working with discrete 4 | probability distributions. A discrete probability distribution 5 | (DPD) represents the probability distribution of a discrete random 6 | variable, where each possible value (category) has an associated 7 | probability weight. 8 | 9 | The DPD in this module supports operations such as initialization, 10 | updating category weights, sampling categories according to their 11 | probabilities, and calculating the total weight of the 12 | distribution. 13 | 14 | The DPD module is designed to be used in various applications that 15 | involve discrete probability distributions, such as stochastic 16 | simulations, Markov chain Monte Carlo (MCMC) methods, and 17 | probabilistic modeling. 18 | 19 | Example usage: 20 | 21 | {[ 22 | let dpd = Discrete_pd.init 3 ~f:(fun i -> float (i + 1)) 23 | Discrete_pd.update dpd 1 4.0 24 | let rng = Gsl.Rng.(make (default ())) 25 | let category = Discrete_pd.draw dpd rng 26 | ]} 27 | 28 | In the example above, a discrete probability distribution (dpd) 29 | with 3 categories is initialized, and the weight of category 1 is 30 | updated. Then, a category index is sampled using the DPD and a 31 | random number generator (rng). 32 | *) 33 | 34 | type t 35 | 36 | val init : int -> f:(int -> float) -> t 37 | (** [init n ~f] initializes an updatable discrete probability 38 | distribution with [n] categories. It takes a function [f] that 39 | specifies the weight for each category. 40 | 41 | Example: 42 | {[ 43 | let dpd = init 3 ~f:(fun i -> float (i + 1)) 44 | ]} 45 | 46 | This initializes a discrete probability distribution with 3 47 | categories and assigns weights 1.0, 2.0, and 3.0 to each category 48 | respectively. *) 49 | 50 | val update : t -> int -> float -> unit 51 | (** [update dpd i w_i] updates the weight of category [i] in the 52 | discrete probability distribution [dpd] to the new weight [w_i]. 53 | 54 | Example: 55 | {[ 56 | let dpd = init 3 ~f:(fun i -> float (i + 1)) 57 | update dpd 1 4.0 58 | ]} 59 | 60 | This updates the weight of category 1 in the discrete probability 61 | distribution [dpd] to 4.0. *) 62 | 63 | val draw : t -> Gsl.Rng.t -> int 64 | (** [draw dpd rng] samples a category index from the discrete 65 | probability distribution [dpd] using the random number generator 66 | [rng]. It returns the index of the sampled category. 67 | 68 | Example: 69 | {[ 70 | let dpd = init 3 ~f:(fun i -> float (i + 1)) 71 | let rng = Gsl.Rng.(make (default ())) 72 | let category = draw dpd rng 73 | ]} 74 | *) 75 | 76 | val total_weight : t -> float 77 | (** [total_weight dpd] returns the total weight of all categories in 78 | the discrete probability distribution [dpd]. 79 | 80 | Example: 81 | {[ 82 | let dpd = init 3 ~f:(fun i -> float (i + 1)) 83 | let weight = total_weight dpd 84 | ]} 85 | 86 | This calculates the total weight of all categories in the discrete 87 | probability distribution [dpd] and assigns it to the variable 88 | [weight]. *) 89 | 90 | val demo : n:int -> ncat:int -> float array * float array 91 | (** [demo ~n ~ncat] is a demo function that generates a discrete 92 | probability distribution with [ncat] categories and generates a 93 | sample of size [n] from it. It returns a tuple formed by the 94 | probability distribution and the frequencies in the sample. 95 | 96 | Example: 97 | {[ 98 | let probabilities, counts = demo ~n:1000 ~ncat:5 99 | ]} 100 | 101 | This generates random probabilities, creates an updatable discrete 102 | probability distribution with 5 categories, performs 1000 sampling 103 | trials, and assigns the generated probabilities normalized by 104 | their sum to the variable [probabilities] and the counts of each 105 | sampled category divided by 1000 to the variable [counts]. *) 106 | -------------------------------------------------------------------------------- /lib/dna.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | type t = string 4 | 5 | let of_string_unsafe x = x 6 | 7 | let of_string_exn s = 8 | match String.find s ~f:(function 9 | | 'a' | 'A' | 'c' | 'C' 10 | | 'g' | 'G' | 't' | 'T' -> false 11 | | _ -> true 12 | ) 13 | with 14 | | None -> s 15 | | Some c -> invalid_argf "of_string_exn: unexpected character '%c'" c () 16 | 17 | 18 | let of_codons codons = 19 | Array.map codons ~f:Codon.Universal_genetic_code.NS.to_string 20 | |> String.concat_array 21 | |> of_string_exn 22 | 23 | let gc_contents s = 24 | let n = String.count s ~f:(function 25 | | 'C' | 'G' -> true 26 | | 'A' | 'T' -> false 27 | | _ -> assert false 28 | ) 29 | in 30 | float n /. float (String.length s) 31 | -------------------------------------------------------------------------------- /lib/dna.mli: -------------------------------------------------------------------------------- 1 | (** Strings that represent valid DNA sequences *) 2 | 3 | type t = private string 4 | 5 | val of_string_unsafe : string -> t 6 | (** [of_string_unsafe s] creates a DNA sequence from a string without validation. 7 | {[ 8 | let dna_sequence = of_string_unsafe "ACGT" in 9 | (* dna_sequence is now "ACGT". If an unexpected character is encountered, the result is undefined. *) 10 | ]} 11 | *) 12 | 13 | val of_string_exn : string -> t 14 | (** [of_string_exn s] creates a DNA sequence from a string, raising an 15 | exception if an unexpected character is encountered. Raises 16 | [Invalid_argument] if an unexpected character is found. 17 | 18 | Example: 19 | {[ 20 | let dna_sequence = of_string_exn "ACGT" in 21 | (* dna_sequence is now "ACGT" *) 22 | ]} 23 | *) 24 | 25 | val of_codons : Codon.Universal_genetic_code.NS.t array -> t 26 | (** [of_codons codons] creates a DNA sequence from an array of codons. 27 | 28 | Example: 29 | 30 | {[ 31 | let codons = [|Codon.Universal_genetic_code.NS.Adenine; Codon.Universal_genetic_code.NS.Cytosine; Codon.Universal_genetic_code.NS.Guanine; Codon.Universal_genetic_code.NS.Thymine|] in 32 | let dna_sequence = of_codons codons in 33 | (* dna_sequence is now "ACGT" *) 34 | ]} 35 | *) 36 | 37 | val gc_contents : t -> float 38 | (** [gc_contents s] calculates the GC (Guanine/Cytosine) contents of a 39 | DNA sequence, that is it counts the occurrences of 'C' and 'G' in 40 | [s] and returns the ratio of GC bases to the total length of the 41 | sequence. 42 | 43 | Example: 44 | 45 | {[ 46 | let dna_sequence = of_string_exn "ACGT" in 47 | let gc_content = gc_contents dna_sequence in 48 | (* gc_content is approximately 0.5 *) 49 | ]} 50 | *) 51 | -------------------------------------------------------------------------------- /lib/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name phylogenetics) 3 | (public_name phylogenetics) 4 | (inline_tests 5 | (deps ../tests/data/wag.dat)) 6 | (libraries 7 | angstrom.unix 8 | biotk 9 | core 10 | gsl 11 | lacaml 12 | yojson 13 | menhirLib 14 | printbox 15 | printbox-text 16 | rresult) 17 | (preprocess 18 | (pps ppx_deriving.show ppx_inline_test ppx_jane))) 19 | 20 | (menhir 21 | (modules newick_parser) 22 | (flags --table)) 23 | 24 | (ocamllex newick_lexer) 25 | 26 | ;; No built-in support for Menhir's parser messages yet 27 | 28 | (rule 29 | (with-stdout-to 30 | newick_parser.messages.new 31 | (run menhir %{dep:newick_parser.mly} --list-errors))) 32 | 33 | (rule 34 | (with-stdout-to 35 | newick_parser_errors.ml 36 | (run 37 | menhir 38 | %{dep:newick_parser.mly} 39 | --compile-errors 40 | %{dep:newick_parser.messages}))) 41 | 42 | (rule 43 | (with-stdout-to 44 | newick_parser.messages.updated 45 | (run 46 | menhir 47 | %{dep:newick_parser.mly} 48 | --update-errors 49 | %{dep:newick_parser.messages}))) 50 | 51 | (rule 52 | (alias update-newick_parser-messages) 53 | (action 54 | (diff newick_parser.messages newick_parser.messages.updated))) 55 | 56 | (rule 57 | (alias create-newick_parser-messages) 58 | (action 59 | (diff newick_parser.messages newick_parser.messages.new))) 60 | 61 | (documentation 62 | (package phylogenetics)) 63 | -------------------------------------------------------------------------------- /lib/felsenstein.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetic_tree 3 | 4 | module type Alignment = sig 5 | type t 6 | type base 7 | val get_base : t -> seq:string -> pos:int -> base 8 | val length : t -> int 9 | end 10 | 11 | module Make 12 | (A : Alphabet.S) 13 | (Align : Alignment with type base := A.t) 14 | (E : Site_evolution_model.S with type mat := A.matrix 15 | and type vec := A.vector) = 16 | struct 17 | let known_vector x = A.Vector.init (fun y -> if A.equal x y then 1. else 0.) 18 | 19 | (* ======================= *) 20 | (* | Generic Felsenstein | *) 21 | (* ======================= *) 22 | let felsenstein_single ?(shift=fun _ _ v->v,0.0) param = 23 | (* First, specialize the eMt function to compute the diagonalization 24 | of the transition matrix once and for all. *) 25 | let spec_eMt = E.transition_probability_matrix param in 26 | 27 | fun ~site tree seq -> 28 | let rec aux = function (* then, go recursively through the topology tree *) 29 | | Leaf {index; _} -> leaf index 30 | | Node {left=l1,t1; right=l2,t2; _} -> node l1 t1 l2 t2 31 | 32 | (* On leaves, the probability vector is of the form (0,0,..,0,1,0,...0) 33 | where the position of the 1 is the position of the observed base. *) 34 | and leaf i = Align.get_base seq ~seq:i ~pos:site 35 | |> known_vector |> shift 0.0 0.0 36 | (* there is no need to shift here, but the function is called anyways in case 37 | shift does not return the vector directly*) 38 | 39 | and node l1 t1 l2 t2 = 40 | let (v_l, s_l), (v_r, s_r) = aux t1, aux t2 in (* recursive calls *) 41 | A.Vector.mul 42 | (A.Matrix.apply (spec_eMt l1) v_l) (* vector of child x exp(branch_length x transition matrix) *) 43 | (A.Matrix.apply (spec_eMt l2) v_r) 44 | |> shift s_l s_r (* shift is a function that is responsible for preventing float underflows, 45 | it is allowed to divide the result and carry the log along the tree*) 46 | 47 | in let res_vec, res_shift = aux tree in 48 | res_vec |> A.Vector.mul (E.stationary_distribution param) |> A.Vector.sum |> log |> (+.) res_shift 49 | (* In the end, multiply result vector by the static distribution of the model, 50 | then sum elements to get the likelihood, then take its log and add the log shifts 51 | that were performed to avoid underflows. *) 52 | 53 | 54 | (* ============================ *) 55 | (* | Specific implementations | *) 56 | (* ============================ *) 57 | let shift_normal thre acc1 acc2 v = 58 | (* if the min value of the vector is below a certain threshold... *) 59 | if Float.(A.Vector.min v > thre) then (v, acc1 +. acc2) 60 | else 61 | let mv = A.Vector.max v in (* then divide by its max and add the log of the max to *) 62 | (A.Vector.scal_mul (1.0 /. mv) v, acc1 +. acc2 +. (log mv)) (* an accumulator *) 63 | 64 | let felsenstein_single_shift ?threshold:(threshold=0.0000001) param = 65 | felsenstein_single 66 | ~shift:(shift_normal threshold) 67 | param 68 | 69 | 70 | (* ======================= *) 71 | (* | Multi-site versions | *) 72 | (* ======================= *) 73 | let multisite (f: site:int -> Phylogenetic_tree.t -> Align.t -> float) tree seq = 74 | let l = Align.length seq in 75 | List.fold (List.range 0 l) ~init:0.0 ~f:(fun acc x -> (f ~site:x tree seq) +. acc) 76 | 77 | let felsenstein_noshift param = multisite (felsenstein_single param) 78 | let felsenstein param = multisite (felsenstein_single_shift param) 79 | 80 | end 81 | -------------------------------------------------------------------------------- /lib/felsenstein.mli: -------------------------------------------------------------------------------- 1 | (** Deprecated, see {! Phylo_ctmc}. Functions that implement Felsenstein's "pruning" algorithm to compute 2 | likelihood of phylogenetic trees with known sequences at leaves.*) 3 | 4 | module type Alignment = sig 5 | type t 6 | type base 7 | val get_base : t -> seq:string -> pos:int -> base 8 | val length : t -> int 9 | end 10 | 11 | module Make 12 | (A : Alphabet.S) 13 | (Align : Alignment with type base := A.t) 14 | (E : Site_evolution_model.S with type mat := A.matrix 15 | and type vec := A.vector) : 16 | sig 17 | (** Single-site. felsenstein without underflow prevention. *) 18 | val felsenstein_single : 19 | ?shift:(float -> float -> A.vector -> A.vector * float) -> 20 | E.param -> 21 | site:int -> 22 | Phylogenetic_tree.t -> 23 | Align.t -> 24 | float 25 | 26 | (** Single-site felsenstein with underflow prevention (configure threshold through threshold parameter). *) 27 | val felsenstein_single_shift : 28 | ?threshold:float -> 29 | E.param -> 30 | site:int -> 31 | Phylogenetic_tree.t -> 32 | Align.t -> 33 | float 34 | 35 | (** Multisite felsenstein without underflow prevention. *) 36 | val felsenstein_noshift : 37 | E.param -> 38 | Phylogenetic_tree.t -> 39 | Align.t -> 40 | float 41 | 42 | (** Multisite felsenstein with underflow prevention (use this by default). *) 43 | val felsenstein : 44 | E.param -> 45 | Phylogenetic_tree.t -> 46 | Align.t -> 47 | float 48 | end 49 | -------------------------------------------------------------------------------- /lib/fitch.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | let default_cost x y = 4 | if x = y then 0. else 1. 5 | 6 | let rec forward ?(cost = default_cost) ~n ~category (t : (_,'l,_) Tree.t) = 7 | match t with 8 | | Leaf l -> 9 | let costs = 10 | match category l with 11 | | Some cat -> 12 | if cat < 0 || cat >= n then invalid_arg "category returned integer not in [0;n[" ; 13 | Array.init n ~f:(fun i -> if i = cat then 0. else Float.infinity) 14 | | None -> Array.create ~len:n 0. 15 | in 16 | costs, Tree.leaf l 17 | | Node node -> 18 | let children_costs, children = 19 | List1.map node.branches ~f:(fun (Branch b) -> 20 | let cost, child = forward ~cost ~n ~category b.tip in 21 | cost, Tree.branch b.data child 22 | ) 23 | |> List1.unzip 24 | in 25 | let costs, choices = 26 | Array.init n ~f:(fun i -> 27 | let costs_for_root_i = 28 | List1.map children_costs ~f:(fun costs -> 29 | let cost j = costs.(j) +. cost i j in 30 | let rec loop j best_cost best_choice = 31 | if j = n then (best_cost, best_choice) 32 | else 33 | let candidate_cost = cost j in 34 | if Float.(candidate_cost < best_cost) then 35 | loop (j + 1) candidate_cost j 36 | else 37 | loop (j + 1) best_cost best_choice 38 | in 39 | loop 1 (cost 0) 0 40 | ) 41 | in 42 | let costs, choices = List1.unzip costs_for_root_i in 43 | let total_cost = List1.fold costs ~init:0. ~f:( +. ) in 44 | total_cost, choices 45 | ) 46 | |> Array.unzip 47 | in 48 | costs, Tree.node (node.data, choices) children 49 | 50 | let rec backward_aux t i = match t with 51 | | Tree.Leaf l -> Tree.leaf (l, i) 52 | | Node n -> 53 | Tree.node (fst n.data, i) ( 54 | List1.map2_exn n.branches (snd n.data).(i) ~f:(fun (Branch b) choice -> 55 | Tree.branch b.data (backward_aux b.tip choice) 56 | ) 57 | ) 58 | 59 | let array_min_elt_index xs ~compare = 60 | match xs with 61 | | [| |] -> None 62 | | _ -> 63 | let n = Array.length xs in 64 | let rec loop acc i = 65 | if i >= n then Some acc 66 | else 67 | let acc' = 68 | match compare xs.(acc) xs.(i) with 69 | | -1 70 | | 0 -> acc 71 | | 1 -> i 72 | | _ -> assert false 73 | in 74 | loop acc' (i + 1) 75 | in 76 | loop 0 1 77 | 78 | let backward costs t = 79 | match array_min_elt_index costs ~compare:Float.compare with 80 | | None -> assert false 81 | | Some root -> backward_aux t root 82 | 83 | let fitch ?cost ~n ~category t = 84 | let costs, routing = forward ?cost ~n ~category t in 85 | backward costs routing 86 | 87 | let%expect_test "fitch" = 88 | let node x y = Tree.node () List1.(cons (Tree.branch () x) [ Tree.branch () y ]) in 89 | let leaf x = Tree.leaf x in 90 | let t = node (node (leaf 0) (leaf 1)) (leaf 0) in 91 | let p (_, i) = Int.to_string i in 92 | fitch ~category:Option.return ~n:2 t 93 | |> Tree.to_printbox ~leaf:p ~node:p 94 | |> PrintBox_text.output stdout ; 95 | [%expect {| 96 | 0 97 | ├─0 98 | │ ├─0 99 | │ └─1 100 | └─0 |}] 101 | 102 | let%expect_test "fitch_2" = 103 | let node x y = Tree.node () List1.(cons (Tree.branch () x) [ Tree.branch () y ]) in 104 | let leaf x = Tree.leaf x in 105 | let p (_, i) = Int.to_string i in 106 | let t = node (node (leaf 0) (leaf 1)) (node (leaf 1) (leaf 2)) in 107 | fitch ~n:3 ~category:Option.return t 108 | |> Tree.to_printbox ~leaf:p ~node:p 109 | |> PrintBox_text.output stdout ; 110 | [%expect {| 111 | 1 112 | ├─1 113 | │ ├─0 114 | │ └─1 115 | └─1 116 | ├─1 117 | └─2 |}] 118 | -------------------------------------------------------------------------------- /lib/index.mld: -------------------------------------------------------------------------------- 1 | {0 Phylogenetics} 2 | 3 | This library provides data structures and algorithms to parse, 4 | transform and draw phylogenies, perform simulations or statistical 5 | inference on evolutionary models. 6 | 7 | The full API is available {{! Phylogenetics}here}. 8 | 9 | {1 Quickstart} 10 | 11 | Let's start using phylogenetics in the ocaml interpreter, by calling 12 | `utop` or `ocaml` (with `down` installed for instance): 13 | {[ 14 | # #require "phylogenetics";; 15 | [...] 16 | ]} 17 | 18 | {2 Loading a newick tree} 19 | 20 | Several functions are available in the {! Phylogenetics.Newick} module to parse from 21 | a file or a string: 22 | {[ 23 | # let t = Phylogenetics.Newick.from_string_exn "((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;";; 24 | val t : Phylogenetics.Newick.t = 25 | {Phylogenetics.Newick.name = Some "A"; tags = []; parent_branch = None; 26 | children = 27 | [{Phylogenetics.Newick.name = Some "F"; tags = []; 28 | parent_branch = Some 0.1; 29 | children = 30 | [...] 31 | ]} 32 | -------------------------------------------------------------------------------- /lib/iupac_nucleotide.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | include Alphabet.Make(struct let card = 16 end) 4 | 5 | let chars_of_aa_string = "ACGTRYSWKMBDHVN" 6 | 7 | let code_A = Char.to_int 'A' 8 | 9 | let iupac_of_chars = 10 | let t = Array.create ~len:26 None in 11 | for i = 0 to String.length chars_of_aa_string - 1 do 12 | t.(Char.to_int chars_of_aa_string.[i] - code_A) <- Some i 13 | done ; 14 | t 15 | 16 | let of_char = function 17 | | 'A'..'Z' as c -> iupac_of_chars.(Char.to_int c - code_A) 18 | | _ -> None 19 | 20 | let is_ambiguous c = c > 3 21 | 22 | let%test "IUPAC ambiguity" = 23 | Option.map (of_char 'R') ~f:is_ambiguous 24 | |> Option.value ~default:false 25 | -------------------------------------------------------------------------------- /lib/iupac_nucleotide.mli: -------------------------------------------------------------------------------- 1 | (** Nucleotide IUPAC symbol 2 | 3 | IUPAC symbols are characters that can be used to represent a 4 | subset of nucleotides, for instance when the nucleotide at a given 5 | position is not precisely known. A IUPAC symbol is said {i 6 | ambiguous} if it corresponds to a set of size at least 2. 7 | 8 | Valid IUPAC symbols for nucleotides can be found {{: 9 | https://www.bioinformatics.org/sms/iupac.html} here}. 10 | *) 11 | 12 | include Alphabet.S_int 13 | 14 | val of_char : char -> t option 15 | (** [of_char c] returns the IUPAC symbol corresponding to the given 16 | character [c]. It returns [None] if the character is not a valid 17 | IUPAC nucleotide symbol. 18 | 19 | Example: 20 | {[ 21 | let symbol = Iupac_nucleotide.of_char 'A' in 22 | (* symbol = Some 0 *) 23 | ]} 24 | *) 25 | 26 | val is_ambiguous : t -> bool 27 | (** [is_ambiguous sym] checks if the given IUPAC symbol [sym] 28 | represents an ambiguous nucleotide. It returns [true] if the 29 | symbol is ambiguous, and [false] otherwise. 30 | 31 | Example: 32 | {[ 33 | let ambiguous = Iupac_nucleotide.is_ambiguous 4 in 34 | (* ambiguous = true *) 35 | ]} 36 | *) 37 | -------------------------------------------------------------------------------- /lib/let_syntax.ml: -------------------------------------------------------------------------------- 1 | module Result = struct 2 | let (let*) = Result.bind 3 | let (let+) x f = Result.map f x 4 | end 5 | 6 | module Option = struct 7 | let (let*) = Option.bind 8 | let (let+) x f = Option.map f x 9 | let (and+) x y = 10 | match x, y with 11 | | Some x, Some y -> Some (x, y) 12 | | Some _, None 13 | | None, Some _ 14 | | None, None -> None 15 | end 16 | -------------------------------------------------------------------------------- /lib/linear_algebra_tools.ml: -------------------------------------------------------------------------------- 1 | let rec fact n = if n=0 || n=1 then 1 else n * (fact (n-1)) 2 | 3 | module Lacaml = struct 4 | open Lacaml.D 5 | 6 | type mat = Lacaml__Float64.mat 7 | type vec = Lacaml__Float64.vec 8 | 9 | let mat_vec_mul m v = gemv m v 10 | 11 | let inplace_scal_mat_mul f a = Mat.scal f a 12 | 13 | let scal_mat_mul f a = 14 | let r = lacpy a in 15 | inplace_scal_mat_mul f r ; 16 | r 17 | 18 | let inplace_scal_vec_mul s v = scal s v 19 | 20 | let scal_vec_mul s v = 21 | let r = copy v in 22 | scal s r ; r 23 | 24 | let scal_vec_add s v = Lacaml.D.Vec.add_const s v 25 | 26 | module Mat = struct 27 | type t = mat 28 | let init size ~f = Mat.init_rows size size f 29 | let init_diag v = Mat.of_diag v 30 | let mul a ?alpha:(al=1.) b = gemm a b ~alpha:al 31 | 32 | let row mat r = Mat.copy_row mat r (* FIXME: costly operation! *) 33 | 34 | (* FIXME: do a lot better by breaking in 2 exponent for even 35 | exponent *) 36 | let rec pow a ?alpha:(al=1.) n = 37 | if n = 0 then 38 | Mat.init_cols (Mat.dim1 a) (Mat.dim2 a) (fun x y -> if x=y then 1.0 else 0.0) 39 | else if n=1 then a 40 | else if n=2 then pow a (n-1) ~alpha:al |> mul a ~alpha:al 41 | else pow a (n-1) ~alpha:al |> mul a 42 | 43 | let add a b = Mat.add a b 44 | 45 | let expm a = 46 | (* matrix exponentiation using the series *) 47 | let rec aux i acc = 48 | if i<15 then 49 | let factor = 1. /. (float_of_int (fact i)) in 50 | let newacc = add (pow a i ~alpha:factor) acc in 51 | (aux (i+1) newacc) 52 | else 53 | acc 54 | in aux 0 (Mat.make0 (Mat.dim1 a) (Mat.dim2 a)) 55 | 56 | let log m = Mat.log m 57 | 58 | let compare ~tol:p m1 m2 = 59 | let diff = add m1 (scal_mat_mul (-1.) m2) in (* substract two matrices *) 60 | let relative_diff = (* element-wise diff/m1 *) 61 | mul diff (Mat.map (fun x -> 1./.x) m1) 62 | |> Mat.abs 63 | in 64 | let maxdiff = Mat.fold_cols (fun acc vec -> max acc (Vec.max vec)) 0.0 relative_diff in 65 | maxdiff <= p 66 | 67 | let get m i j = m.{i,j} 68 | 69 | let inverse m = 70 | let tmp = lacpy m in (* copy matrix to avoid erasing original *) 71 | let tmp_vec = getrf tmp in (* getri requires a previous call to getrf (LU factorization) *) 72 | getri ~ipiv:tmp_vec tmp ; (* inversion *) 73 | tmp 74 | 75 | let diagonalize m = 76 | let tmp = lacpy m in (* copy matrix to avoid erasing original *) 77 | match syevr ~vectors:true tmp with (* syevr = find eigenvalues and eigenvectors *) 78 | | (_,v,c,_) -> c, v, Mat.transpose_copy c (* extract only the relevant data *) 79 | 80 | let pp = pp_mat 81 | end 82 | 83 | module Vec = struct 84 | type t = vec 85 | let init size ~f = Vec.init size (fun i -> f (i - 1)) 86 | let add v1 v2 = Vec.add v1 v2 87 | let mul v1 v2 = Vec.mul v1 v2 88 | let sum v = Vec.sum v 89 | let log v = Vec.log v 90 | let exp v = Vec.exp v 91 | let min v = Vec.min v 92 | let max v = Vec.max v 93 | let get v i = v.{i} 94 | let pp = pp_vec 95 | end 96 | 97 | let stat_dist m = 98 | (* copy matrix to avoid erasing original *) 99 | let tmp = lacpy m in 100 | (* get eigenvector for eigenvalue 0 *) 101 | match syevr ~vectors:true ~range:(`V(-0.001,0.001)) tmp with 102 | | (_,_,c,_) -> 103 | let vec = Lacaml.D.Mat.col c 1 in 104 | (* normalize so the sum of elements equals 1 *) 105 | scal_vec_mul (1. /. (Vec.sum vec)) vec 106 | 107 | end 108 | -------------------------------------------------------------------------------- /lib/linear_algebra_tools.mli: -------------------------------------------------------------------------------- 1 | (** Deprecated, see {! Linear_algebra}. This modules wraps linear algebra functions (from Lacaml) 2 | and provide a few completely new functions (such as exponentiation)*) 3 | 4 | module Lacaml : sig 5 | (** A square matrix of floats. *) 6 | type mat = Lacaml.D.mat 7 | 8 | (** A vector of floats. *) 9 | type vec = Lacaml.D.vec 10 | 11 | module Vec : sig 12 | type t = vec 13 | 14 | (** Initialises a vector from a int->float function. *) 15 | val init : int -> f:(int -> float) -> vec 16 | 17 | (** Vector addition. *) 18 | val add : vec -> vec -> vec 19 | 20 | (** Element-wise product of two vectors. *) 21 | val mul : vec -> vec -> vec 22 | 23 | (** Sum of the elements of a vector. *) 24 | val sum : vec -> float 25 | 26 | (** Element-wise logarithm of vector *) 27 | val log : vec -> vec 28 | 29 | (** Element-wise exponential of matrix*) 30 | val exp : vec -> vec 31 | 32 | (** Minimum element in a vector. *) 33 | val min : vec -> float 34 | 35 | (** Maximum element in a vector. *) 36 | val max : vec -> float 37 | 38 | (** Access a specific element of a vector. *) 39 | val get : vec -> int -> float 40 | 41 | (** Prints a vector to the standard output. *) 42 | val pp : Format.formatter -> vec -> unit 43 | end 44 | 45 | module Mat : sig 46 | type t = mat 47 | 48 | (** {5 Matrix and vector creation} *) 49 | 50 | (** Initialises a square matrix from a int->int->float function. *) 51 | val init : int -> f:(int -> int -> float) -> mat 52 | 53 | (** Initializes a square diagonal matrix from the vector of its diagonal elements. *) 54 | val init_diag : vec -> mat 55 | 56 | (** Computes the product of two matrices. 57 | If optional argument alpha is provided then the result is also 58 | multiplied by scalar alpha.*) 59 | val mul : mat -> ?alpha:float -> mat -> mat 60 | 61 | (** Elevates a matrix to an integer power. 62 | If optional argument alpha is provided then the result is also 63 | multiplied by scalar alpha. *) 64 | val pow: mat -> ?alpha:float -> int -> mat 65 | 66 | (** Matrix addition. *) 67 | val add : mat -> mat -> mat 68 | 69 | (** Matrix exponentiation. *) 70 | val expm : mat -> mat 71 | 72 | (** Element-wise logarithm of matrix *) 73 | val log : mat -> mat 74 | 75 | (** Compares two matrices and tolerates a certain relative difference. 76 | Let f be the float parameter, it returns true iff the elements of the second matrix 77 | are between 1-f and 1+f times the corresponding elements of the first *) 78 | val compare: tol:float -> mat -> mat -> bool 79 | 80 | (** Access a specific element of a matrix. *) 81 | val get : mat -> int -> int -> float 82 | 83 | (** Copy row from a matrix *) 84 | val row : mat -> int -> vec 85 | 86 | (** Diagonalizes a matrix M so that M = PxDxP^T; returns P,v,P^T where 87 | v is the diagonal vector of D.*) 88 | val diagonalize : mat -> mat * vec * mat 89 | 90 | (** Computes the inverse of a matrix. *) 91 | val inverse: mat -> mat 92 | 93 | (** Prints a matrix to the standard output (display may be messy). *) 94 | val pp : Format.formatter -> mat -> unit 95 | end 96 | 97 | (** Matrix-vector product. *) 98 | val mat_vec_mul: mat -> vec -> vec 99 | 100 | (** Multiplication of a matrix by a scalar. *) 101 | val scal_mat_mul : float -> mat -> mat 102 | 103 | (** Scalar-vector product (in-place). *) 104 | val inplace_scal_vec_mul: float -> vec -> unit 105 | 106 | (** Scalar-vector product *) 107 | val scal_vec_mul : float -> vec -> vec 108 | 109 | (** Scalar-vector addition. *) 110 | val scal_vec_add: float -> vec -> vec 111 | 112 | (** Computes the static distribution (ie, eigenvector for eigenvalue 0) of a given matrix. *) 113 | val stat_dist: mat -> vec 114 | end 115 | -------------------------------------------------------------------------------- /lib/list1.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | type 'a t = Cons of 'a * 'a list 4 | [@@deriving sexp] 5 | 6 | let length (Cons (_, xs)) = 1 + List.length xs 7 | 8 | let init n ~f = 9 | if n < 1 then invalid_arg "Non_empty_list should be at least of length 1" ; 10 | Cons (f 0, List.init (n - 1) ~f:(fun i -> f (i + 1))) 11 | 12 | let cons h t = Cons (h, t) 13 | 14 | let cons1 h (Cons (h', t')) = Cons (h, h' :: t') 15 | 16 | let rev (Cons (h, t)) = 17 | let rec loop (h, acc) = function 18 | | [] -> Cons (h, acc) 19 | | h' :: t' -> 20 | loop (h', h :: acc) t' 21 | in 22 | loop (h, []) t 23 | 24 | let fold (Cons (h, t)) ~init ~f = 25 | List.fold t ~f ~init:(f init h) 26 | 27 | let fold_right (Cons (h, t)) ~init ~f = 28 | f h (List.fold_right t ~f ~init) 29 | 30 | let reduce (Cons (h, t)) ~f = 31 | List.reduce_exn (h :: t) ~f 32 | 33 | let map (Cons (h, t)) ~f = 34 | cons (f h) (List.map t ~f) 35 | 36 | let mapi (Cons (h, t)) ~f = 37 | cons (f 0 h) (List.mapi t ~f:(fun i -> f (i + 1))) 38 | 39 | let map2 (Cons (h1, t1)) (Cons (h2, t2)) ~f = 40 | match List.map2 t1 t2 ~f with 41 | | List.Or_unequal_lengths.Ok l -> 42 | Ok (cons (f h1 h2) l) 43 | | List.Or_unequal_lengths.Unequal_lengths -> 44 | Error `Unequal_lengths 45 | 46 | let map2_exn (Cons (h1, t1)) (Cons (h2, t2)) ~f = 47 | cons (f h1 h2) (List.map2_exn t1 t2 ~f) 48 | 49 | let iter (Cons (h, t)) ~f = 50 | f h ; List.iter t ~f 51 | 52 | let to_list (Cons (h, t)) = 53 | h :: t 54 | 55 | let filter_map (Cons (h, t)) ~f = 56 | match f h, List.filter_map t ~f with 57 | | None, [] -> [] 58 | | None, (_ :: _ as r) -> r 59 | | Some h, l -> h :: l 60 | 61 | let filter (Cons (h, t)) ~f = 62 | match f h, List.filter t ~f with 63 | | false, [] -> [] 64 | | false, (_ :: _ as r) -> r 65 | | true, l -> h :: l 66 | 67 | let unzip (Cons ((h1, h2), t)) = 68 | let t1, t2 = List.unzip t in 69 | Cons (h1, t1), Cons (h2, t2) 70 | 71 | let for_all (Cons (h, t)) ~f = 72 | f h && List.for_all t ~f 73 | 74 | let exists (Cons (h, t)) ~f = 75 | f h || List.exists t ~f 76 | 77 | let hd (Cons (h, _)) = h 78 | 79 | let singleton x = Cons (x, []) 80 | 81 | let of_list = function 82 | | [] -> None 83 | | h :: t -> Some (Cons (h, t)) 84 | 85 | let of_list_exn = function 86 | | [] -> failwith "empty list" 87 | | h :: t -> Cons (h, t) 88 | 89 | let sort (Cons (h, t)) ~compare = 90 | match List.sort (h:: t) ~compare with 91 | | h :: t -> Cons (h, t) 92 | | [] -> assert false 93 | -------------------------------------------------------------------------------- /lib/mCMC.ml: -------------------------------------------------------------------------------- 1 | (** Deprecated. A simple implementation of the Monte Carlo Markov Chain algorithm *) 2 | 3 | open Core 4 | 5 | let accept p = 6 | Float.(Random.float 1.0 <= p) 7 | 8 | let run (theta0:'a) (step:'a->'a*float) (likelihood:'a->float) (nb_points:int) = 9 | let rec aux i prev acc = 10 | if i>=nb_points then acc 11 | else 12 | let candidate, hastings_ratio = step prev in 13 | let full_ratio = hastings_ratio *. (likelihood candidate) /. (likelihood prev) in 14 | (* FIXME unnecessary computation of f prev *) 15 | if accept full_ratio then aux (i+1) candidate (candidate::acc) 16 | else aux i prev (prev::acc) 17 | in 18 | aux 0 theta0 [] 19 | 20 | module M = struct 21 | module Alignment = Alignment.Make(Seq.DNA) 22 | module Felsenstein = Felsenstein.Make(Nucleotide)(Alignment)(Site_evolution_model.K80) 23 | end 24 | 25 | type vector = { 26 | tree : Phylogenetic_tree.t ; 27 | align : M.Alignment.t 28 | } 29 | 30 | let my_likelihood v = 31 | let open M in 32 | Stdlib.exp (Felsenstein.felsenstein 2.0 v.tree v.align) *. 33 | (let newlength = List.nth_exn (Phylogenetic_tree.get_branch_lengths v.tree) 5 in 34 | if Float.(newlength>0. && newlength<5.) then 1.0 else 0.0) 35 | 36 | let my_align = M.Alignment.of_string_list ["A";"A";"A";"T"] 37 | let my_basetree = Phylogenetic_tree.of_preorder "0.1;0.1;0.1;0.1;0;1;3.0;0.1;2;3" 38 | let my_theta0 = { align=my_align ; tree=my_basetree } 39 | 40 | let my_step v = 41 | let lengths = Phylogenetic_tree.get_branch_lengths v.tree |> List.mapi ~f:( 42 | let range = 5.0 in 43 | fun i x -> if i=5 44 | then x -. (range/.2.) +. (Random.float range) 45 | else x 46 | ) in 47 | let new_tree = Phylogenetic_tree.set_branch_lengths v.tree lengths in 48 | {align=v.align; tree=new_tree}, 1. 49 | -------------------------------------------------------------------------------- /lib/mG94.ml: -------------------------------------------------------------------------------- 1 | module NSCodon = Codon.Universal_genetic_code.NS 2 | 3 | module Nucleotide_rates = Rate_matrix.Nucleotide 4 | module NSCodon_rate_matrix = Rate_matrix.Make(NSCodon) 5 | 6 | type param = { 7 | nucleotide_rates : Rate_matrix.Nucleotide.t ; 8 | nucleotide_stat_dist : Nucleotide.vector ; 9 | omega : float ; 10 | } 11 | 12 | let rate_matrix { nucleotide_rates ; omega ; _ } = 13 | let nuc_rates = (nucleotide_rates :> Nucleotide.matrix) in 14 | NSCodon_rate_matrix.make (fun p q -> 15 | match NSCodon.neighbours p q with 16 | | Some (_, x_a, x_b) -> 17 | let q_ab = nuc_rates.Nucleotide.%{x_a, x_b} in 18 | if NSCodon.synonym p q then q_ab 19 | else omega *. q_ab 20 | | None -> 0. 21 | ) 22 | 23 | let stationary_distribution { nucleotide_stat_dist = pi ; _ } = 24 | let open Nucleotide in 25 | let pi_a = pi.%(a) 26 | and pi_g = pi.%(g) 27 | and pi_t = pi.%(t) in 28 | let pi_stop = 29 | pi_t *. pi_a *. pi_g +. pi_t *. pi_g *. pi_a +. pi_t *. pi_a *. pi_a 30 | in 31 | let alpha = 1. /. (1. -. pi_stop) in 32 | NSCodon.Vector.init (fun c -> 33 | let i, j, k = NSCodon.nucleotides c in 34 | pi.%(i) *. pi.%(j) *. pi.%(k) *. alpha 35 | ) 36 | 37 | let assert_stationary_distributions_are_equal_calculation pi pi' = 38 | let open Linear_algebra in 39 | let pi = (pi : NSCodon.vector :> vec) in 40 | let pi' = (pi' : NSCodon.vector :> vec) in 41 | let res = Linear_algebra.Vector.robust_equal ~tol:1e-6 pi pi' in 42 | if not res then ( 43 | Format.eprintf "found: %a\nwanted: %a\n" Linear_algebra.Vector.pp pi Linear_algebra.Vector.pp pi' 44 | ); 45 | res 46 | 47 | let%test "MG94 stationary distribution" = 48 | let rng = Gsl.Rng.(make (default ())) in 49 | let nucleotide_process = Nucleotide_process.Random.gtr rng ~alpha:0.3 in 50 | let nucleotide_rates = Nucleotide_process.rate_matrix nucleotide_process in 51 | let nucleotide_stat_dist = Nucleotide_process.stationary_distribution nucleotide_process in 52 | let p = { nucleotide_rates ; nucleotide_stat_dist ; omega = 0.7 } in 53 | let mat = rate_matrix p in 54 | assert_stationary_distributions_are_equal_calculation 55 | (stationary_distribution p) 56 | (NSCodon_rate_matrix.stationary_distribution mat) 57 | -------------------------------------------------------------------------------- /lib/mG94.mli: -------------------------------------------------------------------------------- 1 | (** Codon rate matrix from Muse and Gaut 94 2 | 3 | This modules provides functions to build rates matrices for codon 4 | evolution following the model proposed by Muse and Gaut [1]. The 5 | exact formulation follows the parameterization used by Latrille 6 | and Lartillot [2]. 7 | 8 | References: 9 | @see 10 | @see 11 | 12 | *) 13 | 14 | module NSCodon = Codon.Universal_genetic_code.NS 15 | module NSCodon_rate_matrix : module type of Rate_matrix.Make(NSCodon) 16 | 17 | type param = { 18 | nucleotide_rates : Rate_matrix.Nucleotide.t ; 19 | nucleotide_stat_dist : Nucleotide.vector ; 20 | omega : float ; 21 | } 22 | 23 | val rate_matrix : param -> NSCodon_rate_matrix.t 24 | val stationary_distribution : param -> NSCodon.vector 25 | -------------------------------------------------------------------------------- /lib/mutsel.mli: -------------------------------------------------------------------------------- 1 | (** Mutation-selection model for codons 2 | 3 | This module defines a parameterization for the Mutsel evolutionary 4 | model for codons, as well as functions to compute the 5 | corresponding transition rate matrix and the associated stationary 6 | distribution. 7 | 8 | References: 9 | @see 10 | @see 11 | *) 12 | 13 | module NSCodon = Codon.Universal_genetic_code.NS 14 | module NSCodon_rate_matrix : module type of Rate_matrix.Make(NSCodon) 15 | 16 | type param = { 17 | nucleotide_rates : Rate_matrix.Nucleotide.t ; 18 | nucleotide_stat_dist : Nucleotide.vector ; 19 | omega : float ; (* dN/dS *) 20 | scaled_fitness : Amino_acid.vector ; 21 | gBGC : float ; 22 | pps : float ; (* persistent positive selection intensity Z as in Tamuri & dos Reis 2021 *) 23 | } 24 | 25 | val random_param : 26 | Gsl.Rng.t -> 27 | nucleotide_process:Nucleotide_process.t -> 28 | alpha:float -> 29 | param 30 | 31 | val flat_param : unit -> param 32 | 33 | val rate_matrix : param -> NSCodon_rate_matrix.t 34 | (** 35 | [rate_matrix param] computes the rate matrix for codon 36 | substitutions based on the given parameter values [param]. The 37 | resulting matrix is a rate matrix, meaning that its off-diagonal 38 | elements are positive and its lines sum to 0. 39 | 40 | Example: 41 | {[ 42 | let param = flat_param () in 43 | let matrix = rate_matrix param in 44 | ]} 45 | *) 46 | 47 | val stationary_distribution : param -> NSCodon.vector 48 | (** 49 | [stationary_distribution param] calculates the stationary 50 | distribution under parameter value [param]. 51 | 52 | Example: 53 | {[ 54 | let param = flat_param () in 55 | let distribution = stationary_distribution param in 56 | ]} 57 | *) 58 | 59 | val transition_probability_matrix : param -> float -> NSCodon.matrix 60 | (** 61 | [transition_probability_matrix param t] computes the transition 62 | probability matrix for codon substitutions over a specified time 63 | period [t] based on the given parameter values [param]. The 64 | resulting matrix is a probability matrix, meaning that all elements 65 | are probabilities and lines sum to 1. 66 | 67 | Example: 68 | {[ 69 | let param = flat_param () in 70 | let time = 0.1 in 71 | let transition_matrix = transition_probability_matrix param time in 72 | ]} 73 | *) 74 | -------------------------------------------------------------------------------- /lib/nelder_mead.ml: -------------------------------------------------------------------------------- 1 | (** 2 | Implements method as described in https://en.wikipedia.org/wiki/Nelder%E2%80%93Mead_method 3 | some tests from original publication: 4 | 5 | A simplex method for function minimization 6 | J. A. Nelder and R. Mead 7 | 8 | 9 | Here is what the different functions of this file do: 10 | 11 | - `centroid` calculates the centroid of a set of points. In the 12 | context of the Nelder-Mead method, it is used to compute the center 13 | of gravity of a simplex, which is a polytope composed of points in a 14 | multidimensional space. 15 | 16 | - update performs an update of a point using a specified 17 | direction. This can be used to update a point in the Nelder-Mead 18 | method according to the specific rules of the algorithm. 19 | 20 | - `minimize` is the main function that implements the Nelder-Mead 21 | method to minimize a given function. It takes parameters such as 22 | the function to minimize (f), a sample generation function (sample) 23 | to generate initial points, as well as optional parameters like 24 | tolerance (tol), maximum number of iterations (maxit), and debug 25 | mode (debug). 26 | 27 | 28 | Using these functions, you can perform the minimization of a given 29 | function using the Nelder-Mead method. The algorithm iterates over a 30 | set of points (a simplex) and performs updates to get closer to the 31 | minimum of the function. The process repeats until a convergence 32 | condition is met (e.g., when the difference between function values 33 | becomes smaller than a given tolerance). The minimize function 34 | returns the obtained minimum, the corresponding point, and the 35 | number of iterations performed. 36 | *) 37 | 38 | open Core 39 | 40 | let centroid xs = 41 | let n = Array.length xs in 42 | if n = 0 then raise (Invalid_argument "Nelder_mead.centroid: empty array") ; 43 | let d = Array.length xs.(0) in 44 | Array.init d ~f:(fun i -> 45 | Array.fold xs ~init:0. ~f:(fun acc x -> acc +. x.(i)) 46 | /. float n 47 | ) 48 | 49 | let update ~from:c alpha ~towards:x = 50 | let d = Array.length c in 51 | Array.init d ~f:(fun i -> c.(i) +. alpha *. (x.(i) -. c.(i))) 52 | 53 | let minimize ?(tol = 1e-8) ?(maxit = 100_000) ?(debug = false) ~f ~sample () = 54 | let alpha = 1. in 55 | let gamma = 2. in 56 | let rho = 0.5 in 57 | let sigma = 0.5 in 58 | let x0 = sample () in 59 | let n = Array.length x0 in 60 | if n = 0 then raise (Invalid_argument "Nelder_mead.minimize: sample returns empty vectors") ; 61 | let sample () = 62 | let y = sample () in 63 | if Array.length y <> n then raise (Invalid_argument "Nelder_mead.minimize: sample returns vectors of varying lengths") ; 64 | y 65 | in 66 | let points = Array.init (n + 1) ~f:(fun _ -> sample ()) in 67 | let obj = Array.map points ~f in 68 | let rec loop i = 69 | let ranks = Utils.array_order ~compare:Float.compare obj in 70 | if debug then ( 71 | printf "\n\nIteration %d: %f\n%!" i obj.(ranks.(0)) ; 72 | printf "Delta: %g\n%!" (obj.(ranks.(n)) -. obj.(ranks.(0))) 73 | ) ; 74 | let c = 75 | Array.sub ranks ~pos:0 ~len:n 76 | |> Array.map ~f:(Array.get points) 77 | |> centroid 78 | in 79 | let x_r = update ~from:c (-. alpha) ~towards:points.(ranks.(n)) in 80 | let f_r = f x_r in 81 | if debug then ( 82 | printf "Candidate: %f\n" f_r ; 83 | ) ; 84 | ( 85 | match Float.(f_r < obj.(ranks.(0)), f_r < obj.(ranks.(Int.(n - 1)))) with 86 | | false, true -> 87 | if debug then printf "Reflection\n" ; 88 | points.(ranks.(n)) <- x_r ; 89 | obj.(ranks.(n)) <- f_r ; 90 | | true, _ -> 91 | if debug then printf "Expansion\n" ; 92 | let x_e = update ~from:c gamma ~towards:x_r in 93 | let f_e = f x_e in 94 | points.(ranks.(n)) <- if Float.(f_e < f_r) then x_e else x_r ; 95 | obj.(ranks.(n)) <- Float.min f_r f_e ; 96 | | false, false -> 97 | let x_c, f_c, candidate_accepted = 98 | if Float.(f_r < obj.(ranks.(n))) then (* outside contraction *) 99 | let x_c = update ~from:c rho ~towards:x_r in 100 | let f_c = f x_c in 101 | x_c, f_c, Float.(f_c <= f_r) 102 | else (* inside contraction *) 103 | let x_cc = update ~from:c ~towards:points.(ranks.(n)) rho in 104 | let f_cc = f x_cc in 105 | x_cc, f_cc, Float.(f_cc < obj.(ranks.(n))) 106 | in 107 | if candidate_accepted then ( 108 | if debug then printf "Contraction, f_c = %f\n" f_c ; 109 | points.(ranks.(n)) <- x_c ; 110 | obj.(ranks.(n)) <- f_c ; 111 | ) 112 | else ( 113 | if debug then printf "Shrink\n" ; 114 | Array.iteri points ~f:(fun i x_i -> 115 | if i <> ranks.(0) then ( 116 | let x_i = update ~from:points.(ranks.(0)) sigma ~towards:x_i in 117 | points.(i) <- x_i ; 118 | obj.(i) <- f x_i 119 | ) 120 | ) 121 | ) 122 | ) ; 123 | let sigma = Gsl.Stats.sd obj in 124 | if debug then ( 125 | printf "Sigma: %f\n" sigma ; 126 | printf "Values: %s\n" (Utils.show_float_array (Array.init (n + 1) ~f:(fun i -> obj.(ranks.(i))))) 127 | ) ; 128 | if Float.(sigma < tol) || i >= maxit 129 | then obj.(ranks.(0)), points.(ranks.(0)), i 130 | else loop (i + 1) 131 | in 132 | loop 0 133 | 134 | let%test "Parabola" = 135 | let f x = x.(0) ** 2. in 136 | let sample () = [| Random.float 200. -. 100. |] in 137 | let obj, _, _ = minimize ~f ~tol:1e-3 ~sample () in 138 | Float.(abs obj < 1e-3) 139 | 140 | let%test "Rosenbrock" = 141 | let f x = 100. *. (x.(1) -. x.(0) ** 2.) ** 2. +. (1. -. x.(0)) ** 2. in 142 | let rfloat _ = Random.float 200. -. 100. in 143 | let sample () = Array.init 2 ~f:rfloat in 144 | let obj, _, _ = minimize ~f ~sample () in 145 | Float.(abs obj < 1e-3) 146 | 147 | let%test "Powell quartic" = 148 | let f x = 149 | let open Float in 150 | (x.(0) + 10. * x.(1)) ** 2. + 5. *. (x.(2) - x.(3)) ** 2. 151 | + (x.(1) - 2. *. x.(2)) ** 4. + 10. * (x.(0) - x.(3)) ** 4. 152 | in 153 | let rfloat _ = Random.float 200. -. 100. in 154 | let sample () = Array.init 4 ~f:rfloat in 155 | let obj, _, _ = minimize ~f ~sample () in 156 | Float.(abs obj < 1e-3) 157 | -------------------------------------------------------------------------------- /lib/nelder_mead.mli: -------------------------------------------------------------------------------- 1 | (** An implementation of Nelder-Mead algorithm for function 2 | optimization 3 | 4 | Implements method as described in 5 | {{:https://en.wikipedia.org/wiki/Nelder%E2%80%93Mead_method}Wikipedia} 6 | and uses some tests from the original publication: 7 | 8 | A simplex method for function minimization 9 | J. A. Nelder and R. Mead 10 | *) 11 | 12 | (** Minimizes cost function [f]. 13 | [sample] function must return a vector of initial parameters. 14 | Returns the minimum value of [f], the vector of optimized parameters, 15 | and the number of iterations. 16 | *) 17 | val minimize : 18 | ?tol:float -> 19 | ?maxit:int -> 20 | ?debug:bool -> 21 | f:(float array -> float) -> 22 | sample:(unit -> float array) -> 23 | unit -> 24 | float * float array * int 25 | -------------------------------------------------------------------------------- /lib/newick.mli: -------------------------------------------------------------------------------- 1 | (** New Hampshire tree (a.k.a. newick) format parsing 2 | 3 | This module provides parsing utilities for Newick tree format [1] and 4 | its extension with tags [2]. 5 | 6 | [1] https://phylipweb.github.io/phylip/newicktree.html 7 | [2] http://www.phylosoft.org/NHX/nhx.pdf 8 | *) 9 | include module type of Newick_ast 10 | 11 | val from_file : string -> (t, [> error]) result 12 | 13 | val from_file_exn : string -> t 14 | 15 | val from_string : string -> (t, [> error]) result 16 | 17 | val from_string_exn : string -> t 18 | 19 | val of_tree : 20 | ?node_id:('a -> string option) -> 21 | ?node_tags:('a -> tag list) -> 22 | ?leaf_id:('b -> string option) -> 23 | ?leaf_tags:('b -> tag list) -> 24 | ?branch_length:('c -> float option) -> 25 | ?parent_branch:float -> 26 | ('a, 'b, 'c) Tree.t -> 27 | t 28 | 29 | val to_string : t -> string 30 | val to_file : t -> string -> unit 31 | 32 | module Tree_repr : sig 33 | type ast = t 34 | 35 | type node_info = { 36 | name : string option ; 37 | tags : tag list ; 38 | } 39 | 40 | type tree = (node_info, node_info, float option) Tree.t 41 | type branch = (node_info, node_info, float option) Tree.branch 42 | 43 | type t = 44 | | Tree of tree 45 | | Branch of branch 46 | 47 | val of_ast : ast -> t 48 | val to_ast : t -> ast 49 | 50 | val map_inner_tree : t -> f:(tree -> tree) -> t 51 | val with_inner_tree : t -> f:(tree -> 'a) -> 'a 52 | end 53 | -------------------------------------------------------------------------------- /lib/newick_ast.ml: -------------------------------------------------------------------------------- 1 | type tag = string * string 2 | 3 | type t = { 4 | name : string option ; 5 | tags : tag list ; 6 | parent_branch : float option ; 7 | children : t list ; 8 | } 9 | 10 | type error_desc = { 11 | offset : int ; 12 | line : int ; 13 | column : int ; 14 | msg : string ; 15 | } 16 | 17 | let string_of_error_desc e = 18 | Printf.sprintf "Error at line %d, column %d: %s" e.line e.column e.msg 19 | 20 | type error = [`Newick_parser_error of error_desc] 21 | 22 | let mkerror lexbuf msg = 23 | let pos = Lexing.lexeme_start_p lexbuf in 24 | let line = pos.pos_lnum in 25 | let column = pos.pos_cnum - pos.pos_bol + 1 in 26 | let offset = pos.pos_cnum in 27 | `Newick_parser_error { offset ; line ; column ; msg } 28 | 29 | let string_of_error (`Newick_parser_error ed) = string_of_error_desc ed 30 | -------------------------------------------------------------------------------- /lib/newick_lexer.mll: -------------------------------------------------------------------------------- 1 | { 2 | open Newick_parser 3 | 4 | exception Error of Newick_ast.error 5 | } 6 | 7 | rule token = parse 8 | | [' ''\t''\n''\r']+ { token lexbuf } 9 | | ':' { COLON } 10 | | ';' { SEMICOLON } 11 | | '(' { LPAREN } 12 | | ')' { RPAREN } 13 | | ',' { COMMA } 14 | | '[' { LBRACKET } 15 | | ']' { RBRACKET } 16 | | "&&NHX" { NHXTAG } 17 | | '=' { EQUAL } 18 | | eof { EOF } 19 | 20 | | ['0'-'9']+ as i 21 | { INT i } 22 | | ['0'-'9']+ ('.' ['0'-'9']*)? (['e''E'] ['+' '-']? ['0'-'9']*)? as f 23 | { FLOAT f } 24 | 25 | | ['A'-'Z''a'-'z''0'-'9''-''_''.''/']+ as lxm { STRING(lxm) } 26 | 27 | | _ as c 28 | { raise (Error (Newick_ast.mkerror lexbuf (Printf.sprintf "unexpected character: %c" c))) } 29 | -------------------------------------------------------------------------------- /lib/newick_parser.messages: -------------------------------------------------------------------------------- 1 | start: LPAREN RPAREN EOF 2 | start: LPAREN RPAREN COLON FLOAT EOF 3 | 4 | semi-colon expected at end of input 5 | 6 | ### 7 | 8 | start: LPAREN EOF 9 | 10 | unclosed parenthesis 11 | -------------------------------------------------------------------------------- /lib/newick_parser.mly: -------------------------------------------------------------------------------- 1 | %{ 2 | (* http://evolution.genetics.washington.edu/phylip/newicktree.html *) 3 | let list_of_opt = Base.Option.value ~default:[] 4 | %} 5 | 6 | %token FLOAT 7 | %token INT 8 | %token STRING 9 | %token COMMA COLON SEMICOLON LPAREN RPAREN LBRACKET RBRACKET NHXTAG EQUAL EOF 10 | 11 | %start start 12 | %type start 13 | 14 | %% 15 | 16 | start: node SEMICOLON EOF {$1}; 17 | 18 | node: 19 | | name = option(STRING) parent_branch = option(length) tags = option(tags) 20 | { Newick_ast.{name ; tags = list_of_opt tags ; parent_branch ; children = []} 21 | } 22 | | LPAREN children = separated_nonempty_list(COMMA, node) RPAREN name = option(STRING) parent_branch = option(length) tags = option(tags) 23 | { Newick_ast.{name ; tags = list_of_opt tags ; parent_branch ; children } 24 | } 25 | ; 26 | 27 | length: 28 | | COLON l = FLOAT { float_of_string l } 29 | | COLON l = INT { float_of_string l } 30 | ; 31 | 32 | tags: 33 | | LBRACKET NHXTAG tags = nonempty_list(tag) RBRACKET { tags } 34 | ; 35 | 36 | tag: 37 | | COLON key = STRING EQUAL data = STRING { (key, data) } 38 | | COLON key = STRING EQUAL data = FLOAT { (key, data) } 39 | | COLON key = STRING EQUAL data = INT { (key, data) } 40 | ; 41 | -------------------------------------------------------------------------------- /lib/nucleotide.ml: -------------------------------------------------------------------------------- 1 | include Alphabet.Make(struct let card = 4 end) 2 | 3 | let a = 0 4 | let c = 1 5 | let g = 2 6 | let t = 3 7 | 8 | let of_char_exn = function 9 | | 'a' | 'A' -> 0 10 | | 'c' | 'C' -> 1 11 | | 'g' | 'G' -> 2 12 | | 't' | 'T' -> 3 13 | | _ -> invalid_arg "Nucleotide.of_char_exn" 14 | 15 | let transversion p q = 16 | match p, q with 17 | | 0, 2 | 2, 0 18 | | 1, 3 | 3, 1 -> false 19 | | _ -> true 20 | 21 | type repr = A | C | G | T 22 | 23 | let inspect = function 24 | | 0 -> A 25 | | 1 -> C 26 | | 2 -> G 27 | | 3 -> T 28 | | _ -> assert false 29 | 30 | let to_char = function 31 | | 0 -> 'A' 32 | | 1 -> 'C' 33 | | 2 -> 'G' 34 | | 3 -> 'T' 35 | | _ -> assert false 36 | -------------------------------------------------------------------------------- /lib/nucleotide.mli: -------------------------------------------------------------------------------- 1 | (** A single DNA base *) 2 | 3 | include Alphabet.S_int 4 | 5 | val a : t 6 | (** The base 'A'. *) 7 | 8 | val c : t 9 | (** The base 'C'. *) 10 | 11 | val g : t 12 | (** The base 'G'. *) 13 | 14 | val t : t 15 | (** The base 'T'. *) 16 | 17 | val of_char_exn : Char.t -> t 18 | (** Creates a DNA base from a char (case insensitive). 19 | Raises invalid_arg in case of incorrect char parameter.*) 20 | 21 | val to_char : t -> Char.t 22 | (** Returns a single capital character representing the base. *) 23 | 24 | val transversion : t -> t -> bool 25 | (** [transversion x y] is [true] if [x] and [y] belong to different 26 | purine/pyrimidine groups. *) 27 | 28 | type repr = A | C | G | T 29 | 30 | val inspect : t -> repr 31 | (** Returns the representation of a DNA base as a variant type. 32 | The representation can be 'A', 'C', 'G', or 'T'. *) 33 | -------------------------------------------------------------------------------- /lib/nucleotide_process.ml: -------------------------------------------------------------------------------- 1 | module RM = Rate_matrix.Nucleotide 2 | 3 | type t = 4 | | JC69 5 | | K80 of float 6 | | HKY85 of { 7 | stationary_distribution : Nucleotide.vector ; 8 | transition_rate : float ; 9 | transversion_rate : float ; 10 | } 11 | | GTR of { 12 | stationary_distribution : Nucleotide.vector ; 13 | exchangeabilities : Nucleotide.matrix ; 14 | } 15 | 16 | let rate_matrix = function 17 | | JC69 -> RM.jc69 () 18 | | K80 kappa -> RM.k80 kappa 19 | | HKY85 { stationary_distribution ; transversion_rate ; transition_rate } -> 20 | RM.hky85 ~stationary_distribution ~transition_rate ~transversion_rate 21 | | GTR { stationary_distribution ; exchangeabilities } -> 22 | RM.gtr ~stationary_distribution ~exchangeabilities 23 | 24 | let stationary_distribution = function 25 | | JC69 26 | | K80 _ -> Nucleotide.Vector.init (fun _ -> 0.25) 27 | | HKY85 { stationary_distribution ; _ } 28 | | GTR { stationary_distribution ; _ } -> stationary_distribution 29 | 30 | module Random = struct 31 | let gtr rng ~alpha = 32 | let stationary_distribution = Nucleotide.random_profile rng alpha in 33 | let exchangeabilities = Rate_matrix.Nucleotide.make_symetric (fun _ _ -> Gsl.Randist.gamma rng ~a:1. ~b:1.) in 34 | GTR { stationary_distribution ; exchangeabilities } 35 | 36 | let hky85 rng ~alpha = 37 | let stationary_distribution = Nucleotide.random_profile rng alpha in 38 | let transition_rate = Gsl.Randist.gamma rng ~a:1. ~b:1. in 39 | let transversion_rate = Gsl.Randist.gamma rng ~a:1. ~b:1. in 40 | HKY85 { stationary_distribution ; transversion_rate ; transition_rate } 41 | end 42 | -------------------------------------------------------------------------------- /lib/nucleotide_process.mli: -------------------------------------------------------------------------------- 1 | (** A representation of classical parametric processes on nucleotides 2 | 3 | Example usage of functions 4 | 5 | Example 1: Computing a rate matrix 6 | {[ 7 | let example_rate_matrix = 8 | let model = JC69 in 9 | let rm = rate_matrix model in 10 | rm (* The computed rate matrix *) 11 | ]} 12 | 13 | Example 2: Computing the stationary distribution 14 | {[ 15 | let example_stationary_distribution = 16 | let model = K80 2.0 in 17 | let sd = stationary_distribution model in 18 | sd (* The computed stationary distribution *) 19 | 20 | ]} 21 | 22 | Example 3: Sampling a mutation model using HKY85 23 | {[ 24 | let example_hky85_model = 25 | let rng = Gsl.Rng.make Gsl.Rng.MT19937 in 26 | let alpha = 0.5 in 27 | let model = Random.hky85 rng ~alpha in 28 | model (* The sampled HKY85 mutation model *) 29 | ]} 30 | 31 | Example 4: Sampling a mutation model using GTR 32 | {[ 33 | let example_gtr_model = 34 | let rng = Gsl.Rng.make Gsl.Rng.MT19937 in 35 | let alpha = 1.0 in 36 | let model = Random.gtr rng ~alpha in 37 | model (* The sampled GTR mutation model *) 38 | ]} 39 | 40 | *) 41 | 42 | type t = 43 | | JC69 44 | | K80 of float 45 | | HKY85 of { 46 | stationary_distribution : Nucleotide.vector ; 47 | transition_rate : float ; 48 | transversion_rate : float ; 49 | } 50 | | GTR of { 51 | stationary_distribution : Nucleotide.vector ; 52 | exchangeabilities : Nucleotide.matrix ; 53 | } 54 | 55 | val rate_matrix : t -> Rate_matrix.Nucleotide.t 56 | 57 | val stationary_distribution : t -> Nucleotide.vector 58 | 59 | module Random : sig 60 | val hky85 : Gsl.Rng.t -> alpha:float -> t 61 | (** [hky85 rng ~alpha] uses [alpha] as Dirichlet parameter to 62 | sample a stationary profile, and draws transversion/transition 63 | rates from a Gamma(1, 1) *) 64 | 65 | val gtr : Gsl.Rng.t -> alpha:float -> t 66 | (** [gtr rng alpha] uses [alpha] as Dirichlet parameter to sample 67 | a stationary profile, and draws exchangeabilities from a 68 | Gamma(1, 1) *) 69 | end 70 | -------------------------------------------------------------------------------- /lib/phylip.ml: -------------------------------------------------------------------------------- 1 | (* 2 | http://evolution.genetics.washington.edu/phylip/doc/sequence.html 3 | http://scikit-bio.org/docs/0.2.3/generated/skbio.io.phylip.html 4 | *) 5 | open Core 6 | open Rresult 7 | 8 | type item = { 9 | name : string ; 10 | sequence : string ; 11 | } 12 | 13 | type t = { 14 | number_of_sequences : int ; 15 | sequence_length : int ; 16 | items : item list ; 17 | } 18 | 19 | let make_exn = function 20 | | [] -> invalid_arg "empty list of items" 21 | | h :: t as items -> 22 | let n = String.length h.sequence in 23 | ( 24 | match List.findi t ~f:(fun _ it -> String.length it.sequence <> n) with 25 | | Some (i, it) -> invalid_argf "Sequence %d has length %d while it is expected to have length %d" i (String.length it.sequence) n () 26 | | None -> () 27 | ) ; 28 | { number_of_sequences = List.length items ; sequence_length = n ; items } 29 | 30 | module Relaxed_parser = struct 31 | let parse_header l = 32 | match 33 | String.split_on_chars l ~on:['\t' ; ' '] 34 | |> List.filter ~f:(String.( <> ) "") 35 | with 36 | | [ m ; n ] -> ( 37 | try Ok (Int.of_string m, Int.of_string n) 38 | with _ -> Error (`Msg "Incorrect header") 39 | ) 40 | | _ -> Error (`Msg "Incorrect header") 41 | 42 | let check_nb_lines lines number_of_sequences = 43 | if List.length lines = number_of_sequences then Ok () 44 | else Error (`Msg "Unexpected number of lines in file") 45 | 46 | let parse_item ~sequence_length i l = 47 | let err () = Error (`Msg (sprintf "incorrect sequence syntax on line %d" (i + 1))) in 48 | match String.lsplit2 l ~on:'\t' with 49 | | Some (name, sequence) -> 50 | if String.length sequence = sequence_length then Ok { name ; sequence } 51 | else err () 52 | | None -> err () 53 | 54 | let read fn = 55 | let open Result.Monad_infix in 56 | match In_channel.read_lines fn with 57 | | [] -> Error (`Msg "Empty file") 58 | | header :: sequences -> 59 | parse_header header >>= fun (number_of_sequences, sequence_length) -> 60 | check_nb_lines sequences number_of_sequences >>= fun () -> 61 | List.mapi sequences ~f:(parse_item ~sequence_length) |> Result.all >>= fun items -> 62 | Ok { number_of_sequences ; sequence_length ; items } 63 | end 64 | 65 | let make ~number_of_sequences ~sequence_length ~items = 66 | let nseq = List.length items in 67 | if nseq <> number_of_sequences then 68 | R.error_msgf "Declared %d sequences but provided %d" number_of_sequences nseq 69 | else if List.exists items ~f:(fun { sequence = s ; _ } -> String.length s <> sequence_length) then 70 | R.error_msgf "Not all sequences have declared length of %d" sequence_length 71 | else Ok { 72 | number_of_sequences ; 73 | sequence_length ; 74 | items ; 75 | } 76 | 77 | module Parser = struct 78 | open Angstrom 79 | 80 | let is_space = function 81 | | ' ' -> true 82 | | _ -> false 83 | 84 | let space = skip_while is_space "space" 85 | let space1 = (satisfy is_space *> space) "space1" 86 | 87 | let integer = 88 | ( 89 | take_while1 ( 90 | function 91 | | '0'..'9' -> true 92 | | _ -> false 93 | ) >>= fun s -> 94 | try return (Int.of_string s) 95 | with Failure msg -> fail msg 96 | ) "integer" 97 | 98 | let header_parser = 99 | ( 100 | space *> integer >>= fun number_of_sequences -> 101 | space1 *> integer >>= fun sequence_length -> 102 | space *> char '\n' >>= fun _ -> 103 | return (number_of_sequences, sequence_length) 104 | ) "header_parser" 105 | 106 | let id_parser = 107 | count 10 (not_char '\n') >>| String.of_char_list >>| Stdlib.String.trim 108 | 109 | let sequence_parser = 110 | take_while1 (function 111 | | 'A'..'Z' | 'a'..'z' | '-' | '.' -> true 112 | | _ -> false 113 | ) 114 | 115 | let item = 116 | id_parser >>= fun name -> 117 | sequence_parser >>= fun sequence -> 118 | return { name ; sequence } 119 | 120 | let file = 121 | header_parser >>= fun (number_of_sequences, sequence_length) -> 122 | sep_by1 (char '\n') item >>= fun items -> 123 | match make ~number_of_sequences ~items ~sequence_length with 124 | | Ok x -> return x 125 | | Error (`Msg m) -> fail m 126 | end 127 | 128 | let read ?(strict = true) fn = 129 | if strict then 130 | In_channel.with_file fn ~f:(fun ic -> 131 | Angstrom_unix.parse Parser.file ic 132 | ) 133 | |> snd 134 | |> Result.map_error ~f:(fun s -> `Msg s) 135 | else 136 | Relaxed_parser.read fn 137 | 138 | let read_exn ?strict fn = 139 | match read ?strict fn with 140 | | Ok r -> r 141 | | Error (`Msg msg) -> failwith msg 142 | 143 | let write_strict data fn = 144 | Out_channel.with_file fn ~f:(fun oc -> 145 | fprintf oc "%d %d\n" data.number_of_sequences data.sequence_length ; 146 | List.iter data.items ~f:(fun it -> 147 | let id = 148 | let n = String.length it.name in 149 | if n <= 10 then (it.name ^ String.make (10 - n) ' ') 150 | else String.prefix it.name 10 151 | in 152 | fprintf oc "%s%s\n" id it.sequence 153 | ) 154 | ) 155 | 156 | let write_relaxed data fn = 157 | Out_channel.with_file fn ~f:(fun oc -> 158 | fprintf oc "%d\t%d\n" data.number_of_sequences data.sequence_length ; 159 | List.iter data.items ~f:(fun it -> 160 | fprintf oc "%s\t%s\n" it.name it.sequence 161 | ) 162 | ) 163 | 164 | let write ?(strict = true) t fn = 165 | if strict then write_strict t fn 166 | else write_relaxed t fn 167 | -------------------------------------------------------------------------------- /lib/phylip.mli: -------------------------------------------------------------------------------- 1 | (** Parsing for the PHYLIP format 2 | 3 | This module provides functionality for working with the [Phylip] 4 | file format, which is commonly used in phylogenetic analysis and 5 | sequence alignment. The [Phylip] format is a plain text format 6 | that represents multiple biological sequences along with their 7 | associated metadata. The original format is described 8 | {{:http://evolution.genetics.washington.edu/phylip.html}there}. This 9 | implementation also allows a somewhat more relaxed syntax: 10 | 11 | {v 12 | 13 | TAB 14 | TAB 15 | ... 16 | v} 17 | which is specified with the option [~strict:false]. 18 | 19 | Example usage of the functions: 20 | {[ 21 | let items = [ 22 | { name = "Seq1"; sequence = "ACGT" }; 23 | { name = "Seq2"; sequence = "CGTA" }; 24 | ] in 25 | let data = make_exn items in 26 | write data "output.txt"; 27 | let parsed_data = read "output.txt" in 28 | match parsed_data with 29 | | Ok parsed -> Printf.printf "Number of sequences: %d\n" parsed.number_of_sequences 30 | | Error (`Msg msg) -> Printf.printf "Parsing error: %s\n" msg 31 | ]} 32 | *) 33 | 34 | type item = { 35 | name : string ; 36 | sequence : string ; 37 | } 38 | 39 | type t = private { 40 | number_of_sequences : int ; 41 | sequence_length : int ; 42 | items : item list ; 43 | } 44 | 45 | val make_exn : item list -> t 46 | (** [make_exn items] creates a new [t] value from a list of items. 47 | It validates the input by checking that the list is not empty and 48 | that all sequences have the same length. 49 | Raises [Invalid_argument] if the list of items is empty or if 50 | the sequences have different lengths. *) 51 | 52 | val read : 53 | ?strict:bool -> 54 | string -> 55 | (t, [> `Msg of string]) result 56 | (** [read ?strict fn] reads the contents of the specified file [fn] 57 | and returns a [t] value representing the data. The [strict] 58 | parameter specifies whether to use strict parsing or relaxed 59 | parsing. If [strict] is set to [true], the function expects the 60 | file to follow the strict format. If [strict] is set to [false], 61 | the function allows a more relaxed syntax. 62 | 63 | Returns: 64 | - [Ok t] if the file is successfully parsed and the [t] value is 65 | created. 66 | - [Error (`Msg msg)] if an error occurs during parsing, with [msg] 67 | describing the error. *) 68 | 69 | val read_exn : 70 | ?strict:bool -> 71 | string -> 72 | t 73 | (** [read_exn ?strict fn] is similar to [read], but raises an 74 | exception with the error message if the parsing fails. *) 75 | 76 | val write : 77 | ?strict:bool -> 78 | t -> 79 | string -> 80 | unit 81 | (** [write ?strict t fn] writes the data from the [t] value to the 82 | specified file [fn]. The [strict] parameter specifies whether to 83 | use strict writing or relaxed writing. If [strict] is set to 84 | [true], the function writes the data in strict format. If [strict] 85 | is set to [false], the function writes the data in a more relaxed 86 | format. *) 87 | -------------------------------------------------------------------------------- /lib/phylogenetic_tree.mli: -------------------------------------------------------------------------------- 1 | (** Module for phylogenetic trees. *) 2 | 3 | (** {5 Types} *) 4 | 5 | (** Type for evolutionary trees: binary trees 6 | whose edges are labelled with lengths (floats) 7 | and whose leaves are labelled with sequence indexes (strings)*) 8 | type t = 9 | | Node of {meta:metadata; left:branch; right:branch} 10 | | Leaf of {meta:metadata; index:Sigs.index} 11 | and branch = float * t 12 | and metadata = {id:int; routing_no:int} 13 | 14 | 15 | (** {5 Creation/Conversion} *) 16 | 17 | val of_preorder: string -> t 18 | 19 | val of_newick: string -> t 20 | 21 | val of_newick_file: string -> t 22 | 23 | val make_random: int -> t 24 | 25 | val to_newick: t -> string 26 | 27 | val to_newick_file: t -> string -> unit 28 | 29 | val to_dot: t -> string 30 | 31 | val to_tree : t -> (metadata, metadata * string, float) Tree.t 32 | 33 | (** {5 Parameters and transformations} *) 34 | 35 | val nb_branches: t -> int 36 | 37 | val get_branch_lengths: t -> float list 38 | 39 | val set_branch_lengths: t -> float list -> t 40 | 41 | 42 | (** {5 Constructors} *) 43 | 44 | val build_leaf: ?routing_no:int -> Sigs.index -> t 45 | 46 | val build_node: ?routing_no:int -> float * t -> float * t -> t 47 | 48 | val index_of_int: int -> Sigs.index 49 | 50 | val index_of_string: string -> Sigs.index 51 | 52 | 53 | (** {5 Getters and setters} *) 54 | 55 | val get_id: t -> int 56 | 57 | val get_meta: t -> metadata 58 | 59 | val set_meta: t -> metadata -> t 60 | 61 | val get_routing_no: t -> int 62 | 63 | val set_routing_no: t -> int -> t 64 | 65 | 66 | (** {5 Comparison} *) 67 | 68 | val equal: t -> t -> bool 69 | 70 | 71 | (** {5 Pretty printers} *) 72 | 73 | val pp: Format.formatter -> t -> unit 74 | 75 | val pp_fancy: Format.formatter -> t -> unit 76 | 77 | val print: t -> unit 78 | 79 | val print_fancy: t -> unit 80 | -------------------------------------------------------------------------------- /lib/rate_matrix.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Linear_algebra 3 | 4 | module type S = sig 5 | type vector 6 | type matrix 7 | type symbol 8 | type t = matrix 9 | 10 | val make : (symbol -> symbol -> float) -> t 11 | 12 | val make_symetric : (symbol -> symbol -> float) -> t 13 | 14 | val jc69 : unit -> t 15 | 16 | val gtr : 17 | stationary_distribution:vector -> 18 | exchangeabilities:matrix -> 19 | t 20 | 21 | val stationary_distribution : t -> vector 22 | 23 | val scaled_rate_matrix : vector -> t -> t 24 | 25 | val scale : t -> t 26 | end 27 | 28 | module Make(A : Alphabet.S_int) = struct 29 | type t = A.matrix 30 | let sum f = 31 | List.fold A.all ~init:0. ~f:(fun acc n -> acc +. f n) 32 | 33 | let stationary_distribution (m : A.matrix) = 34 | Matrix.zero_eigen_vector (m :> mat) 35 | |> A.Vector.upcast_exn 36 | 37 | let jc69 () = 38 | let r = Float.(1. / (of_int A.card - 1.)) in 39 | A.Matrix.init (fun i j -> if A.equal i j then -1. else r) 40 | 41 | let make f = 42 | let r = A.Matrix.init (fun _ _ -> 0.) in 43 | List.iter A.all ~f:(fun i-> 44 | let total = ref 0. in 45 | List.iter A.all ~f:(fun j -> 46 | if not (A.equal i j) then ( 47 | let r_ij = f i j in 48 | if Float.(r_ij < 0.) then (failwith "Rates should be positive") ; 49 | total := r_ij +. !total ; 50 | A.(r.%{i, j} <- r_ij) 51 | ) 52 | ) ; 53 | A.(r.%{i, i} <- -. !total) 54 | ) ; 55 | r 56 | 57 | let make_symetric f = 58 | let r = A.Matrix.init (fun _ _ -> 0.) in 59 | List.iter A.all ~f:(fun i-> 60 | List.iter A.all ~f:(fun j -> 61 | if (i :> int) > (j :> int) then ( 62 | let r_ij = f i j in 63 | if Float.(r_ij < 0.) then (failwith "Rates should be positive") ; 64 | A.(r.%{i, j} <- r_ij) ; 65 | A.(r.%{j, i} <- r_ij) 66 | ) 67 | ) ; 68 | let total = List.fold A.all ~init:0. ~f:(fun acc j -> acc +. r.A.%{i, j}) in 69 | A.(r.%{i, i} <- -. total) 70 | ) ; 71 | r 72 | 73 | let scaled_rate_matrix profile rate = 74 | let mu = -. sum Float.(fun i -> 75 | profile.A.%(i) * rate.A.%{i, i} 76 | ) 77 | in 78 | make Float.(fun i j -> rate.A.%{i, j} / mu) 79 | 80 | let scale rate = 81 | let mu = sum (fun i -> rate.A.%{i, i}) in 82 | A.Matrix.init Float.(fun i j -> rate.A.%{i, j} / mu) 83 | 84 | let ut_index i j = 85 | let n = A.card in 86 | n * (n - 1) / 2 - (n - i) * (n - i -1) / 2 + j - i - 1 87 | 88 | let%test "upper triangular indexation" = 89 | Poly.equal 90 | ( 91 | List.init A.card ~f:(fun i -> 92 | List.init (A.card - i - 1) ~f:(fun j -> 93 | let j = i + j + 1 in 94 | ut_index i j 95 | ) 96 | ) 97 | |> List.concat 98 | ) 99 | (List.init (A.card * (A.card - 1) / 2) ~f:Fn.id) 100 | 101 | let gtr ~stationary_distribution ~exchangeabilities = 102 | let m = make (fun i j -> 103 | A.Matrix.get exchangeabilities (i :> int) (j :> int) *. stationary_distribution.A.%(j) 104 | ) in 105 | scaled_rate_matrix stationary_distribution m 106 | 107 | let%test "gtr stationary distribution" = 108 | let rng = Utils.rng_of_int 12334 in 109 | let pi = A.random_profile rng 10. in 110 | let exchangeabilities = make_symetric (fun _ _ -> Gsl.Randist.gamma rng ~a:1. ~b:1.) in 111 | let gtr_rates = gtr ~stationary_distribution:pi ~exchangeabilities in 112 | let pi' = stationary_distribution gtr_rates in 113 | Vector.robust_equal ~tol:1e-6 (pi :> vec) (pi' :> vec) 114 | end 115 | 116 | let make n ~f = 117 | let r = Matrix.init n ~f:(fun _ _ -> 0.) in 118 | for i = 0 to n - 1 do 119 | let total = ref 0. in 120 | for j = 0 to n - 1 do 121 | if i <> j then ( 122 | let r_ij = f i j in 123 | if Float.(r_ij < 0.) then (failwith "Rates should be positive") ; 124 | total := r_ij +. !total ; 125 | Matrix.set r i j r_ij 126 | ) 127 | done ; 128 | Matrix.set r i i (-. !total) 129 | done ; 130 | r 131 | 132 | let transition_probability_matrix ~tau ~rates = 133 | Matrix.(( 134 | (of_arrays_exn rates 135 | |> scal_mul tau 136 | |> expm) :> Lacaml.D.mat) 137 | ) 138 | |> Lacaml.D.Mat.to_array 139 | 140 | module Nucleotide = struct 141 | include Make(Nucleotide) 142 | 143 | let k80 kappa = 144 | Nucleotide.Matrix.init (fun i j -> 145 | if Nucleotide.equal i j then -1. 146 | else if Nucleotide.transversion i j then 1. /. (kappa +. 2.) 147 | else kappa /. (kappa +. 2.) 148 | ) 149 | 150 | let hky85 151 | ~(stationary_distribution : Nucleotide.vector) 152 | ~transition_rate ~transversion_rate = 153 | let m = make (fun i j -> 154 | let coef = if Nucleotide.equal i j 155 | then -1. 156 | else if Nucleotide.transversion i j 157 | then transversion_rate 158 | else transition_rate 159 | in 160 | coef *. stationary_distribution.Nucleotide.%(j) 161 | ) in 162 | scaled_rate_matrix stationary_distribution m 163 | 164 | let%test "HKY85 stationary distribution" = 165 | let rng = Utils.rng_of_int 420 in 166 | let pi = Nucleotide.random_profile rng 10. in 167 | let transition_rate = Gsl.Rng.uniform rng 168 | and transversion_rate = Gsl.Rng.uniform rng in 169 | let hky_rates = hky85 ~stationary_distribution:pi ~transition_rate ~transversion_rate 170 | in 171 | let pi' = stationary_distribution hky_rates in 172 | Vector.robust_equal ~tol:1e-6 (pi :> vec) (pi' :> vec) 173 | end 174 | 175 | module Amino_acid = struct 176 | include Make(Amino_acid) 177 | end 178 | -------------------------------------------------------------------------------- /lib/rate_matrix.mli: -------------------------------------------------------------------------------- 1 | (** Continuous Time Markov Chain rate matrix 2 | 3 | A rate matrix is the 4 | {{:https://en.wikipedia.org/wiki/Infinitesimal_generator_(stochastic_processes)}infinitesimal 5 | generator} for a discrete-space continuous time markov process. It 6 | is basically a matrix such that all off-diagonal elements are 7 | positive and each diagonal element is minus the sum of all other 8 | elements in the row. *) 9 | 10 | 11 | module type S = sig 12 | type vector 13 | type matrix 14 | type symbol 15 | type t = matrix 16 | 17 | val make : (symbol -> symbol -> float) -> t 18 | (** [make f] is a matrix such that [f i j] is the rate of transition 19 | from state [i] to state [j]. [f] is called only for [i <> j]. *) 20 | 21 | val make_symetric : (symbol -> symbol -> float) -> t 22 | 23 | val jc69 : unit -> t 24 | (** {{:https://en.wikipedia.org/wiki/Models_of_DNA_evolution#JC69_model_(Jukes_and_Cantor_1969)}Jukes and Cantor 1969} *) 25 | 26 | val gtr : 27 | stationary_distribution:vector -> 28 | exchangeabilities:matrix -> 29 | t 30 | (** 31 | {{:https://en.wikipedia.org/wiki/Models_of_DNA_evolution#GTR_model_(Tavar%C3%A9_1986)}Generalised 32 | Time-Reversible model}. [exchangeabilities] should be a symetric 33 | matrix with arbitrary diagonal *) 34 | 35 | val stationary_distribution : t -> vector 36 | (** [stationary_distribution r] numerically computes the asymptotic 37 | probability distribution [pi] of the CTMC defined by [r]. *) 38 | 39 | val scaled_rate_matrix : vector -> t -> t 40 | (** [scaled_rate_matrix pi r] is a new matrix rate such that the 41 | corresponding CTMC has one expected transition per unit of time 42 | under the distribution [pi]. In addition, if [r] is symetrical, 43 | the result has [pi] as stationary distribution. *) 44 | 45 | val scale : t -> t 46 | (** rescale matrix such that the sum of off-diagonal elements is 1. *) 47 | end 48 | 49 | module Make(A : Alphabet.S_int) : S with type symbol := A.t 50 | and type vector := A.vector 51 | and type matrix := A.matrix 52 | 53 | 54 | module Nucleotide : sig 55 | include module type of Make(Nucleotide) 56 | val k80 : float -> t 57 | val hky85 : 58 | stationary_distribution:Nucleotide.vector -> 59 | transition_rate:float -> 60 | transversion_rate:float -> 61 | t 62 | end 63 | 64 | module Amino_acid : sig 65 | include module type of Make(Amino_acid) 66 | end 67 | 68 | val make : int -> f:(int -> int -> float) -> Linear_algebra.mat 69 | (** [make n f] is a [n] x [n] matrix such that [f i j] is the rate of 70 | transition from state [i] to state [j]. [f] is only called for [i 71 | <> j]. *) 72 | 73 | val transition_probability_matrix : 74 | tau:float -> 75 | rates:float array array -> 76 | float array array 77 | -------------------------------------------------------------------------------- /lib/rejection_sampling.ml: -------------------------------------------------------------------------------- 1 | (** Deprecated. Utility functions for rejection sampling *) 2 | 3 | open Core 4 | open Sigs 5 | 6 | module Make(Align : ALIGNMENT) = struct 7 | let generate_trees ~(sampler:unit->Phylogenetic_tree.t) amount = 8 | List.init amount ~f:(fun _ -> sampler ()) 9 | 10 | let reject seqgen p align trees = 11 | List.filter_map trees ~f:(fun t -> 12 | if Align.equal align (seqgen p t 1) then Some t else None 13 | ) 14 | 15 | let mean_floats l = 16 | List.fold l ~init:0.0 ~f:(fun acc x -> acc +. x) 17 | /. float_of_int (List.length l) 18 | 19 | let get_branch i trees = 20 | List.map trees ~f:(fun t -> List.nth_exn (Phylogenetic_tree.get_branch_lengths t) i) 21 | 22 | let mean_specific_branch i trees = 23 | get_branch i trees |> mean_floats 24 | 25 | end 26 | -------------------------------------------------------------------------------- /lib/seq.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | module type Base = sig 4 | type t 5 | val to_char : t -> char 6 | val of_char_exn : char -> t 7 | end 8 | 9 | module type S = sig 10 | type base 11 | type t 12 | val get : t -> int -> base 13 | val length : t -> int 14 | val of_list : base list -> t 15 | val of_string_exn : string -> t 16 | val to_string : t -> string 17 | val pp : Format.formatter -> t -> unit 18 | end 19 | 20 | module Make(B : Base) = struct 21 | type base = B.t 22 | type t = {length:int ; array:base array} 23 | 24 | let get seq i = Array.get seq.array i 25 | 26 | let length seq = seq.length 27 | 28 | let of_string_exn str = { 29 | length = String.length str ; 30 | array = Array.init (String.length str) ~f:( 31 | fun i -> B.of_char_exn (str.[i]) 32 | ) 33 | } 34 | 35 | let of_list l = { 36 | length = List.length l ; 37 | array = Array.init (List.length l) ~f:( 38 | fun i -> List.nth_exn l i 39 | ) 40 | } 41 | 42 | let to_string seq = 43 | String.init (length seq) ~f:( 44 | fun i -> B.to_char (get seq i) 45 | ) 46 | 47 | let pp fmt seq = 48 | to_string seq 49 | |> Format.fprintf fmt "%s" 50 | end 51 | 52 | module Make_list (B : Base) = struct 53 | type base = B.t 54 | type t = base list 55 | 56 | let get seq i = match List.nth seq i with Some b->b | None->failwith "Base not found" 57 | 58 | let length seq = List.length seq 59 | 60 | let of_string_exn str = 61 | let rec aux i acc = 62 | if (i >= String.length str) then 63 | List.rev acc 64 | else 65 | match B.of_char_exn str.[i] with 66 | | b -> aux (i+1) (b::acc) 67 | | exception _ -> invalid_arg "input string" 68 | in 69 | aux 0 [] 70 | 71 | let of_list l = l (* wow *) 72 | 73 | let to_string seq = 74 | List.map ~f:B.to_char seq 75 | |> String.of_char_list 76 | 77 | let pp fmt seq = to_string seq |> Format.fprintf fmt "%s" 78 | end 79 | 80 | module DNA = Make(Nucleotide) 81 | -------------------------------------------------------------------------------- /lib/seq.mli: -------------------------------------------------------------------------------- 1 | (** Module for sequences of bases. Provides a functor to build the module 2 | for any base module with the BASE signature.*) 3 | 4 | module type Base = sig 5 | type t 6 | val to_char : t -> char 7 | val of_char_exn : char -> t 8 | end 9 | 10 | (** Module type for sequences of bases (eg, DNA). *) 11 | module type S = sig 12 | type base 13 | type t 14 | val get : t -> int -> base 15 | val length : t -> int 16 | val of_list : base list -> t 17 | val of_string_exn : string -> t 18 | val to_string : t -> string 19 | val pp : Format.formatter -> t -> unit 20 | end 21 | 22 | (** Main functor (using arrays) *) 23 | module Make (B : Base): S with type base = B.t 24 | 25 | (** List-based constructor (no particular reason to use it) *) 26 | module Make_list (B : Base): S with type base = B.t 27 | 28 | (** Pre-built DNA module. *) 29 | module DNA: module type of Make(Nucleotide) 30 | -------------------------------------------------------------------------------- /lib/sequence_generation.ml: -------------------------------------------------------------------------------- 1 | (** Deprecated. Simulation functions for sequences *) 2 | 3 | open Core 4 | open Sigs 5 | 6 | module type Base = sig 7 | type t 8 | val to_int : t -> int 9 | val of_int_exn : int -> t 10 | end 11 | 12 | module type Sequence = sig 13 | type t 14 | type base 15 | val to_string: t -> string 16 | val of_list: base list -> t 17 | end 18 | 19 | module Make 20 | (A : Alphabet.S_int) 21 | (Seq : Sequence with type base = A.t) 22 | (Align : ALIGNMENT with type sequence = Seq.t) 23 | (E : Site_evolution_model.S with type mat := A.matrix 24 | and type vec := A.vector) = 25 | struct 26 | let proba param = 27 | let my_eMt = E.transition_probability_matrix param in 28 | fun base t -> A.Matrix.row (my_eMt t) (A.to_int base) 29 | 30 | let draw_base vec = 31 | (* for all base check if x is smaller than transition proba, 32 | if yes return base else decrement x *) 33 | let rec aux i x = 34 | let s = A.of_int_exn i in 35 | let proba = A.Vector.get vec s in 36 | if Float.(x < proba) then s 37 | else aux (i+1) (x-.proba) 38 | in 39 | Random.float 1.0 |> aux 0 40 | 41 | let seqgen_raw param = 42 | let my_proba = proba param in 43 | let stat_dist = E.stationary_distribution param in 44 | fun tree size -> 45 | let rec aux tree bl = match tree with 46 | | Phylogenetic_tree.Leaf {index=i; _} -> [(i,bl)] 47 | | Phylogenetic_tree.Node {left=t1,l; right=t2,r; _} -> 48 | aux l (List.map bl ~f:(fun b->draw_base (my_proba b t1))) 49 | @ aux r (List.map bl ~f:(fun b->draw_base (my_proba b t2))) 50 | in 51 | List.init size ~f:(fun _->draw_base stat_dist) 52 | |> aux tree 53 | 54 | let seqgen param = 55 | let my_seqgen = seqgen_raw param in 56 | fun tree size -> 57 | my_seqgen tree size 58 | |> List.map ~f:(fun (i,s)->(i,Seq.of_list s)) 59 | |> Align.of_assoc_list 60 | 61 | let seqgen_string_list param = 62 | let my_seqgen = seqgen_raw param in 63 | fun tree size -> 64 | let raw = my_seqgen tree size in 65 | List.init (List.length raw) ~f:( 66 | fun i -> 67 | List.Assoc.find_exn ~equal:String.( = ) raw (Phylogenetic_tree.index_of_int i) 68 | |> Seq.of_list |> Seq.to_string 69 | ) 70 | end 71 | -------------------------------------------------------------------------------- /lib/sequence_simulator.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | module type Alphabet = sig 4 | type t 5 | val card : int 6 | val of_int_exn : int -> t 7 | val to_char : t -> char 8 | module Vector : sig 9 | type t 10 | val of_array_exn : float array -> t 11 | end 12 | end 13 | 14 | module Make(A : Alphabet) = struct 15 | type profile = Profile of { 16 | probs : float array ; 17 | dist : Gsl.Randist.discrete ; 18 | } 19 | 20 | let profile_of_array_exn probs = 21 | if Array.length probs <> A.card then failwith "Array has incorrect size" ; 22 | let dist = Gsl.Randist.discrete_preproc probs in 23 | Profile { probs ; dist } 24 | 25 | let random_profile ~alpha rng = 26 | let alpha = Array.create ~len:A.card alpha in 27 | let theta = Array.create ~len:A.card 0. in 28 | Gsl.Randist.dirichlet rng ~alpha ~theta ; 29 | profile_of_array_exn theta 30 | 31 | let draw_from_profile (Profile p) rng = 32 | Gsl.Randist.discrete rng p.dist 33 | |> A.of_int_exn 34 | 35 | type pwm = profile array 36 | 37 | let random_pwm ~alpha n rng = 38 | Array.init n ~f:(fun _ -> random_profile ~alpha rng) 39 | 40 | let draw_from_pwm pwm rng = 41 | let n = Array.length pwm in 42 | String.init n ~f:(fun i -> 43 | draw_from_profile pwm.(i) rng 44 | |> A.to_char 45 | ) 46 | 47 | let vec_of_profile (Profile p) = 48 | A.Vector.of_array_exn p.probs 49 | end 50 | -------------------------------------------------------------------------------- /lib/sequence_simulator.mli: -------------------------------------------------------------------------------- 1 | (** Functions to simulate sequences *) 2 | 3 | module type Alphabet = sig 4 | type t 5 | val card : int 6 | val of_int_exn : int -> t 7 | val to_char : t -> char 8 | module Vector : sig 9 | type t 10 | val of_array_exn : float array -> t 11 | end 12 | end 13 | 14 | module Make(A : Alphabet) : sig 15 | type profile 16 | val profile_of_array_exn : float array -> profile 17 | val random_profile : alpha:float -> Gsl.Rng.t -> profile 18 | val draw_from_profile : profile -> Gsl.Rng.t -> A.t 19 | val vec_of_profile : profile -> A.Vector.t 20 | 21 | type pwm = profile array 22 | val random_pwm : alpha:float -> int -> Gsl.Rng.t -> pwm 23 | val draw_from_pwm : pwm -> Gsl.Rng.t -> string 24 | end 25 | -------------------------------------------------------------------------------- /lib/sigs.ml: -------------------------------------------------------------------------------- 1 | (** Compilation of module signatures used elsewhere. *) 2 | 3 | open Linear_algebra (* to get the vec and mat types *) 4 | 5 | (** Index type for tree leaves and sequences in alignments *) 6 | type index = string 7 | 8 | (** Module type for individual bases (eg, A, T, C, G). 9 | Mostly conversion to/from strings and ints *) 10 | module type BASE = sig 11 | type t 12 | val of_char: char -> t 13 | val of_int: int -> t 14 | val to_int: t -> int 15 | val print_base: t -> unit 16 | val to_string: t -> string 17 | val alphabet_size: int 18 | end 19 | 20 | (** Module type for sequences of bases (eg, DNA). *) 21 | module type SEQUENCE = sig 22 | type base 23 | type t 24 | val get: t -> int -> base 25 | val length: t -> int 26 | val of_list: base list -> t 27 | val of_string: string -> t 28 | val to_string: t -> string 29 | val pp: Format.formatter -> t -> unit 30 | end 31 | 32 | (** Module type for alignments of sequences (eg, DNA alignment.) *) 33 | module type ALIGNMENT = sig 34 | type t 35 | type base 36 | type sequence 37 | val of_string_list: string list -> t 38 | val of_assoc_list: (index*sequence) list -> t 39 | val of_fasta: string -> t 40 | val pp: Format.formatter -> t -> unit 41 | val get_base: t -> seq:index -> pos:int -> base 42 | val length: t -> int 43 | val nb_seq: t -> int 44 | val to_file: t -> string -> unit 45 | val equal: t -> t -> bool 46 | end 47 | 48 | 49 | (** Evolution model with linear algebra functions to compute static distribution and 50 | transition matrix diagonalization.*) 51 | module type EVOL_MODEL = sig 52 | type t 53 | type base 54 | val transition: t -> base -> base -> float 55 | val of_string: string -> t 56 | val to_string: t -> string 57 | val eMt_mat: t -> float -> mat 58 | val eMt_series: t -> float -> mat 59 | val stat_dist_vec: t -> vec 60 | val known_vector: base -> vec 61 | end 62 | -------------------------------------------------------------------------------- /lib/simulator.ml: -------------------------------------------------------------------------------- 1 | (** 2 | Useful documents: 3 | - https://www.stat.wisc.edu/~larget/phylogeny.pdf 4 | - https://bmcevolbiol.biomedcentral.com/articles/10.1186/s12862-017-0979-y 5 | - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5854120/ 6 | *) 7 | open Core 8 | 9 | module type Branch_info = sig 10 | type t 11 | val length : t -> float 12 | end 13 | 14 | module Make 15 | (A : Alphabet.S_int) 16 | (BI : Branch_info) = 17 | struct 18 | let symbol_sample rng v = 19 | Gsl.Randist.discrete_preproc v 20 | |> Gsl.Randist.discrete rng 21 | |> A.of_int_exn 22 | 23 | let transition_matrix rate_matrix b = 24 | A.Matrix.(expm (scal_mul (BI.length b) (rate_matrix b))) 25 | 26 | let evolve tree ~f ~root = 27 | Tree.propagate tree ~init:root 28 | ~node:(fun x_in ni -> x_in, (ni, x_in)) 29 | ~leaf:(fun x_in li -> li, x_in) 30 | ~branch:(fun x_in bi -> 31 | let x_out = f bi x_in in 32 | x_out, bi 33 | ) 34 | 35 | let site_exponential_method rng tree ~root ~transition_matrix = 36 | evolve tree ~root ~f:(fun bi (x_in : A.t) -> 37 | A.Matrix.row (transition_matrix bi) (x_in :> int) 38 | |> A.Vector.to_array 39 | |> symbol_sample rng 40 | ) 41 | 42 | (* Gillespie "first reaction" method *) 43 | let site_gillespie_first_reaction rng tree ~root ~rate_matrix = 44 | evolve tree ~root ~f:(fun bi (x_in : A.t) -> 45 | let rate_matrix = rate_matrix bi in 46 | let rec loop state remaining_time = 47 | let waiting_times = 48 | A.Table.init (fun m -> 49 | if A.equal m state then (Float.infinity, m) 50 | else 51 | let rate = rate_matrix.A.%{state, m} in 52 | if Float.(rate < 1e-30) then (Float.infinity, m) 53 | else 54 | let tau = Gsl.Randist.exponential rng ~mu:(1. /. rate) in 55 | tau, m 56 | ) 57 | in 58 | let tau, next_state = 59 | Array.min_elt (waiting_times :> (float * A.t) array) ~compare:Poly.compare 60 | |> (fun x -> Option.value_exn x) 61 | in 62 | if Float.(tau > remaining_time) then state 63 | else loop next_state Float.(remaining_time - tau) 64 | in 65 | loop x_in (BI.length bi) 66 | ) 67 | 68 | (* Gillespie "direct" method *) 69 | let branch_gillespie_direct rng ~start_state ~rate_matrix ~branch_length ~init ~f = 70 | let rec loop state t acc = 71 | let rates = A.Table.init (fun m -> if A.equal m state then 0. else rate_matrix.A.%{state, m}) in 72 | let total_rate = -. rate_matrix.A.%{state, state} in 73 | let t' = t +. Gsl.Randist.exponential rng ~mu:(1. /. total_rate) in 74 | if Float.(t' > branch_length) then acc 75 | else 76 | let next_state = symbol_sample rng (rates :> float array) in 77 | loop next_state t' (f acc next_state t') 78 | in 79 | loop start_state 0. init 80 | 81 | let site_gillespie_direct rng tree ~(root : A.t) ~rate_matrix = 82 | evolve tree ~root ~f:(fun bi (x_in : A.t) -> 83 | let rate_matrix = rate_matrix bi in 84 | branch_gillespie_direct rng 85 | ~start_state:x_in ~rate_matrix ~branch_length:(BI.length bi) 86 | ~init:x_in ~f:(fun _ n _ -> n) 87 | ) 88 | 89 | let sequence_gillespie_direct rng tree ~update_iterator ~root ~rate_vector = 90 | evolve tree ~root ~f:(fun bi seq -> 91 | let state = Array.copy seq in 92 | let n = Array.length state in 93 | let rate i = rate_vector bi state i in 94 | let rates = Array.init n ~f:rate in 95 | let pos_rate i = Utils.array_sum (rates.(i) : float A.table :> float array) in 96 | let pos_rates = Discrete_pd.init n ~f:pos_rate in 97 | let rec loop remaining_time = 98 | let total_rate = Discrete_pd.total_weight pos_rates in 99 | let tau = Gsl.Randist.exponential rng ~mu:(1. /. total_rate) in 100 | if Float.(tau > remaining_time) then state 101 | else 102 | let pos = Discrete_pd.draw pos_rates rng in 103 | let next_letter = symbol_sample rng (rates.(pos) :> float array) in 104 | state.(pos) <- next_letter ; 105 | update_iterator ~n ~pos (fun pos -> 106 | rates.(pos) <- rate pos ; 107 | Discrete_pd.update pos_rates pos (pos_rate pos) 108 | ) ; 109 | loop Float.(remaining_time - tau) 110 | in 111 | loop (BI.length bi) 112 | ) 113 | 114 | let hmm0 rng ~len ~dist = 115 | Array.init len ~f:(fun i -> symbol_sample rng (dist i : float A.table :> float array)) 116 | end 117 | 118 | module NSCodon(BI : Branch_info) = struct 119 | include Make(Mutsel.NSCodon)(BI) 120 | 121 | let alignment rng tree ~root ~rate_matrix = 122 | List.init (Array.length root) ~f:(fun i -> 123 | let rate_matrix = rate_matrix i in 124 | site_gillespie_direct rng tree ~root:root.(i) ~rate_matrix 125 | |> Tree.leaves 126 | |> List.map ~f:(fun (_, c) -> Codon.Universal_genetic_code.NS.to_string c) 127 | ) 128 | |> List.transpose_exn 129 | |> List.map ~f:String.concat 130 | |> List.map ~f:Dna.of_string_unsafe 131 | end 132 | -------------------------------------------------------------------------------- /lib/simulator.mli: -------------------------------------------------------------------------------- 1 | module type Branch_info = sig 2 | type t 3 | val length : t -> float 4 | end 5 | 6 | module Make 7 | (A : Alphabet.S_int) 8 | (BI : Branch_info) : 9 | sig 10 | val transition_matrix : 11 | (BI.t -> A.matrix) -> 12 | BI.t -> 13 | A.matrix 14 | 15 | val site_exponential_method : 16 | Gsl.Rng.t -> 17 | ('n, 'l, BI.t) Tree.t -> 18 | root:A.t -> 19 | transition_matrix:(BI.t -> A.matrix) -> 20 | ('n * A.t, 'l * A.t, BI.t) Tree.t 21 | 22 | val branch_gillespie_direct : 23 | Gsl.Rng.t -> 24 | start_state:A.t -> 25 | rate_matrix:A.matrix -> 26 | branch_length:float -> 27 | init:'a -> 28 | f:('a -> A.t -> float -> 'a) -> 29 | 'a 30 | 31 | val site_gillespie_direct : 32 | Gsl.Rng.t -> 33 | ('n, 'l, BI.t) Tree.t -> 34 | root:A.t -> 35 | rate_matrix:(BI.t -> A.matrix) -> 36 | ('n * A.t, 'l * A.t, BI.t) Tree.t 37 | 38 | val site_gillespie_first_reaction : 39 | Gsl.Rng.t -> 40 | ('n, 'l, BI.t) Tree.t -> 41 | root:A.t -> 42 | rate_matrix:(BI.t -> A.matrix) -> 43 | ('n * A.t, 'l * A.t, BI.t) Tree.t 44 | 45 | val sequence_gillespie_direct : 46 | Gsl.Rng.t -> 47 | ('n, 'l, BI.t) Tree.t -> 48 | update_iterator:(n:int -> pos:int -> (int -> unit) -> unit) -> 49 | root:A.t array -> 50 | rate_vector:(BI.t -> A.t array -> int -> float A.table) -> 51 | ('n * A.t array, 'l * A.t array, BI.t) Tree.t 52 | 53 | val hmm0 : 54 | Gsl.Rng.t -> 55 | len:int -> 56 | dist:(int -> float A.table) -> 57 | A.t array 58 | end 59 | 60 | module NSCodon(BI : Branch_info) : sig 61 | include module type of Make(Mutsel.NSCodon)(BI) 62 | 63 | val alignment : 64 | Gsl.Rng.t -> 65 | (_, _, BI.t) Tree.t -> 66 | root:Mutsel.NSCodon.t array -> 67 | rate_matrix:(int -> BI.t -> Mutsel.NSCodon_rate_matrix.t) -> 68 | Dna.t list 69 | end 70 | -------------------------------------------------------------------------------- /lib/site_evolution_model.ml: -------------------------------------------------------------------------------- 1 | module type S = sig 2 | type param 3 | type vec 4 | type mat 5 | val rate_matrix : param -> mat 6 | val transition_probability_matrix : param -> float -> mat 7 | val stationary_distribution : param -> vec 8 | end 9 | 10 | module type S_with_reduction = sig 11 | include S 12 | val rate_matrix_reduction : param -> mat * vec * mat 13 | end 14 | 15 | module type Rate_matrix = sig 16 | type param 17 | type mat 18 | val rate_matrix : param -> mat 19 | end 20 | 21 | module type Diagonalizable_rate_matrix = sig 22 | include Rate_matrix 23 | type vec 24 | val rate_matrix_reduction : param -> mat * vec * mat 25 | end 26 | 27 | module Make 28 | (A : Alphabet.S) 29 | (M : Rate_matrix with type mat := A.matrix) = 30 | struct 31 | let transition_probability_matrix param t = 32 | A.Matrix.(expm (scal_mul t (M.rate_matrix param))) 33 | let stationary_distribution param = 34 | A.Matrix.zero_eigen_vector (M.rate_matrix param) 35 | end 36 | 37 | 38 | module Make_diag 39 | (A : Alphabet.S) 40 | (M : Diagonalizable_rate_matrix with type vec := A.vector 41 | and type mat := A.matrix) 42 | = struct 43 | let transition_probability_matrix param = 44 | let diag_p, diag, diag_p_inv = M.rate_matrix_reduction param in 45 | fun t -> 46 | let d = A.Matrix.diagm A.Vector.(exp (scal_mul t diag)) in 47 | A.Matrix.(dot (dot diag_p d) diag_p_inv) 48 | 49 | let stationary_distribution param = 50 | A.Matrix.zero_eigen_vector (M.rate_matrix param) 51 | end 52 | 53 | module JC69 = struct 54 | module Base = struct 55 | type param = unit 56 | let rate_matrix () = Rate_matrix.Nucleotide.jc69 () 57 | let rate_matrix_reduction () = 58 | let p = Nucleotide.Matrix.of_arrays_exn [| 59 | [| 1. ; -1. ; -1. ; -1. |] ; 60 | [| 1. ; 1. ; 0. ; 0. |] ; 61 | [| 1. ; 0. ; 1. ; 0. |] ; 62 | [| 1. ; 0. ; 0. ; 1. |] ; 63 | |] 64 | in 65 | let diag = Nucleotide.Vector.of_array_exn [| 0. ; -4./.3. ; -4./.3. ; -4./.3. |] in 66 | let p_inv = Nucleotide.Matrix.of_arrays_exn [| 67 | [| 0.25 ; 0.25 ; 0.25 ; 0.25 |] ; 68 | [| -0.25 ; 0.75 ; -0.25 ; -0.25 |] ; 69 | [| -0.25 ; -0.25 ; 0.75 ; -0.25 |] ; 70 | [| -0.25 ; -0.25 ; -0.25 ; 0.75 |] ; 71 | |] 72 | in 73 | p, diag, p_inv 74 | 75 | let%test "JC69_reduction 1" = 76 | let p, _, p_inv = rate_matrix_reduction () in 77 | Nucleotide.Matrix.(robust_equal ~tol:1e-6 (dot p p_inv) (init (fun i j -> if i = j then 1. else 0.))) 78 | 79 | let%test "JC69_reduction 2" = 80 | let p, d, p_inv = rate_matrix_reduction () in 81 | Nucleotide.Matrix.(robust_equal ~tol:1e-6 (dot p (dot (diagm d) p_inv)) (rate_matrix ())) 82 | end 83 | include Base 84 | include Make_diag(Nucleotide)(Base) 85 | let stationary_distribution () = Nucleotide.Vector.init (fun _ -> 0.25) 86 | end 87 | 88 | module JC69_numerical = struct 89 | include JC69.Base 90 | include Make(Nucleotide)(JC69.Base) 91 | end 92 | 93 | module K80 = struct 94 | module Base = struct 95 | type param = float 96 | let rate_matrix kappa = Rate_matrix.Nucleotide.k80 kappa 97 | let rate_matrix_reduction k = 98 | let p = Nucleotide.Matrix.of_arrays_exn [| 99 | [| 1. ; -1. ; 0. ; -1. |] ; 100 | [| 1. ; 1. ; -1. ; 0. |] ; 101 | [| 1. ; -1. ; 0. ; 1. |] ; 102 | [| 1. ; 1. ; 1. ; 0. |] ; 103 | |] 104 | in 105 | let diag = 106 | let lambda_3 = (-2. *. k -. 2.) /. (k +. 2.) in 107 | [| 0. ; -4. /. (k +. 2.) ; lambda_3 ; lambda_3 |] 108 | |> Nucleotide.Vector.of_array_exn 109 | in 110 | let p_inv = Nucleotide.Matrix.of_arrays_exn [| 111 | [| 0.25 ; 0.25 ; 0.25 ; 0.25 |] ; 112 | [| -0.25 ; 0.25 ; -0.25 ; 0.25 |] ; 113 | [| 0. ; -0.5 ; 0. ; 0.5 |] ; 114 | [| -0.5 ; 0. ; 0.5 ; 0. |] 115 | |] 116 | in 117 | p, diag, p_inv 118 | 119 | let%test "K80_reduction 1" = 120 | let p, _, p_inv = rate_matrix_reduction 2. in 121 | Nucleotide.Matrix.(robust_equal ~tol:1e-6 (dot p p_inv) (init (fun i j -> if i = j then 1. else 0.))) 122 | 123 | let%test "K80_reduction 2" = 124 | let p, d, p_inv = rate_matrix_reduction 2. in 125 | Nucleotide.Matrix.(robust_equal ~tol:1e-6 (dot p (dot (diagm d) p_inv)) (rate_matrix 2.)) 126 | end 127 | include Base 128 | include Make_diag(Nucleotide)(Base) 129 | let stationary_distribution _ = Nucleotide.Vector.init (fun _ -> 0.25) 130 | end 131 | 132 | module K80_numerical = struct 133 | include K80.Base 134 | include Make(Nucleotide)(K80.Base) 135 | end 136 | 137 | module type Nucleotide_S_with_reduction = 138 | S_with_reduction 139 | with type vec := Nucleotide.vector 140 | and type mat := Nucleotide.matrix 141 | -------------------------------------------------------------------------------- /lib/site_evolution_model.mli: -------------------------------------------------------------------------------- 1 | (** Deprecated. Compilation of modules implementing evolution models and 2 | providing relevant mathematical procedure (eg, exponential of transition matrix); 3 | also includes functors to build models from transition matrices.*) 4 | 5 | (** Evolution model with linear algebra functions to compute 6 | stationary distribution and transition matrix diagonalization.*) 7 | module type S = sig 8 | type param 9 | type vec 10 | type mat 11 | val rate_matrix : param -> mat 12 | val transition_probability_matrix : param -> float -> mat 13 | val stationary_distribution : param -> vec 14 | end 15 | 16 | module type S_with_reduction = sig 17 | include S 18 | val rate_matrix_reduction : param -> mat * vec * mat 19 | end 20 | 21 | module type Rate_matrix = sig 22 | type param 23 | type mat 24 | val rate_matrix : param -> mat 25 | end 26 | 27 | module type Diagonalizable_rate_matrix = sig 28 | include Rate_matrix 29 | type vec 30 | val rate_matrix_reduction : param -> mat * vec * mat 31 | end 32 | 33 | module Make 34 | (A : Alphabet.S) 35 | (M : Rate_matrix with type mat := A.matrix) : 36 | sig 37 | val transition_probability_matrix : M.param -> float -> A.matrix 38 | val stationary_distribution : M.param -> A.vector 39 | end 40 | 41 | module Make_diag 42 | (A : Alphabet.S) 43 | (M : Diagonalizable_rate_matrix with type vec := A.vector 44 | and type mat := A.matrix) : 45 | sig 46 | val transition_probability_matrix : M.param -> float -> A.matrix 47 | val stationary_distribution : M.param -> A.vector 48 | end 49 | 50 | module type Nucleotide_S_with_reduction = 51 | S_with_reduction 52 | with type vec := Nucleotide.vector 53 | and type mat := Nucleotide.matrix 54 | 55 | (** Jukes-Cantor model with analytical diagonalization of transition 56 | matrix. *) 57 | module JC69 : Nucleotide_S_with_reduction with type param = unit 58 | 59 | (** Jukes-Cantor model with numerical calculation of probability 60 | transition matrix *) 61 | module JC69_numerical : Nucleotide_S_with_reduction with type param = unit 62 | 63 | (** K80 model with analytical diagonalization of transition matrix (parametrized by kappa) *) 64 | module K80 : Nucleotide_S_with_reduction with type param = float 65 | 66 | (** K80 model with numerical calculation of probability 67 | transition matrix *) 68 | module K80_numerical : Nucleotide_S_with_reduction with type param = float 69 | -------------------------------------------------------------------------------- /lib/stat_tools.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | type sample_list = float list 4 | 5 | let sample_branch_lengths ~(branchs:int->bool) ~(sampler:unit->float) tree () = 6 | Phylogenetic_tree.get_branch_lengths tree 7 | |> List.mapi ~f:(fun i l -> if branchs i then sampler () else l) 8 | |> Phylogenetic_tree.set_branch_lengths tree 9 | 10 | let sample_list_of_file path = 11 | In_channel.read_lines path 12 | |> List.map ~f:(float_of_string) 13 | 14 | let sample_list_extrema d = 15 | match 16 | List.min_elt ~compare:Float.compare d, List.max_elt ~compare:Float.compare d 17 | with 18 | (Some mi, Some ma) -> (mi, ma) | _ -> failwith "empty input distribution" 19 | 20 | let sample_list_mean d = 21 | List.fold d ~init:(0., 0) ~f:(fun (s, c) x -> (s+.x, c+1)) 22 | |> fun (s, c) -> s /. (float_of_int c) 23 | 24 | -------------------------------------------------------------------------------- /lib/stat_tools.mli: -------------------------------------------------------------------------------- 1 | (** Probability and statistics tools (eg, samplers and distribution handling). *) 2 | 3 | (** {5 Samplers} *) 4 | 5 | val sample_branch_lengths: branchs:(int->bool) -> sampler:(unit->float) -> Phylogenetic_tree.t -> unit -> Phylogenetic_tree.t 6 | 7 | 8 | (** {5 Distributions} *) 9 | 10 | type sample_list = float list 11 | 12 | val sample_list_of_file: string -> sample_list 13 | 14 | val sample_list_extrema: sample_list -> float * float 15 | 16 | val sample_list_mean: sample_list -> float 17 | -------------------------------------------------------------------------------- /lib/tree.mli: -------------------------------------------------------------------------------- 1 | (** Tree structure with annotations on leaves, internal nodes and 2 | branches. *) 3 | 4 | type ('n, 'l, 'b) t = 5 | | Node of { 6 | data : 'n ; 7 | branches : ('n, 'l, 'b) branch List1.t ; 8 | } 9 | | Leaf of 'l 10 | [@@deriving sexp] 11 | (** [Node { data; branches }] represents a node in the tree with node 12 | data of type ['n] and a non-empty list of branches 13 | 14 | [Leaf l] represents a leaf in the tree with leaf data of type 15 | ['l]. 16 | *) 17 | 18 | and ('n, 'l, 'b) branch = Branch of { 19 | data : 'b ; 20 | tip : ('n, 'l, 'b) t ; 21 | } 22 | 23 | val leaf : 'l -> (_, 'l, 'b) t 24 | (** [leaf l] constructs a leaf node with data [l]. *) 25 | 26 | val node : 27 | 'a -> 28 | ('a, 'b, 'c) branch List1.t -> 29 | ('a, 'b, 'c) t 30 | (** [node a branches] constructs a node with data [a] and the given 31 | list of branches. *) 32 | 33 | val binary_node : 34 | 'a -> 35 | ('a, 'b, 'c) branch -> 36 | ('a, 'b, 'c) branch -> 37 | ('a, 'b, 'c) t 38 | (** [binary_node a b1 b2] constructs a binary node with data [a] and 39 | the two branches [b1] and [b2]. *) 40 | 41 | val branch : 42 | 'c -> 43 | ('a, 'b, 'c) t -> 44 | ('a, 'b, 'c) branch 45 | (** [branch c tip] constructs a branch with data [c] and the given 46 | sub-tree [tip]. *) 47 | 48 | val data : ('a, 'a, _) t -> 'a 49 | (** [data t] returns the data contained in the leaf or root node of 50 | the tree [t]. *) 51 | 52 | val to_printbox : 53 | ?node:('n -> string) -> 54 | ?leaf:('l -> string) -> 55 | ?branch:('b -> string option) -> 56 | ('n, 'l, 'b) t -> 57 | PrintBox.t 58 | (** [to_printbox ?node ?leaf ?branch t] converts the tree [t] to a 59 | printable representation using the provided functions for 60 | converting node, leaf, and branch data to strings. *) 61 | 62 | val map : 63 | ('a, 'b, 'c) t -> 64 | node:('a -> 'd) -> 65 | leaf:('b -> 'e) -> 66 | branch:('c -> 'f) -> 67 | ('d, 'e, 'f) t 68 | (** [map t ~node ~leaf ~branch] maps the node, leaf, and branch data 69 | of the tree [t] using the provided functions [node], [leaf], and 70 | [branch], respectively. *) 71 | 72 | val map_branches : 73 | ('a, 'b, 'c) t -> 74 | node:('a -> 'd) -> 75 | leaf:('b -> 'd) -> 76 | branch:('d -> 'c -> 'd -> 'e) -> 77 | ('a, 'b, 'e) t 78 | 79 | val map2_exn : 80 | ('a, 'b, 'c) t -> 81 | ('d, 'e, 'f) t -> 82 | node:('a -> 'd -> 'x) -> 83 | leaf:('b -> 'e -> 'y) -> 84 | branch:('c -> 'f -> 'z) -> 85 | ('x, 'y, 'z) t 86 | (** [map2_exn t1 t2 ~node ~leaf ~branch] maps the corresponding node, 87 | leaf, and branch data of the trees [t1] and [t2] using functions 88 | [node], [leaf], and [branch], respectively. Raises an exception if 89 | the two trees have different structures. *) 90 | 91 | val map_branch2_exn : 92 | ('a, 'b, 'c) branch -> 93 | ('d, 'e, 'f) branch -> 94 | node:('a -> 'd -> 'x) -> 95 | leaf:('b -> 'e -> 'y) -> 96 | branch:('c -> 'f -> 'z) -> 97 | ('x, 'y, 'z) branch 98 | (** [map_branch2_exn b1 b2 ~node ~leaf ~branch] maps the corresponding 99 | node, leaf, and branch data of the branches [b1] and [b2] using 100 | functions [node], [leaf], and [branch], respectively. Raises an 101 | exception if the two branches have different structures. *) 102 | 103 | val propagate : 104 | ('n1, 'l1, 'b1) t -> 105 | init:'s -> 106 | node:('s -> 'n1 -> 's * 'n2) -> 107 | leaf:('s -> 'l1 -> 'l2) -> 108 | branch:('s -> 'b1 -> 's * 'b2) -> 109 | ('n2, 'l2, 'b2) t 110 | (** [propagate t ~init ~node ~leaf ~branch] propagates the values of 111 | node, leaf, and branch data in the tree [t] using the provided 112 | update functions [node], [leaf], and [branch]. The initial state 113 | [init] is threaded through the propagation. *) 114 | 115 | val prefix_traversal : 116 | ('n, 'l, 'b) t -> 117 | init:'c -> 118 | node:('c -> 'n -> 'c) -> 119 | leaf:('c -> 'l -> 'c) -> 120 | branch:('c -> 'b -> 'c) -> 121 | 'c 122 | (** [prefix_traversal t ~init ~node ~leaf ~branch] performs a prefix 123 | traversal of the tree [t] using the provided update functions 124 | [node], [leaf], and [branch]. The initial state [init] is threaded 125 | through the traversal. Returns the final state after the 126 | traversal. *) 127 | 128 | val leaves : (_, 'l, _) t -> 'l list 129 | (** [leaves t] returns a list of all leaf data in the tree [t]. *) 130 | 131 | val fold_leaves : 132 | (_, 'l, _) t -> 133 | init:'a -> 134 | f:('a -> 'l -> 'a) -> 135 | 'a 136 | (** [fold_leaves t ~init ~f] folds over the leaf data in the tree [t] 137 | using the provided folding function [f] and initial accumulator 138 | [init]. Returns the final accumulator after folding. *) 139 | 140 | val unfold : 141 | ('n, 'l, 'b) t -> 142 | init:'a -> 143 | branch:('e -> 'b -> 'a * 'bb) -> 144 | leaf:('a -> 'l -> 'e * 'll) -> 145 | node:('a -> 'n -> 'e * 'nn) -> 146 | ('nn, 'll, 'bb) t 147 | (** [unfold t ~init ~branch ~leaf ~node] unfolds the tree [t] into a 148 | new tree using the provided functions [branch], [leaf], and 149 | [node]. The initial state [init] is threaded through the 150 | unfolding. *) 151 | 152 | (* val node_prefix_synthesis : 153 | * ('a, 'b, 'l) t -> 154 | * init:'c -> 155 | * f:('c -> 'a -> 'd list -> 'c * 'd) -> 156 | * ('d, 'b) t *) 157 | 158 | val leafset_generated_subtree : 159 | ('n, 'l, 'b) t -> 160 | ('l -> string option) -> 161 | string list -> 162 | ('n, 'l, 'b) t option 163 | (** [leafset_generated_subtree t f xs] returns the maximal subtree of 164 | [t] whose leaves all return a string in [xs] when applied to [f] if 165 | it exists (and [None] otherwise) *) 166 | 167 | val simplify_node_with_single_child : 168 | merge_branch_data:('b list -> 'b) -> 169 | ('n, 'l, 'b) t -> 170 | ('n, 'l, 'b) t 171 | (** [simplify_node_with_single_child ~merge_branch_data t] simplifies 172 | the tree [t] by merging nodes with a single child using the 173 | provided merging function [merge_branch_data]. *) 174 | -------------------------------------------------------------------------------- /lib/utils.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | let marker = function 4 | | "" -> "\027[0m" 5 | | "red" -> "\027[31m" 6 | | "green" -> "\027[32m" 7 | | "yellow" -> "\027[33m" 8 | | "blue" -> "\027[34m" 9 | | "magenta" -> "\027[35m" 10 | | "cyan" -> "\027[36m" 11 | | s -> failwith (sprintf "Unrecognized marker %s." s) 12 | 13 | let rec insert_colors str = 14 | match String.lsplit2 ~on:'$' str with 15 | | None -> str 16 | | Some (beg, en) -> beg ^ (match String.lsplit2_exn ~on:'$' en with 17 | | m, en2 -> (marker m) ^ (insert_colors en2) 18 | ) 19 | 20 | let rec defancy str = 21 | match String.lsplit2 ~on:'\027' str with 22 | | None -> str 23 | | Some (beg, en) -> 24 | beg ^ (match String.lsplit2_exn ~on:'m' en with _, en2 -> defancy en2) 25 | 26 | let fancy_length str = defancy str |> String.length 27 | 28 | let fancy_format format = 29 | Scanf.format_from_string 30 | (insert_colors (string_of_format format)) 31 | format 32 | 33 | (** sprintf variant that recognizes markers for colored output.*) 34 | let fancy_sprintf format = sprintf (fancy_format format) 35 | 36 | let colorize color to_colorize string = 37 | String.concat_map string ~f:(fun c -> 38 | if String.contains to_colorize c 39 | then fancy_sprintf "%s%c$$" (marker color) c 40 | else sprintf "%c" c) 41 | 42 | let apply_options options s = 43 | List.fold options ~init:s ~f:(fun s f -> f s) 44 | 45 | let print f = fun s -> f s |> defancy |> printf "%s" 46 | 47 | let print_fancy ?(options=[]) f = fun s -> f s |> apply_options options |> printf "%s" 48 | 49 | let pp f = fun fmt s -> f s |> defancy |> Format.fprintf fmt "%s" 50 | 51 | let pp_fancy ?(options=[]) f = fun fmt s -> f s |> apply_options options |> Format.fprintf fmt "%s" 52 | 53 | let all_printers ?(options=[]) f = pp f, pp_fancy ~options f, print f, print_fancy ~options f 54 | 55 | type float_array = float array 56 | [@@deriving show] 57 | 58 | type float_array_array = float array array 59 | [@@deriving show] 60 | 61 | let robust_equal x y = 62 | Float.robustly_compare x y = 0 63 | 64 | let float_array_robust_equal x y = 65 | let res = Array.for_alli x ~f:(fun i _ -> robust_equal x.(i) y.(i)) in 66 | if not res then ( 67 | fprintf stderr "expected: %s\ngot: %s\n" (show_float_array x) (show_float_array y) 68 | ); 69 | res 70 | 71 | let random_profile rng n = 72 | let v = Array.init n ~f:(fun _ -> Gsl.Rng.uniform rng) in 73 | let s = Array.fold v ~init:0. ~f:( +. ) in 74 | Linear_algebra.Vector.init n ~f:(fun i -> v.(i) /. s) 75 | 76 | let array_sum xs = Array.fold xs ~f:( +. ) ~init:0. 77 | 78 | let array_order xs ~compare = 79 | let ys = Array.mapi xs ~f:(fun i x -> x, i) in 80 | Array.sort ys ~compare:(Tuple2.compare ~cmp1:compare ~cmp2:Int.compare) ; 81 | Array.map ~f:snd ys 82 | 83 | let rng_of_int seed = 84 | let res = Gsl.Rng.(make (default ())) in 85 | Gsl.Rng.set res (Nativeint.of_int seed) ; 86 | res 87 | -------------------------------------------------------------------------------- /lib/utils.mli: -------------------------------------------------------------------------------- 1 | val fancy_sprintf : ('a, unit, string, string, string, string) format6 -> 'a 2 | val fancy_length : string -> int 3 | val all_printers : 4 | ?options:(string -> string) list -> 5 | ('a -> string) -> 6 | (Format.formatter -> 'a -> unit) * 7 | (Format.formatter -> 'a -> unit) * 8 | ('a -> unit) * ('a -> unit) 9 | val colorize : string -> string -> string -> string 10 | 11 | type float_array = float array 12 | [@@deriving show] 13 | 14 | type float_array_array = float array array 15 | [@@deriving show] 16 | 17 | val robust_equal : float -> float -> bool 18 | val float_array_robust_equal : float array -> float array -> bool 19 | 20 | val random_profile : Gsl.Rng.t -> int -> Linear_algebra.vec 21 | 22 | val array_sum : float array -> float 23 | 24 | val array_order : 25 | 'a array -> 26 | compare:('a -> 'a -> int) -> 27 | int array 28 | 29 | val rng_of_int : int -> Gsl.Rng.t 30 | -------------------------------------------------------------------------------- /lib/wag.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | 3 | type t = { 4 | rate_matrix : Amino_acid.matrix ; 5 | freqs : Amino_acid.vector ; 6 | } 7 | 8 | let parse_aa_order l = 9 | let chars = String.filter l ~f:(function 'A'..'Y' -> true | _ -> false) in 10 | Amino_acid.Table.init (fun aa -> 11 | let aa = Amino_acid.to_char aa in 12 | match String.lfindi chars ~f:(fun _ c -> Char.equal c aa) with 13 | | Some i -> i 14 | | None -> failwith l 15 | ) 16 | 17 | let parse_floats l = 18 | String.split ~on:' ' l 19 | |> List.filter ~f:(String.( <> ) "") 20 | |> List.map ~f:Float.of_string 21 | |> Array.of_list 22 | 23 | let parse_freqs aa_order l = 24 | let values = parse_floats l in 25 | Amino_acid.Vector.init (fun aa -> values.(Amino_acid.Table.get aa_order aa)) 26 | 27 | let parse_rate_matrix aa_order lines = 28 | let values = Array.map lines ~f:parse_floats in 29 | Rate_matrix.Amino_acid.make (fun aa1 aa2 -> 30 | let i = Amino_acid.Table.get aa_order aa1 in 31 | let j = Amino_acid.Table.get aa_order aa2 in 32 | values.(max i j - 1).(min i j) 33 | ) 34 | 35 | let from_lines lines = 36 | let lines = 37 | List.filter lines ~f:(Fn.non String.is_empty) 38 | |> Array.of_list 39 | in 40 | if Array.length lines < 21 then failwithf "Syntax error: only %d line(s)" (Array.length lines) () ; 41 | let rate_matrix_lines = Array.sub lines ~pos:0 ~len:19 in 42 | let freqs_line = lines.(19) in 43 | let aa_line = lines.(20) in 44 | let aa_order = parse_aa_order aa_line in 45 | let freqs = parse_freqs aa_order freqs_line in 46 | let rate_matrix = parse_rate_matrix aa_order rate_matrix_lines in 47 | { rate_matrix ; freqs } 48 | 49 | let from_string_exn s = 50 | String.split_lines s |> from_lines 51 | let from_file_exn fn = 52 | In_channel.read_lines fn |> from_lines 53 | 54 | let parse fn = from_file_exn fn 55 | 56 | let%test "WAG parsing works" = 57 | try ignore (from_file_exn "../tests/data/wag.dat" : t) ; true 58 | with exn -> ( 59 | print_endline (Exn.to_string exn) ; 60 | Printexc.print_backtrace stderr ; 61 | false 62 | ) 63 | -------------------------------------------------------------------------------- /lib/wag.mli: -------------------------------------------------------------------------------- 1 | (** WAG matrix parser 2 | 3 | Parser for the WAG matrix as available {{:https://www.ebi.ac.uk/goldman-srv/WAG/}here}. 4 | *) 5 | 6 | type t = { 7 | rate_matrix : Amino_acid.matrix ; 8 | freqs : Amino_acid.vector ; 9 | } 10 | 11 | val parse : string -> t 12 | [@@ocaml.alert deprecated "Use from_file_exn"] 13 | 14 | val from_file_exn : string -> t 15 | val from_string_exn : string -> t 16 | -------------------------------------------------------------------------------- /lib/zipper.mli: -------------------------------------------------------------------------------- 1 | (** Zipper type for fast operations on large phylogenetic trees. *) 2 | 3 | (** {5 Types} *) 4 | 5 | type t 6 | 7 | type branch = float * Phylogenetic_tree.t 8 | 9 | type direction = 10 | | Dir0 11 | | Dir1 12 | | Dir2 13 | 14 | type location_type = 15 | | LocLeaf 16 | | LocBranch 17 | | LocNode 18 | 19 | type oriented_zipper 20 | 21 | 22 | (** {5 Creation / conversion} *) 23 | 24 | val of_tree: Phylogenetic_tree.t -> t 25 | 26 | val of_tree_dir: Phylogenetic_tree.t -> oriented_zipper 27 | 28 | val to_tree: t -> Phylogenetic_tree.t 29 | 30 | val orient: t -> direction -> oriented_zipper 31 | 32 | val unorient: oriented_zipper -> t 33 | 34 | 35 | (** {5 Constructors / object manipulation} *) 36 | 37 | val string_of_dir: direction -> string 38 | 39 | val dir_of_string: string -> direction 40 | 41 | 42 | (** {5 Observation} *) 43 | 44 | val equal: t -> t -> bool 45 | 46 | val get_branch: t -> direction -> branch 47 | 48 | val get_index: t -> Sigs.index 49 | 50 | val get_length: t -> direction -> float 51 | 52 | val get_tree: t -> direction -> Phylogenetic_tree.t 53 | 54 | val length_left: oriented_zipper -> float 55 | 56 | val length_right: oriented_zipper -> float 57 | 58 | val location: t -> location_type 59 | 60 | 61 | (** {5 Movement} *) 62 | 63 | val goto: t -> int -> t 64 | 65 | val init_routing: t -> t 66 | 67 | val left: oriented_zipper -> direction 68 | 69 | val move: t -> direction -> t 70 | 71 | val move_left: oriented_zipper -> oriented_zipper 72 | 73 | val move_right: oriented_zipper -> oriented_zipper 74 | 75 | val random_node: t -> t 76 | 77 | val right: oriented_zipper -> direction 78 | 79 | val slide: t -> direction -> float -> t 80 | 81 | 82 | (** {5 Pretty printing} *) 83 | 84 | val pp: Format.formatter -> t -> unit 85 | 86 | val pp_fancy: Format.formatter -> t -> unit 87 | 88 | val print: t -> unit 89 | 90 | val print_fancy: t -> unit 91 | -------------------------------------------------------------------------------- /phylogenetics.opam: -------------------------------------------------------------------------------- 1 | # This file is generated by dune, edit dune-project instead 2 | opam-version: "2.0" 3 | synopsis: "Algorithms and datastructures for phylogenetics" 4 | maintainer: ["philippe.veber@gmail.com"] 5 | authors: [ 6 | "Louis Duchemin" "Vincent Lanore" "Corentin Moumard" "Philippe Veber" 7 | ] 8 | license: "CeCILL-B" 9 | tags: ["bioinformatics" "evolution" "phylogeny"] 10 | homepage: "https://github.com/biocaml/phylogenetics/" 11 | bug-reports: "https://github.com/biocaml/phylogenetics/issues" 12 | depends: [ 13 | "alcotest" {with-test} 14 | "angstrom-unix" 15 | "binning" 16 | "biotk" {>= "0.2.0"} 17 | "core" {>= "v0.16.0"} 18 | "dune" {>= "3.6"} 19 | "gsl" 20 | "lacaml" {>= "10.0.2"} 21 | "menhir" 22 | "ppx_deriving" 23 | "printbox" {>= "0.6.1"} 24 | "printbox-text" 25 | "yojson" {>= "1.6.0"} 26 | "odoc" {with-doc} 27 | ] 28 | build: [ 29 | ["dune" "subst"] {dev} 30 | [ 31 | "dune" 32 | "build" 33 | "-p" 34 | name 35 | "-j" 36 | jobs 37 | "@install" 38 | "@runtest" {with-test} 39 | "@doc" {with-doc} 40 | ] 41 | ] 42 | dev-repo: "git+https://github.com/biocaml/phylogenetics.git" 43 | -------------------------------------------------------------------------------- /tests/data/tiny1.fasta: -------------------------------------------------------------------------------- 1 | #Best fasta file in the universe 2 | >T0 3 | ATT 4 | >T1 5 | TGC 6 | >T2 7 | GTC 8 | -------------------------------------------------------------------------------- /tests/data/tiny2.fasta: -------------------------------------------------------------------------------- 1 | >T0 2 | ATT 3 | >T1 4 | TGC 5 | >T2 6 | GTC 7 | -------------------------------------------------------------------------------- /tests/data/wag.dat: -------------------------------------------------------------------------------- 1 | 0.551571 2 | 0.509848 0.635346 3 | 0.738998 0.147304 5.429420 4 | 1.027040 0.528191 0.265256 0.0302949 5 | 0.908598 3.035500 1.543640 0.616783 0.0988179 6 | 1.582850 0.439157 0.947198 6.174160 0.021352 5.469470 7 | 1.416720 0.584665 1.125560 0.865584 0.306674 0.330052 0.567717 8 | 0.316954 2.137150 3.956290 0.930676 0.248972 4.294110 0.570025 0.249410 9 | 0.193335 0.186979 0.554236 0.039437 0.170135 0.113917 0.127395 0.0304501 0.138190 10 | 0.397915 0.497671 0.131528 0.0848047 0.384287 0.869489 0.154263 0.0613037 0.499462 3.170970 11 | 0.906265 5.351420 3.012010 0.479855 0.0740339 3.894900 2.584430 0.373558 0.890432 0.323832 0.257555 12 | 0.893496 0.683162 0.198221 0.103754 0.390482 1.545260 0.315124 0.174100 0.404141 4.257460 4.854020 0.934276 13 | 0.210494 0.102711 0.0961621 0.0467304 0.398020 0.0999208 0.0811339 0.049931 0.679371 1.059470 2.115170 0.088836 1.190630 14 | 1.438550 0.679489 0.195081 0.423984 0.109404 0.933372 0.682355 0.243570 0.696198 0.0999288 0.415844 0.556896 0.171329 0.161444 15 | 3.370790 1.224190 3.974230 1.071760 1.407660 1.028870 0.704939 1.341820 0.740169 0.319440 0.344739 0.967130 0.493905 0.545931 1.613280 16 | 2.121110 0.554413 2.030060 0.374866 0.512984 0.857928 0.822765 0.225833 0.473307 1.458160 0.326622 1.386980 1.516120 0.171903 0.795384 4.378020 17 | 0.113133 1.163920 0.0719167 0.129767 0.717070 0.215737 0.156557 0.336983 0.262569 0.212483 0.665309 0.137505 0.515706 1.529640 0.139405 0.523742 0.110864 18 | 0.240735 0.381533 1.086000 0.325711 0.543833 0.227710 0.196303 0.103604 3.873440 0.420170 0.398618 0.133264 0.428437 6.454280 0.216046 0.786993 0.291148 2.485390 19 | 2.006010 0.251849 0.196246 0.152335 1.002140 0.301281 0.588731 0.187247 0.118358 7.821300 1.800340 0.305434 2.058450 0.649892 0.314887 0.232739 1.388230 0.365369 0.314730 20 | 21 | 0.0866279 0.043972 0.0390894 0.0570451 0.0193078 0.0367281 0.0580589 0.0832518 0.0244313 0.048466 0.086209 0.0620286 0.0195027 0.0384319 0.0457631 0.0695179 0.0610127 0.0143859 0.0352742 0.0708956 22 | 23 | 24 | A R N D C Q E G H I L K M F P S T W Y V 25 | Ala Arg Asn Asp Cys Gln Glu Gly His Ile Leu Lys Met Phe Pro Ser Thr Trp Tyr Val 26 | 27 | 28 | Symmetrical part of the WAG rate matrix and aa frequencies, 29 | estimated from 3905 globular protein amino acid sequences forming 182 30 | protein families. 31 | The first part above indicates the symmetric 'exchangeability' 32 | parameters, where s_ij = s_ji. The s_ij above are not scaled, but the 33 | PAML package will perform this scaling. 34 | The second part gives the amino acid frequencies (pi_i) 35 | estimated from the 3905 sequences. The net replacement rate from i to 36 | j is Q_ij = s_ij*pi_j. 37 | Prepared by Simon Whelan and Nick Goldman, December 2000. 38 | 39 | Citation: 40 | Whelan, S. and N. Goldman. 2001. A general empirical model of 41 | protein evolution derived from multiple protein families using 42 | a maximum likelihood approach. Molecular Biology and 43 | Evolution 18:691-699. 44 | -------------------------------------------------------------------------------- /tests/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name phylogenetics_test) 3 | (libraries alcotest phylogenetics) 4 | (preprocess (pps ppx_deriving.show ppx_jane))) 5 | -------------------------------------------------------------------------------- /tests/expect/birth_death_simulator.expected: -------------------------------------------------------------------------------- 1 | (n0:4.926876639043,((n1:0.092501292168,n2:0.092501292168):2.151229766277,(n4:0.179025801152,n3:0.179025801152):2.064705257293):2.683145580598); 2 | 3 | Branch length mean: 0.281977 vs 0.2823492 (ref) 4 | Branch length variance: 0.052816 vs 0.05321786 (ref) 5 | -------------------------------------------------------------------------------- /tests/expect/birth_death_simulator.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | 4 | let rec node_to_nhx ?(parent_branch) = function 5 | | Tree.Node n -> Newick_ast.{ 6 | name = None ; 7 | tags = [] ; 8 | parent_branch ; 9 | children = List1.map n.branches ~f:branch_to_nhx |> List1.to_list; 10 | } 11 | | Tree.Leaf l -> { 12 | name = Some (sprintf "n%d" l) ; 13 | tags = [] ; 14 | parent_branch ; 15 | children = [] 16 | } 17 | and branch_to_nhx (Branch b) = 18 | node_to_nhx ~parent_branch:b.data b.tip 19 | 20 | let () = 21 | Birth_death.age_ntaxa_simulation (Birth_death.make ~birth_rate:2. ~death_rate:1.) Gsl.Rng.(make (default ())) ~age:5. ~ntaxa:5 22 | |> node_to_nhx 23 | |> Newick.to_string 24 | |> print_endline 25 | 26 | let branch_lengths t = 27 | Tree.prefix_traversal t 28 | ~init:[] 29 | ~branch:(fun acc bl -> bl :: acc) 30 | ~node:(fun acc _ -> acc) 31 | ~leaf:(fun acc _ -> acc) 32 | |> Array.of_list 33 | 34 | (* 35 | Test against TESS generated by: 36 | 37 | library(TESS) 38 | sims = tess.sim.taxa.age(n=10000, 0.4, 0.3, nTaxa=10, samplingProbability=0.7, age=1, MRCA=FALSE) 39 | mean(sapply(sims, function(sim) mean(sim$edge.length))) 40 | mean(sapply(sims, function(sim) var(sim$edge.length))) 41 | *) 42 | let () = 43 | let bdp = Birth_death.make ~birth_rate:0.4 ~death_rate:0.3 in 44 | let rng = Gsl.Rng.(make (default ())) in 45 | let sims = Array.init 10_000 ~f:(fun _ -> 46 | let t = Birth_death.age_ntaxa_simulation bdp rng ~age:1. ~ntaxa:10 ~sampling_probability:0.7 in 47 | let branch_lengths = branch_lengths t in 48 | Gsl.Stats.(mean branch_lengths, variance branch_lengths) 49 | ) 50 | in 51 | let means, vars = Core.Array.unzip sims in 52 | let mean = Gsl.Stats.mean means in 53 | let var = Gsl.Stats.mean vars in 54 | printf "\nBranch length mean: %f vs 0.2823492 (ref)\n" mean ; 55 | printf "Branch length variance: %f vs 0.05321786 (ref)\n" var 56 | -------------------------------------------------------------------------------- /tests/expect/dune: -------------------------------------------------------------------------------- 1 | (tests 2 | (names birth_death_simulator phylo_ctmc_conditional_simulation phylo_ctmc_conditional_simulation_missing_values phylo_ctmc_substitution_mappings simulator) 3 | (libraries phylogenetics) 4 | (deps ../data/wag.dat)) 5 | -------------------------------------------------------------------------------- /tests/expect/phylo_ctmc_conditional_simulation.expected: -------------------------------------------------------------------------------- 1 | 2 | Tests if we obtain the same probability distribution for internal nodes of a tree 3 | given observations on the leaves, whether we compute them using the pruning algorithm 4 | or a rejection sampling algorithm. 5 | 6 | 7 | ┌─────┬─────┬─────┬─────┐ 8 | │0.039│0.037│0.909│0.015│ 9 | ├─────┼─────┼─────┼─────┤ 10 | │0.054│0.035│0.900│0.011│ 11 | ├─────┴─────┴─────┴─────┘ 12 | ├─┬─────┬─────┬─────┬─────┐ 13 | │ │0.075│0.050│0.855│0.020│ 14 | │ ├─────┼─────┼─────┼─────┤ 15 | │ │0.072│0.052│0.856│0.020│ 16 | │ ├─────┴─────┴─────┴─────┘ 17 | │ ├─┬─────┬─────┬─────┬─────┐ 18 | │ │ │0.000│0.000│1.000│0.000│ 19 | │ │ ├─────┼─────┼─────┼─────┤ 20 | │ │ │0.000│0.000│1.000│0.000│ 21 | │ │ └─────┴─────┴─────┴─────┘ 22 | │ └─┬─────┬─────┬─────┬─────┐ 23 | │ │0.000│0.000│1.000│0.000│ 24 | │ ├─────┼─────┼─────┼─────┤ 25 | │ │0.000│0.000│1.000│0.000│ 26 | │ └─────┴─────┴─────┴─────┘ 27 | └─┬─────┬─────┬─────┬─────┐ 28 | │0.026│0.021│0.945│0.008│ 29 | ├─────┼─────┼─────┼─────┤ 30 | │0.026│0.022│0.940│0.012│ 31 | ├─────┴─────┴─────┴─────┘ 32 | ├─┬─────┬─────┬─────┬─────┐ 33 | │ │0.000│0.000│1.000│0.000│ 34 | │ ├─────┼─────┼─────┼─────┤ 35 | │ │0.000│0.000│1.000│0.000│ 36 | │ └─────┴─────┴─────┴─────┘ 37 | └─┬─────┬─────┬─────┬─────┐ 38 | │0.000│0.000│1.000│0.000│ 39 | ├─────┼─────┼─────┼─────┤ 40 | │0.000│0.000│1.000│0.000│ 41 | └─────┴─────┴─────┴─────┘ -------------------------------------------------------------------------------- /tests/expect/phylo_ctmc_conditional_simulation.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | 4 | let rng = Gsl.Rng.(make (default ())) 5 | let _B_ = 1_000 6 | 7 | let small_tree = 8 | let open Tree in 9 | binary_node () 10 | (branch 0.4 ( 11 | binary_node () 12 | (branch 0.8 (leaf "A1")) 13 | (branch 1.2 (leaf "A2")))) 14 | (branch 0.1 ( 15 | binary_node () 16 | (branch 0.6 (leaf "B")) 17 | (branch 0.3 (leaf "C")))) 18 | 19 | module Branch_info = struct 20 | type t = float 21 | let length x = x 22 | end 23 | 24 | module NucSim = Phylogenetics.Simulator.Make(Nucleotide)(Branch_info) 25 | module SeqSim = Sequence_simulator.Make(Nucleotide) 26 | 27 | let reference_simulation, profile, rate_matrix = 28 | let profile = SeqSim.random_profile ~alpha:1. rng in 29 | let root = SeqSim.draw_from_profile profile rng in 30 | let stationary_distribution = SeqSim.vec_of_profile profile in 31 | let exchangeabilities = 32 | Rate_matrix.Nucleotide.make_symetric (fun a b -> 33 | float ((a :> int) + 7 + (b :> int) + 7) /. 20. 34 | ) 35 | in 36 | let rate_matrix = Rate_matrix.Nucleotide.gtr ~stationary_distribution ~exchangeabilities in 37 | let sim = 38 | NucSim.site_gillespie_direct rng small_tree ~root ~rate_matrix:(fun _ -> 39 | rate_matrix 40 | ) 41 | in 42 | sim, profile, rate_matrix 43 | 44 | let conditional_simulations = 45 | let map_tree = 46 | Tree.map 47 | ~leaf:Nucleotide.of_int_exn 48 | ~node:Nucleotide.of_int_exn 49 | ~branch:fst 50 | in 51 | let cl = 52 | Phylo_ctmc.conditional_likelihoods 53 | reference_simulation 54 | ~nstates:Nucleotide.card 55 | ~leaf_state:(fun (_, nuc) -> Nucleotide.to_int nuc) 56 | ~transition_probabilities:(fun bl -> 57 | let mat = Nucleotide.Matrix.(expm (scal_mul bl rate_matrix)) in 58 | (mat :> Linear_algebra.mat) 59 | ) 60 | in 61 | let stationary_distribution = SeqSim.vec_of_profile profile in 62 | let root_frequencies = (stationary_distribution :> Linear_algebra.vec) in 63 | Array.init _B_ ~f:(fun _ -> 64 | Phylo_ctmc.conditional_simulation rng cl ~root_frequencies 65 | |> map_tree 66 | ) 67 | 68 | let rejection_sampling_simulations = 69 | let map_tree = Tree.map ~leaf:snd ~node:snd ~branch:Fun.id in 70 | let reference_leaves = Tree.leaves reference_simulation in 71 | let rec loop () = 72 | let root = SeqSim.draw_from_profile profile rng in 73 | let sim = NucSim.site_gillespie_direct rng small_tree ~root ~rate_matrix:(fun _ -> 74 | rate_matrix 75 | ) 76 | in 77 | let leaves = Tree.leaves sim in 78 | let equal = List.equal (Tuple2.equal ~eq1:String.equal ~eq2:Nucleotide.equal) in 79 | if equal leaves reference_leaves then map_tree sim 80 | else loop () 81 | in 82 | Array.init _B_ ~f:(fun _ -> loop ()) 83 | 84 | let nuc_counts xs = 85 | let create_table _ = Nucleotide.Table.init (Fun.const 0) in 86 | let incr table nuc = Nucleotide.Table.(set table nuc (get table nuc + 1)) in 87 | let rec iter2 acc t = 88 | match acc, t with 89 | | Tree.Leaf table, Tree.Leaf nuc -> incr table nuc 90 | | Node n_acc, Node n_t -> 91 | incr n_acc.data n_t.data ; 92 | List.iter2_exn 93 | (List1.to_list n_acc.branches) 94 | (List1.to_list n_t.branches) 95 | ~f:(fun (Branch b_acc) (Branch b_t) -> iter2 b_acc.tip b_t.tip) 96 | | _ -> assert false 97 | in 98 | let res = 99 | Tree.map reference_simulation ~node:create_table ~leaf:create_table ~branch:Fun.id 100 | in 101 | Array.iter xs ~f:(iter2 res) ; 102 | res 103 | 104 | let all_counts = 105 | let pair x y = x, y in 106 | let first x _ = x in 107 | Tree.map2_exn 108 | (nuc_counts rejection_sampling_simulations) 109 | (nuc_counts conditional_simulations) 110 | ~leaf:pair ~node:pair ~branch:first 111 | 112 | module B = PrintBox 113 | 114 | let display_tree = 115 | let render_counts k = 116 | (k : int Nucleotide.table :> int array) 117 | |> Array.map ~f:(fun k -> B.sprintf "%.3f" (float k /. float _B_)) 118 | in 119 | let render_node_info (k1, k2) = 120 | [| render_counts k1 ; render_counts k2 |] 121 | |> B.grid 122 | |> B.frame 123 | in 124 | let rec render = function 125 | | Tree.Leaf li -> render_node_info li 126 | | Node n -> 127 | B.tree 128 | (render_node_info n.data) 129 | ( 130 | List1.to_list n.branches 131 | |> List.map ~f:(fun (Tree.Branch b) -> render b.tip) 132 | ) 133 | in 134 | render all_counts 135 | 136 | let () = 137 | print_endline {| 138 | Tests if we obtain the same probability distribution for internal nodes of a tree 139 | given observations on the leaves, whether we compute them using the pruning algorithm 140 | or a rejection sampling algorithm. 141 | 142 | |} ; 143 | PrintBox_text.output stdout display_tree 144 | -------------------------------------------------------------------------------- /tests/expect/phylo_ctmc_conditional_simulation_missing_values.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | 4 | let rng = Gsl.Rng.(make (default ())) 5 | let _B_ = 10_000 6 | 7 | let small_tree = 8 | let open Tree in 9 | binary_node () 10 | (branch 0.4 (binary_node () 11 | (branch 0.8 (leaf "A1")) 12 | (branch 1.2 (leaf "A2")))) 13 | (branch 0.1 (binary_node () 14 | (branch 0.6 (binary_node () 15 | (branch 1.2 (leaf "B1")) 16 | (branch 0.3 (leaf "B2")))) 17 | (branch 0.3 (leaf "C")))) 18 | 19 | module Branch_info = struct 20 | type t = float 21 | let length x = x 22 | end 23 | 24 | module NucSim = Phylogenetics.Simulator.Make(Nucleotide)(Branch_info) 25 | module SeqSim = Sequence_simulator.Make(Nucleotide) 26 | 27 | let reference_simulation, profile, rate_matrix = 28 | let profile = SeqSim.random_profile ~alpha:1. rng in 29 | let root = SeqSim.draw_from_profile profile rng in 30 | let stationary_distribution = SeqSim.vec_of_profile profile in 31 | let exchangeabilities = 32 | Rate_matrix.Nucleotide.make_symetric (fun a b -> 33 | float ((a :> int) + 7 + (b :> int) + 7) /. 20. 34 | ) 35 | in 36 | let rate_matrix = Rate_matrix.Nucleotide.gtr ~stationary_distribution ~exchangeabilities in 37 | let sim = 38 | NucSim.site_gillespie_direct rng small_tree ~root ~rate_matrix:(fun _ -> 39 | rate_matrix 40 | ) 41 | in 42 | let censored_sim = 43 | Tree.map sim ~node:Fun.id ~branch:Fun.id ~leaf:(fun (label, nuc) -> 44 | label, 45 | if String.equal label "B2" then None else Some nuc) 46 | in 47 | censored_sim, profile, rate_matrix 48 | 49 | let conditional_simulations = 50 | let map_tree = 51 | Tree.map 52 | ~leaf:Nucleotide.of_int_exn 53 | ~node:Nucleotide.of_int_exn 54 | ~branch:fst 55 | in 56 | let cl = 57 | Phylo_ctmc.Missing_values.conditional_likelihoods 58 | reference_simulation 59 | ~nstates:Nucleotide.card 60 | ~leaf_state:(fun (_, nuc) -> Option.map ~f:Nucleotide.to_int nuc) 61 | ~transition_probabilities:(fun bl -> 62 | let mat = Nucleotide.Matrix.(expm (scal_mul bl rate_matrix)) in 63 | (mat :> Linear_algebra.mat) 64 | ) 65 | in 66 | let stationary_distribution = SeqSim.vec_of_profile profile in 67 | let root_frequencies = (stationary_distribution :> Linear_algebra.vec) in 68 | Array.init _B_ ~f:(fun _ -> 69 | Phylo_ctmc.Missing_values.conditional_simulation rng cl ~root_frequencies 70 | |> map_tree 71 | ) 72 | 73 | let rejection_sampling_simulations = 74 | let map_tree = Tree.map ~leaf:snd ~node:snd ~branch:Fun.id in 75 | let reference_leaves = Tree.leaves reference_simulation in 76 | let rec loop () = 77 | let root = SeqSim.draw_from_profile profile rng in 78 | let sim = NucSim.site_gillespie_direct rng small_tree ~root ~rate_matrix:(fun _ -> 79 | rate_matrix 80 | ) 81 | in 82 | let leaves = Tree.leaves sim in 83 | let compatible (l1, nuc1) (l2, maybe_nuc2) = 84 | String.equal l1 l2 85 | && Option.value_map ~default:true maybe_nuc2 ~f:(Nucleotide.equal nuc1) 86 | in 87 | if List.for_all2_exn ~f:compatible leaves reference_leaves then map_tree sim 88 | else loop () 89 | in 90 | Array.init _B_ ~f:(fun _ -> loop ()) 91 | 92 | let nuc_counts xs = 93 | let create_table _ = Nucleotide.Table.init (Fun.const 0) in 94 | let incr table nuc = Nucleotide.Table.(set table nuc (get table nuc + 1)) in 95 | let rec iter2 acc t = 96 | match acc, t with 97 | | Tree.Leaf table, Tree.Leaf nuc -> incr table nuc 98 | | Node n_acc, Node n_t -> 99 | incr n_acc.data n_t.data ; 100 | List.iter2_exn 101 | (List1.to_list n_acc.branches) 102 | (List1.to_list n_t.branches) 103 | ~f:(fun (Branch b_acc) (Branch b_t) -> iter2 b_acc.tip b_t.tip) 104 | | _ -> assert false 105 | in 106 | let res = 107 | Tree.map reference_simulation ~node:create_table ~leaf:create_table ~branch:Fun.id 108 | in 109 | Array.iter xs ~f:(iter2 res) ; 110 | res 111 | 112 | let all_counts = 113 | let pair x y = x, y in 114 | let first x _ = x in 115 | Tree.map2_exn 116 | (nuc_counts rejection_sampling_simulations) 117 | (nuc_counts conditional_simulations) 118 | ~leaf:pair ~node:pair ~branch:first 119 | 120 | module B = PrintBox 121 | 122 | let display_tree = 123 | let render_counts k = 124 | (k : int Nucleotide.table :> int array) 125 | |> Array.map ~f:(fun k -> B.sprintf "%.3f" (float k /. float _B_)) 126 | in 127 | let render_node_info (k1, k2) = 128 | [| render_counts k1 ; render_counts k2 |] 129 | |> B.grid 130 | |> B.frame 131 | in 132 | let rec render = function 133 | | Tree.Leaf li -> render_node_info li 134 | | Node n -> 135 | B.tree 136 | (render_node_info n.data) 137 | ( 138 | List1.to_list n.branches 139 | |> List.map ~f:(fun (Tree.Branch b) -> render b.tip) 140 | ) 141 | in 142 | render all_counts 143 | 144 | let () = 145 | print_endline {| 146 | Tests if we obtain the same probability distribution for internal nodes of a tree 147 | given observations on the leaves, whether we compute them using the pruning algorithm 148 | or a rejection sampling algorithm. 149 | 150 | |} ; 151 | PrintBox_text.output stdout display_tree 152 | -------------------------------------------------------------------------------- /tests/expect/phylo_ctmc_substitution_mappings.expected: -------------------------------------------------------------------------------- 1 | Test that rejection sampler and uniformized path sampler coincide 2 | 01 0.488 | 02 0.150 | 03 0.221 | 04 0.081 | 05 0.039 | 06 0.015 | 07 0.004 | 08 0.001 | 09 0.001 | 10 0.000 3 | 01 0.481 | 02 0.149 | 03 0.227 | 04 0.087 | 05 0.038 | 06 0.013 | 07 0.005 | 08 0.001 | 09 0.000 4 | Check that the marginal likelihood is greater than the average complete likelihood 5 | -117.326089 >= -101.389767 6 | -------------------------------------------------------------------------------- /tests/expect/phylo_ctmc_substitution_mappings.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | open Phylogenetics.Linear_algebra 4 | 5 | let rng = Gsl.Rng.(make (default ())) 6 | 7 | let sample_tree () = 8 | let bdp = Birth_death.make ~birth_rate:2. ~death_rate:1. in 9 | Birth_death.age_ntaxa_simulation bdp rng ~age:1. ~ntaxa:100 10 | 11 | module Branch_info = struct 12 | type t = float 13 | let length x = x 14 | end 15 | 16 | module AASim = Phylogenetics.Simulator.Make(Amino_acid)(Branch_info) 17 | 18 | let valine = Option.value_exn (Amino_acid.of_char 'V') 19 | let alanine = Option.value_exn (Amino_acid.of_char 'A') 20 | 21 | let wag = Wag.from_file_exn "../data/wag.dat" 22 | 23 | type param = { 24 | stationary_distribution : Amino_acid.vector ; 25 | exchangeability_matrix : Amino_acid.matrix ; 26 | scale : float ; 27 | } 28 | 29 | let substitution_rate p i j = 30 | p.scale *. 31 | p.exchangeability_matrix.Amino_acid.%{i, j} *. 32 | p.stationary_distribution.Amino_acid.%(j) 33 | 34 | let transition_rates p = 35 | Rate_matrix.Amino_acid.make (substitution_rate p) 36 | 37 | let transition_probabilities_of_rates m = 38 | fun bl -> (Amino_acid.Matrix.(expm (scal_mul bl m)) :> mat) 39 | 40 | let uniformized_process p = 41 | let transition_rates = transition_rates p in 42 | let transition_probabilities = 43 | transition_probabilities_of_rates transition_rates 44 | in 45 | Phylo_ctmc.Uniformized_process.make 46 | ~transition_rates:(transition_rates :> mat) 47 | ~transition_probabilities 48 | 49 | let wag_param = { 50 | stationary_distribution = wag.freqs ; 51 | exchangeability_matrix = wag.rate_matrix ; 52 | scale = 1. ; 53 | } 54 | 55 | (* Test on single branch *) 56 | 57 | let int_histogram xs = 58 | let n = List.length xs in 59 | Binning.counts (Stdlib.List.to_seq xs) 60 | |> Stdlib.List.of_seq 61 | |> List.map ~f:(fun (i, k) -> i, float k /. float n) 62 | |> List.sort ~compare:Poly.compare 63 | 64 | let render_int_histogram xs = 65 | int_histogram xs 66 | |> List.map ~f:(fun (k, f) -> sprintf "%02d %.3f" k f) 67 | |> String.concat ~sep:" | " 68 | |> print_endline 69 | 70 | let nb_events_along_branch rng path_sampler ~start_state ~end_state ~sample_size = 71 | List.init sample_size ~f:(fun _ -> 72 | Phylo_ctmc.Path_sampler.sample_exn 73 | ~rng path_sampler 74 | ~start_state:(Amino_acid.to_int start_state) 75 | ~end_state:(Amino_acid.to_int end_state) 76 | |> Array.length 77 | ) 78 | 79 | let () = 80 | let start_state = alanine in 81 | let end_state = valine in 82 | let sample_size = 10_000 in 83 | let branch_length = 2. in 84 | let process = Staged.unstage (uniformized_process wag_param) ~branch_length in 85 | let uniformized_path_sampler = Phylo_ctmc.Path_sampler.uniformization process in 86 | let rejection_path_sampler = 87 | Phylo_ctmc.Path_sampler.rejection_sampling 88 | ~rates:(Phylo_ctmc.Uniformized_process.transition_rates process) 89 | ~branch_length ~max_tries:10_000 () in 90 | print_endline "Test that rejection sampler and uniformized path sampler coincide" ; 91 | render_int_histogram ( 92 | nb_events_along_branch 93 | rng rejection_path_sampler 94 | ~start_state ~end_state ~sample_size 95 | ) ; 96 | render_int_histogram ( 97 | nb_events_along_branch 98 | rng uniformized_path_sampler 99 | ~start_state ~end_state ~sample_size 100 | ) 101 | 102 | let sample_site tree root = 103 | let rates = transition_rates wag_param in 104 | AASim.site_gillespie_first_reaction rng tree ~root ~rate_matrix:(fun _ -> rates) 105 | 106 | let iter_branches t ~f = 107 | let rec traverse_node = function 108 | | Tree.Leaf _ -> () 109 | | Node n -> 110 | List1.iter n.branches ~f:(traverse_branch n.data) 111 | and traverse_branch parent_data (Tree.Branch b) = 112 | f parent_data b.data (Tree.data b.tip) ; 113 | traverse_node b.tip 114 | in 115 | traverse_node t 116 | 117 | let sufficient_statistics ~nstates tree = 118 | let counts = Array.make_matrix ~dimx:nstates ~dimy:nstates 0 in 119 | let waiting_times = Array.create ~len:nstates 0. in 120 | iter_branches tree ~f:(fun start_state (bl, mapping) _ -> 121 | match mapping with 122 | | [||] -> 123 | waiting_times.(start_state) <- waiting_times.(start_state) +. bl 124 | | _ -> 125 | Array.iteri mapping ~f:(fun k (s_j, t_j) -> 126 | let s_i, t_i = 127 | if k = 0 then start_state, 0. 128 | else mapping.(k - 1) 129 | in 130 | counts.(s_i).(s_j) <- 1 + counts.(s_i).(s_j) ; 131 | waiting_times.(s_i) <- waiting_times.(s_i) +. (t_j -. t_i) 132 | ) ; 133 | let (last_state, last_time) = Array.last mapping in 134 | waiting_times.(last_state) <- waiting_times.(last_state) +. bl -. last_time 135 | ) ; 136 | counts, waiting_times 137 | 138 | let root_state = function 139 | | Tree.Leaf l -> l 140 | | Tree.Node n -> n.data 141 | 142 | let mapping_likelihood ~nstates ~root_frequencies ~transition_rates tree = 143 | let counts, waiting_times = sufficient_statistics ~nstates tree in 144 | let lik = ref 0. in 145 | lik := !lik +. Float.log (Vector.get root_frequencies (root_state tree)) ; 146 | for i = 0 to nstates - 1 do 147 | for j = 0 to nstates - 1 do 148 | let contrib = 149 | if i = j then waiting_times.(i) *. Matrix.get transition_rates i i 150 | else float counts.(i).(j) *. Float.log (Matrix.get transition_rates i j) 151 | in 152 | lik := !lik +. contrib 153 | done ; 154 | done ; 155 | !lik 156 | 157 | let () = 158 | let tree = sample_tree () in 159 | let site = sample_site tree valine in 160 | let nstates = Amino_acid.card in 161 | let leaf_state (_, aa) = Amino_acid.to_int aa in 162 | let root_frequencies = (wag.freqs :> Vector.t) in 163 | let transition_rates = transition_rates wag_param in 164 | let conditional_likelihoods = 165 | let transition_probabilities bl = transition_probabilities_of_rates transition_rates bl in 166 | Phylo_ctmc.conditional_likelihoods site ~nstates ~leaf_state ~transition_probabilities 167 | in 168 | let process = Staged.unstage (uniformized_process wag_param) in 169 | let path_sampler bi = Phylo_ctmc.Path_sampler.uniformization (process ~branch_length:bi) in 170 | let mean_mapping_likelihood = 171 | Array.init 100 ~f:(fun _ -> 172 | Phylo_ctmc.conditional_simulation rng ~root_frequencies conditional_likelihoods 173 | |> Phylo_ctmc.substitution_mapping ~rng ~path_sampler 174 | |> mapping_likelihood ~nstates ~root_frequencies ~transition_rates:(transition_rates :> mat) 175 | ) 176 | |> Gsl.Stats.mean 177 | in 178 | let pruning_likelihood = 179 | let transition_probabilities bl = [`Mat (transition_probabilities_of_rates transition_rates bl)] in 180 | Phylo_ctmc.pruning site ~nstates ~transition_probabilities ~leaf_state ~root_frequencies 181 | in 182 | print_endline "Check that the marginal likelihood is greater than the average complete likelihood" ; 183 | printf "%f >= %f\n" pruning_likelihood mean_mapping_likelihood 184 | -------------------------------------------------------------------------------- /tests/expect/simulator.expected: -------------------------------------------------------------------------------- 1 | Test convergence of Gillespie direct on one branch 2 | 0.192064 0.207 3 | 0.230298 0.215 4 | 0.233056 0.229 5 | 0.00204146 0.001 6 | 6.82864E-07 0 7 | 3.91614E-09 0 8 | 4.50594E-08 0 9 | 1.33304E-09 0 10 | 0.271512 0.282 11 | 0.00111919 0 12 | 1.08756E-10 0 13 | 0.00501554 0.006 14 | 7.22811E-11 0 15 | 9.70161E-11 0 16 | 7.04754E-07 0 17 | 0.000674593 0 18 | 0.0642129 0.06 19 | 2.77005E-10 0 20 | 1.35292E-08 0 21 | 5.28832E-06 0 22 | Test convergence of Gillespie first reaction on one branch 23 | 0.192064 0.188 24 | 0.230298 0.212 25 | 0.233056 0.241 26 | 0.00204146 0.002 27 | 6.82864E-07 0 28 | 3.91614E-09 0 29 | 4.50594E-08 0 30 | 1.33304E-09 0 31 | 0.271512 0.286 32 | 0.00111919 0.001 33 | 1.08756E-10 0 34 | 0.00501554 0.007 35 | 7.22811E-11 0 36 | 9.70161E-11 0 37 | 7.04754E-07 0 38 | 0.000674593 0.002 39 | 0.0642129 0.061 40 | 2.77005E-10 0 41 | 1.35292E-08 0 42 | 5.28832E-06 0 43 | -------------------------------------------------------------------------------- /tests/expect/simulator.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | 4 | module L = Linear_algebra 5 | 6 | module Branch_info = struct 7 | type t = float 8 | let length x = x 9 | end 10 | 11 | module Model = struct 12 | type param = { 13 | stationary_distribution : Amino_acid.vector ; 14 | exchangeability_matrix : Amino_acid.matrix ; 15 | } 16 | 17 | let substitution_rate p i j = 18 | p.exchangeability_matrix.Amino_acid.%{i, j} *. 19 | p.stationary_distribution.Amino_acid.%(j) 20 | 21 | let rate_matrix p = 22 | Rate_matrix.Amino_acid.make (substitution_rate p) 23 | end 24 | 25 | module Sim = Simulator.Make(Amino_acid)(Branch_info) 26 | 27 | let single_branch_tree l = 28 | Tree.node () List1.(cons (Tree.branch l (Tree.leaf ())) []) 29 | 30 | let draw_amino_acid_profile rng alpha = 31 | let theta = Array.create ~len:20 0. in 32 | Gsl.Randist.dirichlet rng ~alpha:(Array.create ~len:20 alpha) ~theta ; 33 | let sampler = 34 | let t = Gsl.Randist.discrete_preproc theta in 35 | fun rng -> 36 | Gsl.Randist.discrete rng t 37 | |> Amino_acid.of_int_exn 38 | in 39 | Amino_acid.Vector.of_array_exn theta, sampler 40 | 41 | let simulation_on_one_branch simulator simulator_name = 42 | let rng = Gsl.Rng.(make (default ())) in 43 | let tree = single_branch_tree 10. in 44 | let root = Amino_acid.of_int_exn 0 in 45 | let profile, _ = draw_amino_acid_profile rng 0.1 in 46 | let exchangeability_matrix = (Wag.from_file_exn "../data/wag.dat").rate_matrix in 47 | let param = { 48 | Model.stationary_distribution = profile ; 49 | exchangeability_matrix ; 50 | } 51 | in 52 | let rates = Model.rate_matrix param in 53 | let empirical_frequencies = 54 | Sequence.init 1000 ~f:(fun _ -> 55 | simulator rng tree ~root ~rate_matrix:(Fn.const rates) 56 | |> Tree.leaves 57 | |> List.map ~f:snd 58 | |> List.hd_exn 59 | ) 60 | |> Amino_acid.counts 61 | |> (fun (k : int Amino_acid.table) -> (k :> int array)) 62 | |> Array.map ~f:Float.of_int 63 | |> Amino_acid.Vector.of_array_exn 64 | |> Amino_acid.Vector.normalize 65 | in 66 | let res = 67 | [| 68 | Amino_acid.Vector.to_array profile ; 69 | Amino_acid.Vector.to_array empirical_frequencies ; 70 | |] 71 | |> L.Matrix.of_arrays_exn 72 | |> L.Matrix.transpose 73 | in 74 | printf "Test convergence of %s on one branch\n" simulator_name ; 75 | L.Matrix.pp Format.std_formatter res ; 76 | Format.print_newline () 77 | 78 | let simulation_branch_length () = 79 | let exchangeability_matrix = (Wag.from_file_exn "../data/wag.dat").rate_matrix in 80 | let rng = Gsl.Rng.(make (default ())) in 81 | let branch_length = 0.5 in 82 | let profile, aa_sampler = draw_amino_acid_profile rng 0.1 in 83 | let param = { 84 | Model.stationary_distribution = profile ; 85 | exchangeability_matrix ; 86 | } 87 | in 88 | let rate_matrix = Rate_matrix.Amino_acid.scaled_rate_matrix profile (Model.rate_matrix param) in 89 | let empirical_freq = 90 | Array.init 100_000 ~f:(fun _ -> 91 | let start_state = aa_sampler rng in 92 | Sim.branch_gillespie_direct 93 | rng ~start_state ~rate_matrix ~branch_length 94 | ~init:0 ~f:(fun acc _ _ -> acc + 1) 95 | |> float 96 | ) 97 | |> Gsl.Stats.mean 98 | in 99 | printf "Test branch length in substitutions\nExpected: %f\nGot: %f\n" branch_length empirical_freq 100 | 101 | let () = 102 | simulation_on_one_branch Sim.site_gillespie_direct "Gillespie direct" ; 103 | simulation_on_one_branch Sim.site_gillespie_first_reaction "Gillespie first reaction" ; 104 | simulation_branch_length () 105 | -------------------------------------------------------------------------------- /tests/test_MCMC.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | 4 | (** {6 Test input parameters} *) 5 | 6 | let my_likelihood v = 7 | Float.exp (MCMC.(M.Felsenstein.felsenstein 2.0 v.tree v.align)) *. 8 | (let newlength = List.nth_exn (Phylogenetic_tree.get_branch_lengths v.tree) 5 in 9 | if Float.(newlength > 0. && newlength < 5.) then 1.0 else 0.0) 10 | 11 | let myalign = MCMC.M.Alignment.of_string_list ["A"; "A"; "A"; "T"] 12 | let mybasetree = Phylogenetic_tree.of_preorder "0.1;0.1;0.1;0.1;0;1;2.5;0.1;2;3" 13 | let my_theta0 = MCMC.{align=my_align; tree=my_basetree} 14 | 15 | 16 | let my_step (v : MCMC.vector) = 17 | let lengths = Phylogenetic_tree.get_branch_lengths v.tree |> List.mapi ~f:( 18 | let range = 5.0 in 19 | fun i x -> if i=5 20 | then x -. (range/.2.) +. (Random.float range) 21 | else x 22 | ) in 23 | let new_tree = Phylogenetic_tree.set_branch_lengths v.tree lengths in 24 | MCMC.{align=v.align; tree=new_tree}, 1. 25 | 26 | 27 | (** {6 Test functions} *) 28 | 29 | let sample amount = 30 | MCMC.run my_theta0 my_step my_likelihood amount 31 | |> List.map ~f:(fun { MCMC.tree ; _ } -> 32 | List.nth_exn (Phylogenetic_tree.get_branch_lengths tree) 5 33 | ) 34 | |> List.filteri ~f:(fun x _ -> x > amount / 5) 35 | 36 | let test_MCMC () = 37 | Test_utils.check_distrib [2.8] (sample 10000) 38 | 39 | 40 | (** {6 Test list} *) 41 | 42 | let tests = [ 43 | "Specific branch length on tiny tree with 10k points.", `Slow, test_MCMC] 44 | -------------------------------------------------------------------------------- /tests/test_alignment.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | open Alcotest 4 | 5 | 6 | (** {6 Test input parameters} *) 7 | 8 | module DNA = Seq.DNA 9 | module Align = Alignment.Make (DNA) 10 | 11 | let mytab = Align.of_assoc_list Nucleotide.[("T0",DNA.of_list [a;t;t]); 12 | ("T1",DNA.of_list [t;g;c]); 13 | ("T2",DNA.of_list [g;t;c])] 14 | 15 | 16 | (** {6 Test functions} *) 17 | 18 | let test_of_string_list () = 19 | Align.of_string_list ["ATT";"TGC";"GTC"] |> 20 | (check @@ testable Align.pp Align.equal) "identical sequence tables" mytab 21 | 22 | let test_of_fasta_with_header () = 23 | Align.of_fasta "../tests/data/tiny1.fasta" |> 24 | (check @@ testable Align.pp Align.equal) "identical sequence tables" mytab 25 | 26 | let test_of_fasta () = 27 | Align.of_fasta "../tests/data/tiny2.fasta" |> 28 | (check @@ testable Align.pp Align.equal) "identical sequence tables" mytab 29 | 30 | 31 | let test_get_base () = 32 | Align.get_base ~seq:"T1" ~pos:2 mytab 33 | |> Nucleotide.to_char 34 | |> check char "get base from sequence" 'C' 35 | 36 | 37 | (** {6 Test list} *) 38 | 39 | let tests = [ 40 | "of_string_list", `Quick, test_of_string_list ; 41 | "of_fasta", `Quick, test_of_fasta ; 42 | "get_base", `Quick, test_get_base 43 | ] 44 | 45 | -------------------------------------------------------------------------------- /tests/test_felsenstein.ml: -------------------------------------------------------------------------------- 1 | (** Tests of the felsenstein implementation. The results are compared to bppml 2 | to check correctness. A variety of models and problem sizes are used to generate 3 | trees and alignments which are submitted to our felsenstein implementation and 4 | bppml.*) 5 | open Phylogenetics 6 | open Core 7 | 8 | 9 | type 'a model = (module Site_evolution_model.Nucleotide_S_with_reduction with type param = 'a) 10 | 11 | type test_case = Test_case : { 12 | model : 'a model ; 13 | param : 'a ; 14 | bpp_spec : Bppsuite.model ; 15 | } -> test_case 16 | 17 | (** {6 Preliminary functions} *) 18 | 19 | (** Generates a random tree, a random sequence (using the provided model), 20 | runs both biocaml felsenstein and bppml, and checks that the results are identical*) 21 | let test_felsenstein ?(treesize=5) ?(seqsize=5) (Test_case c) () = 22 | let module M = (val c.model) in 23 | let module Align = Alignment.Make(Seq.DNA) in 24 | let module F = Felsenstein.Make(Nucleotide)(Align)(M) in 25 | let module SG = Sequence_generation.Make(Nucleotide)(Seq.DNA)(Align)(M) in 26 | let tree = Phylogenetic_tree.make_random treesize in 27 | let align = SG.seqgen_string_list c.param tree seqsize |> Align.of_string_list in 28 | let my_result = F.felsenstein c.param tree align in 29 | let bpp_result = begin 30 | Phylogenetic_tree.to_newick_file tree "tmp.tree" ; (* TODO unique file name *) 31 | Align.to_file align "tmp.seq" ; 32 | try 33 | Test_utils.felsenstein_bpp ~model:c.bpp_spec ~tree:("tmp.tree") "tmp.seq" 34 | with 35 | | Failure s -> Printf.printf "\027[0;31mERROR\027[0;0m(felsenstein_bpp): %s\n" s; 0.0 36 | end in 37 | Test_utils.check_likelihood my_result bpp_result 38 | 39 | 40 | (** {6 Test list} *) 41 | 42 | let models = Site_evolution_model.[ 43 | Test_case { model = (module JC69) ; param = () ; bpp_spec = JC69 } ; 44 | Test_case { model = (module K80) ; param = 2. ; bpp_spec = K80 { kappa = Some 2. } } ; 45 | Test_case { model = (module K80) ; param = 0.5 ; bpp_spec = K80 { kappa = Some 0.5 } } ; 46 | Test_case { model = (module JC69_numerical) ; param = () ; bpp_spec = JC69 } ; 47 | Test_case { model = (module K80_numerical) ; param = 4. ; bpp_spec = K80 { kappa = Some 4. } } ; 48 | ] 49 | 50 | let tree_sizes = [10 ; 250] 51 | let seq_sizes = [1 ; 100 ] 52 | 53 | let tests = 54 | List.cartesian_product tree_sizes seq_sizes 55 | |> List.cartesian_product models 56 | |> List.map ~f:(fun ((Test_case tc as test_case), (treesize, seqsize)) -> 57 | (Printf.sprintf "test against bppml\t%s\ttreesize=%d\tseqsize=%d" (Bppsuite.string_of_model tc.bpp_spec) treesize seqsize, 58 | `Slow, test_felsenstein ~treesize ~seqsize test_case) 59 | ) 60 | -------------------------------------------------------------------------------- /tests/test_newick.ml: -------------------------------------------------------------------------------- 1 | open Phylogenetics 2 | 3 | let tree1 = {|ADH1:0.11[&&NHX:S=human:E=1.1.1.1];|} 4 | let tree2 = {|(((A:0.0707394[&&NHX:Condition=0],(B:0.082502[&&NHX:Condition=1:Transition=1],(C:0.0399704[&&NHX:Condition=0],(D:0.0130784[&&NHX:Condition=0],E:0.0126315[&&NHX:Condition=1:Transition=1]):0.0679028[&&NHX:Condition=0]):0.0355645[&&NHX:Condition=0]):0.0829847[&&NHX:Condition=0]):0.025651[&&NHX:Condition=0],F:0.00179799[&&NHX:Condition=0]):0.0487697[&&NHX:Condition=0],(G:0.0797879[&&NHX:Condition=0],(H:0.0338484[&&NHX:Condition=0],(I:0.0526082[&&NHX:Condition=1],J:0.0366882[&&NHX:Condition=1]):0.0553277[&&NHX:Condition=1:Transition=1]):0.00758827[&&NHX:Condition=0]):0.0913262[&&NHX:Condition=0]);|} 5 | 6 | let just_parse tree () = 7 | ignore (Newick.from_string_exn tree : Newick.t) 8 | 9 | let tests = [ 10 | "just_parse_test1", `Quick, just_parse tree1 ; 11 | "just_parse_test2", `Quick, just_parse tree2 ; 12 | ] 13 | -------------------------------------------------------------------------------- /tests/test_phylo_ctmc.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | 4 | let test_pruning ?(tree_size = 5) ?(seq_size = 10) () = 5 | let module M = Site_evolution_model.JC69 in 6 | let module Align = Alignment.Make(Seq.DNA) in 7 | let module F = Felsenstein.Make(Nucleotide)(Align)(M) in 8 | let module SG = Sequence_generation.Make(Nucleotide)(Seq.DNA)(Align)(M) in 9 | let module CTMC = Phylo_ctmc in 10 | let tree = Phylogenetic_tree.make_random tree_size in 11 | let align = 12 | SG.seqgen_string_list () tree seq_size 13 | |> Align.of_string_list 14 | in 15 | let felsenstein_result = F.felsenstein () tree align in 16 | let ctmc_result = 17 | let tree = Phylogenetic_tree.to_tree tree in 18 | let transition_probabilities l = [`Mat (M.transition_probability_matrix () l :> Linear_algebra.mat)] in 19 | let root_frequencies = (M.stationary_distribution () :> Linear_algebra.vec) in 20 | Array.init (Align.length align) ~f:(fun i -> 21 | let leaf_state (_, index) = Align.get_base align ~seq:index ~pos:i |> Nucleotide.to_int in 22 | CTMC.pruning tree ~nstates:Nucleotide.card ~transition_probabilities ~leaf_state ~root_frequencies 23 | ) 24 | |> Utils.array_sum 25 | in 26 | Test_utils.check_likelihood felsenstein_result ctmc_result 27 | 28 | let vec_of_sv (Phylo_ctmc.SV (v, carry)) = 29 | Linear_algebra.Vector.scal_mul (Float.exp carry) v 30 | 31 | let check_equal_clv v1 v2 = 32 | let open Linear_algebra in 33 | let d2 = Vector.( 34 | let delta = add v1 (scal_mul (-1.) v2) in 35 | sum (mul delta delta) 36 | ) 37 | in 38 | if Float.(sqrt d2 < 1e-6) then Ok () 39 | else Error (`Different_conditional_likelihoods (v1, v2)) 40 | 41 | let rec check_equal_conditional_likelihood cl1 cl2 = 42 | let open Tree in 43 | let open Result.Monad_infix in 44 | match cl1, cl2 with 45 | | Leaf _, Leaf _ -> Ok () 46 | | Node n1, Node n2 -> ( 47 | let v1 = vec_of_sv n1.data and v2 = vec_of_sv n2.data in 48 | check_equal_clv v1 v2 >>= fun () -> 49 | match List1.map2 n1.branches n2.branches ~f:(fun (Branch b1) (Branch b2) -> 50 | check_equal_conditional_likelihood b1.tip b2.tip 51 | ) 52 | with 53 | | Ok results -> Result.all_unit (List1.to_list results) 54 | | Error `Unequal_lengths -> Error `Different_structure 55 | ) 56 | | _ -> Error `Different_structure 57 | 58 | let test_conditional_likelihood_ambiguity () = 59 | let module BI = struct type t = float let length x = x end in 60 | let module Sim = Simulator.Make(Amino_acid)(BI) in 61 | let rng = Gsl.Rng.(make (default ())) in 62 | let bd = Birth_death.make ~birth_rate:1. ~death_rate:0.9 in 63 | let tree = Birth_death.age_ntaxa_simulation bd rng ~age:1. ~ntaxa:20 in 64 | let stationary_distribution = Amino_acid.random_profile rng 0.1 in 65 | let root_state = Amino_acid.of_char_exn 'V' in 66 | let rate_matrix = Rate_matrix.Amino_acid.gtr 67 | ~stationary_distribution 68 | ~exchangeabilities:(Rate_matrix.Amino_acid.make_symetric (fun _ _ -> 1e-3)) 69 | in 70 | let site = Sim.site_gillespie_direct rng ~root:root_state tree ~rate_matrix:(Fun.const rate_matrix) in 71 | let nstates = Amino_acid.card in 72 | let transition_probabilities bl = 73 | (Amino_acid.Matrix.(expm (scal_mul bl rate_matrix)) :> Linear_algebra.mat) 74 | in 75 | let cl = Phylo_ctmc.conditional_likelihoods site ~nstates 76 | ~transition_probabilities 77 | ~leaf_state:(fun (_, aa) -> Amino_acid.to_int aa) in 78 | let cl_amb = Phylo_ctmc.Ambiguous.conditional_likelihoods site ~nstates 79 | ~leaf_state:(fun (_, aa) i -> Amino_acid.to_int aa = i) 80 | ~transition_probabilities 81 | in 82 | match check_equal_conditional_likelihood cl cl_amb with 83 | | Ok () -> () 84 | | Error (`Different_conditional_likelihoods (v1, v2)) -> 85 | let open Linear_algebra in 86 | failwithf 87 | "Different conditional likelihoods:\n%s\n%s\n%s" 88 | ([%show: float array] (Amino_acid.Vector.to_array stationary_distribution)) 89 | ([%show: float array] (Vector.to_array v1)) 90 | ([%show: float array] (Vector.to_array v2)) 91 | () 92 | 93 | | Error `Different_structure -> failwith "Different tree structure" 94 | 95 | let tests = [ 96 | ("Felsenstein vs Phylo_ctmc", `Quick, test_pruning ~tree_size:100 ~seq_size:10) ; 97 | ("Cond. lik. simple vs ambiguous", `Quick, test_conditional_likelihood_ambiguity) ; 98 | ] 99 | -------------------------------------------------------------------------------- /tests/test_rejection_sampling.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | 4 | let rng = Gsl.Rng.(make (default ())) 5 | 6 | (** {6 Test input parameters} *) 7 | 8 | module Align = Alignment.Make(Seq.DNA) 9 | module RS_DNA = Rejection_sampling.Make(Align) 10 | let myalign = Align.of_string_list ["A"; "A"; "A"; "T"] 11 | let mybasetree = Phylogenetic_tree.of_preorder "0.1;0.1;0.1;0.1;0;1;2.5;0.1;2;3" 12 | let mysampler = Stat_tools.sample_branch_lengths ~branchs:(fun i -> i=5) 13 | ~sampler:(fun () -> Gsl.Rng.uniform rng *. 5.) mybasetree 14 | 15 | module K80 = struct 16 | include Site_evolution_model.K80 17 | type base = Nucleotide.t 18 | end 19 | 20 | (** {6 Test functions} *) 21 | module Seqgen = Sequence_generation.Make(Nucleotide)(Seq.DNA)(Align)(K80) 22 | 23 | let sample amount = 24 | let prior_trees = RS_DNA.generate_trees ~sampler:mysampler amount in 25 | let post_trees = RS_DNA.reject Seqgen.seqgen 2.0 myalign prior_trees in 26 | List.map post_trees ~f:(fun t -> 27 | Phylogenetic_tree.get_branch_lengths t 28 | |> fun l -> List.nth_exn l 5 29 | ) 30 | 31 | let test_rejection () = 32 | Test_utils.check_distrib [2.8] (sample 500000) 33 | 34 | 35 | (** {6 Test list} *) 36 | 37 | let tests = [ 38 | "Specific branch length on tiny tree with 500k points.", `Slow, test_rejection] 39 | -------------------------------------------------------------------------------- /tests/test_sequence.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | open Alcotest 4 | 5 | 6 | (** {6 Test input parameters} *) 7 | 8 | module DNA = Seq.DNA 9 | 10 | let myseq = DNA.of_list Nucleotide.[a;g;c;t] 11 | 12 | 13 | (** {6 Test functions} *) 14 | 15 | let test_of_string () = 16 | DNA.of_string_exn "AGCT" |> 17 | (check @@ testable DNA.pp Poly.equal) "identical sequences" myseq 18 | 19 | 20 | (** {6 Test list} *) 21 | 22 | let tests = [ 23 | "of_string", `Quick, test_of_string 24 | ] 25 | -------------------------------------------------------------------------------- /tests/test_site_evolution_model.ml: -------------------------------------------------------------------------------- 1 | open Phylogenetics 2 | open Test_utils 3 | open Site_evolution_model 4 | 5 | 6 | let test_JC69_exponential () = 7 | compare_matrices 8 | (module Nucleotide) 9 | "JC69: numerical vs analytical exponential" 10 | (JC69.transition_probability_matrix () 0.1) 11 | (JC69_numerical.transition_probability_matrix () 0.1) 12 | 13 | let test_K80_exponential () = 14 | compare_matrices 15 | (module Nucleotide) 16 | "K80: numerical vs analytical exponential" 17 | (K80.transition_probability_matrix 2.0 0.1) 18 | (K80_numerical.transition_probability_matrix 2.0 0.1) 19 | 20 | let tests = [ 21 | "JC69 exponential", `Quick, test_JC69_exponential; 22 | "K80 exponential", `Quick, test_K80_exponential; 23 | ] 24 | -------------------------------------------------------------------------------- /tests/test_topotree.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics.Phylogenetic_tree 3 | open Alcotest 4 | 5 | 6 | (** {6 Test input parameters} *) 7 | 8 | let hs = Hashtbl.hash 9 | 10 | let mytree = Node { 11 | left = 0.135, Node { 12 | left = 0.11, Leaf {index = "T0"; meta={routing_no= -1; id=hs "T0"}}; 13 | right = 0.18, Leaf {index = "T1"; meta={routing_no= -1; id=hs "T1"}}; 14 | meta = {routing_no= -1; id=hs (hs "T0" + hs "T1")} 15 | }; 16 | right = 0.23, Leaf {index = "T2"; meta={routing_no= -1; id=hs "T2"}}; 17 | meta = {routing_no= -1; id=hs (hs "T2" + hs (hs "T0" + hs "T1"))} 18 | } 19 | 20 | 21 | (** {6 Test functions} *) 22 | 23 | let test_of_preorder () = 24 | of_preorder "0.135;0.23;0.11;0.18;0;1;2" |> 25 | (check @@ testable Phylogenetics.Phylogenetic_tree.pp Poly.equal) "identical trees" mytree 26 | 27 | 28 | (** {6 Test list} *) 29 | 30 | let tests = [ 31 | "of_preorder", `Quick, test_of_preorder 32 | ] 33 | -------------------------------------------------------------------------------- /tests/test_utils.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Alcotest 3 | open Phylogenetics.Stat_tools 4 | module Bppsuite = Phylogenetics.Bppsuite 5 | 6 | let eps = 0.1 7 | 8 | (* ============= *) 9 | (* COMPARISONS *) 10 | (* ============= *) 11 | 12 | (* Function used to compare floats and tolerate relative imprecision. 13 | Returns true if (1-p)*f1 < f2 < (1+p)*f1 *) 14 | let float_compare p f1 f2 = 15 | let diff = f1-.f2 |> Float.abs in 16 | Float.(diff /. Float.abs f1 <= p) 17 | 18 | let check_likelihood = (check @@ testable 19 | (pp (Alcotest.float eps)) 20 | (float_compare 0.00001) 21 | ) "identical log likelihoods!" 22 | 23 | let check_distrib ref_estim d = 24 | (check @@ list (testable (pp (Alcotest.float eps)) (float_compare 0.05))) 25 | "Distributions with identical characteristics" 26 | ref_estim [sample_list_mean d] 27 | 28 | let compare_matrices (type s) (module A : Phylogenetics.Alphabet.S with type matrix = s) = 29 | check @@ testable A.Matrix.pp (A.Matrix.robust_equal ~tol:0.0001) 30 | 31 | 32 | (* ================ *) 33 | (* BPP INTERFACES *) 34 | (* ================ *) 35 | 36 | (* fails after printing the content of a file with a message*) 37 | let fail_file ?(path="tmp.data") message = 38 | Printf.sprintf "ERROR (bpp_interface): %s:\n%s" 39 | message (In_channel.read_all path) |> prerr_endline; Out_channel.flush stdout; 40 | failwith message 41 | 42 | let felsenstein_bpp ?(alphabet = Bppsuite.DNA) ?(model = Bppsuite.JC69) ?(path=".") ~tree seq = 43 | let call = 44 | Bppsuite.Cmd.bppml 45 | ~alphabet ~model 46 | ~input_tree_file:(Filename.concat path tree) 47 | ~input_sequence_file:(Filename.concat path seq) 48 | () 49 | in 50 | let script = sprintf "%s > tmp.data 2>&1" call in 51 | match 52 | if Stdlib.Sys.command script <> 0 then fail_file "bppml failed" ; 53 | In_channel.read_lines "tmp.data" 54 | |> List.filter ~f:String.(fun l-> equal (prefix l 11) "Initial log") 55 | with (* looking for a very specific line *) 56 | | [l] -> 57 | Scanf.sscanf l "Initial log likelihood.................: %f" (fun x->x) 58 | | _ -> 59 | fail_file "unexpected bppml output" 60 | 61 | let seqgen_bpp ?(alphabet = Bppsuite.DNA) ?(model = Bppsuite.JC69) ?(path=".") ~tree output size = 62 | let call = 63 | Bppsuite.Cmd.bppseqgen 64 | ~alphabet ~model 65 | ~input_tree_file:(Filename.concat path tree) 66 | ~output_sequence_file:(Filename.concat path output) 67 | ~number_of_sites:size 68 | in 69 | let script = sprintf "%s > tmp.data 2>&1" call in 70 | match Stdlib.Sys.command script with 71 | | 0 -> () 72 | | _ -> fail_file "bppseqgen failed" 73 | -------------------------------------------------------------------------------- /tests/test_utils.mli: -------------------------------------------------------------------------------- 1 | (** Various utility functions to be used in tests. 2 | Includes bio++ interfaces and Alcotest comparison functions. *) 3 | 4 | open Phylogenetics 5 | 6 | 7 | (** {6 Comparison functions (including Alcotest testables)} *) 8 | 9 | (** Compares two distributions using estimators TODO *) 10 | val check_distrib: Stat_tools.sample_list -> Stat_tools.sample_list -> unit 11 | 12 | (** Compares two floats (which are supposed to be likelihood results) using the alcotest check *) 13 | val check_likelihood: float -> float -> unit 14 | 15 | val compare_matrices: 16 | (module Alphabet.S with type matrix = 'mat) -> 17 | string -> 18 | 'mat -> 19 | 'mat -> 20 | unit 21 | 22 | 23 | (** {6 Interfaces for external runs of bppml.} 24 | Uses Sys and needs bpp executables in PATH.*) 25 | 26 | (** Runs bppml and extracts initial log likelihood. Needs bppml in PATH. *) 27 | val felsenstein_bpp : 28 | ?alphabet:Bppsuite.alphabet -> 29 | ?model:Bppsuite.model -> 30 | ?path:string -> 31 | tree:string -> 32 | string -> 33 | float 34 | 35 | (** Runs bppseqgen and writes the result in a fasta file. Needs bppseqgen in PATH. *) 36 | val seqgen_bpp : 37 | ?alphabet:Bppsuite.alphabet -> 38 | ?model:Bppsuite.model -> 39 | ?path:string -> 40 | tree:string -> 41 | string -> 42 | int -> 43 | unit 44 | -------------------------------------------------------------------------------- /tests/test_zipper.ml: -------------------------------------------------------------------------------- 1 | open Core 2 | open Phylogenetics 3 | open Zipper 4 | open Alcotest 5 | 6 | let eps = 0.1 7 | 8 | (** {6 Test input parameters} *) 9 | 10 | (*/--0.200--/--0.400-- 11 | | \--0.300-- 12 | \--0.100--*) 13 | let mytree = Phylogenetic_tree.of_preorder "0.2;0.1;0.4;0.3;0;1;2" 14 | let mybranches = Phylogenetic_tree.[of_preorder "0.4;0.3;0;1"; of_preorder "2"] 15 | 16 | (*/--0.200-- 17 | \--0.200--/--0.300-- 18 | \--0.300--*) 19 | let myothertree = Phylogenetic_tree.of_preorder "0.2;0.2;0;0.3;0.3;1;2" 20 | 21 | 22 | (** {6 Test functions} *) 23 | 24 | let test_of_tree_check_subtress () = 25 | of_tree mytree |> (fun z -> [get_tree z Dir0; get_tree z Dir1]) |> 26 | (check @@ slist Phylogenetic_tree.(testable pp equal) Poly.compare) "identical subtrees" mybranches 27 | 28 | let test_of_tree_check_lengths () = 29 | of_tree mytree |> (fun z -> [get_length z Dir0; get_length z Dir1]) |> 30 | (check @@ slist (float eps) Poly.compare) "identical branch lengths" [0.1; 0.2] 31 | 32 | let test_tree_and_back () = 33 | of_tree mytree |> to_tree |> 34 | Phylogenetic_tree.(check @@ testable pp equal) "identical trees" mytree 35 | 36 | let test_of_tree_dir_move_and_back () = 37 | of_tree_dir mytree |> move_left |> unorient |> to_tree |> 38 | Phylogenetic_tree.(check @@ testable pp equal) "identical trees" myothertree 39 | 40 | let test_goto () = 41 | let zipper_goto = of_tree mytree |> init_routing |> fun z -> goto z 2 in 42 | let zipper_move = of_tree mytree |> fun z -> move z Dir0 |> fun z -> move z Dir0 in 43 | Zipper.(check @@ testable pp equal) "identical zippers" zipper_move zipper_goto 44 | 45 | 46 | (** {6 Test list} *) 47 | 48 | let tests = [ 49 | "of_tree (check subtrees)", `Quick, test_of_tree_check_subtress; 50 | "of_tree (check lengths)", `Quick, test_of_tree_check_lengths; 51 | "of_tree and back to_tree", `Quick, test_tree_and_back; 52 | "of_tree_dir, move_left and back to_tree", `Quick, test_of_tree_dir_move_and_back; 53 | "goto vs several moves", `Quick, test_goto 54 | ] 55 | --------------------------------------------------------------------------------