├── .gitattributes ├── Makefile ├── README.md ├── coding_wheel.txt ├── codon_usage_freq_table_human.csv ├── codon_usage_freq_table_yeast.csv ├── gflags.py ├── license.txt ├── lineardesign ├── pic └── baidu_research_logo.jpg ├── src ├── Utils │ ├── base.h │ ├── codon.h │ ├── common.h │ ├── constants.h │ ├── flat.h │ ├── libraries │ │ ├── LinearDesign_Mac_M1.so │ │ ├── LinearDesign_Mac_x86.so │ │ ├── LinearDesign_linux64.so │ │ └── LinearDesign_linux64_old.so │ ├── network.h │ ├── reader.h │ └── utility_v.h ├── backtrace_iter.cc ├── beam_cky_parser.cc ├── beam_cky_parser.h └── linear_design.cpp └── testseq /.gitattributes: -------------------------------------------------------------------------------- 1 | *.cpp linguist-language=c++ 2 | *.h linguist-language=c++ -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | CLA=clang++ 3 | CXX=g++ 4 | CXXFLAGS=-std=c++11 -Ofast -DFINAL_CHECK -DSPECIAL_HP -fpermissive 5 | DEPS=src/beam_cky_parser.cc src/beam_cky_parser.h src/backtrace_iter.cc src/Utils/reader.h src/Utils/network.h src/Utils/codon.h src/Utils/utility_v.h src/Utils/common.h src/Utils/base.h 6 | BIN=bin/LinearDesign_2D 7 | UNAME_S := $(shell uname -s) 8 | UNAME_M := $(shell uname -m) 9 | 10 | lineardesign_2D: $(DEPS) 11 | @echo "Compiling" $@ "from" $< "..." 12 | chmod +x lineardesign 13 | mkdir -p ./bin 14 | export LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH 15 | 16 | ifeq ($(UNAME_S), Linux) 17 | if $(CXX) $(CXXFLAGS) src/linear_design.cpp -o bin/LinearDesign_2D src/Utils/libraries/LinearDesign_linux64.so; then \ 18 | echo "Linux system; compiled with g++; finished."; \ 19 | echo "Compilation Succeed!"; \ 20 | else \ 21 | echo "Try another .so file."; \ 22 | if $(CXX) $(CXXFLAGS) src/linear_design.cpp -o bin/LinearDesign_2D src/Utils/libraries/LinearDesign_linux64_old.so; then \ 23 | echo "Linux system; compiled with g++; finished."; \ 24 | echo "Compilation Succeed!"; \ 25 | else \ 26 | echo "Compilation failed! Make sure it is either Linux-64 or Mac."; \ 27 | fi \ 28 | fi 29 | else 30 | if [[ $(UNAME_M) == 'arm64' ]]; then \ 31 | if $(CLA) $(CXXFLAGS) src/linear_design.cpp -o bin/LinearDesign_2D src/Utils/libraries/LinearDesign_Mac_M1.so; then \ 32 | echo "Mac M1 system; compiled with clang++; finished."; \ 33 | echo "Compilation Succeed!"; \ 34 | echo "You may encounter a pop-up message at the first run. If so, please go to System Preferences -> Security & Privacy -> General to allow LinearDesign_Mac_M1.so to open. See README.md for details."; \ 35 | else \ 36 | echo "Compilation failed! Make sure it is either Linux-64 or Mac."; \ 37 | fi \ 38 | else \ 39 | if $(CLA) $(CXXFLAGS) src/linear_design.cpp -o bin/LinearDesign_2D src/Utils/libraries/LinearDesign_Mac_x86.so; then \ 40 | echo "Mac x86_64 system; compiled with clang++; finished."; \ 41 | echo "Compilation Succeed!"; \ 42 | echo "You may encounter a pop-up message at the first run. If so, please go to System Preferences -> Security & Privacy -> General to allow LinearDesign_Mac_x86.so to open. See README.md for details."; \ 43 | else \ 44 | echo "Compilation failed! Make sure it is either Linux-64 or Mac."; \ 45 | fi \ 46 | fi 47 | endif 48 | 49 | 50 | .PHONY : clean 51 | 52 | clean: 53 | rm -f $(BIN) 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Baidu Research Logo

2 | 3 | # Algorithm for Optimized mRNA Design Improves Stability and Immunogenicity (LinearDesign) 4 | ![GitHub all releases](https://img.shields.io/github/downloads/LinearDesignSoftware/LinearDesign/total) 5 | 6 | 7 | This repository contains the source code for the LinearDesign project. 8 | 9 | He Zhang†, Liang Zhang†, Ang Lin†, Congcong Xu†, Ziyu Li, Kaibo Liu, Boxiang Liu, Xiaopin Ma, Fanfan Zhao, Huiling Jiang, Chunxiu Chen, Haifa Shen, Hangwen Li*, David H. Mathews*, Yujian Zhang*, Liang Huang†*^#. Algorithm for Optimized mRNA Design Improves Stability and Immunogenicity. Nature [https://doi.org/10.1038/s41586-023-06127-z](https://doi.org/10.1038/s41586-023-06127-z) (2023) 10 | 11 | † contributed equally, 12 | \* corresponding authors, 13 | ^# lead corresponding author 14 | 15 | For questions, please contact the lead corresponding author at . 16 | 17 | ## Dependencies 18 | Clang 11.0.0 (or above) or GCC 4.8.5 (or above) 19 | 20 | python2.7 21 | 22 | ## To Compile 23 | ``` 24 | make 25 | ``` 26 | 27 | ## To Run 28 | The LinearDesign program can be run with: 29 | ``` 30 | echo SEQUENCE | ./lineardesign [OPTIONS] 31 | 32 | OR 33 | 34 | cat FASTA_FILE | ./lineardesign [OPTIONS] 35 | ``` 36 | 37 | OPTIONS: 38 | ``` 39 | --lambda LAMBDA or -l LAMBDA 40 | ``` 41 | Set LAMBDA, a hyperparameter balancing MFE and CAI. (default 0.0) 42 | ``` 43 | --codonusage FILE_NAME or -c FILE_NAME 44 | ``` 45 | Import a Codon Usage Frequency Table. See "codon_usage_freq_table_human.csv" for the format. 46 | (default: using human codon usage frequency table) 47 | ``` 48 | --verbose or -v 49 | ``` 50 | Print out more details. (default False) 51 | 52 | For Macbook, users may encounter a pop-up message at the first run. 53 | For Mac-M1 system, the message is: 54 | ``` 55 | "LinearDesign_Mac_M1.so" can't be opened because Apple cannot check it for malicious software. 56 | ``` 57 | For Mac-Intel system, the message is: 58 | ``` 59 | "LinearDesign_Mac_Intel.so" cannot be opened because it is from an unidentified developer. 60 | ``` 61 | If so, please go to "System Preferences -> Security & Privacy -> General" to allow LinearDesign-Mac-M1.so (or LinearDesign-Mac-Intel.so) to open. 62 | 63 | ## Example: Single Sequence Design 64 | ``` 65 | echo MNDTEAI | ./lineardesign 66 | mRNA sequence: AUGAACGAUACGGAGGCGAUC 67 | mRNA structure: ......(((.((....))))) 68 | mRNA folding free energy: -1.10 kcal/mol; mRNA CAI: 0.695 69 | ``` 70 | 71 | ## Example: Multiple Sequences Design with Option --lambda (-l) 72 | ``` 73 | cat testseq | ./lineardesign --lambda 3 74 | >seq1 75 | mRNA sequence: AUGCCAAACACCCUGGCAUGCCCC 76 | mRNA structure: ((((((.......))))))..... 77 | mRNA folding free energy: -6.00 kcal/mol; mRNA CAI: 0.910 78 | 79 | >seq2 80 | mRNA sequence: AUGCUGGAUCAGGUGAACAAGCUGAAGUACCCAGAGGUGAGCCUGACCUGA 81 | mRNA structure: .....((.((((((..((...(((.......)))..))..))))))))... 82 | mRNA folding free energy: -13.50 kcal/mol; mRNA CAI: 0.979 83 | ``` 84 | 85 | ## Example: Option --codonusage (-c) 86 | ``` 87 | echo MNDTEAI | ./lineardesign -l 0.3 --codonusage codon_usage_freq_table_yeast.csv 88 | mRNA sequence: AUGAAUGAUACGGAAGCGAUC 89 | mRNA structure: ......(((.((....))))) 90 | mRNA folding free energy: -1.10 kcal/mol; mRNA CAI: 0.670 91 | ``` 92 | 93 | ## Example: Option --verbose (-v) 94 | ``` 95 | echo MNDTEAI | ./lineardesign --verbose 96 | Input protein: MNDTEAI 97 | Using lambda = 0; Using codon frequency table = codon_usage_freq_table_human.csv 98 | mRNA sequence: AUGAACGAUACGGAGGCGAUC 99 | mRNA structure: ......(((.((....))))) 100 | mRNA folding free energy: -1.10 kcal/mol; mRNA CAI: 0.695 101 | Runtime: 0.002 seconds 102 | ``` 103 | 104 | 105 | ## Declarations 106 | Baidu Research has filed a patent for the LinearDesign algorithm that lists He Zhang, Liang Zhang, Ziyu Li, Kaibo Liu, Boxiang Liu, and Liang Huang as inventors. 107 | -------------------------------------------------------------------------------- /coding_wheel.txt: -------------------------------------------------------------------------------- 1 | Phe U U CU 2 | Leu C U GCUA U U GA 3 | Ser U C GCUA A G CU 4 | Tyr U A CU 5 | STOP U A GA U G A 6 | Cys U G CU 7 | Trp U G G 8 | Pro C C GCUA 9 | His C A CU 10 | Gln C A GA 11 | Arg C G GCUA A G GA 12 | Ile A U CUA 13 | Met A U G 14 | Thr A C GCUA 15 | Asn A A CU 16 | Lys A A GA 17 | Val G U GCUA 18 | Asp G A CU 19 | Glu G A GA 20 | Gly G G GCUA 21 | Ala G C GCUA -------------------------------------------------------------------------------- /codon_usage_freq_table_human.csv: -------------------------------------------------------------------------------- 1 | #,, 2 | UAA,*,0.28 3 | UAG,*,0.2 4 | UGA,*,0.52 5 | GCU,A,0.26 6 | GCC,A,0.4 7 | GCA,A,0.23 8 | GCG,A,0.11 9 | UGU,C,0.45 10 | UGC,C,0.55 11 | GAU,D,0.46 12 | GAC,D,0.54 13 | GAA,E,0.42 14 | GAG,E,0.58 15 | UUU,F,0.45 16 | UUC,F,0.55 17 | GGU,G,0.16 18 | GGC,G,0.34 19 | GGA,G,0.25 20 | GGG,G,0.25 21 | CAU,H,0.41 22 | CAC,H,0.59 23 | AUU,I,0.36 24 | AUC,I,0.48 25 | AUA,I,0.16 26 | AAA,K,0.42 27 | AAG,K,0.58 28 | UUA,L,0.07 29 | UUG,L,0.13 30 | CUU,L,0.13 31 | CUC,L,0.2 32 | CUA,L,0.07 33 | CUG,L,0.41 34 | AUG,M,1 35 | AAU,N,0.46 36 | AAC,N,0.54 37 | CCU,P,0.28 38 | CCC,P,0.33 39 | CCA,P,0.27 40 | CCG,P,0.11 41 | CAA,Q,0.25 42 | CAG,Q,0.75 43 | CGU,R,0.08 44 | CGC,R,0.19 45 | CGA,R,0.11 46 | CGG,R,0.21 47 | AGA,R,0.2 48 | AGG,R,0.2 49 | UCU,S,0.18 50 | UCC,S,0.22 51 | UCA,S,0.15 52 | UCG,S,0.06 53 | AGU,S,0.15 54 | AGC,S,0.24 55 | ACU,T,0.24 56 | ACC,T,0.36 57 | ACA,T,0.28 58 | ACG,T,0.12 59 | GUU,V,0.18 60 | GUC,V,0.24 61 | GUA,V,0.11 62 | GUG,V,0.47 63 | UGG,W,1 64 | UAU,Y,0.43 65 | UAC,Y,0.57 66 | -------------------------------------------------------------------------------- /codon_usage_freq_table_yeast.csv: -------------------------------------------------------------------------------- 1 | #,, 2 | UAA,*,0.48 3 | UAG,*,0.24 4 | UGA,*,0.29 5 | GCU,A,0.38 6 | GCC,A,0.22 7 | GCA,A,0.29 8 | GCG,A,0.11 9 | UGU,C,0.63 10 | UGC,C,0.37 11 | GAU,D,0.65 12 | GAC,D,0.35 13 | GAA,E,0.71 14 | GAG,E,0.29 15 | UUU,F,0.59 16 | UUC,F,0.41 17 | GGU,G,0.47 18 | GGC,G,0.19 19 | GGA,G,0.22 20 | GGG,G,0.12 21 | CAU,H,0.64 22 | CAC,H,0.36 23 | AUU,I,0.46 24 | AUC,I,0.26 25 | AUA,I,0.27 26 | AAA,K,0.58 27 | AAG,K,0.42 28 | UUA,L,0.28 29 | UUG,L,0.29 30 | CUU,L,0.13 31 | CUC,L,0.06 32 | CUA,L,0.14 33 | CUG,L,0.11 34 | AUG,M,1 35 | AAU,N,0.59 36 | AAC,N,0.41 37 | CCU,P,0.31 38 | CCC,P,0.15 39 | CCA,P,0.41 40 | CCG,P,0.12 41 | CAA,Q,0.69 42 | CAG,Q,0.31 43 | CGU,R,0.15 44 | CGC,R,0.06 45 | CGA,R,0.07 46 | CGG,R,0.04 47 | AGA,R,0.48 48 | AGG,R,0.21 49 | UCU,S,0.26 50 | UCC,S,0.16 51 | UCA,S,0.21 52 | UCG,S,0.1 53 | AGU,S,0.16 54 | AGC,S,0.11 55 | ACU,T,0.35 56 | ACC,T,0.22 57 | ACA,T,0.3 58 | ACG,T,0.13 59 | GUU,V,0.39 60 | GUC,V,0.21 61 | GUA,V,0.21 62 | GUG,V,0.19 63 | UGG,W,1 64 | UAU,Y,0.56 65 | UAC,Y,0.44 66 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | The LinearDesign code is freely accessible to all interested parties. 2 | It is free for academic, non-profit, and research use, and can be licensed for commercial use. 3 | 4 | To use this software for the development of a commercial product, including but not limited to software, service, or pharmaceuticals, please contact the lead corresponding author. 5 | 6 | Redistribution of the code with or without modification is not permitted without explicit written permission by the lead corresponding author. 7 | -------------------------------------------------------------------------------- /lineardesign: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import gflags as flags 4 | import subprocess 5 | import sys 6 | import os 7 | 8 | FLAGS = flags.FLAGS 9 | 10 | def setgflags(): 11 | flags.DEFINE_float('lambda', 0.0, "set lambda", short_name='l') 12 | flags.DEFINE_boolean('verbose', False, "print out more details", short_name='v') 13 | flags.DEFINE_string('codonusage', 'codon_usage_freq_table_human.csv', "import a Codon Usage Frequency Table", short_name='c') 14 | argv = FLAGS(sys.argv) 15 | 16 | def main(): 17 | 18 | lambda_ = str(FLAGS.l) 19 | verbose_ = '1' if FLAGS.verbose else '0' 20 | codon_usage = str(FLAGS.codonusage) 21 | 22 | path = os.path.dirname(os.path.abspath(__file__)) 23 | cmd = ["%s/%s" % (path, ('bin/LinearDesign_2D')), lambda_, verbose_, codon_usage] 24 | subprocess.call(cmd, stdin=sys.stdin) 25 | 26 | if __name__ == '__main__': 27 | setgflags() 28 | main() 29 | 30 | -------------------------------------------------------------------------------- /pic/baidu_research_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinearDesignSoftware/LinearDesign/f0126ca89a8b853088b4bccfd2cc8c378d3678be/pic/baidu_research_logo.jpg -------------------------------------------------------------------------------- /src/Utils/base.h: -------------------------------------------------------------------------------- 1 | #ifndef base_h 2 | #define base_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #if defined(__GNUC__) || defined(__clang__) 12 | #define LINEAR_DESIGN_DEPRECATED __attribute__((deprecated)) 13 | #elif defined(_MSC_VER) 14 | #define LINEAR_DESIGN_DEPRECATED __declspec(deprecated) 15 | #else 16 | #pragma message("WARNING: function deprecated") 17 | #define LINEAR_DESIGN_DEPRECATED 18 | #endif 19 | 20 | #if defined(__GNUC__) || defined(__clang__) 21 | #define LINEAR_DESIGN_INLINE inline __attribute__((always_inline)) 22 | #else 23 | #define LINEAR_DESIGN_INLINE inline 24 | #endif 25 | 26 | #define LINEAR_DESIGN_CACHELINE 64 27 | 28 | template 29 | using enable_if_t = typename std::enable_if::type; 30 | 31 | template ::value, int> = 0> 32 | std::ostream& operator<< (std::ostream& out, const std::pair& rhs) { 33 | out << "(" << rhs.first << ", " << rhs.second << ")"; 34 | return out; 35 | } 36 | 37 | template ::value, int> = 0> 38 | std::ostream& operator<< (std::ostream& out, const std::vector>& rhs) { 39 | out << "["; 40 | for (size_t i = 0; i < rhs.size(); ++i) { 41 | out << rhs[i]; 42 | if (i < rhs.size() - 1) out << ","; 43 | } 44 | out << "]"; 45 | return out; 46 | } 47 | 48 | namespace LinearDesign { 49 | 50 | namespace util { 51 | std::vector split(const std::string &s, char delim) { 52 | std::vector result; 53 | std::stringstream ss(s); 54 | std::string item; 55 | while (getline(ss, item, delim)) 56 | result.push_back(item); 57 | return result; 58 | } 59 | 60 | template 61 | constexpr T value_min() { 62 | static_assert(std::is_integral::value || 63 | std::is_floating_point::value, "Int or float required."); 64 | return std::numeric_limits::lowest(); 65 | } 66 | 67 | template 68 | constexpr T value_max() { 69 | static_assert(std::is_integral::value || 70 | std::is_floating_point::value, "Int or float required."); 71 | return std::numeric_limits::max(); 72 | } 73 | } /* util */ 74 | 75 | 76 | // template struct is_any; 77 | // template <> struct is_any<> : std::false_type {}; 78 | // template struct is_any { 79 | // constexpr static bool value = First || is_any::value; 80 | // }; 81 | 82 | struct hash_pair_pair { 83 | template 84 | size_t operator()(const std::pair, T3>& p) const { 85 | auto hash1 = std::hash{}(p.first.first); 86 | auto hash2 = std::hash{}(p.first.second); 87 | auto hash3 = std::hash{}(p.second); 88 | return hash1 ^ hash2 ^ hash3; 89 | } 90 | }; 91 | 92 | struct hash_pair { 93 | template 94 | size_t operator()(const std::pair& p) const { 95 | auto hash1 = std::hash{}(p.first); 96 | auto hash2 = std::hash{}(p.second); 97 | return hash1 ^ hash2; 98 | } 99 | }; 100 | 101 | } 102 | 103 | 104 | namespace Hash { 105 | template 106 | LINEAR_DESIGN_INLINE size_t hash_combine(size_t left_seed, const T& right) { 107 | return left_seed ^ (std::hash{}(right) << 1); 108 | } 109 | 110 | template ::value - 1> 111 | struct TupleHashImpl { 112 | static size_t impl(size_t seed, const Tuple& tuple) { 113 | size_t h = hash_combine(seed, std::get(tuple)); 114 | return TupleHashImpl::impl(h, tuple); 115 | } 116 | }; 117 | 118 | template 119 | struct TupleHashImpl { 120 | static size_t impl(size_t seed, const Tuple& tuple) { 121 | return hash_combine(seed, std::get<0>(tuple)); 122 | } 123 | }; 124 | } 125 | 126 | 127 | template 128 | struct std::hash> { 129 | size_t operator()(const std::tuple& ts) const { 130 | return Hash::TupleHashImpl>::impl(0, ts); 131 | } 132 | }; 133 | 134 | template 135 | struct std::hash> { 136 | size_t operator()(const std::pair& p) const { 137 | size_t h = std::hash{}(p.first); 138 | return Hash::hash_combine(h, p.second); 139 | } 140 | }; 141 | 142 | 143 | #endif -------------------------------------------------------------------------------- /src/Utils/codon.h: -------------------------------------------------------------------------------- 1 | #ifndef codon_h 2 | #define codon_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | #include "base.h" 16 | #include "constants.h" 17 | 18 | namespace LinearDesign { 19 | 20 | // trim from end (in place) 21 | static inline void rtrim(std::string &s) { 22 | s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { 23 | return !std::isspace(ch); 24 | }).base(), s.end()); 25 | } 26 | 27 | 28 | class Codon { 29 | public: 30 | Codon(const std::string& path) : codon_table_(), aa_table_() { 31 | std::ifstream codon_file; 32 | codon_file.open(path); 33 | if (codon_file.is_open()) { 34 | int index = 0; 35 | for (std::string line; getline(codon_file, line);){ 36 | 37 | rtrim(line); 38 | 39 | if(line.size() == 0 or line.empty()) 40 | continue; 41 | 42 | if (index++ == 0) 43 | continue; 44 | 45 | const auto line_split = util::split(line, ','); 46 | if(line_split.size() != 3){ 47 | std::cerr << "Wrong format of codon frequency file!" << std::endl; 48 | exit(1); 49 | } 50 | const std::string codon = line_split[0]; 51 | const std::string aa = line_split[1]; 52 | const float fraction = std::stof(line_split[2]); 53 | 54 | codon_table_[codon] = make_pair(aa, fraction); 55 | aa_table_[aa].push_back(make_pair(codon, fraction)); 56 | if (!max_aa_table_.count(aa)) 57 | max_aa_table_[aa] = fraction; 58 | else 59 | max_aa_table_[aa] = fmax(max_aa_table_[aa], fraction); 60 | } 61 | codon_file.close(); 62 | if (codon_table_.size() != 64){ 63 | std::cerr << "Codon frequency file needs to contain 64 codons!" << std::endl; 64 | exit(1); 65 | } 66 | 67 | } else { 68 | std::cerr << "The input codon frequency file does not exist!" << std::endl; 69 | exit(1); 70 | } 71 | } 72 | 73 | float calc_cai(const std::string& rna_seq) const { 74 | if (rna_seq.length() % 3) 75 | throw std::runtime_error("invalid rna seq"); 76 | 77 | const int protein_length = static_cast(rna_seq.length() / 3); 78 | float cai = 0.0f; 79 | 80 | for (int index = 3; index < rna_seq.length() + 1; index += 3) { 81 | const std::string tri_letter = rna_seq.substr(index - 3, 3); 82 | const auto f_ci_aa = codon_table_.at(tri_letter); 83 | const auto f_c_max = max_aa_table_.at(f_ci_aa.first); 84 | 85 | float w_i = f_ci_aa.second / f_c_max; 86 | cai += log2f(w_i); 87 | } 88 | 89 | return exp2f(cai / protein_length); 90 | } 91 | 92 | std::string find_max_codon(const char aa, 93 | const std::string& match) const { 94 | auto candidate_condons = aa_table_.at(std::string(1, aa)); 95 | 96 | float max_score = 0; 97 | std::string max_condon; 98 | for (auto& candidate : candidate_condons) { 99 | if (std::regex_match(candidate.first, std::regex(match)) && 100 | candidate.second > max_score) { 101 | max_condon = candidate.first; 102 | max_score = candidate.second; 103 | } 104 | } 105 | 106 | if (max_condon.empty()) 107 | throw std::runtime_error("invald search"); 108 | 109 | // assert(codon_table_.at(max_condon).first == std::string(1, aa)); 110 | return max_condon; 111 | } 112 | 113 | std::string cvt_rna_seq_to_aa_seq(const std::string& rna_seq) const { 114 | if (rna_seq.length() % 3) 115 | throw std::runtime_error("invalid rna seq"); 116 | 117 | std::string aa_seq; 118 | aa_seq.reserve(rna_seq.length()); 119 | for (int index = 3; index < rna_seq.length() + 1; index += 3) { 120 | const std::string tri_letter = rna_seq.substr(index - 3, 3); 121 | auto aa = codon_table_.at(tri_letter).first; 122 | if (aa == "STOP") { 123 | aa_seq.append("*"); 124 | return aa_seq; 125 | } 126 | aa_seq.append(codon_table_.at(tri_letter).first); 127 | } 128 | return aa_seq; 129 | } 130 | 131 | float get_weight(const std::string& aa_tri, const std::string& codon) const { 132 | 133 | if (k_map_3_1.count(aa_tri)) { 134 | auto codons = aa_table_.at(std::string(1, k_map_3_1[aa_tri])); 135 | auto it = std::find_if(codons.begin(), codons.end(), [codon](const std::pair& e){ 136 | // std::cout << typeid(e).name() << '\n'; 137 | return e.first == codon; 138 | }); 139 | if (it == codons.end()) { 140 | throw std::runtime_error("invalid codon"); 141 | } 142 | return it->second; 143 | } else if (three_prime_aa_table_.count(aa_tri)) { 144 | return three_prime_aa_table_.at(aa_tri).second; 145 | } 146 | 147 | return 0.0f; 148 | } 149 | 150 | 151 | 152 | // private: 153 | std::vector aux_aa_; 154 | std::map> three_prime_codon_table_; 155 | std::map> three_prime_aa_table_; 156 | 157 | 158 | std::map max_aa_table_; 159 | std::map> codon_table_; 160 | std::map>> aa_table_; 161 | }; 162 | 163 | } 164 | 165 | #endif 166 | -------------------------------------------------------------------------------- /src/Utils/common.h: -------------------------------------------------------------------------------- 1 | #ifndef common_h 2 | #define common_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "base.h" 13 | 14 | namespace LinearDesign { 15 | 16 | 17 | using SizeType = size_t; 18 | using ScoreType = int32_t; 19 | using IndexType = int32_t; //if less than 10000, only int16_t is needed here 20 | using NucType = int8_t; 21 | using NumType = int32_t; 22 | using NucPairType = int8_t; 23 | using PairType = int8_t; 24 | using FinalScoreType = double; 25 | using NodeType = std::pair; 26 | using NodeNucType = std::pair; 27 | using NodeNucWType = std::tuple; 28 | using PairType = int8_t; 29 | 30 | 31 | 32 | 33 | enum class Manner : std::uint8_t { 34 | NONE = 0, // 0: empty 35 | H, // 1: hairpin candidate 36 | HAIRPIN, // 2: hairpin 37 | SINGLE, // 3: single 38 | HELIX, // 4: helix 39 | MULTI, // 5: multi = ..M2. [30 restriction on the left and jump on the right] 40 | MULTI_eq_MULTI_plus_U, // 6: multi = multi + U 41 | P_eq_MULTI, // 7: P = (multi) 42 | M2_eq_M_plus_P, // 8: M2 = M + P 43 | M_eq_M2, // 9: M = M2 44 | M_eq_M_plus_U, // 10: M = M + U 45 | M_eq_P, // 11: M = P 46 | C_eq_C_plus_U, // 12: C = C + U 47 | C_eq_C_plus_P, // 13: C = C + P 48 | }; 49 | 50 | enum class Beam_type : std::uint8_t { 51 | BEAM_C = 0, 52 | BEAM_P, 53 | BEAM_MULTI, 54 | BEAM_M2, 55 | BEAM_M1 56 | 57 | }; 58 | 59 | 60 | template 61 | struct State { 62 | ScoreType score = util::value_min(); 63 | double cai_score = util::value_min(); 64 | NodeType pre_node; 65 | double pre_left_cai; 66 | }; 67 | 68 | struct BacktraceResult { 69 | std::string seq; 70 | std::string structure; 71 | }; 72 | 73 | template > 76 | struct DecoderResult { 77 | std::string sequence; 78 | std::string structure; 79 | ScoreType score; 80 | ScoreType cai; 81 | ScoreType old_cai; 82 | IndexType num_states; 83 | }; 84 | 85 | template > 88 | struct ScoreInnerDate { 89 | ScoreType newscore; 90 | NodeType j_node; 91 | NodeType i_node; 92 | int nuc_pair; 93 | }; 94 | 95 | 96 | struct NodeNucpair { 97 | IndexType node_first; 98 | NumType node_second; 99 | NucPairType nucpair; 100 | }; 101 | 102 | 103 | } 104 | 105 | #endif /* common_h */ 106 | -------------------------------------------------------------------------------- /src/Utils/constants.h: -------------------------------------------------------------------------------- 1 | #ifndef constants_h 2 | #define constants_h 3 | 4 | #include 5 | 6 | namespace LinearDesign { 7 | 8 | constexpr uint8_t k_void_nuc = 127; 9 | 10 | // static std::map k_map_1_3 = { 11 | // {'F',"Phe"}, 12 | // {'L',"Leu"}, 13 | // {'S',"Ser"}, 14 | // {'Y',"Tyr"}, 15 | // {'*',"STOP"}, 16 | // {'C',"Cys"}, 17 | // {'W',"Trp"}, 18 | // {'P',"Pro"}, 19 | // {'H',"His"}, 20 | // {'Q',"Gln"}, 21 | // {'R',"Arg"}, 22 | // {'I',"Ile"}, 23 | // {'M',"Met"}, 24 | // {'T',"Thr"}, 25 | // {'N',"Asn"}, 26 | // {'K',"Lys"}, 27 | // {'V',"Val"}, 28 | // {'D',"Asp"}, 29 | // {'E',"Glu"}, 30 | // {'G',"Gly"}, 31 | // {'A',"Ala"} 32 | // }; 33 | 34 | static std::map k_map_3_1 = { 35 | {"Phe", 'F'}, 36 | {"Leu", 'L'}, 37 | {"Ser", 'S'}, 38 | {"Tyr", 'Y'}, 39 | {"STOP", '*'}, 40 | {"Cys", 'C'}, 41 | {"Trp", 'W'}, 42 | {"Pro", 'P'}, 43 | {"His", 'H'}, 44 | {"Gln", 'Q'}, 45 | {"Arg", 'R'}, 46 | {"Ile", 'I'}, 47 | {"Met", 'M'}, 48 | {"Thr", 'T'}, 49 | {"Asn", 'N'}, 50 | {"Lys", 'K'}, 51 | {"Val", 'V'}, 52 | {"Asp", 'D'}, 53 | {"Glu", 'E'}, 54 | {"Gly", 'G'}, 55 | {"Ala", 'A'} 56 | }; 57 | 58 | } 59 | 60 | #endif /* constants_h */ 61 | -------------------------------------------------------------------------------- /src/Utils/flat.h: -------------------------------------------------------------------------------- 1 | #ifndef flat_h 2 | #define flat_h 3 | 4 | #include 5 | #include 6 | 7 | #include "base.h" 8 | 9 | namespace detail { 10 | template 11 | struct DefaultIndex { 12 | inline size_t operator()(const Key key) const { 13 | return static_cast(key); 14 | } 15 | }; 16 | } 17 | 18 | template > 19 | class Flat { 20 | public: 21 | using self_type = Flat; 22 | using storage_type = std::vector; 23 | using key_type = Key; 24 | using reference = T&; 25 | using const_reference = const T&; 26 | using iterator = typename storage_type::iterator; 27 | 28 | 29 | LINEAR_DESIGN_INLINE iterator begin() { 30 | return data_.begin(); 31 | } 32 | 33 | LINEAR_DESIGN_INLINE iterator end() { 34 | return data_.end(); 35 | } 36 | 37 | LINEAR_DESIGN_INLINE bool empty() const { 38 | return false; 39 | } 40 | 41 | LINEAR_DESIGN_INLINE void reserve(const size_t n) { 42 | data_.reserve(n); 43 | } 44 | 45 | LINEAR_DESIGN_INLINE void resize(const size_t n) { 46 | data_.resize(n); 47 | } 48 | 49 | template ::value, int> = 0> 50 | LINEAR_DESIGN_INLINE reference operator[](size_t index) { 51 | return data_[index]; 52 | } 53 | 54 | template ::value, int> = 0> 55 | LINEAR_DESIGN_INLINE const_reference operator[](size_t index) const { 56 | return data_[index]; 57 | } 58 | 59 | LINEAR_DESIGN_INLINE reference operator[](key_type key) { 60 | return data_[index_(key)]; 61 | } 62 | 63 | LINEAR_DESIGN_INLINE const_reference operator[](key_type key) const { 64 | return data_[index_(key)]; 65 | } 66 | 67 | LINEAR_DESIGN_INLINE size_t size() const { 68 | return data_.size(); 69 | } 70 | 71 | private: 72 | IndexFn index_; 73 | storage_type data_; 74 | }; 75 | 76 | #endif /* flat_h */ 77 | -------------------------------------------------------------------------------- /src/Utils/libraries/LinearDesign_Mac_M1.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinearDesignSoftware/LinearDesign/f0126ca89a8b853088b4bccfd2cc8c378d3678be/src/Utils/libraries/LinearDesign_Mac_M1.so -------------------------------------------------------------------------------- /src/Utils/libraries/LinearDesign_Mac_x86.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinearDesignSoftware/LinearDesign/f0126ca89a8b853088b4bccfd2cc8c378d3678be/src/Utils/libraries/LinearDesign_Mac_x86.so -------------------------------------------------------------------------------- /src/Utils/libraries/LinearDesign_linux64.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinearDesignSoftware/LinearDesign/f0126ca89a8b853088b4bccfd2cc8c378d3678be/src/Utils/libraries/LinearDesign_linux64.so -------------------------------------------------------------------------------- /src/Utils/libraries/LinearDesign_linux64_old.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinearDesignSoftware/LinearDesign/f0126ca89a8b853088b4bccfd2cc8c378d3678be/src/Utils/libraries/LinearDesign_linux64_old.so -------------------------------------------------------------------------------- /src/Utils/network.h: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "utility_v.h" 19 | #include "common.h" 20 | #include "codon.h" 21 | 22 | using namespace std; 23 | 24 | // #define is_verbose 25 | 26 | namespace LinearDesign { 27 | 28 | template , 30 | typename NodeType = pair, 31 | typename NodeNucWType = tuple> 32 | class Lattice { 33 | public: 34 | unordered_map> nodes; 35 | unordered_map, hash_pair> left_edges; 36 | unordered_map, hash_pair> right_edges; 37 | 38 | Lattice(): nodes(), left_edges(), right_edges() {}; 39 | 40 | void add_edge(NodeType n1, NodeType n2, NucType nuc, double weight = 0.0f){ 41 | right_edges[n1].push_back(make_tuple(n2, nuc, weight)); 42 | left_edges[n2].push_back(make_tuple(n1, nuc, weight)); 43 | } 44 | 45 | void add_node(NodeType n1){ 46 | IndexType pos = get<0>(n1); 47 | nodes[pos].push_back(n1); 48 | } 49 | }; 50 | 51 | template , 53 | typename NodeType = pair, 54 | typename NodeNucWType = tuple> 55 | class DFA { 56 | public: 57 | unordered_map> nodes; 58 | unordered_map, hash_pair> left_edges; 59 | unordered_map, hash_pair> right_edges; 60 | unordered_map, hash_pair>, hash_pair> auxiliary_left_edges; 61 | unordered_map, hash_pair>, hash_pair> auxiliary_right_edges; 62 | unordered_map, hash_pair> node_rightedge_weights; 63 | 64 | DFA(): nodes(), left_edges(), right_edges(), auxiliary_left_edges(), auxiliary_right_edges() {}; 65 | 66 | void add_edge(NodeType n1, NodeType n2, IndexType nuc, double weight = 0.0f){ 67 | right_edges[n1].push_back(make_tuple(n2, nuc, weight)); 68 | left_edges[n2].push_back(make_tuple(n1, nuc, weight)); 69 | auxiliary_right_edges[n1][n2].push_back(make_pair(nuc, weight)); 70 | auxiliary_left_edges[n2][n1].push_back(make_pair(nuc, weight)); 71 | node_rightedge_weights[n1][nuc] = weight; 72 | } 73 | 74 | void add_node(NodeType n1){ 75 | IndexType pos = get<0>(n1); 76 | nodes[pos].push_back(n1); 77 | } 78 | }; 79 | 80 | template , 82 | typename LatticeType = Lattice> 83 | unordered_map read_wheel(string const &filename) { 84 | unordered_map aa_graphs; 85 | ifstream inFile; 86 | inFile.open(filename); 87 | if (!inFile) { 88 | printf("Unable to open coding_wheel file\n"); 89 | exit(1); // call system to stop 90 | } 91 | 92 | vector stuff; 93 | vector option_splited; 94 | string aa; 95 | IndexType i; 96 | 97 | for (string line; getline(inFile, line);) { 98 | stuff = util::split(line, '\t'); 99 | 100 | aa = stuff[0]; 101 | LatticeType graph = LatticeType(); 102 | graph.add_node(make_pair(0,0)); // always initialize with node (0,0) 103 | 104 | char last_first = 0; 105 | vector::iterator iter = stuff.begin(); 106 | ++iter; // position 0 is aa name 107 | i = 0; 108 | while(iter != stuff.end()){ 109 | string option = *iter; 110 | option_splited = util::split(option, ' '); 111 | char first = option_splited[0][0]; 112 | char second = option_splited[1][0]; 113 | string thirds = option_splited[2]; 114 | NodeType n2 = make_pair(2, i); 115 | graph.add_node(n2); 116 | NodeType n1; 117 | if (first != last_first) { 118 | n1 = make_pair(1, i); 119 | graph.add_node(n1); 120 | graph.add_edge(make_pair(0, 0), n1, GET_ACGU_NUC(first)); 121 | } 122 | else { 123 | n1 = make_pair(1, i-1); 124 | } 125 | last_first = first; 126 | graph.add_edge(n1, n2, GET_ACGU_NUC(second)); 127 | for (auto& third : thirds) { 128 | graph.add_edge(n2, make_pair(0,0), GET_ACGU_NUC(third)); 129 | } 130 | i++; iter++; 131 | } 132 | aa_graphs[aa] = graph; 133 | #ifdef is_verbose 134 | printf("-----------------Lattice------------------------\n"); 135 | for(IndexType pos = 0; pos <= 2; pos++){ 136 | for(auto &node : graph.nodes[pos]){ 137 | IndexType p = get<0>(node); 138 | IndexType num = get<1>(node); 139 | printf("node, (%d, %d)\n", p, num); 140 | for(auto &item : graph.right_edges[node]){ 141 | NodeType n2 = get<0>(item); 142 | IndexType p2 = get<0>(n2); IndexType num2 = get<1>(n2); 143 | IndexType nuc = get<1>(item); 144 | double weight = get<2>(item); 145 | printf(" (%d, %d) -(%d,%lf)-> (%d, %d)\n", p, num, nuc, weight, p2, num2); 146 | } 147 | for(auto &item : graph.left_edges[node]){ 148 | NodeType n1 = get<0>(item); 149 | IndexType p1 = get<0>(n1); IndexType num1 = get<1>(n1); 150 | IndexType nuc = get<1>(item); 151 | double weight = get<2>(item); 152 | printf(" (%d, %d) <-(%d,%lf)- (%d, %d)\n", p1, num1, nuc, weight, p, num); 153 | } 154 | } 155 | } 156 | #endif 157 | } 158 | inFile.close(); 159 | return aa_graphs; 160 | } 161 | 162 | 163 | template , 165 | typename LatticeType = Lattice, 166 | typename NucType = IndexType, 167 | typename NodeNucNodeType = std::tuple> 168 | unordered_map read_wheel_with_weights(const std::string& filename, 169 | std::unordered_map>& nodes_with_best_weight, 170 | std::unordered_map>>& edges_with_best_weight, 171 | const Codon& codon) { 172 | unordered_map aa_graphs; 173 | ifstream inFile; 174 | inFile.open(filename); 175 | if (!inFile) 176 | throw std::runtime_error("Unable to open coding_wheel file\n"); 177 | 178 | vector stuff; 179 | vector option_splited; 180 | string aa; 181 | IndexType i; 182 | 183 | for (string line; getline(inFile, line);) { 184 | stuff = util::split(line, '\t'); 185 | aa = stuff[0]; 186 | LatticeType graph = LatticeType(); 187 | graph.add_node(make_pair(0,0)); // always initialize with node (0,0) 188 | 189 | char last_first = 0; 190 | vector::iterator iter = stuff.begin(); 191 | ++iter; // position 0 is aa name 192 | i = 0; 193 | while(iter != stuff.end()){ 194 | string option = *iter; 195 | option_splited = util::split(option, ' '); 196 | char first = option_splited[0][0]; 197 | char second = option_splited[1][0]; 198 | string thirds = option_splited[2]; 199 | NodeType n2 = make_pair(2, i); 200 | graph.add_node(n2); 201 | NodeType n1; 202 | if (first != last_first) { 203 | n1 = make_pair(1, i); 204 | graph.add_node(n1); 205 | auto first_num = GET_ACGU_NUC(first); 206 | 207 | double weight = 0.0f; 208 | if (nodes_with_best_weight[aa].count(make_pair(0, 0))) { 209 | weight = edges_with_best_weight[aa][make_tuple(make_pair(0, 0), first_num, n1)] / nodes_with_best_weight[aa][make_pair(0, 0)]; 210 | } 211 | 212 | graph.add_edge(make_pair(0, 0), n1, first_num, weight); 213 | } 214 | else { 215 | n1 = make_pair(1, i-1); 216 | } 217 | 218 | last_first = first; 219 | 220 | auto second_num = GET_ACGU_NUC(second); 221 | 222 | double weight = 0.0f; 223 | if (nodes_with_best_weight[aa].count(n1)) { 224 | weight = edges_with_best_weight[aa][make_tuple(n1, second_num, n2)] / nodes_with_best_weight[aa][n1]; 225 | } 226 | 227 | graph.add_edge(n1, n2, second_num, weight); 228 | 229 | for (auto& third : thirds) { 230 | 231 | std::string three_nums = std::string(1, first) + std::string(1, second) + std::string(1, third); 232 | 233 | double weight = 0.0f; 234 | if (nodes_with_best_weight[aa].count(n2)) { 235 | weight = codon.get_weight(aa, three_nums) / nodes_with_best_weight[aa][n2]; 236 | } else { 237 | weight = codon.get_weight(aa, three_nums); 238 | } 239 | 240 | graph.add_edge(n2, make_pair(0,0), GET_ACGU_NUC(third), weight); 241 | } 242 | i++; iter++; 243 | } 244 | aa_graphs[aa] = graph; 245 | } 246 | 247 | inFile.close(); 248 | return aa_graphs; 249 | } 250 | 251 | 252 | template , 254 | typename LatticeType = Lattice, 255 | typename NucType = IndexType, 256 | typename NodeNucNodeType = std::tuple> 257 | unordered_map read_wheel_with_weights_log(const std::string& filename, 258 | std::unordered_map>& nodes_with_best_weight, 259 | std::unordered_map>>& edges_with_best_weight, 260 | const Codon& codon, double lambda_) { 261 | unordered_map aa_graphs; 262 | ifstream inFile; 263 | inFile.open(filename); 264 | if (!inFile) 265 | throw std::runtime_error("Unable to open coding_wheel file\n"); 266 | 267 | vector stuff; 268 | vector option_splited; 269 | string aa; 270 | IndexType i; 271 | 272 | for (string line; getline(inFile, line);) { 273 | stuff = util::split(line, '\t'); 274 | aa = stuff[0]; 275 | LatticeType graph = LatticeType(); 276 | graph.add_node(make_pair(0,0)); // always initialize with node (0,0) 277 | 278 | char last_first = 0; 279 | vector::iterator iter = stuff.begin(); 280 | ++iter; // position 0 is aa name 281 | i = 0; 282 | while(iter != stuff.end()){ 283 | string option = *iter; 284 | option_splited = util::split(option, ' '); 285 | char first = option_splited[0][0]; 286 | char second = option_splited[1][0]; 287 | string thirds = option_splited[2]; 288 | NodeType n2 = make_pair(2, i); 289 | graph.add_node(n2); 290 | NodeType n1; 291 | if (first != last_first) { 292 | n1 = make_pair(1, i); 293 | graph.add_node(n1); 294 | auto first_num = GET_ACGU_NUC(first); 295 | 296 | double weight = 1.0f; 297 | if (nodes_with_best_weight[aa].count(make_pair(0, 0))) { 298 | weight = lambda_ * log(edges_with_best_weight[aa][make_tuple(make_pair(0, 0), first_num, n1)] / nodes_with_best_weight[aa][make_pair(0, 0)]); 299 | } 300 | 301 | graph.add_edge(make_pair(0, 0), n1, first_num, weight); 302 | } 303 | else { 304 | n1 = make_pair(1, i-1); 305 | } 306 | 307 | last_first = first; 308 | 309 | auto second_num = GET_ACGU_NUC(second); 310 | 311 | double weight = 1.0f; 312 | if (nodes_with_best_weight[aa].count(n1)) { 313 | weight = lambda_ * log(edges_with_best_weight[aa][make_tuple(n1, second_num, n2)] / nodes_with_best_weight[aa][n1]); 314 | } 315 | 316 | graph.add_edge(n1, n2, second_num, weight); 317 | 318 | for (auto& third : thirds) { 319 | 320 | std::string three_nums = std::string(1, first) + std::string(1, second) + std::string(1, third); 321 | 322 | double weight = 1.0f; 323 | if (nodes_with_best_weight[aa].count(n2)) { 324 | weight = lambda_ * log(codon.get_weight(aa, three_nums) / nodes_with_best_weight[aa][n2]); 325 | } else { 326 | weight = lambda_ * log(codon.get_weight(aa, three_nums)); 327 | } 328 | 329 | graph.add_edge(n2, make_pair(0,0), GET_ACGU_NUC(third), weight); 330 | } 331 | i++; iter++; 332 | } 333 | aa_graphs[aa] = graph; 334 | } 335 | 336 | inFile.close(); 337 | return aa_graphs; 338 | } 339 | 340 | template , 343 | typename NodeNucNodeType = std::tuple, 344 | typename WeightType = double, 345 | typename LatticeType = Lattice> 346 | void prepare_codon_unit_lattice(const std::string& wheel_path, const Codon& codon, 347 | std::unordered_map& aa_graphs_with_ln_weights_ret, 348 | std::unordered_map, std::tuple, std::hash>>>& 349 | best_path_in_one_codon_unit_ret, 350 | std::unordered_map& aa_best_path_in_a_whole_codon_ret, double lambda_) { 351 | 352 | std::unordered_map> nodes_with_best_weight; 353 | std::unordered_map>> edges_with_best_weight; 354 | 355 | unordered_map aa_graphs_with_ln_weights; 356 | unordered_map aa_graphs_with_weights = read_wheel_with_weights(wheel_path, nodes_with_best_weight, edges_with_best_weight, codon); 357 | 358 | for (auto& aa_aa_elem : aa_graphs_with_weights) { 359 | auto& aa = aa_aa_elem.first; 360 | auto& aa_elem = aa_aa_elem.second; 361 | for (auto& node_at_2 : aa_elem.nodes[2]) { 362 | for (auto& node_at_3_nuc_weight : aa_elem.right_edges[node_at_2]) { 363 | auto node_at_3 = std::get<0>(node_at_3_nuc_weight); 364 | auto nuc = std::get<1>(node_at_3_nuc_weight); 365 | auto weight = std::get<2>(node_at_3_nuc_weight); 366 | nodes_with_best_weight[aa][node_at_2] = max(nodes_with_best_weight[aa][node_at_2], weight); 367 | edges_with_best_weight[aa][make_tuple(node_at_2,nuc,node_at_3)] = weight; 368 | } 369 | } 370 | 371 | for (auto& node_at_1 : aa_elem.nodes[1]) { 372 | for (auto& node_at_2_nuc_weight : aa_elem.right_edges[node_at_1]) { 373 | auto node_at_2 = std::get<0>(node_at_2_nuc_weight); 374 | auto nuc = std::get<1>(node_at_2_nuc_weight); 375 | nodes_with_best_weight[aa][node_at_1] = max(nodes_with_best_weight[aa][node_at_1], nodes_with_best_weight[aa][node_at_2]); 376 | edges_with_best_weight[aa][make_tuple(node_at_1,nuc,node_at_2)] = nodes_with_best_weight[aa][node_at_2]; 377 | } 378 | } 379 | 380 | for (auto& node_at_0 : aa_elem.nodes[0]) { 381 | for (auto& node_at_1_nuc_weight : aa_elem.right_edges[node_at_0]) { 382 | auto node_at_1 = std::get<0>(node_at_1_nuc_weight); 383 | auto nuc = std::get<1>(node_at_1_nuc_weight); 384 | nodes_with_best_weight[aa][node_at_0] = max(nodes_with_best_weight[aa][node_at_0], nodes_with_best_weight[aa][node_at_1]); 385 | edges_with_best_weight[aa][make_tuple(node_at_0,nuc,node_at_1)] = nodes_with_best_weight[aa][node_at_1]; 386 | } 387 | } 388 | } 389 | 390 | aa_graphs_with_ln_weights = read_wheel_with_weights_log(wheel_path, nodes_with_best_weight, edges_with_best_weight, codon, lambda_); 391 | 392 | std::unordered_map, 394 | std::tuple, 395 | std::hash>>> 396 | best_path_in_one_codon_unit; 397 | 398 | 399 | for (auto& aa_graph : aa_graphs_with_ln_weights) { 400 | auto& aa = aa_graph.first; 401 | auto& graph = aa_graph.second; 402 | for (auto& node_0 : graph.nodes[0]) { 403 | for (auto& node_1_nuc_log_w : graph.right_edges[node_0]) { 404 | auto node_1 = std::get<0>(node_1_nuc_log_w); 405 | auto nuc = std::get<1>(node_1_nuc_log_w); 406 | auto log_weight = std::get<2>(node_1_nuc_log_w); 407 | 408 | if (!best_path_in_one_codon_unit[aa].count(make_tuple(node_0,node_1))) 409 | best_path_in_one_codon_unit[aa][make_tuple(node_0,node_1)] = make_tuple(util::value_min(),k_void_nuc,k_void_nuc); 410 | 411 | double current_log_weight = std::get<0>(best_path_in_one_codon_unit[aa][make_tuple(node_0,node_1)]); 412 | if (current_log_weight < log_weight) { 413 | best_path_in_one_codon_unit[aa][make_tuple(node_0,node_1)] = make_tuple(log_weight,nuc,k_void_nuc); 414 | } 415 | } 416 | } 417 | 418 | for (auto& node_1 : graph.nodes[1]) { 419 | for (auto& node_2_nuc_log_w : graph.right_edges[node_1]) { 420 | auto node_2 = std::get<0>(node_2_nuc_log_w); 421 | auto nuc = std::get<1>(node_2_nuc_log_w); 422 | auto log_weight = std::get<2>(node_2_nuc_log_w); 423 | 424 | if (!best_path_in_one_codon_unit[aa].count(make_tuple(node_1,node_2))) 425 | best_path_in_one_codon_unit[aa][make_tuple(node_1,node_2)] = make_tuple(util::value_min(),k_void_nuc,k_void_nuc); 426 | 427 | double current_log_weight = std::get<0>(best_path_in_one_codon_unit[aa][make_tuple(node_1,node_2)]); 428 | if (current_log_weight < log_weight) { 429 | best_path_in_one_codon_unit[aa][make_tuple(node_1,node_2)] = make_tuple(log_weight,nuc,k_void_nuc); 430 | } 431 | 432 | auto temp = best_path_in_one_codon_unit[aa][make_tuple(node_1,node_2)]; 433 | } 434 | } 435 | 436 | for (auto& node_2 : graph.nodes[2]) { 437 | for (auto& node_3_nuc_log_w : graph.right_edges[node_2]) { 438 | auto node_3 = std::get<0>(node_3_nuc_log_w); 439 | auto nuc = std::get<1>(node_3_nuc_log_w); 440 | auto log_weight = std::get<2>(node_3_nuc_log_w); 441 | 442 | if (!best_path_in_one_codon_unit[aa].count(make_tuple(node_2,node_3))) 443 | best_path_in_one_codon_unit[aa][make_tuple(node_2,node_3)] = make_tuple(util::value_min(),k_void_nuc,k_void_nuc); 444 | 445 | double current_log_weight = std::get<0>(best_path_in_one_codon_unit[aa][make_tuple(node_2,node_3)]); 446 | if (current_log_weight < log_weight) { 447 | best_path_in_one_codon_unit[aa][make_tuple(node_2,node_3)] = make_tuple(log_weight,nuc,k_void_nuc); 448 | } 449 | } 450 | } 451 | 452 | for (auto& node_0 : graph.nodes[0]) { 453 | for (auto& node_1_nuc_0_log_weight_0 : graph.right_edges[node_0]) { 454 | auto& node_1 = std::get<0>(node_1_nuc_0_log_weight_0); 455 | auto& nuc_0 = std::get<1>(node_1_nuc_0_log_weight_0); 456 | auto log_weight_0 = std::get<2>(node_1_nuc_0_log_weight_0); 457 | for (auto& node_2_nuc_1_log_weight_1 : graph.right_edges[node_1]) { 458 | auto& node_2 = std::get<0>(node_2_nuc_1_log_weight_1); 459 | auto& nuc_1 = std::get<1>(node_2_nuc_1_log_weight_1); 460 | auto log_weight_1 = std::get<2>(node_2_nuc_1_log_weight_1); 461 | 462 | if (!best_path_in_one_codon_unit[aa].count(make_tuple(node_0,node_2))) 463 | best_path_in_one_codon_unit[aa][make_tuple(node_0,node_2)] = make_tuple(util::value_min(),k_void_nuc,k_void_nuc); 464 | 465 | if (std::get<0>(best_path_in_one_codon_unit[aa][make_tuple(node_0,node_2)]) < log_weight_0 + log_weight_1) 466 | best_path_in_one_codon_unit[aa][make_tuple(node_0,node_2)] = make_tuple(log_weight_0 + log_weight_1, nuc_0, nuc_1); 467 | } 468 | } 469 | } 470 | 471 | for (auto& node_1 : graph.nodes[1]) { 472 | for (auto& node_2_nuc_1_log_weight_1 : graph.right_edges[node_1]) { 473 | auto& node_2 = std::get<0>(node_2_nuc_1_log_weight_1); 474 | auto& nuc_1 = std::get<1>(node_2_nuc_1_log_weight_1); 475 | auto log_weight_1 = std::get<2>(node_2_nuc_1_log_weight_1); 476 | for (auto& node_3_nuc_2_log_weight_2 : graph.right_edges[node_2]) { 477 | auto& node_3 = std::get<0>(node_3_nuc_2_log_weight_2); 478 | auto& nuc_2 = std::get<1>(node_3_nuc_2_log_weight_2); 479 | auto log_weight_2 = std::get<2>(node_3_nuc_2_log_weight_2); 480 | 481 | if (!best_path_in_one_codon_unit[aa].count(make_tuple(node_1,node_3))) 482 | best_path_in_one_codon_unit[aa][make_tuple(node_1,node_3)] = make_tuple(util::value_min(),k_void_nuc,k_void_nuc); 483 | 484 | if (std::get<0>(best_path_in_one_codon_unit[aa][make_tuple(node_1,node_3)]) < log_weight_1 + log_weight_2) 485 | best_path_in_one_codon_unit[aa][make_tuple(node_1,node_3)] = make_tuple(log_weight_1 + log_weight_2, nuc_1, nuc_2); 486 | } 487 | } 488 | } 489 | } 490 | 491 | std::unordered_map max_path; 492 | std::unordered_map aa_best_path_in_a_whole_codon; 493 | 494 | for (auto& aa_path_weight : codon.aa_table_) { 495 | auto& aa = aa_path_weight.first; // char 496 | for (auto& path_weight : aa_path_weight.second) { 497 | if (max_path[aa] < path_weight.second) { 498 | max_path[aa] = path_weight.second; 499 | aa_best_path_in_a_whole_codon[aa] = path_weight.first; 500 | } 501 | } 502 | } 503 | 504 | aa_graphs_with_ln_weights_ret = aa_graphs_with_ln_weights; 505 | best_path_in_one_codon_unit_ret = best_path_in_one_codon_unit; 506 | aa_best_path_in_a_whole_codon_ret = aa_best_path_in_a_whole_codon; 507 | } 508 | 509 | 510 | 511 | template , 513 | typename LatticeType = Lattice, 514 | typename DFAType = DFA> 515 | DFAType get_dfa(unordered_map aa_graphs, vector aa_seq) { 516 | DFAType dfa = DFAType(); 517 | NodeType newnode = make_pair(3 * static_cast(aa_seq.size()), 0); 518 | dfa.add_node(newnode); 519 | IndexType i = 0; 520 | IndexType i3; 521 | string aa; 522 | LatticeType graph; 523 | for(auto& item : aa_seq) { 524 | i3 = i * 3; 525 | aa = aa_seq[i]; 526 | graph = aa_graphs[aa]; 527 | for (IndexType pos = 0; pos <= 2; pos++) { 528 | for(auto& node : graph.nodes[pos]) { 529 | IndexType num = get<1>(node); 530 | newnode = make_pair(i3 + pos, num); 531 | dfa.add_node(newnode); 532 | for (auto& edge : graph.right_edges[node]) { 533 | NodeType n2 = get<0>(edge); 534 | IndexType nuc = get<1>(edge); 535 | num = get<1>(n2); 536 | NodeType newn2 = make_pair(i3 + pos + 1, num); 537 | dfa.add_edge(newnode, newn2, nuc, get<2>(edge)); 538 | } 539 | } 540 | } 541 | i++; 542 | } 543 | #ifdef is_verbose 544 | printf("-----------------DFA------------------------\n"); 545 | for(IndexType pos = 0; pos < 3 * static_cast(aa_seq.size()) + 1; pos++){ 546 | for(auto& node : dfa.nodes[pos]) { 547 | IndexType p = get<0>(node); 548 | IndexType num = get<1>(node); 549 | printf("node, (%d, %d)\n", p, num); 550 | for(auto &n2 : dfa.auxiliary_right_edges[node]){ 551 | IndexType p2 = get<0>(n2.first); 552 | IndexType num2 = get<1>(n2.first); 553 | for(auto nuc : n2.second){ 554 | printf(" (%d, %d) -(%d,%lf)-> (%d, %d)\n", p, num, get<0>(nuc),get<1>(nuc), p2, num2); 555 | } 556 | } 557 | for(auto &n1 : dfa.auxiliary_left_edges[node]){ 558 | IndexType p1 = get<0>(n1.first); IndexType num1 = get<1>(n1.first); 559 | for(auto nuc : n1.second){ 560 | printf(" (%d, %d) <-(%d,%lf)- (%d, %d)\n", p1, num1, get<0>(nuc),get<1>(nuc), p, num); 561 | } 562 | } 563 | } 564 | } 565 | #endif 566 | return dfa; 567 | } 568 | 569 | } 570 | -------------------------------------------------------------------------------- /src/Utils/reader.h: -------------------------------------------------------------------------------- 1 | #ifndef fasta_h 2 | #define fasta_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "base.h" 10 | 11 | namespace LinearDesign { 12 | 13 | struct Reader { 14 | static bool cvt_to_seq(const string& from, string& to) { 15 | return false; 16 | } 17 | }; 18 | 19 | struct Fasta : public Reader { 20 | static map map_fasta; 21 | 22 | static bool cvt_to_seq(const string& fasta, string& nucs) { 23 | nucs.reserve(4 * fasta.length()); 24 | for(auto aa : fasta) { 25 | if (map_fasta.count(aa)) { 26 | nucs.append(map_fasta[aa] + " "); 27 | } else { 28 | cerr << "invalid protein sequence!\n" << endl; 29 | return false; 30 | } 31 | } 32 | nucs.pop_back(); 33 | return true; 34 | } 35 | }; 36 | 37 | map Fasta::map_fasta = { 38 | {'F',"Phe"}, 39 | {'L',"Leu"}, 40 | {'S',"Ser"}, 41 | {'Y',"Tyr"}, 42 | {'*',"STOP"}, 43 | {'C',"Cys"}, 44 | {'W',"Trp"}, 45 | {'P',"Pro"}, 46 | {'H',"His"}, 47 | {'Q',"Gln"}, 48 | {'R',"Arg"}, 49 | {'I',"Ile"}, 50 | {'M',"Met"}, 51 | {'T',"Thr"}, 52 | {'N',"Asn"}, 53 | {'K',"Lys"}, 54 | {'V',"Val"}, 55 | {'D',"Asp"}, 56 | {'E',"Glu"}, 57 | {'G',"Gly"}, 58 | {'A',"Ala"} 59 | }; 60 | 61 | template 62 | struct ReaderTraits { 63 | static bool cvt_to_seq(const string& from, string& to) { 64 | return T::cvt_to_seq(from, to); 65 | } 66 | }; 67 | 68 | } 69 | 70 | #endif /* fasta_h */ -------------------------------------------------------------------------------- /src/Utils/utility_v.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef FASTCKY_UTILITY_V_H 3 | #define FASTCKY_UTILITY_V_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #define NTP(x,y) (x==1? (y==4?5:0) : (x==2? (y==3?1:0) : (x==3 ? (y==2?2:(y==4?3:0)) : (x==4 ? (y==3?4:(y==1?6:0)) : 0)))) 11 | #define PTLN(x) (x==1? 2:((x==2 || x==3)? 3:(x==5)? 1:4)) 12 | #define PTRN(x) (x==2? 2:((x==1 || x==4)? 3:(x==6)? 1:4)) 13 | 14 | #define NOTON 5 // NUM_OF_TYPE_OF_NUCS 15 | #define NOTOND 25 16 | #define NOTONT 125 17 | 18 | #define EXPLICIT_MAX_LEN 4 19 | #define SINGLE_MIN_LEN 0 20 | #define SINGLE_MAX_LEN 20 // NOTE: *must* <= sizeof(char), otherwise modify State::TraceInfo accordingly 21 | 22 | #define HAIRPIN_MAX_LEN 30 23 | #define BULGE_MAX_LEN SINGLE_MAX_LEN 24 | #define INTERNAL_MAX_LEN SINGLE_MAX_LEN 25 | #define SYMMETRIC_MAX_LEN 15 26 | #define ASYMMETRY_MAX_LEN 28 27 | #define SPECIAL_HAIRPIN_SCORE_BASELINE -10000 28 | 29 | extern bool _allowed_pairs[NOTON][NOTON]; 30 | 31 | #define MAXLOOP 30 32 | 33 | #define GET_ACGU(x) ((x==1? 'A' : (x==2? 'C' : (x==3? 'G' : (x==4?'U': 'X'))))) 34 | 35 | #define GET_ACGU_NUC(x) ((x=='A'? 1 : (x=='C'? 2 : (x=='G'? 3 : (x=='U'?4: 0))))) 36 | 37 | #define HAIRPINTYPE(x) ((x==5?0 : (x==6?1 : (x==8?2 : 3)))) 38 | 39 | extern int func1(std::string& a, int8_t b); 40 | 41 | extern void func2(std::string& a, int b, std::vector& c, std::vector& d, std::vector& e); 42 | 43 | extern int func3(int a, int b, int c, int d, int e); 44 | 45 | extern int func4(int a, int b, int c, int d, int e, int f, int g); 46 | 47 | extern int func5(int a, int b, int c); 48 | 49 | extern int func6(int a, int b, int c, int d, int e, int f, int g, int h); 50 | 51 | extern int func7(int a, int b, int c, int d, int e, int h, int i); 52 | 53 | extern int func8(int a, int b); 54 | 55 | extern void func9(int a, int b); 56 | 57 | extern int func10(int a, int b, int c); 58 | 59 | extern int func11(int a, int b, int c, int d, int e, int f, int g, int h); 60 | 61 | extern int func12(int a, int b, int c, int d, int e, int f, int g = -1); 62 | 63 | extern int func13(int a, int b); 64 | 65 | extern int func14(int a, int b, int c, int d, int e, int f, int g, int h, int i, int j, int k, int l); 66 | 67 | extern int func15(int a, int b, int c, int d, int e, int f, int g); 68 | 69 | #endif 70 | 71 | -------------------------------------------------------------------------------- /src/backtrace_iter.cc: -------------------------------------------------------------------------------- 1 | 2 | #include "beam_cky_parser.h" 3 | 4 | using namespace std; 5 | 6 | #define tetra_hex_tri -1 7 | 8 | namespace LinearDesign { 9 | 10 | template 11 | BacktraceResult BeamCKYParser::backtrace(DFA_t& dfa, const State_t& state, NodeType end_node){ 12 | 13 | char sequence[seq_length+1]; 14 | memset(sequence, '.', seq_length); 15 | sequence[seq_length] = 0; 16 | 17 | char structure[seq_length+1]; 18 | memset(structure, '.', seq_length); 19 | structure[seq_length] = 0; 20 | 21 | bool no_backpointer; 22 | 23 | stack> stk; 24 | NodeType start_node = make_pair(0, 0); 25 | stk.push(make_tuple(start_node, end_node, state, Beam_type::BEAM_C, -1)); 26 | 27 | double epsilon = 1e-8; 28 | 29 | while(!stk.empty()) { 30 | tuple top = stk.top(); 31 | NodeType i_node = get<0>(top), j_node = get<1>(top); 32 | State_t& state = get<2>(top); 33 | Beam_type beam_type = get<3>(top); 34 | PairType curr_pair_nuc = get<4>(top); 35 | stk.pop(); 36 | 37 | IndexType i, j, p, q, hairpin_length; 38 | j = j_node.first; 39 | NucType nuci, nucj, nuci1, nucj_1; 40 | no_backpointer = true; 41 | 42 | int left_start, left_end, right_start, right_end; 43 | 44 | switch (beam_type) { 45 | case Beam_type::BEAM_C: 46 | if (j <= 0) continue; 47 | for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ 48 | auto j_1_node = std::get<0>(j_1_node_nucj_1); 49 | auto& c_state = bestC[j_1_node]; 50 | auto weight_nucj_1 = std::get<2>(j_1_node_nucj_1); 51 | auto cai_score = c_state.cai_score + weight_nucj_1; 52 | 53 | if (state.score == c_state.score && abs(state.cai_score - cai_score) < epsilon){ 54 | NucType nucj_1 = std::get<1>(j_1_node_nucj_1); 55 | stk.push(make_tuple(i_node, j_1_node, c_state, Beam_type::BEAM_C, curr_pair_nuc)); 56 | sequence[j-1] = GET_ACGU(nucj_1); 57 | no_backpointer = false; 58 | break; 59 | } 60 | } 61 | 62 | // C = C + P 63 | if(no_backpointer) { 64 | for (size_t c_node_nucpair_ = 0; c_node_nucpair_ < 16 * seq_length; ++c_node_nucpair_){ 65 | auto& p_state = bestP[j_node][c_node_nucpair_]; 66 | 67 | if (p_state.score == util::value_min()) continue; 68 | auto c_node_nucpair = reverse_index(c_node_nucpair_); 69 | 70 | auto c = c_node_nucpair.node_first; 71 | auto c_num = c_node_nucpair.node_second; 72 | auto pair_nuc = c_node_nucpair.nucpair; 73 | auto c_node = make_pair(c, c_num); 74 | 75 | auto nucc = PTLN(pair_nuc); 76 | auto nucj_1 = PTRN(pair_nuc); 77 | 78 | 79 | auto newscore = - func3(c, j-1, nucc, nucj_1, seq_length) + p_state.score; 80 | 81 | if (c > 0){ 82 | auto& c_state = bestC[c_node]; 83 | auto cai_score = c_state.cai_score + p_state.cai_score; 84 | 85 | if (state.score == c_state.score + newscore && abs(state.cai_score - cai_score) < epsilon){ 86 | stk.push(make_tuple(i_node, c_node, c_state, Beam_type::BEAM_C, curr_pair_nuc)); 87 | stk.push(make_tuple(c_node, j_node, p_state, Beam_type::BEAM_P, pair_nuc)); 88 | no_backpointer = false; 89 | break; 90 | } 91 | } else{ 92 | if (state.score == newscore && abs(state.cai_score - p_state.cai_score) < epsilon){ 93 | stk.push(make_tuple(c_node, j_node, p_state, Beam_type::BEAM_P, pair_nuc)); 94 | no_backpointer = false; 95 | break; 96 | } 97 | } 98 | } 99 | if (!no_backpointer) break; 100 | } 101 | assert(no_backpointer == false); // something wrong if no path matches 102 | break; 103 | 104 | case Beam_type::BEAM_P: 105 | i = i_node.first; 106 | j = j_node.first; 107 | nuci = PTLN(curr_pair_nuc); 108 | nucj_1 = PTRN(curr_pair_nuc); 109 | 110 | 111 | hairpin_length = j - i; 112 | 113 | for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ 114 | NucType new_nucj_1 = std::get<1>(j_1_node_nucj_1); 115 | if (new_nucj_1 != nucj_1) continue; 116 | 117 | auto j_1_node = std::get<0>(j_1_node_nucj_1); 118 | auto weight_nucj_1 = std::get<2>(j_1_node_nucj_1); 119 | 120 | 121 | 122 | #ifdef SPECIAL_HP 123 | if (hairpin_length == 5 or hairpin_length == 6 or hairpin_length == 8){ 124 | for(auto & seq_score_weight : hairpin_seq_score_cai[i_node][j_1_node][NTP(nuci, nucj_1)]){ 125 | auto seq = get<0>(seq_score_weight); 126 | auto pre_cal_score = get<1>(seq_score_weight); 127 | auto pre_cal_cai_score = get<2>(seq_score_weight); 128 | 129 | 130 | if (state.score == pre_cal_score && abs(state.cai_score - pre_cal_cai_score) < epsilon){ 131 | for(int c=0; c(i1_node_nuci); 148 | if (new_nuci != nuci) continue; 149 | auto i1_node = std::get<0>(i1_node_nuci); 150 | auto weight_nuci = std::get<2>(i1_node_nuci); 151 | 152 | // helix 153 | for (auto& j_2_node_nucj_2 : dfa.left_edges[j_1_node]){ 154 | NucType nucj_2 = std::get<1>(j_2_node_nucj_2); 155 | 156 | for (auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ 157 | NucType nuci1 = std::get<1>(i2_node_nuci1); 158 | auto pair_nuc = NTP(nuci1, nucj_2); 159 | 160 | NodeNucpair temp = {i1_node.first, i1_node.second, static_cast(pair_nuc)}; 161 | 162 | auto& p_state = bestP[j_1_node][temp]; 163 | auto newscore = - func14(i, j-1, i+1, j-2, 164 | nuci, nuci1, nucj_2, nucj_1, 165 | nuci, nuci1, nucj_2, nucj_1) 166 | + p_state.score; 167 | 168 | auto cai_score = p_state.cai_score + (weight_nuci + weight_nucj_1); 169 | 170 | if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ 171 | stk.push(make_tuple(i1_node, j_1_node, p_state, Beam_type::BEAM_P, pair_nuc)); 172 | sequence[i] = GET_ACGU(nuci); 173 | sequence[j-1] = GET_ACGU(nucj_1); 174 | structure[i] = '('; 175 | structure[j-1] = ')'; 176 | 177 | no_backpointer = false; 178 | break; 179 | } 180 | }if (!no_backpointer) break; 181 | }if (!no_backpointer) break; 182 | } 183 | if (!no_backpointer) break; 184 | 185 | // hairpin 186 | NodeNucpair temp = {i_node.first, LinearDesign::NumType(i_node.second), curr_pair_nuc}; 187 | 188 | if (state.score == bestH[j_1_node][temp].score){ //no need to check CAI score here 189 | 190 | for (auto& j_2_node_nucj_2 : dfa.left_edges[j_1_node]){ 191 | NucType nucj_2 = std::get<1>(j_2_node_nucj_2); 192 | auto j_2_node = std::get<0>(j_2_node_nucj_2); 193 | auto j_2 = j_2_node.first; 194 | 195 | auto weight_nucj_2 = std::get<2>(j_2_node_nucj_2); 196 | 197 | for (auto& i1_node_nuci : dfa.right_edges[i_node]){ 198 | NucType new_nuci = std::get<1>(i1_node_nuci); 199 | if (new_nuci != nuci) continue; 200 | 201 | auto i1_node = std::get<0>(i1_node_nuci); 202 | auto weight_nuci = get<2>(i1_node_nuci); 203 | 204 | 205 | for(auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ 206 | NucType nuci1 = std::get<1>(i2_node_nuci1); 207 | auto i2_node = std::get<0>(i2_node_nuci1); 208 | auto i2 = i2_node.first; 209 | auto weight_nuci1 = std::get<2>(i2_node_nuci1); 210 | 211 | if (j - 1 - i == 4 and (j_2_node.second != i2_node.second and dfa.nodes[i+2].size() == dfa.nodes[j-2].size())) continue; 212 | 213 | auto newscore = - func12(i, j-1, nuci, nuci1, nucj_2, nucj_1, tetra_hex_tri); 214 | auto cai_score = weight_nuci + weight_nuci1 + get_broken_codon_score(i2_node, j_2_node) + weight_nucj_2 + weight_nucj_1; 215 | 216 | if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ 217 | sequence[i] = GET_ACGU(nuci); 218 | sequence[i+1] = GET_ACGU(nuci1); 219 | sequence[j-2] = GET_ACGU(nucj_2); 220 | sequence[j-1] = GET_ACGU(nucj_1); 221 | structure[i] = '('; 222 | structure[j-1] = ')'; 223 | 224 | auto temp_string = get_nuc_from_dfa_cai(dfa, i2_node, j_2_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); 225 | int count = i2; 226 | for (auto & nuc : temp_string ){ 227 | sequence[count] = nuc; 228 | count++; 229 | } 230 | assert(count == j_2); 231 | 232 | no_backpointer = false; 233 | break; 234 | } 235 | }if (!no_backpointer) break; 236 | }if (!no_backpointer) break; 237 | }if (!no_backpointer) break; 238 | } 239 | } 240 | 241 | // single branch 242 | if (no_backpointer) { 243 | vector> right_seq; 244 | vector>, int, int, NodeType, NodeType, double, double, double, double, bool, NodeType>> q_node_nucs_list; 245 | for (IndexType q = j-1; q >= std::max(j - SINGLE_MAX_LEN - 1, i + 5); --q){ 246 | int right_start = -1; 247 | int right_end = -1; 248 | q_node_nucs_list.clear(); 249 | 250 | if (q == j-1){ 251 | for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ 252 | if (get<1>(j_1_node_nucj_1) != nucj_1) continue; 253 | auto q_node = get<0>(j_1_node_nucj_1); 254 | auto weight_nucj_1 = get<2>(j_1_node_nucj_1); 255 | for (auto& q_1_node_nucq_1 : dfa.left_edges[q_node]){ 256 | NodeType q_1_node = get<0>(q_1_node_nucq_1); 257 | auto nucq_1 = get<1>(q_1_node_nucq_1); 258 | double weight_nucq_1 = get<2>(q_1_node_nucq_1); 259 | right_seq.push_back(make_pair(j-1, nucj_1)); 260 | q_node_nucs_list.push_back(make_tuple(q_1_node, q_node, nucq_1, nucj_1, nucq_1, right_seq, right_start, right_end, make_pair(-1,0), make_pair(-1,0), weight_nucq_1, 0., 0., weight_nucj_1, true, make_pair(-1,0))); 261 | right_seq.clear(); 262 | } 263 | } 264 | }else if(q == j-2){ 265 | for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ 266 | if (get<1>(j_1_node_nucj_1) != nucj_1) continue; 267 | auto j_1_node = get<0>(j_1_node_nucj_1); 268 | auto weight_nucj_1 = get<2>(j_1_node_nucj_1); 269 | for (auto& q_node_nucq : dfa.left_edges[j_1_node]){ 270 | auto q_node = get<0>(q_node_nucq); 271 | auto nucq = get<1>(q_node_nucq); 272 | auto weight_nucq = get<2>(q_node_nucq); 273 | for (auto& q_1_node_nucq_1 : dfa.left_edges[q_node]){ 274 | NodeType q_1_node = get<0>(q_1_node_nucq_1); 275 | auto nucq_1 = get<1>(q_1_node_nucq_1); 276 | double weight_nucq_1 = get<2>(q_1_node_nucq_1); 277 | right_seq.push_back(make_pair(q, nucq)); 278 | right_seq.push_back(make_pair(j-1, nucj_1)); 279 | q_node_nucs_list.push_back(make_tuple(q_1_node, q_node, nucq_1, nucq, nucq, right_seq, right_start, right_end, make_pair(-1,0), make_pair(-1,0), weight_nucq_1, weight_nucq, 0., weight_nucj_1, false, j_1_node)); 280 | right_seq.clear(); 281 | } 282 | } 283 | } 284 | }else if(q == j-3){ 285 | for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ 286 | if (get<1>(j_1_node_nucj_1) != nucj_1) continue; 287 | auto j_1_node = get<0>(j_1_node_nucj_1); 288 | auto weight_nucj_1 = get<2>(j_1_node_nucj_1); 289 | for(auto& j_2_node_nucj_2 : dfa.left_edges[j_1_node]){ 290 | auto j_2_node = get<0>(j_2_node_nucj_2); 291 | auto nucj_2 = get<1>(j_2_node_nucj_2); 292 | auto weight_nucj_2 = get<2>(j_2_node_nucj_2); 293 | for (auto& q_node_nucq : dfa.left_edges[j_2_node]){ 294 | auto q_node = get<0>(q_node_nucq); 295 | auto nucq = get<1>(q_node_nucq); 296 | auto weight_nucq = get<2>(q_node_nucq); 297 | for (auto& q_1_node_nucq_1 : dfa.left_edges[q_node]){ 298 | NodeType q_1_node = get<0>(q_1_node_nucq_1); 299 | auto nucq_1 = get<1>(q_1_node_nucq_1); 300 | double weight_nucq_1 = get<2>(q_1_node_nucq_1); 301 | right_seq.push_back(make_pair(q, nucq)); 302 | right_seq.push_back(make_pair(j-2, nucj_2)); 303 | right_seq.push_back(make_pair(j-1, nucj_1)); 304 | q_node_nucs_list.push_back(make_tuple(q_1_node, q_node, nucq_1, nucq, nucj_2, right_seq, right_start, right_end, make_pair(-1,0), make_pair(-1,0), weight_nucq_1, weight_nucq, weight_nucj_2, weight_nucj_1, false, j_1_node)); 305 | right_seq.clear(); 306 | } 307 | } 308 | } 309 | } 310 | } 311 | else if(q == j-4){ 312 | for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ 313 | if (get<1>(j_1_node_nucj_1) != nucj_1) continue; 314 | auto j_1_node = get<0>(j_1_node_nucj_1); 315 | auto weight_nucj_1 = get<2>(j_1_node_nucj_1); 316 | for(auto& j_2_node_nucj_2 : dfa.left_edges[j_1_node]){ 317 | auto j_2_node = get<0>(j_2_node_nucj_2); 318 | auto nucj_2 = get<1>(j_2_node_nucj_2); 319 | auto weight_nucj_2 = get<2>(j_2_node_nucj_2); 320 | for(auto& j_3_node_nucj_3 : dfa.left_edges[j_2_node]){ 321 | auto j_3_node = get<0>(j_3_node_nucj_3); 322 | for (auto& q_node_nucq : dfa.left_edges[j_3_node]){ 323 | auto q_node = get<0>(q_node_nucq); 324 | auto nucq = get<1>(q_node_nucq); 325 | auto weight_nucq = get<2>(q_node_nucq); 326 | for (auto& q_1_node_nucq_1 : dfa.left_edges[q_node]){ 327 | NodeType q_1_node = get<0>(q_1_node_nucq_1); 328 | auto nucq_1 = get<1>(q_1_node_nucq_1); 329 | double weight_nucq_1 = get<2>(q_1_node_nucq_1); 330 | right_seq.push_back(make_pair(q, nucq)); 331 | right_seq.push_back(make_pair(j-2, nucj_2)); 332 | right_seq.push_back(make_pair(j-1, nucj_1)); 333 | right_start = q+1; 334 | right_end = j-2; 335 | q_node_nucs_list.push_back(make_tuple(q_1_node, q_node, nucq_1, nucq, nucj_2, right_seq, right_start, right_end, j_3_node, j_2_node, weight_nucq_1, weight_nucq, weight_nucj_2, weight_nucj_1, false, j_1_node)); 336 | right_seq.clear(); 337 | } 338 | } 339 | } 340 | } 341 | } 342 | } 343 | else{ 344 | for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ 345 | if (get<1>(j_1_node_nucj_1) != nucj_1) continue; 346 | auto j_1_node = get<0>(j_1_node_nucj_1); 347 | auto weight_nucj_1 = get<2>(j_1_node_nucj_1); 348 | for(auto& j_2_node_nucj_2 : dfa.left_edges[j_1_node]){ 349 | auto j_2_node = get<0>(j_2_node_nucj_2); 350 | auto nucj_2 = get<1>(j_2_node_nucj_2); 351 | auto weight_nucj_2 = get<2>(j_2_node_nucj_2); 352 | for (auto& q_node : dfa.nodes[q]){ 353 | for (auto& q1_node_nucq : dfa.right_edges[q_node]){ 354 | auto q1_node = get<0>(q1_node_nucq); 355 | auto nucq = get<1>(q1_node_nucq); 356 | auto weight_nucq = get<2>(q1_node_nucq); 357 | for (auto& q_1_node_nucq_1 : dfa.left_edges[q_node]){ 358 | NodeType q_1_node = get<0>(q_1_node_nucq_1); 359 | auto nucq_1 = get<1>(q_1_node_nucq_1); 360 | double weight_nucq_1 = get<2>(q_1_node_nucq_1); 361 | right_seq.push_back(make_pair(q, nucq)); 362 | right_seq.push_back(make_pair(j-2, nucj_2)); 363 | right_seq.push_back(make_pair(j-1, nucj_1)); 364 | right_start = q+1; 365 | right_end = j-2; 366 | q_node_nucs_list.push_back(make_tuple(q_1_node, q_node, nucq_1, nucq, nucj_2, right_seq, right_start, right_end, q1_node, j_2_node, weight_nucq_1, weight_nucq, weight_nucj_2, weight_nucj_1,false, j_1_node)); 367 | right_seq.clear(); 368 | } 369 | } 370 | } 371 | } 372 | } 373 | } 374 | 375 | for (auto& q_node_nucs : q_node_nucs_list){ 376 | auto q_1_node = get<0>(q_node_nucs); 377 | auto q_node = get<1>(q_node_nucs); 378 | auto nucq_1 = get<2>(q_node_nucs); 379 | auto nucq = get<3>(q_node_nucs); 380 | auto nucj_2 = get<4>(q_node_nucs); 381 | auto right_seq = get<5>(q_node_nucs); 382 | auto right_start = get<6>(q_node_nucs); 383 | auto right_end = get<7>(q_node_nucs); 384 | auto right_start_node = get<8>(q_node_nucs); 385 | auto right_end_node = get<9>(q_node_nucs); 386 | auto weight_nucq_1 = get<10>(q_node_nucs); 387 | auto weight_nucq = get<11>(q_node_nucs); 388 | auto weight_nucj_2 = get<12>(q_node_nucs); 389 | auto weight_nucj_1 = get<13>(q_node_nucs); 390 | bool q_equ_j_1 = get<14>(q_node_nucs); 391 | auto j_1_node = get<15>(q_node_nucs); 392 | 393 | double weight_right = 0.0; 394 | double weight_left = 0.0; 395 | 396 | if(q_equ_j_1){ 397 | for (auto& i1_node_nuci : dfa.right_edges[i_node]){ 398 | NucType new_nuci = get<1>(i1_node_nuci); 399 | if (new_nuci != nuci) continue; 400 | auto i1_node = get<0>(i1_node_nuci); 401 | auto weight_nuci = get<2>(i1_node_nuci); 402 | 403 | auto p_list = next_list[nucq_1][i1_node]; 404 | for (auto &p_node_nucp : p_list){ 405 | auto p_node = get<0>(p_node_nucp); 406 | auto nucp = get<1>(p_node_nucp); 407 | auto p = p_node.first; 408 | PairType pair_nuc = NTP(nucp, nucq_1); 409 | 410 | if (p == i + 1) continue; // stack 411 | if (p - i + j - q - 2 > SINGLE_MAX_LEN) continue; 412 | 413 | NodeNucpair temp = {p_node.first, p_node.second, static_cast(pair_nuc)}; 414 | 415 | auto& p_state = bestP[q_node][temp]; 416 | 417 | auto newscore = - func14(i, j-1, p, q-1, nuci, nucp, nucq_1, nucj_1, nuci, nucp, nucq_1, nucj_1) + p_state.score; 418 | auto weight_left = weight_nuci + get_broken_codon_score(i1_node, p_node); 419 | auto cai_score = p_state.cai_score + (weight_left + weight_nucj_1); 420 | 421 | if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ 422 | stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); 423 | 424 | sequence[i] = GET_ACGU(nuci); 425 | 426 | auto temp_i1_to_p_nucs = get_nuc_from_dfa_cai(dfa, i1_node, p_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); 427 | assert(temp_i1_to_p_nucs.size() == p - (i+1)); 428 | auto count = i+1; 429 | for (auto& nuc : temp_i1_to_p_nucs){ 430 | sequence[count] = nuc; 431 | count++; 432 | } 433 | assert(count == p); 434 | 435 | assert(right_seq.size() == 1); 436 | sequence[j-1] = GET_ACGU(right_seq[0].second); 437 | 438 | structure[i] = '('; 439 | structure[j-1] = ')'; 440 | 441 | no_backpointer = false; 442 | break; 443 | }if (!no_backpointer) break; 444 | }if (!no_backpointer) break; 445 | } 446 | }else{ 447 | for (auto& i1_node_nuci : dfa.right_edges[i_node]){ 448 | NucType new_nuci = get<1>(i1_node_nuci); 449 | if (new_nuci != nuci) continue; 450 | auto i1_node = get<0>(i1_node_nuci); 451 | auto weight_nuci = get<2>(i1_node_nuci); 452 | 453 | auto p_list = next_list[nucq_1][i1_node]; 454 | 455 | for (auto &p_node_nucp : p_list){ 456 | auto p_node = get<0>(p_node_nucp); 457 | auto nucp = get<1>(p_node_nucp); 458 | auto weight_nucp = get<2>(p_node_nucp); 459 | auto p = p_node.first; 460 | PairType pair_nuc = NTP(nucp, nucq_1); 461 | 462 | if (p - i + j - q - 2 > SINGLE_MAX_LEN) continue; 463 | 464 | NodeNucpair temp = {p_node.first, p_node.second, static_cast(pair_nuc)}; 465 | 466 | auto& p_state = bestP[q_node][temp]; 467 | auto newscore = 0; 468 | if (p == i+1){ 469 | newscore = - func14(i, j-1, p, q-1, nuci, nucp, nucj_2, nucj_1, nuci, nucp, nucq_1, nucq) 470 | + p_state.score; 471 | 472 | weight_right = get_broken_codon_score(q_node,j_1_node) + weight_nucj_1; 473 | 474 | auto cai_score = p_state.cai_score + (weight_nuci + weight_right); 475 | 476 | if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ 477 | stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); 478 | 479 | sequence[i] = GET_ACGU(nuci); 480 | for(auto& idx_nucidx : right_seq){ 481 | IndexType idx = idx_nucidx.first; 482 | NucType nucidx = idx_nucidx.second; 483 | sequence[idx] = GET_ACGU(nucidx); 484 | } 485 | 486 | structure[i] = '('; 487 | structure[j-1] = ')'; 488 | 489 | auto temp_string = get_nuc_from_dfa_cai(dfa, q_node, j_1_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); 490 | int count = q; 491 | for (auto & nuc : temp_string ){ 492 | sequence[count] = nuc; 493 | count++; 494 | } 495 | 496 | assert(count == j-1); 497 | no_backpointer = false; 498 | break; 499 | } 500 | }else if(p == i+2){ 501 | for (auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ 502 | auto i2_node = get<0>(i2_node_nuci1); 503 | if (p_node != i2_node) continue; 504 | 505 | NucType nuci1 = get<1>(i2_node_nuci1); 506 | auto weight_nuci1 = get<2>(i2_node_nuci1); 507 | newscore = - func14(i, j-1, p, q-1, nuci, nuci1, nucj_2, nucj_1, nuci1, nucp, nucq_1, nucq) 508 | + p_state.score; 509 | 510 | weight_left = weight_nuci + weight_nuci1; 511 | weight_right = weight_nucq + get_broken_codon_score(right_start_node, right_end_node) + weight_nucj_2 + weight_nucj_1; 512 | 513 | auto cai_score = p_state.cai_score + (weight_left + weight_right); 514 | 515 | if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ 516 | stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); 517 | 518 | sequence[i] = GET_ACGU(nuci); 519 | sequence[i+1] = GET_ACGU(nuci1); 520 | for(auto& idx_nucidx : right_seq){ 521 | IndexType idx = idx_nucidx.first; 522 | NucType nucidx = idx_nucidx.second; 523 | sequence[idx] = GET_ACGU(nucidx); 524 | } 525 | 526 | structure[i] = '('; 527 | structure[j-1] = ')'; 528 | auto temp_string = get_nuc_from_dfa_cai(dfa, right_start_node, right_end_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); 529 | int count = right_start; 530 | for (auto & nuc : temp_string ){ 531 | sequence[count] = nuc; 532 | count++; 533 | } 534 | assert(count == right_end); 535 | no_backpointer = false; 536 | break; 537 | } 538 | } 539 | }else if(p == i+3){ 540 | for (auto& p_1_node_nucp_1 : dfa.left_edges[p_node]){ 541 | auto p_1_node = get<0>(p_1_node_nucp_1); 542 | auto nucp_1 = get<1>(p_1_node_nucp_1); 543 | auto weight_nucp_1 = get<2>(p_1_node_nucp_1); 544 | for (auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ 545 | auto i2_node = get<0>(i2_node_nuci1); 546 | if (p_1_node != i2_node) continue; 547 | 548 | NucType nuci1 = get<1>(i2_node_nuci1); 549 | auto weight_nuci1 = get<2>(i2_node_nuci1); 550 | newscore = - func14(i, j-1, p, q-1, nuci, nuci1, nucj_2, nucj_1, nucp_1, nucp, nucq_1, nucq) 551 | + p_state.score; 552 | 553 | weight_left = weight_nuci + weight_nuci1 + weight_nucp_1; 554 | weight_right = weight_nucq + get_broken_codon_score(right_start_node,right_end_node) + weight_nucj_2 + weight_nucj_1; 555 | 556 | auto cai_score = p_state.cai_score + (weight_left + weight_right); 557 | 558 | if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ 559 | stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); 560 | 561 | sequence[i] = GET_ACGU(nuci); 562 | sequence[i+1] = GET_ACGU(nuci1); 563 | sequence[p-1] = GET_ACGU(nucp_1); 564 | 565 | for(auto& idx_nucidx : right_seq){ 566 | IndexType idx = idx_nucidx.first; 567 | NucType nucidx = idx_nucidx.second; 568 | sequence[idx] = GET_ACGU(nucidx); 569 | } 570 | structure[i] = '('; 571 | structure[j-1] = ')'; 572 | auto temp_string = get_nuc_from_dfa_cai(dfa, right_start_node, right_end_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); 573 | int count = right_start; 574 | for (auto & nuc : temp_string ){ 575 | sequence[count] = nuc; 576 | count++; 577 | } 578 | assert(count == right_end); 579 | 580 | no_backpointer = false; 581 | break; 582 | } 583 | }if (!no_backpointer) break; 584 | } 585 | }else if(p == i+4){ 586 | for (auto& p_1_node_nucp_1 : dfa.left_edges[p_node]){ 587 | auto p_1_node = get<0>(p_1_node_nucp_1); 588 | auto nucp_1 = get<1>(p_1_node_nucp_1); 589 | auto weight_nucp_1 = get<2>(p_1_node_nucp_1); 590 | for (auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ 591 | auto i2_node = get<0>(i2_node_nuci1); 592 | NucType nuci1 = get<1>(i2_node_nuci1); 593 | auto weight_nuci1 = get<2>(i2_node_nuci1); 594 | for (auto& i3_node_nuci2 : dfa.right_edges[i2_node]){ 595 | auto i3_node = get<0>(i3_node_nuci2); 596 | if (i3_node != p_1_node) continue; 597 | auto nuci2 = get<1>(i3_node_nuci2); 598 | auto weight_nuci2 = get<2>(i3_node_nuci2); 599 | 600 | newscore = - func14(i, j-1, p, q-1, nuci, nuci1, nucj_2, nucj_1, nucp_1, nucp, nucq_1, nucq) 601 | + p_state.score; 602 | 603 | weight_left = weight_nuci + weight_nuci1 + weight_nuci2 + weight_nucp_1; 604 | weight_right = weight_nucq + get_broken_codon_score(right_start_node,right_end_node) + weight_nucj_2 + weight_nucj_1; 605 | 606 | auto cai_score = p_state.cai_score + (weight_left + weight_right); 607 | 608 | if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ 609 | stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); 610 | 611 | sequence[i] = GET_ACGU(nuci); 612 | sequence[i+1] = GET_ACGU(nuci1); 613 | sequence[i+2] = GET_ACGU(nuci2); 614 | sequence[p-1] = GET_ACGU(nucp_1); 615 | 616 | for(auto& idx_nucidx : right_seq){ 617 | IndexType idx = idx_nucidx.first; 618 | NucType nucidx = idx_nucidx.second; 619 | sequence[idx] = GET_ACGU(nucidx); 620 | } 621 | structure[i] = '('; 622 | structure[j-1] = ')'; 623 | auto temp_string = get_nuc_from_dfa_cai(dfa, right_start_node, right_end_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); 624 | auto count = right_start; 625 | for (auto & nuc : temp_string ){ 626 | sequence[count] = nuc; 627 | count++; 628 | } 629 | assert(count == right_end); 630 | 631 | no_backpointer = false; 632 | break; 633 | } 634 | }if (!no_backpointer) break; 635 | }if (!no_backpointer) break; 636 | } 637 | }else{ 638 | for (auto& i2_node_nuci1 : dfa.right_edges[i1_node]){ 639 | NucType nuci1 = get<1>(i2_node_nuci1); 640 | auto i2_node = get<0>(i2_node_nuci1); 641 | auto weight_nuci1 = get<2>(i2_node_nuci1); 642 | 643 | for (auto& p_1_node_nucp_1 : dfa.left_edges[p_node]){ 644 | auto nucp_1 = get<1>(p_1_node_nucp_1); 645 | auto p_1_node = get<0>(p_1_node_nucp_1); 646 | auto weight_nucp_1 = get<2>(p_1_node_nucp_1); 647 | 648 | newscore = - func14(i, j-1, p, q-1, nuci, nuci1, nucj_2, nucj_1, nucp_1, nucp, nucq_1, nucq) 649 | + p_state.score; 650 | 651 | weight_left = weight_nuci + weight_nuci1 + get_broken_codon_score(i2_node, p_1_node) + weight_nucp_1; 652 | weight_right = weight_nucq + get_broken_codon_score(right_start_node,right_end_node) + weight_nucj_2 + weight_nucj_1; 653 | 654 | auto cai_score = p_state.cai_score + (weight_left + weight_right); 655 | if (state.score == newscore && abs(state.cai_score - cai_score) < epsilon){ 656 | stk.push(make_tuple(p_node, q_node, p_state, Beam_type::BEAM_P, pair_nuc)); 657 | 658 | sequence[i] = GET_ACGU(nuci); 659 | sequence[i+1] = GET_ACGU(nuci1); 660 | sequence[p-1] = GET_ACGU(nucp_1); 661 | auto temp_string = get_nuc_from_dfa_cai(dfa, i2_node, p_1_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); 662 | int count = i+2; 663 | for (auto & nuc : temp_string ){ 664 | sequence[count] = nuc; 665 | count++; 666 | } 667 | assert(count == p-1); 668 | 669 | for(auto& idx_nucidx : right_seq){ 670 | IndexType idx = idx_nucidx.first; 671 | NucType nucidx = idx_nucidx.second; 672 | sequence[idx] = GET_ACGU(nucidx); 673 | } 674 | structure[i] = '('; 675 | structure[j-1] = ')'; 676 | 677 | temp_string = get_nuc_from_dfa_cai(dfa, right_start_node, right_end_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); 678 | count = right_start; 679 | for (auto & nuc : temp_string ){ 680 | sequence[count] = nuc; 681 | count++; 682 | } 683 | assert(count == right_end); 684 | no_backpointer = false; 685 | break; 686 | } 687 | }if (!no_backpointer) break; 688 | } 689 | }if (!no_backpointer) break; 690 | 691 | }if (!no_backpointer) break; 692 | }if (!no_backpointer) break; 693 | } 694 | }if (!no_backpointer) break; 695 | } 696 | } 697 | 698 | // Multi 699 | if (no_backpointer){ 700 | 701 | NodeNucpair temp = {i_node.first, i_node.second, static_cast(curr_pair_nuc)}; 702 | 703 | 704 | auto& multi_state = bestMulti[j_node][temp]; 705 | auto newscore = multi_state.score - func15(i, j, nuci, -1, -1, nucj_1, seq_length); 706 | 707 | if (state.score == newscore && abs(state.cai_score - multi_state.cai_score) < epsilon){ 708 | stk.push(make_tuple(i_node, j_node, multi_state, Beam_type::BEAM_MULTI, curr_pair_nuc)); 709 | 710 | sequence[i] = GET_ACGU(nuci); 711 | sequence[j-1] = GET_ACGU(nucj_1); 712 | structure[i] = '('; 713 | structure[j-1] = ')'; 714 | 715 | no_backpointer = false; 716 | } 717 | } 718 | 719 | assert(no_backpointer == false); 720 | break; 721 | 722 | case Beam_type::BEAM_MULTI: 723 | nuci = PTLN(curr_pair_nuc); 724 | nucj_1 = PTRN(curr_pair_nuc); 725 | j = j_node.first; 726 | i = i_node.first; 727 | 728 | 729 | for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ 730 | auto j_1_node = get<0>(j_1_node_nucj_1); 731 | auto weight_nucj_1 = get<2>(j_1_node_nucj_1); 732 | NodeType q_node = state.pre_node; 733 | q = q_node.first; 734 | 735 | if(q == j - 1 and q_node != j_1_node) continue; 736 | if(q == j - 2 and dfa.nodes[q].size() == dfa.nodes[j-1].size() and q_node.second != j_1_node.second) continue; 737 | 738 | 739 | 740 | for (size_t p_node_ = 0; p_node_ < 2 * q; ++p_node_) { 741 | 742 | auto& temp_state = bestM2[q_node][p_node_]; 743 | if (temp_state.score == util::value_min()) 744 | continue; 745 | 746 | auto p_node = reverse_index2(p_node_); 747 | auto p = p_node.first; 748 | 749 | if(p <= i) continue; 750 | 751 | for (auto& i1_node_nuci : dfa.right_edges[i_node]){ 752 | 753 | auto i1_node = get<0>(i1_node_nuci); 754 | 755 | if(p == i + 1 and p_node != i1_node) continue; 756 | if(p == i + 2 and dfa.nodes[p].size() == dfa.nodes[i+1].size() and p_node.second != i1_node.second) continue; 757 | 758 | 759 | double weight_nuci = double(get<2>(i1_node_nuci)); 760 | 761 | auto& m2_state = bestM2[q_node][p_node]; 762 | 763 | auto cai_score = m2_state.cai_score + (weight_nuci + get_broken_codon_score(i1_node, p_node) + get_broken_codon_score(q_node, j_1_node) + weight_nucj_1); 764 | 765 | if (state.score == m2_state.score && abs(state.cai_score - cai_score) < epsilon){ 766 | stk.push(make_tuple(p_node, q_node, m2_state, Beam_type::BEAM_M2, -1)); 767 | 768 | auto temp_string = get_nuc_from_dfa_cai(dfa, i1_node, p_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); 769 | auto count = i+1; 770 | for (auto & nuc : temp_string ){ 771 | sequence[count] = nuc; 772 | count++; 773 | } 774 | assert(count == p); 775 | 776 | temp_string.clear(); 777 | temp_string = get_nuc_from_dfa_cai(dfa, q_node, j_1_node, protein, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon); 778 | count = q; 779 | for (auto & nuc : temp_string ){ 780 | sequence[count] = nuc; 781 | count++; 782 | } 783 | 784 | assert(count == j-1); 785 | 786 | no_backpointer = false; 787 | }if (!no_backpointer) break; 788 | }if (!no_backpointer) break; 789 | }if (!no_backpointer) break; 790 | } 791 | assert(no_backpointer == false); 792 | break; 793 | 794 | case Beam_type::BEAM_M2: 795 | // M2 = M + P 796 | i = i_node.first; 797 | j = j_node.first; 798 | 799 | for (size_t m_node_nucpair_ = 0; m_node_nucpair_ < 16 * j; ++m_node_nucpair_){ 800 | 801 | auto& p_state = bestP[j_node][m_node_nucpair_]; 802 | if (p_state.score == util::value_min()) 803 | continue; 804 | 805 | auto m_node_nucpair = reverse_index(m_node_nucpair_); 806 | auto m = m_node_nucpair.node_first; 807 | auto m_num = m_node_nucpair.node_second; 808 | 809 | auto m_node = make_pair(m, m_num); 810 | 811 | auto pair_nuc = m_node_nucpair.nucpair; 812 | 813 | if (m <= i+4) continue; // no sharpturn 814 | 815 | auto nucm = PTLN(pair_nuc); 816 | auto nucj_1 = PTRN(pair_nuc); 817 | auto newscore = - func6(-1, -1, -1, -1, nucm, nucj_1, -1, seq_length) + p_state.score; 818 | 819 | auto& m_state = bestM[m_node][i_node]; 820 | auto cai_score = m_state.cai_score + p_state.cai_score; 821 | 822 | if (state.score == m_state.score + newscore && state.cai_score == cai_score){ 823 | stk.push(make_tuple(i_node, m_node, m_state, Beam_type::BEAM_M1, -1)); 824 | stk.push(make_tuple(m_node, j_node, p_state, Beam_type::BEAM_P, pair_nuc)); 825 | 826 | no_backpointer = false; 827 | break; 828 | } 829 | if (!no_backpointer) break; 830 | } 831 | assert(no_backpointer == false); 832 | break; 833 | 834 | case Beam_type::BEAM_M1: 835 | // M = M + U 836 | for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ 837 | auto j_1_node = std::get<0>(j_1_node_nucj_1); 838 | auto weight_nucj_1 = std::get<2>(j_1_node_nucj_1); 839 | auto& m_state = bestM[j_1_node][i_node]; 840 | auto cai_score = m_state.cai_score + weight_nucj_1; 841 | 842 | if (state.score == m_state.score && abs(state.cai_score - cai_score) < epsilon) { 843 | NucType nucj_1 = std::get<1>(j_1_node_nucj_1); 844 | stk.push(make_tuple(i_node, j_1_node, m_state, Beam_type::BEAM_M1, -1)); 845 | 846 | sequence[j-1] = GET_ACGU(nucj_1); 847 | 848 | no_backpointer = false; 849 | break; 850 | } 851 | } 852 | 853 | // M = P 854 | if(no_backpointer){ 855 | for (auto& j_1_node_nucj_1 : dfa.left_edges[j_node]){ 856 | NucType nucj_1 = std::get<1>(j_1_node_nucj_1); 857 | 858 | for (auto& i1_node_nuci : dfa.right_edges[i_node]){ 859 | NucType nuci = std::get<1>(i1_node_nuci); 860 | PairType pair_nuc = NTP(nuci, nucj_1); 861 | 862 | NodeNucpair temp = {i_node.first, i_node.second, static_cast(pair_nuc)}; 863 | 864 | auto& p_state = bestP[j_node][temp]; 865 | auto newscore = - func6(-1, -1, -1, -1, nuci, nucj_1, -1, seq_length) + p_state.score; 866 | 867 | if (state.score == newscore && abs(state.cai_score - p_state.cai_score) < epsilon) { 868 | stk.push(make_tuple(i_node, j_node, p_state, Beam_type::BEAM_P, pair_nuc)); 869 | no_backpointer = false; 870 | break; 871 | } 872 | }if(!no_backpointer) break; 873 | } 874 | } 875 | 876 | // M = M2 877 | if(no_backpointer){ 878 | auto& m2_state = bestM2[j_node][i_node]; 879 | 880 | 881 | 882 | if (state.score == m2_state.score && state.cai_score == m2_state.cai_score) { 883 | stk.push(make_tuple(i_node, j_node, m2_state, Beam_type::BEAM_M2, -1)); 884 | no_backpointer = false; 885 | } 886 | } 887 | 888 | assert(no_backpointer == false); 889 | break; 890 | default: // MANNER_NONE or other cases 891 | 892 | printf("wrong beam_type at %d, %d\n", i, j); fflush(stdout); 893 | assert(false); 894 | } 895 | 896 | } 897 | assert(string(sequence).size() == string(structure).size()); 898 | return {string(sequence), string(structure)}; 899 | } 900 | } 901 | -------------------------------------------------------------------------------- /src/beam_cky_parser.cc: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "beam_cky_parser.h" 18 | #include "Utils/utility_v.h" 19 | #include "backtrace_iter.cc" 20 | #include "Utils/common.h" 21 | 22 | using namespace std; 23 | 24 | using NodeType = std::pair; 25 | 26 | #define tetra_hex_tri -1 27 | 28 | namespace LinearDesign { 29 | 30 | template 31 | double BeamCKYParser::get_broken_codon_score( 32 | const NodeType& start_node, const NodeType& end_node) { 33 | 34 | IndexType s_index = start_node.first; 35 | IndexType t_index = end_node.first; 36 | 37 | if (s_index >= t_index) 38 | return 0.0; 39 | 40 | auto aa_left = protein[s_index / 3]; // tri letter 41 | 42 | auto aa_right = protein[(int)(s_index / 3)]; 43 | if (t_index / 3 < protein.size()){ 44 | aa_right = protein[(int)(t_index / 3)]; 45 | } 46 | 47 | auto start_node_re_index = make_pair(s_index % 3, start_node.second); 48 | auto end_node_re_index = make_pair(t_index % 3, end_node.second); 49 | 50 | double ret = 0.0; 51 | 52 | if (t_index - s_index < 3) { 53 | if (s_index / 3 == t_index / 3) { 54 | ret = std::get<0>(best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index,end_node_re_index)]); 55 | }else{ 56 | double left_ln_cai = 0.0, right_ln_cai = 0.0; 57 | if (s_index % 3 != 0) 58 | left_ln_cai = std::get<0>(best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index,make_pair(0, 0))]); 59 | if (t_index % 3 != 0) 60 | right_ln_cai = std::get<0>(best_path_in_one_codon_unit[aa_right][make_tuple(make_pair(0, 0), end_node_re_index)]); 61 | ret = left_ln_cai + right_ln_cai; 62 | } 63 | }else{ 64 | double left_ln_cai = 0.0, right_ln_cai = 0.0; 65 | if (s_index % 3 != 0) 66 | left_ln_cai = std::get<0>(best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index,make_pair(0, 0))]); 67 | if (t_index % 3 != 0) 68 | right_ln_cai = std::get<0>(best_path_in_one_codon_unit[aa_right][make_tuple(make_pair(0, 0), end_node_re_index)]); 69 | ret = left_ln_cai + right_ln_cai; 70 | } 71 | return ret; 72 | } 73 | 74 | template 75 | template 76 | void BeamCKYParser::hairpin_beam(IndexType j, DFA_t& dfa) { 77 | 78 | auto j_node = make_pair(j,j_num); 79 | 80 | for (auto &j1_node_nucj : dfa.right_edges[j_node]) { // right_edges[j][j_num][j1_num][nuc]: false/true 81 | 82 | auto j1_node = std::get<0>(j1_node_nucj); 83 | auto nucj = std::get<1>(j1_node_nucj); 84 | 85 | auto weight_nucj = std::get<2>(j1_node_nucj); 86 | 87 | 88 | for (auto &j4_node : dfa.nodes[j+4]){ 89 | const auto& jnext_list = next_pair[nucj][j4_node]; 90 | 91 | if (jnext_list.empty()) 92 | continue; 93 | 94 | for (auto &jnext_node_nucjnext : jnext_list){ 95 | auto jnext_node = std::get<0>(jnext_node_nucjnext); 96 | auto nucjnext = std::get<1>(jnext_node_nucjnext); 97 | auto weight_nucjnext = std::get<2>(jnext_node_nucjnext); 98 | auto jnext = jnext_node.first; 99 | 100 | IndexType hairpin_length = jnext + 1 - j; //special hairpin 101 | NodeNucpair temp = {j, j_num, static_cast(NTP(nucj, nucjnext))}; 102 | 103 | 104 | #ifdef SPECIAL_HP 105 | if (hairpin_length == 5 or hairpin_length == 6 or hairpin_length == 8){ 106 | for(auto & seq_score_weight : hairpin_seq_score_cai[j_node][jnext_node][NTP(nucj, nucjnext)]){ 107 | auto seq = get<0>(seq_score_weight); 108 | auto pre_cal_score = get<1>(seq_score_weight); 109 | auto pre_cal_cai_score = get<2>(seq_score_weight); 110 | update_if_better(bestH[jnext_node][temp], pre_cal_score, pre_cal_cai_score); 111 | } 112 | 113 | continue; 114 | } 115 | #endif 116 | 117 | for (auto &j2_node_nucj1 : dfa.right_edges[j1_node]) { 118 | auto j2_node = std::get<0>(j2_node_nucj1); 119 | auto j2_num = j2_node.second; 120 | auto nucj1 = std::get<1>(j2_node_nucj1); 121 | auto weight_nucj1 = std::get<2>(j2_node_nucj1); 122 | 123 | for (auto& jnext_1_node_list : dfa.auxiliary_left_edges[jnext_node]){ 124 | 125 | NodeType jnext_1_node = jnext_1_node_list.first; 126 | NumType jnext_1_num = jnext_1_node.second; 127 | if (jnext - j == 4 and (jnext_1_num != j2_num and dfa.nodes[j+2].size() == dfa.nodes[jnext-1].size())) continue; 128 | 129 | for (auto& nucjnext_1_weight : jnext_1_node_list.second){ 130 | 131 | IndexType nucjnext_1 = get<0>(nucjnext_1_weight); 132 | auto weight_nucjnext_1 = get<1>(nucjnext_1_weight); 133 | 134 | auto newscore = - func12(j, jnext, nucj, nucj1, nucjnext_1, nucjnext, tetra_hex_tri); 135 | 136 | FinalScoreType cai_score = weight_nucj + weight_nucj1 + weight_nucjnext_1 + weight_nucjnext; //ZL need to add weight_nucjnext 137 | 138 | if ((jnext_1_node.first - j2_node.first) <= SINGLE_MAX_LEN) 139 | cai_score += get_broken_codon_score_map[j2_node][jnext_1_node]; 140 | else 141 | cai_score += get_broken_codon_score(j2_node,jnext_1_node); 142 | 143 | update_if_better(bestH[jnext_node][temp], newscore, cai_score); 144 | 145 | } 146 | } 147 | } 148 | } 149 | } 150 | } 151 | 152 | // for every state h in H[j] 153 | // 1. extend h(i, j) to h(i, jnext) 154 | // 2. generate p(i, j) 155 | for (size_t i_node_nucpair_ = 0; i_node_nucpair_ < 16 * j; ++i_node_nucpair_) { 156 | 157 | if (bestH[j_node][i_node_nucpair_].score == util::value_min()) 158 | continue; 159 | 160 | auto i_node_nucpair = reverse_index(i_node_nucpair_); 161 | 162 | auto i = i_node_nucpair.node_first; 163 | auto i_num = i_node_nucpair.node_second; 164 | auto pair_nuc = i_node_nucpair.nucpair; 165 | auto i_node = make_pair(i,i_num); 166 | 167 | auto nuci = PTLN(pair_nuc); 168 | auto nucj = PTRN(pair_nuc); 169 | 170 | 171 | for (const auto& item : dfa.auxiliary_right_edges[j_node]){ 172 | auto j1_node = item.first; 173 | auto jnext_list = next_pair[nuci][j1_node]; 174 | 175 | if (jnext_list.empty()) continue; 176 | 177 | for (auto &jnext_node_nucjnext : jnext_list){ 178 | auto jnext_node = std::get<0>(jnext_node_nucjnext); 179 | auto nucjnext = std::get<1>(jnext_node_nucjnext); 180 | auto jnext = jnext_node.first; 181 | auto weight_nucjnext = std::get<2>(jnext_node_nucjnext); 182 | auto hairpin_length = jnext + 1 - i; 183 | 184 | NodeNucpair temp = {i, i_num, static_cast(NTP(nuci, nucjnext))}; 185 | 186 | #ifdef SPECIAL_HP 187 | 188 | if (hairpin_length == 5 or hairpin_length == 6 or hairpin_length == 8){ 189 | for(auto & seq_score_weight : hairpin_seq_score_cai[i_node][jnext_node][NTP(nuci, nucjnext)]){ 190 | auto seq = get<0>(seq_score_weight); 191 | auto pre_cal_score = get<1>(seq_score_weight); 192 | auto pre_cal_cai_score = get<2>(seq_score_weight); 193 | update_if_better(bestH[jnext_node][temp], pre_cal_score, pre_cal_cai_score); 194 | } 195 | 196 | continue; 197 | } 198 | 199 | #endif 200 | 201 | for (auto &i1_node_newnuci : dfa.right_edges[i_node]){ 202 | NucType newnuci = get<1>(i1_node_newnuci); 203 | if (nuci != newnuci) continue; 204 | NodeType i1_node = get<0>(i1_node_newnuci); 205 | double weight_newnuci = get<2>(i1_node_newnuci); 206 | 207 | 208 | for (auto &i2_node_nuci1 : dfa.right_edges[i1_node]) { 209 | auto i2_node = get<0>(i2_node_nuci1); 210 | auto nuci1 = get<1>(i2_node_nuci1); 211 | auto weight_nuci1 = get<2>(i2_node_nuci1); 212 | 213 | for (auto &jnext_1_node_nucjnext_1 : dfa.left_edges[jnext_node]) { 214 | auto jnext_1_node = get<0>(jnext_1_node_nucjnext_1); 215 | auto nucjnext_1 = get<1>(jnext_1_node_nucjnext_1); 216 | auto weight_nucjnext_1 = get<2>(jnext_1_node_nucjnext_1); 217 | 218 | auto newscore = - func12(i, jnext, nuci, nuci1, nucjnext_1, nucjnext, tetra_hex_tri); 219 | 220 | FinalScoreType cai_score = weight_newnuci + weight_nuci1 + weight_nucjnext_1 + weight_nucjnext; //move weight_nucjnext from H to P to here. Since we added SH here, so it must be here. 221 | 222 | if ((jnext_1_node.first - i2_node.first) <= SINGLE_MAX_LEN) 223 | cai_score += get_broken_codon_score_map[i2_node][jnext_1_node]; 224 | else 225 | cai_score += get_broken_codon_score(i2_node,jnext_1_node); 226 | 227 | update_if_better(bestH[jnext_node][temp], newscore, cai_score); 228 | 229 | } 230 | } 231 | } 232 | } 233 | } 234 | 235 | auto& state = bestH[j_node][i_node_nucpair_]; 236 | 237 | for (auto &j1_node_newnucj : dfa.right_edges[j_node]){ 238 | NucType newnucj = get<1>(j1_node_newnucj); 239 | if (nucj != newnucj) continue; 240 | NodeType j1_node = get<0>(j1_node_newnucj); 241 | update_if_better(bestP[j1_node][i_node_nucpair_], state.score, state.cai_score); 242 | } 243 | } 244 | 245 | } 246 | 247 | template 248 | template 249 | void BeamCKYParser::Multi_beam(IndexType j, DFA_t& dfa){ 250 | 251 | NodeType j_node = make_pair(j, j_num); 252 | 253 | for (size_t i_node_nucpair_ = 0; i_node_nucpair_ < 16 * j; ++i_node_nucpair_){ 254 | 255 | auto& new_state_score = bestMulti[j_node][i_node_nucpair_]; 256 | 257 | if (new_state_score.score == util::value_min()) 258 | continue; 259 | 260 | auto i_node_nucpair = reverse_index(i_node_nucpair_); 261 | auto i = i_node_nucpair.node_first; 262 | auto i_num = i_node_nucpair.node_second; 263 | auto pair_nuc = i_node_nucpair.nucpair; 264 | auto nuci = PTLN(pair_nuc); 265 | auto nucj_1 = PTRN(pair_nuc); 266 | 267 | auto& jnext_list = next_pair[nuci][j_node]; 268 | 269 | if (!jnext_list.empty()){ 270 | 271 | for (auto &jnext_node_nucjnext : jnext_list){ 272 | auto jnext_node = std::get<0>(jnext_node_nucjnext); 273 | auto nucjnext = std::get<1>(jnext_node_nucjnext); 274 | auto weight_nucjnext = std::get<2>(jnext_node_nucjnext); 275 | auto jnext = jnext_node.first; 276 | 277 | for (auto &jnext1_node_newnucjnext : dfa.right_edges[jnext_node]){ 278 | auto jnext1_node = std::get<0>(jnext1_node_newnucjnext); 279 | auto newnucjnext = std::get<1>(jnext1_node_newnucjnext); 280 | if (newnucjnext == nucjnext){ 281 | double cai_score; 282 | 283 | if ((jnext_node.first - new_state_score.pre_node.first) <= SINGLE_MAX_LEN) 284 | cai_score = new_state_score.pre_left_cai + (get_broken_codon_score_map[new_state_score.pre_node][jnext_node] + weight_nucjnext); 285 | else 286 | cai_score = new_state_score.pre_left_cai + (get_broken_codon_score(new_state_score.pre_node, jnext_node) + weight_nucjnext); 287 | 288 | NodeNucpair temp = {i, i_num, static_cast(NTP(nuci, nucjnext))}; 289 | 290 | update_if_better(bestMulti[jnext1_node][temp], new_state_score.score, cai_score, new_state_score.pre_node, new_state_score.pre_left_cai); 291 | } 292 | } 293 | } 294 | } 295 | // 2. generate multi(i, j) -> p(i, j) 296 | auto newscore = new_state_score.score - func15(i, j, nuci, -1, -1, nucj_1, seq_length); // hzhang: TODO 297 | update_if_better(bestP[j_node][i_node_nucpair_], newscore, new_state_score.cai_score); 298 | } 299 | } 300 | 301 | template 302 | template 303 | void BeamCKYParser::P_beam(IndexType j, DFA_t& dfa){ 304 | 305 | auto j_node = make_pair(j, j_num); 306 | 307 | if (j < seq_length){ 308 | for (size_t i_node_nucpair_ = 0; i_node_nucpair_ < 16 * j; ++i_node_nucpair_){ 309 | 310 | auto& state = bestP[j_node][i_node_nucpair_]; 311 | if (state.score == util::value_min()) 312 | continue; 313 | 314 | auto i_node_nucpair = reverse_index(i_node_nucpair_); 315 | auto i = i_node_nucpair.node_first; 316 | 317 | if (i <= 0) continue; 318 | 319 | auto i_num = i_node_nucpair.node_second; 320 | auto pair_nuc = i_node_nucpair.nucpair; 321 | auto i_node = make_pair(i, i_num); 322 | 323 | auto nuci = PTLN(pair_nuc); 324 | auto nucj_1 = PTRN(pair_nuc); 325 | 326 | // stacking 327 | for (auto &j1_node_nucj : dfa.right_edges[j_node]){ 328 | auto j1_node = std::get<0>(j1_node_nucj); 329 | auto nucj = std::get<1>(j1_node_nucj); 330 | auto weight_nucj = std::get<2>(j1_node_nucj); 331 | 332 | for (auto &i_1_node_nuci_1 : dfa.left_edges[i_node]){ 333 | auto i_1_node = std::get<0>(i_1_node_nuci_1); 334 | auto nuci_1 = std::get<1>(i_1_node_nuci_1); 335 | auto weight_nuci_1 = std::get<2>(i_1_node_nuci_1); 336 | auto outer_pair = NTP(nuci_1, nucj); 337 | if (_allowed_pairs[nuci_1][nucj]){ 338 | auto newscore = stacking_score[outer_pair-1][pair_nuc-1] + state.score; 339 | double cai_score = state.cai_score + (weight_nuci_1 + weight_nucj); 340 | NodeNucpair temp = {i_1_node.first, i_1_node.second, static_cast(NTP(nuci_1, nucj))}; 341 | update_if_better(bestP[j1_node][temp], newscore, cai_score); 342 | } 343 | } 344 | } 345 | 346 | // right bulge: ((...)..) 347 | for (auto &j1_node_list : dfa.auxiliary_right_edges[j_node]){ 348 | auto j1_node = j1_node_list.first; 349 | 350 | for (auto &i_1_node_nuci_1 : dfa.left_edges[i_node]){ 351 | auto i_1_node = std::get<0>(i_1_node_nuci_1); 352 | auto nuci_1 = std::get<1>(i_1_node_nuci_1); 353 | auto weight_nuci_1 = std::get<2>(i_1_node_nuci_1); 354 | 355 | auto q_list = next_list[nuci_1][j1_node]; 356 | 357 | for (auto& q_node_nucq : q_list){ 358 | 359 | auto q_node = std::get<0>(q_node_nucq); 360 | 361 | auto q_num = q_node.second; 362 | auto q = q_node.first; 363 | 364 | if (q-j > SINGLE_MAX_LEN) break; 365 | 366 | auto nucq = std::get<1>(q_node_nucq); 367 | auto weight_nucq = std::get<2>(q_node_nucq); 368 | auto outer_pair = NTP(nuci_1, nucq); 369 | 370 | for(auto& q1_node_list : dfa.auxiliary_right_edges[q_node]){ 371 | NodeType q1_node = q1_node_list.first; 372 | if(dfa.nodes[q].size() == 1 and dfa.nodes[q+1].size() == 2 and ((q1_node_list.second)[0]).first != nucq) continue; 373 | 374 | auto newscore = bulge_score[outer_pair-1][pair_nuc-1][q-j-1] 375 | + state.score; 376 | 377 | double cai_score; 378 | if ((q_node.first - j_node.first) <= SINGLE_MAX_LEN) 379 | cai_score = state.cai_score + (weight_nuci_1 + get_broken_codon_score_map[j_node][q_node] + weight_nucq); 380 | else 381 | cai_score = state.cai_score + (weight_nuci_1 + get_broken_codon_score(j_node, q_node) + weight_nucq); 382 | 383 | NodeNucpair temp = {i_1_node.first, i_1_node.second, static_cast(outer_pair)}; 384 | 385 | update_if_better(bestP[q1_node][temp], newscore, cai_score); 386 | break; 387 | } 388 | } 389 | } 390 | } 391 | 392 | // left bulge: (..(...)) 393 | for (auto &j1_node_nucj : dfa.right_edges[j_node]){ 394 | auto j1_node = std::get<0>(j1_node_nucj); 395 | auto nucj = std::get<1>(j1_node_nucj); 396 | auto weight_nucj = std::get<2>(j1_node_nucj); 397 | 398 | for (auto &i_1_node_list : dfa.auxiliary_left_edges[i_node]){ 399 | auto i_1_node = i_1_node_list.first; 400 | auto p_list = prev_list[nucj][i_1_node]; 401 | 402 | for (auto &p_node_nucp_1 : p_list){ 403 | 404 | auto p_node = std::get<0>(p_node_nucp_1); 405 | auto p_num = p_node.second; 406 | auto p = p_node.first; 407 | 408 | if (i-p > SINGLE_MAX_LEN) break; 409 | 410 | auto nucp_1 = std::get<1>(p_node_nucp_1); 411 | auto outer_pair = NTP(nucp_1, nucj); 412 | 413 | for(auto& p_1_node_new_nucp_1 : dfa.left_edges[p_node]){ 414 | 415 | NucType new_nucp_1 = std::get<1>(p_1_node_new_nucp_1); 416 | if(nucp_1 != new_nucp_1) continue; 417 | NodeType p_1_node = std::get<0>(p_1_node_new_nucp_1); 418 | auto weight_nucp_1 = std::get<2>(p_1_node_new_nucp_1); 419 | 420 | auto newscore = bulge_score[outer_pair-1][pair_nuc-1][i-p-1] 421 | + state.score; 422 | 423 | double cai_score; 424 | 425 | if ((i_node.first - p_node.first) <= SINGLE_MAX_LEN) 426 | cai_score = state.cai_score + (weight_nucp_1 + get_broken_codon_score_map[p_node][i_node] + weight_nucj); 427 | else 428 | cai_score = state.cai_score + (weight_nucp_1 + get_broken_codon_score(p_node, i_node) + weight_nucj); 429 | 430 | NodeNucpair temp = {p_1_node.first, (NumType)p_1_node.second, static_cast(outer_pair)}; 431 | update_if_better(bestP[j1_node][temp], newscore, cai_score); 432 | } 433 | } 434 | } 435 | } 436 | 437 | // internal loop 438 | for (auto &j1_node_dict : dfa.auxiliary_right_edges[j_node]){ 439 | auto j1_node = j1_node_dict.first; 440 | auto j1_num = j1_node.second; 441 | 442 | for (auto &i_1_node_nuci_1 : dfa.left_edges[i_node]){ 443 | auto i_1_node = std::get<0>(i_1_node_nuci_1); 444 | auto i_1_num = i_1_node.second; 445 | auto nuci_1 = std::get<1>(i_1_node_nuci_1); 446 | auto weight_nuci_1 = std::get<2>(i_1_node_nuci_1); 447 | 448 | for (IndexType p = i-1; p > max(i - SINGLE_MAX_LEN, 0); --p) {//ZL, i-(p-1)<=len => i - len < p 449 | vector> p_node_list; 450 | 451 | if (p == i - 1) 452 | p_node_list.push_back(i_1_node); 453 | else if (p == i - 2) // hzhang: N.B. add this p, i-1, i o--o--o 454 | for (auto &p_node_dict : dfa.auxiliary_left_edges[i_1_node]) 455 | p_node_list.push_back(p_node_dict.first); 456 | else 457 | p_node_list = dfa.nodes[p]; 458 | 459 | for (auto &p_node : p_node_list){ 460 | for (auto &p1_node_nucp : dfa.right_edges[p_node]){ 461 | 462 | auto p1_node = std::get<0>(p1_node_nucp); 463 | auto p1_num = p1_node.second; 464 | auto nucp = std::get<1>(p1_node_nucp); 465 | auto weight_nucp = std::get<2>(p1_node_nucp); 466 | 467 | if (p == i - 1 and nucp != nuci_1) continue; 468 | else if (p == i - 2 and p1_num != i_1_num) continue; 469 | else if (p == i - 3 and p1_num != i_1_num and dfa.nodes[p+1].size() == dfa.nodes[i-1].size()) continue; 470 | 471 | for (auto &p_1_node_nucp_1 : dfa.left_edges[p_node]){ 472 | auto p_1_node = std::get<0>(p_1_node_nucp_1); 473 | auto nucp_1 = std::get<1>(p_1_node_nucp_1); 474 | auto weight_nucp_1 = std::get<2>(p_1_node_nucp_1); 475 | 476 | auto q_list = next_list[nucp_1][j1_node]; 477 | 478 | for (auto &q_node_nucq : q_list){ 479 | 480 | auto q_node = std::get<0>(q_node_nucq); 481 | auto q_num = q_node.second; 482 | auto q = q_node.first; 483 | 484 | if (i-p+q-j > SINGLE_MAX_LEN) //check if q is still in the internal loop limit boundary. 485 | break; 486 | 487 | auto nucq = std::get<1>(q_node_nucq); 488 | auto weight_nucq = std::get<2>(q_node_nucq); 489 | 490 | for(auto& q1_node_list : dfa.auxiliary_right_edges[q_node]){ 491 | NodeType q1_node = q1_node_list.first; 492 | if(dfa.nodes[q].size() == 1 and dfa.nodes[q+1].size() == 2 and ((q1_node_list.second)[0]).first != nucq) continue; 493 | NodeNucpair temp = {p_1_node.first, p_1_node.second, static_cast(NTP(nucp_1, nucq))}; 494 | auto& BestP_val = bestP[q1_node][temp]; 495 | 496 | for(auto & nucj_weightj: j1_node_dict.second){ 497 | auto nucj = nucj_weightj.first; 498 | auto weight_nucj = nucj_weightj.second; 499 | if (q == j+1){ 500 | auto newscore = - func14(p-1, q, i, j-1, nucp_1, nucp, nucj, nucq, nuci_1, nuci, nucj_1, nucj) + state.score; 501 | 502 | double weight_left; 503 | if (p == i-1){ 504 | weight_left = weight_nucp_1 + weight_nucp; 505 | } 506 | else{ 507 | if (i_1_node.first - p1_node.first <= SINGLE_MAX_LEN) 508 | weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score_map[p1_node][i_1_node] + weight_nuci_1; 509 | else 510 | weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score(p1_node, i_1_node) + weight_nuci_1; 511 | } 512 | double cai_score = state.cai_score + (weight_left + weight_nucj + weight_nucq); //j+1 == q 513 | 514 | update_if_better(BestP_val, newscore, cai_score); 515 | }else if (q == j+2){ 516 | for(auto& q_1_node_list : dfa.auxiliary_left_edges[q_node]){ 517 | auto q_1_node = q_1_node_list.first; 518 | NumType q_1_num = q_1_node.second; 519 | if (q_1_num != j1_num) continue; 520 | for(auto & nucq_1_weight : q_1_node_list.second){ 521 | auto nucq_1 = nucq_1_weight.first; 522 | auto weight_nucq_1 = nucq_1_weight.second; 523 | auto newscore = - func14(p-1, q, i, j-1, nucp_1, nucp, nucq_1, nucq, nuci_1, nuci, nucj_1, nucj) + state.score; 524 | 525 | double weight_left; 526 | if (p == i-1){ 527 | weight_left = weight_nucp_1 + weight_nucp; 528 | } 529 | else{ 530 | // assert(p < i-1); 531 | if (i_1_node.first - p1_node.first <= SINGLE_MAX_LEN) 532 | weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score_map[p1_node][i_1_node] + weight_nuci_1; 533 | else 534 | weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score(p1_node, i_1_node) + weight_nuci_1; 535 | } 536 | 537 | auto cai_score = state.cai_score + (weight_left + weight_nucj + weight_nucq_1 + weight_nucq); 538 | 539 | update_if_better(BestP_val, newscore, cai_score); 540 | } 541 | if(dfa.nodes[q-1].size() == 2) break; 542 | } 543 | }else if (q == j + 3){ 544 | for(auto& q_1_node_list : dfa.auxiliary_left_edges[q_node]){ 545 | auto q_1_node = q_1_node_list.first; 546 | NumType q_1_num = q_1_node.second; 547 | if (q_1_num != j1_num and dfa.nodes[q-1].size() == dfa.nodes[j+1].size()) continue; 548 | for(auto & nucq_1_weight : q_1_node_list.second){ 549 | auto nucq_1 = nucq_1_weight.first; 550 | auto weight_nucq_1 = nucq_1_weight.second; 551 | auto newscore = - func14(p-1, q, i, j-1, nucp_1, nucp, nucq_1, nucq, nuci_1, nuci, nucj_1, nucj) + state.score; 552 | 553 | double weight_left; 554 | if (p == i-1){ 555 | weight_left = weight_nucp_1 + weight_nucp; 556 | } 557 | else{ 558 | // assert(p < i-1); 559 | if (i_1_node.first - p1_node.first <= SINGLE_MAX_LEN) 560 | weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score_map[p1_node][i_1_node] + weight_nuci_1; 561 | else 562 | weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score(p1_node, i_1_node) + weight_nuci_1; 563 | } 564 | 565 | double cai_score; 566 | if (q_1_node.first - j1_node.first <= SINGLE_MAX_LEN) 567 | cai_score = state.cai_score + (weight_left + weight_nucj + get_broken_codon_score_map[j1_node][q_1_node] + weight_nucq_1 + weight_nucq); 568 | else 569 | cai_score = state.cai_score + (weight_left + weight_nucj + get_broken_codon_score(j1_node, q_1_node) + weight_nucq_1 + weight_nucq); 570 | 571 | update_if_better(BestP_val, newscore, cai_score); 572 | } 573 | if(dfa.nodes[q-1].size() == 2) break; 574 | } 575 | }else{ 576 | for(auto& q_1_node_list : dfa.auxiliary_left_edges[q_node]){ 577 | auto q_1_node = q_1_node_list.first; 578 | for(auto & nucq_1_weight : q_1_node_list.second){ 579 | auto nucq_1 = nucq_1_weight.first; 580 | auto weight_nucq_1 = nucq_1_weight.second; 581 | auto newscore = - func14(p-1, q, i, j-1, nucp_1, nucp, nucq_1, nucq, nuci_1, nuci, nucj_1, nucj) + state.score; 582 | 583 | 584 | double weight_left; 585 | if (p == i-1){ 586 | weight_left = weight_nucp_1 + weight_nucp; 587 | } 588 | else{ 589 | // assert(p < i-1); 590 | if (i_1_node.first - p1_node.first <= SINGLE_MAX_LEN){ 591 | weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score_map[p1_node][i_1_node] + weight_nuci_1; 592 | } 593 | else 594 | weight_left = weight_nucp_1 + weight_nucp + get_broken_codon_score(p1_node, i_1_node) + weight_nuci_1; 595 | } 596 | 597 | double cai_score; 598 | if (q_1_node.first - j1_node.first <= SINGLE_MAX_LEN){ 599 | cai_score = state.cai_score + (weight_left + weight_nucj + get_broken_codon_score_map[j1_node][q_1_node] + weight_nucq_1 + weight_nucq); 600 | } 601 | else 602 | cai_score = state.cai_score + (weight_left + weight_nucj + get_broken_codon_score(j1_node, q_1_node) + weight_nucq_1 + weight_nucq); 603 | 604 | update_if_better(BestP_val, newscore, cai_score); 605 | } 606 | if(dfa.nodes[q-1].size() == 2) break; 607 | } 608 | } 609 | } 610 | } 611 | } 612 | } 613 | } 614 | } 615 | } 616 | } 617 | } 618 | } 619 | } 620 | // M = P and M_P = P 621 | for (size_t i_node_nucpair_ = 0; i_node_nucpair_ < 16 * j; ++i_node_nucpair_){ 622 | auto& state = bestP[j_node][i_node_nucpair_]; 623 | if (state.score == util::value_min()) 624 | continue; 625 | 626 | auto i_node_nucpair = reverse_index(i_node_nucpair_); 627 | auto i = i_node_nucpair.node_first; 628 | auto i_num = i_node_nucpair.node_second; 629 | auto pair_nuc = i_node_nucpair.nucpair; 630 | auto i_node = make_pair(i, i_num); 631 | 632 | auto nuci = PTLN(pair_nuc); 633 | auto nucj_1 = PTRN(pair_nuc); 634 | 635 | if (i > 0 and j < seq_length){ 636 | 637 | auto M1_score = - func6(i, j-1, j-1, -1, nuci, nucj_1, -1, seq_length) + state.score; 638 | 639 | update_if_better(bestM[j_node][i_node], M1_score, state.cai_score); 640 | update_if_better(bestM_P[j_node][i_node], M1_score, state.cai_score); 641 | } 642 | } 643 | 644 | // M2 = M + M_P 645 | for (size_t i_node_ = 0; i_node_ < 2 * j; ++i_node_) { 646 | auto& state = bestM_P[j_node][i_node_]; 647 | auto i_node = reverse_index2(i_node_); 648 | auto i = i_node.first; 649 | 650 | if (state.score == util::value_min()) 651 | continue; 652 | 653 | if (i > 0 and j < seq_length){ 654 | for (size_t m_node = 0; m_node < 2 * i; ++m_node){ 655 | auto& m_new_state_score = bestM[i_node][m_node]; 656 | 657 | if (m_new_state_score.score == util::value_min()) 658 | continue; 659 | 660 | auto newscore = m_new_state_score.score + state.score; 661 | auto cai_score = m_new_state_score.cai_score + state.cai_score; 662 | update_if_better(bestM2[j_node][m_node], newscore, cai_score); 663 | } 664 | } 665 | } 666 | 667 | // C = C + P 668 | for (size_t i_node_nucpair_ = 0; i_node_nucpair_ < 16 * j; ++i_node_nucpair_){ 669 | auto& state = bestP[j_node][i_node_nucpair_]; 670 | if (state.score == util::value_min()) 671 | continue; 672 | 673 | auto i_node_nucpair = reverse_index(i_node_nucpair_); 674 | auto i = i_node_nucpair.node_first; 675 | auto i_num = i_node_nucpair.node_second; 676 | auto pair_nuc = i_node_nucpair.nucpair; 677 | auto i_node = make_pair(i, i_num); 678 | 679 | auto nuci = PTLN(pair_nuc); 680 | auto nucj_1 = PTRN(pair_nuc); 681 | 682 | if (i > 0){ 683 | auto& prefix_C = bestC[i_node]; 684 | 685 | if (prefix_C.score != util::value_min()){ 686 | auto newscore = - func3(i, j-1, nuci, nucj_1, seq_length) + prefix_C.score + state.score; 687 | 688 | auto cai_score = prefix_C.cai_score + state.cai_score; 689 | update_if_better(bestC[j_node], newscore, cai_score); 690 | } 691 | } 692 | else{ 693 | auto newscore = - func3(0, j-1, nuci, nucj_1, seq_length) + state.score; 694 | 695 | update_if_better(bestC[j_node], newscore, state.cai_score); 696 | } 697 | } 698 | } 699 | 700 | template 701 | template 702 | void BeamCKYParser::M2_beam(IndexType j, DFA_t& dfa){ 703 | 704 | auto j_node = make_pair(j, j_num); 705 | for (size_t i_node_ = 0; i_node_ < 2 * j; ++i_node_) { 706 | auto& state = bestM2[j_node][i_node_]; 707 | if (state.score == util::value_min()) 708 | continue; 709 | 710 | auto i_node = reverse_index2(i_node_); 711 | auto i = i_node.first; 712 | 713 | // 1. multi-loop 714 | for (IndexType p = i-1; p >= max(i - SINGLE_MAX_LEN, 0); --p){ 715 | vector> p_node_list; 716 | if (p == i - 1) 717 | for(auto& p_node_dict : dfa.auxiliary_left_edges[i_node]) 718 | p_node_list.push_back(p_node_dict.first); 719 | else p_node_list = dfa.nodes[p]; 720 | 721 | for (auto &p_node : p_node_list){ 722 | for (auto &p1_node_nucp : dfa.right_edges[p_node]){ 723 | auto p1_node = std::get<0>(p1_node_nucp); 724 | auto nucp = std::get<1>(p1_node_nucp); 725 | auto weight_nucp = std::get<2>(p1_node_nucp); 726 | 727 | if(p == i - 1 and p1_node != i_node) continue; 728 | if(p == i - 2 and dfa.nodes[p+1].size() == dfa.nodes[i].size() and p1_node.second != i_node.second) continue; 729 | 730 | auto q_list = next_pair[nucp][j_node]; 731 | 732 | for (auto &q_node_nucq : q_list){ 733 | auto q_node = std::get<0>(q_node_nucq); 734 | auto nucq = std::get<1>(q_node_nucq); 735 | auto weight_nucq = std::get<2>(q_node_nucq); 736 | auto q = q_node.first; 737 | 738 | if (i - p + q - j - 1 > SINGLE_MAX_LEN) continue; //ZL, i-p-1+q-j 739 | auto outer_pair = NTP(nucp, nucq); 740 | for (auto &q1_node_newnucq : dfa.right_edges[q_node]){ 741 | auto newnucq = std::get<1>(q1_node_newnucq); 742 | if (newnucq == nucq) { 743 | auto q1_node = std::get<0>(q1_node_newnucq); 744 | 745 | double cai_score = state.cai_score + (weight_nucp + get_broken_codon_score_map[p1_node][i_node] + get_broken_codon_score_map[j_node][q_node] + weight_nucq); 746 | double temp_left_cai = state.cai_score + (weight_nucp + get_broken_codon_score_map[p1_node][i_node]); 747 | NodeNucpair temp = {p_node.first, p_node.second, static_cast(NTP(nucp, nucq))}; 748 | update_if_better(bestMulti[q1_node][temp], state.score, cai_score, j_node, temp_left_cai); 749 | break; 750 | } 751 | } 752 | } 753 | } 754 | } 755 | } 756 | // 2. M = M2 757 | update_if_better(bestM[j_node][i_node], state.score, state.cai_score); 758 | } 759 | } 760 | 761 | template 762 | template 763 | void BeamCKYParser::M_beam(IndexType j, DFA_t& dfa) 764 | { 765 | 766 | auto j_node = make_pair(j, j_num); 767 | 768 | for (size_t i_node_ = 0; i_node_ < 2 * j; ++i_node_) { 769 | auto& state = bestM[j_node][i_node_]; 770 | 771 | if (state.score == util::value_min()) 772 | continue; 773 | 774 | auto i_node = reverse_index2(i_node_); 775 | for (auto &j1_node_nucj : dfa.right_edges[j_node]){ 776 | auto j1_node = std::get<0>(j1_node_nucj); 777 | auto nucj = std::get<1>(j1_node_nucj); 778 | auto weight_nucj = std::get<2>(j1_node_nucj); 779 | 780 | double cai_score = state.cai_score + weight_nucj; 781 | update_if_better(bestM[j1_node][i_node], state.score, cai_score); 782 | } 783 | } 784 | } 785 | 786 | template 787 | template 788 | void BeamCKYParser::C_beam(IndexType j, DFA_t& dfa) 789 | { 790 | // beam of C 791 | // C = C + U 792 | auto j_node = make_pair(j, j_num); 793 | 794 | auto& state = bestC[j_node]; 795 | 796 | 797 | for (auto &j1_node_nucj : dfa.right_edges[j_node]){ 798 | NodeType j1_node = std::get<0>(j1_node_nucj); 799 | IndexType nucj = std::get<1>(j1_node_nucj); 800 | auto weight_nucj = std::get<2>(j1_node_nucj); 801 | 802 | double cai_score = state.cai_score + (double)weight_nucj; 803 | update_if_better(bestC[j1_node], state.score, cai_score); 804 | } 805 | } 806 | 807 | template 808 | void BeamCKYParser::get_next_pair(DFA_t& dfa) { 809 | vector> temp_vector; 810 | for (NucType nuci = 0; nuci < NOTON; nuci++) { 811 | for (IndexType j = seq_length; j > 0; j--) { 812 | 813 | for (auto& j_node : dfa.nodes[j]) { 814 | for (auto& item : dfa.auxiliary_left_edges[j_node]) { 815 | NodeType j_1_node = item.first; 816 | temp_vector.clear(); 817 | for (auto& nuc_weight : item.second){ 818 | auto nuc = std::get<0>(nuc_weight); 819 | auto weight_nuc = std::get<1>(nuc_weight); 820 | if (_allowed_pairs[nuci][nuc]) 821 | temp_vector.push_back(make_tuple(j_1_node, nuc, weight_nuc)); 822 | } 823 | if(temp_vector.size() == 0){ 824 | if (next_pair[nuci][j_1_node].size() > 0 and next_pair[nuci][j_node].size() > 0) { 825 | // merge 826 | IndexType index1 = std::get<0>(next_pair[nuci][j_1_node][0]).first; 827 | IndexType index2 = std::get<0>(next_pair[nuci][j_node][0]).first; 828 | if(index1/3 == index2/3) 829 | next_pair[nuci][j_1_node].insert(next_pair[nuci][j_1_node].end(), 830 | next_pair[nuci][j_node].begin(), 831 | next_pair[nuci][j_node].end()); 832 | else if(index1 > index2){ 833 | next_pair[nuci][j_1_node].clear(); 834 | next_pair[nuci][j_1_node].insert(next_pair[nuci][j_1_node].end(), 835 | next_pair[nuci][j_node].begin(), 836 | next_pair[nuci][j_node].end()); 837 | } 838 | }else if (next_pair[nuci][j_node].size() > 0) 839 | next_pair[nuci][j_1_node].insert(next_pair[nuci][j_1_node].end(), 840 | next_pair[nuci][j_node].begin(), 841 | next_pair[nuci][j_node].end()); 842 | } 843 | else 844 | next_pair[nuci][j_1_node].insert(next_pair[nuci][j_1_node].end(), 845 | temp_vector.begin(), 846 | temp_vector.end()); 847 | } 848 | } 849 | } 850 | } 851 | } 852 | 853 | template 854 | void BeamCKYParser::get_next_pair_set() { 855 | 856 | for(NucType nuci=0; nuci<5; nuci++){ 857 | for (auto& j_node_vnuc : next_pair[nuci]) { 858 | NodeType j_node = j_node_vnuc.first; 859 | next_pair_set[nuci][j_node] = set>(j_node_vnuc.second.begin(), j_node_vnuc.second.end()); 860 | } 861 | } 862 | for(NucType nuci=0; nuci<5; nuci++){ 863 | for (auto& j_node_vnuc : next_pair_set[nuci]) { 864 | NodeType j_node = j_node_vnuc.first; 865 | next_pair[nuci][j_node].clear(); 866 | for(auto& item : next_pair_set[nuci][j_node]){ 867 | next_pair[nuci][j_node].push_back(item); 868 | } 869 | } 870 | } 871 | } 872 | 873 | template 874 | void BeamCKYParser::get_prev_pair(DFA_t& dfa) { 875 | vector> temp_vector; 876 | for (NucType nuci = 0; nuci < NOTON; nuci++) { 877 | for (IndexType j = 0; j < seq_length; j++) { 878 | for (auto& j_node : dfa.nodes[j]) { 879 | for (auto& item : dfa.auxiliary_right_edges[j_node]) { 880 | NodeType j1_node = item.first; 881 | temp_vector.clear(); 882 | for (auto& nuc_weight : item.second){ 883 | auto nuc = std::get<0>(nuc_weight); 884 | auto weight_nuc = std::get<1>(nuc_weight); 885 | if (_allowed_pairs[nuci][nuc]) 886 | temp_vector.push_back(make_tuple(j1_node, nuc, weight_nuc)); 887 | } 888 | if(temp_vector.size() == 0){ 889 | if (prev_pair[nuci][j1_node].size() > 0 and prev_pair[nuci][j_node].size() > 0) { 890 | // merge 891 | IndexType index1 = std::get<0>(prev_pair[nuci][j1_node][0]).first-1; 892 | IndexType index2 = std::get<0>(prev_pair[nuci][j_node][0]).first-1; 893 | if(index1/3 == index2/3) 894 | prev_pair[nuci][j1_node].insert(prev_pair[nuci][j1_node].end(), 895 | prev_pair[nuci][j_node].begin(), 896 | prev_pair[nuci][j_node].end()); 897 | else if(index1 < index2){ 898 | prev_pair[nuci][j1_node].clear(); 899 | prev_pair[nuci][j1_node].insert(prev_pair[nuci][j1_node].end(), 900 | prev_pair[nuci][j_node].begin(), 901 | prev_pair[nuci][j_node].end()); 902 | } 903 | }else if (prev_pair[nuci][j_node].size() > 0) 904 | prev_pair[nuci][j1_node].insert(prev_pair[nuci][j1_node].end(), 905 | prev_pair[nuci][j_node].begin(), 906 | prev_pair[nuci][j_node].end()); 907 | } 908 | else 909 | prev_pair[nuci][j1_node].insert(prev_pair[nuci][j1_node].end(), 910 | temp_vector.begin(), 911 | temp_vector.end()); 912 | } 913 | } 914 | } 915 | } 916 | } 917 | 918 | template 919 | void BeamCKYParser::get_prev_pair_set() { 920 | 921 | for(NucType nuci=0; nuci<5; nuci++){ 922 | for (auto& j_node_vnuc : prev_pair[nuci]) { 923 | NodeType j_node = j_node_vnuc.first; 924 | prev_pair_set[nuci][j_node] = 925 | set>(j_node_vnuc.second.begin(), j_node_vnuc.second.end()); 926 | } 927 | } 928 | for(NucType nuci=0; nuci<5; nuci++){ 929 | for (auto& j_node_vnuc : prev_pair_set[nuci]) { 930 | NodeType j_node = j_node_vnuc.first; 931 | prev_pair[nuci][j_node].clear(); 932 | for(auto& item : prev_pair_set[nuci][j_node]){ 933 | prev_pair[nuci][j_node].push_back(item); 934 | } 935 | } 936 | } 937 | } 938 | 939 | #ifdef SPECIAL_HP 940 | template 941 | void BeamCKYParser::special_hp(DFA_t& dfa, int8_t hairpin_length) { 942 | int8_t hairpin_type = HAIRPINTYPE(hairpin_length); 943 | vector> queue; 944 | vector> frontier; 945 | // vector 946 | for(IndexType i=0; i<=seq_length - hairpin_length; i++){ 947 | for(NodeType i_node : dfa.nodes[i]){ 948 | int count = hairpin_length; 949 | queue.clear(); 950 | queue.push_back(make_tuple(i_node, "", double(0.), i_node)); 951 | while(count > 0){ 952 | count --; 953 | frontier.clear(); 954 | for(auto& node_str : queue){ 955 | NodeType cur_node = std::get<0>(node_str); 956 | string cur_str = std::get<1>(node_str); 957 | double cur_lncai = std::get<2>(node_str); 958 | for(auto& node_nuc : dfa.right_edges[cur_node]){ 959 | NodeType new_node = std::get<0>(node_nuc); 960 | string new_str = cur_str + GET_ACGU(std::get<1>(node_nuc)); 961 | double new_total_lncai = cur_lncai + std::get<2>(node_nuc); 962 | frontier.push_back(make_tuple(new_node, new_str, new_total_lncai, cur_node)); 963 | } 964 | } 965 | queue.swap(frontier); 966 | } 967 | for(auto node_str : queue){ 968 | auto j_node = std::get<3>(node_str); 969 | auto temp_seq = std::get<1>(node_str); 970 | auto cai_score = std::get<2>(node_str); 971 | auto hairpin_length = temp_seq.size(); 972 | int8_t hairpin_type = HAIRPINTYPE(hairpin_length); 973 | NucType nuci = GET_ACGU_NUC(temp_seq[0]); 974 | NucType nucj = GET_ACGU_NUC(temp_seq[temp_seq.size() - 1]); 975 | auto temp_nucpair = NTP(nuci, nucj); 976 | 977 | ScoreType special_hairpin_score = func1(temp_seq, hairpin_type); 978 | if(special_hairpin_score == SPECIAL_HAIRPIN_SCORE_BASELINE){ 979 | 980 | auto newscore = - func12(0, hairpin_length - 1, GET_ACGU_NUC(temp_seq[0]), GET_ACGU_NUC(temp_seq[1]), GET_ACGU_NUC(temp_seq[hairpin_length-2]), GET_ACGU_NUC(temp_seq[hairpin_length-1]), tetra_hex_tri); 981 | hairpin_seq_score_cai[i_node][j_node][temp_nucpair].push_back(make_tuple(temp_seq, newscore, cai_score)); 982 | } 983 | 984 | else{ 985 | hairpin_seq_score_cai[i_node][j_node][temp_nucpair].push_back(make_tuple(temp_seq, special_hairpin_score, cai_score)); 986 | } 987 | } 988 | } 989 | } 990 | } 991 | #endif 992 | 993 | template 994 | void BeamCKYParser::preprocess(DFA_t& dfa) { 995 | 996 | vector> new_q_list, new_p_list; 997 | set visited; 998 | 999 | // next_list 1000 | NodeType init_node = make_pair(0, 0); 1001 | for (NucType nuci=1; nuci(q_node_nucq); 1011 | auto q_num = q_node.second; 1012 | auto q = q_node.first; 1013 | auto nucq = std::get<1>(q_node_nucq); 1014 | 1015 | // q_node 1016 | next_list[nuci][q_node].push_back(q_node_nucq); 1017 | // q-1 is special 1018 | for(auto& q_1_node_dict : dfa.auxiliary_left_edges[q_node]){ 1019 | NodeType q_1_node = q_1_node_dict.first; 1020 | next_list[nuci][q_1_node].push_back(q_node_nucq); 1021 | } 1022 | for(IndexType j=q-2; j>=max(0, q-SINGLE_MAX_LEN-1); j--) 1023 | for(NodeType j_node : dfa.nodes[j]) 1024 | next_list[nuci][j_node].push_back(q_node_nucq); 1025 | 1026 | for(auto& q1_node_list : dfa.auxiliary_right_edges[q_node]){ 1027 | NodeType q1_node = q1_node_list.first; 1028 | 1029 | if(dfa.nodes[q].size() == 1 and dfa.nodes[q+1].size() == 2 and ((q1_node_list.second)[0]).first != nucq) continue; 1030 | 1031 | if(visited.find(q1_node) == visited.end()){ 1032 | visited.insert(q1_node); 1033 | new_q_list.insert(new_q_list.end(), next_pair[nuci][q1_node].cbegin(), next_pair[nuci][q1_node].cend()); 1034 | } 1035 | break; 1036 | } 1037 | } 1038 | q_list.swap(new_q_list); 1039 | } 1040 | } 1041 | 1042 | // prev_list 1043 | init_node = make_pair(seq_length, 0); 1044 | for (NucType nucj=1; nucj(p_node_nucp_1); 1053 | auto p_num = p_node.second; 1054 | auto p = p_node.first; 1055 | auto nucp_1 = std::get<1>(p_node_nucp_1); 1056 | 1057 | // p_node 1058 | prev_list[nucj][p_node].push_back(p_node_nucp_1); 1059 | // p+1 is special 1060 | for(auto& p1_node_dict : dfa.auxiliary_right_edges[p_node]){ 1061 | NodeType p1_node = p1_node_dict.first; 1062 | prev_list[nucj][p1_node].push_back(p_node_nucp_1); 1063 | } 1064 | for(IndexType i=p+2; i<=min(seq_length, p+SINGLE_MAX_LEN+1); i++) 1065 | for(NodeType i_node : dfa.nodes[i]) 1066 | prev_list[nucj][i_node].push_back(p_node_nucp_1); 1067 | 1068 | for(auto& p_1_node_new_nucp_1 : dfa.left_edges[p_node]){ 1069 | 1070 | NucType new_nucp_1 = std::get<1>(p_1_node_new_nucp_1); 1071 | if(nucp_1 != new_nucp_1) continue; 1072 | NodeType p_1_node = std::get<0>(p_1_node_new_nucp_1); 1073 | 1074 | if(visited.find(p_1_node) == visited.end()){ 1075 | visited.insert(p_1_node); 1076 | new_p_list.insert(new_p_list.end(), prev_pair[nucj][p_1_node].cbegin(), prev_pair[nucj][p_1_node].cend()); 1077 | } 1078 | } 1079 | } 1080 | p_list.swap(new_p_list); 1081 | } 1082 | } 1083 | 1084 | // stacking energy computation 1085 | int newscore; 1086 | for(int8_t outer_pair=1; outer_pair<=6; outer_pair++){ 1087 | auto nuci_1 = PTLN(outer_pair); 1088 | auto nucq = PTRN(outer_pair); 1089 | for(int8_t inner_pair=1; inner_pair<=6; inner_pair++){ 1090 | auto nuci = PTLN(inner_pair); 1091 | auto nucj_1 = PTRN(inner_pair); 1092 | newscore = - func14(0, 1, 1, 0, 1093 | nuci_1, nuci, nucj_1, nucq, 1094 | nuci_1, nuci, nucj_1, nucq); 1095 | stacking_score[outer_pair-1][inner_pair-1] = newscore; 1096 | 1097 | for (IndexType l=0; l<=SINGLE_MAX_LEN; l++){ 1098 | newscore = - func14(0, l+2, 1, 0, 1099 | nuci_1, nuci, nucj_1, nucq, 1100 | nuci_1, nuci, nucj_1, nucq); 1101 | 1102 | bulge_score[outer_pair-1][inner_pair-1][l] = newscore; 1103 | } 1104 | } 1105 | } 1106 | 1107 | #ifdef SPECIAL_HP 1108 | // Triloops 1109 | special_hp(dfa, 5); 1110 | // Tetraloop37 1111 | special_hp(dfa, 6); 1112 | // Hexaloops 1113 | special_hp(dfa, 8); 1114 | #endif 1115 | } 1116 | 1117 | template 1118 | DecoderResult BeamCKYParser::parse( 1119 | DFA_t& dfa, Codon& codon, std::string& aa_seq, std::vector& p, 1120 | std::unordered_map& aa_best_in_codon, 1121 | std::unordered_map, std::tuple, 1122 | std::hash>>>& best_path_in_one_codon, 1123 | std::unordered_map>& aa_graphs_with_ln_weights) { 1124 | 1125 | 1126 | protein = p; 1127 | aa_graphs_with_ln_w = aa_graphs_with_ln_weights; 1128 | aa_best_path_in_a_whole_codon = aa_best_in_codon; 1129 | best_path_in_one_codon_unit = best_path_in_one_codon; 1130 | 1131 | seq_length = 3 * static_cast(aa_seq.size()); 1132 | next_pair.resize(5); 1133 | next_pair_set.resize(5); 1134 | get_next_pair(dfa); 1135 | get_next_pair_set(); 1136 | 1137 | prev_pair.resize(5); 1138 | prev_pair_set.resize(5); 1139 | get_prev_pair(dfa); 1140 | get_prev_pair_set(); 1141 | 1142 | next_list.resize(5); 1143 | prev_list.resize(5); 1144 | stacking_score.resize(6, vector(6)); 1145 | bulge_score.resize(6, vector>(6, vector(SINGLE_MAX_LEN+1))); 1146 | 1147 | preprocess(dfa); 1148 | 1149 | 1150 | int reserved_size = (seq_length + 1) * 16; //node,nucpair 1151 | int reserved_size2 = (seq_length + 1) * 2; //node 1152 | 1153 | bestH.resize(reserved_size2); 1154 | bestP.resize(reserved_size2); 1155 | bestM2.resize(reserved_size2); //slim signature, Liang Zhang 1156 | bestMulti.resize(reserved_size2); 1157 | bestM.resize(reserved_size2); //slim signature, Liang Zhang 1158 | bestM_P.resize(reserved_size2); // hzhang: inter-state: P -> M 1159 | 1160 | bestC.resize(reserved_size2); //slim signature, Liang Zhang 1161 | 1162 | get_broken_codon_score_map.resize(reserved_size2); 1163 | for (auto& e : get_broken_codon_score_map){ //slim signature, Liang Zhang 1164 | e.resize(reserved_size2); 1165 | for (auto& ee : e){ 1166 | ee = util::value_min(); 1167 | } 1168 | } 1169 | 1170 | for (auto& ee : bestC){ 1171 | ee.score = util::value_min(); 1172 | ee.cai_score = util::value_min(); 1173 | } 1174 | 1175 | 1176 | for (auto& e : bestH){ 1177 | e.resize(reserved_size); 1178 | for (auto& ee : e){ 1179 | ee.score = util::value_min(); 1180 | ee.cai_score = util::value_min(); 1181 | } 1182 | 1183 | } 1184 | 1185 | for (auto& e : bestP){ 1186 | e.resize(reserved_size); 1187 | for (auto& ee : e){ 1188 | ee.score = util::value_min(); 1189 | ee.cai_score = util::value_min(); 1190 | } 1191 | } 1192 | 1193 | for (auto& e : bestMulti){ //slim signature, Liang Zhang 1194 | e.resize(reserved_size); 1195 | for (auto& ee : e){ 1196 | ee.score = util::value_min(); 1197 | ee.cai_score = util::value_min(); 1198 | } 1199 | } 1200 | 1201 | for (auto& e : bestM2){ //slim signature, Liang Zhang 1202 | e.resize(reserved_size2); 1203 | for (auto& ee : e){ 1204 | ee.score = util::value_min(); 1205 | ee.cai_score = util::value_min(); 1206 | } 1207 | } 1208 | 1209 | for (auto& e : bestM){ //slim signature, Liang Zhang 1210 | e.resize(reserved_size2); 1211 | for (auto& ee : e){ 1212 | ee.score = util::value_min(); 1213 | ee.cai_score = util::value_min(); 1214 | } 1215 | } 1216 | 1217 | for (auto& e : bestM_P){ // hzhang 1218 | e.resize(reserved_size2); 1219 | for (auto& ee : e){ 1220 | ee.score = util::value_min(); 1221 | ee.cai_score = util::value_min(); 1222 | } 1223 | } 1224 | 1225 | for (IndexType i = 0; i <= seq_length; ++i) { 1226 | for (auto & node_i : dfa.nodes[i]){ 1227 | for (IndexType l = 0; l <= SINGLE_MAX_LEN; ++l){ 1228 | auto j = i + l; 1229 | if (j > seq_length) 1230 | break; 1231 | 1232 | for (auto & node_j : dfa.nodes[j]){ 1233 | get_broken_codon_score_map[node_i][node_j] = get_broken_codon_score(node_i, node_j); 1234 | } 1235 | 1236 | } 1237 | } 1238 | 1239 | } 1240 | 1241 | bestC[make_pair(0,0)].score = 0; 1242 | bestC[make_pair(0,0)].cai_score = double(0.); 1243 | 1244 | for(const auto& node_nue_weight : dfa.right_edges[make_pair(0,0)]) { 1245 | auto node = std::get<0>(node_nue_weight); 1246 | auto weight_nue = std::get<2>(node_nue_weight); 1247 | update_if_better(bestC[node], 0, weight_nue); 1248 | } 1249 | 1250 | for (IndexType j = 0; j <= seq_length; ++j) { 1251 | cout << "j=" << j << "\r" << flush; 1252 | 1253 | hairpin_beam<0>(j, dfa); 1254 | hairpin_beam<1>(j, dfa); 1255 | 1256 | if (j == 0) 1257 | continue; 1258 | 1259 | Multi_beam<0>(j, dfa); 1260 | Multi_beam<1>(j, dfa); 1261 | P_beam<0>(j, dfa); 1262 | P_beam<1>(j, dfa); 1263 | M2_beam<0>(j, dfa); 1264 | M2_beam<1>(j, dfa); 1265 | 1266 | if (j < seq_length) { 1267 | M_beam<0>(j, dfa); 1268 | M_beam<1>(j, dfa); 1269 | C_beam<0>(j, dfa); 1270 | C_beam<1>(j, dfa); 1271 | } 1272 | } 1273 | 1274 | auto end_node = make_pair(seq_length, 0); 1275 | auto viterbi = bestC[end_node]; 1276 | 1277 | auto backtrace_result = backtrace(dfa, viterbi, end_node); 1278 | 1279 | return DecoderResult{backtrace_result.seq, backtrace_result.structure, viterbi.score / -100.0, 0., viterbi.cai_score, seq_length}; 1280 | } 1281 | 1282 | template 1283 | BeamCKYParser::BeamCKYParser(const double lambda_value, const bool verbose) 1284 | : lambda(lambda_value), is_verbose(verbose) { 1285 | func9(0, 0); 1286 | } 1287 | 1288 | } -------------------------------------------------------------------------------- /src/beam_cky_parser.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "Utils/network.h" 14 | #include "Utils/codon.h" 15 | #include "Utils/flat.h" 16 | 17 | 18 | namespace LinearDesign { 19 | 20 | namespace detail { 21 | 22 | struct NodeNucIndex { 23 | LINEAR_DESIGN_INLINE size_t operator()(const NodeNucpair& node_nucpair) const { 24 | return (node_nucpair.node_first << 4) | (node_nucpair.node_second << 3) | node_nucpair.nucpair; 25 | } 26 | }; 27 | 28 | struct NodeNucReverseIndex { 29 | LINEAR_DESIGN_INLINE NodeNucpair operator()(const size_t index) const { 30 | NodeNucpair node_nucpair = {IndexType(index >> 4), NumType((index & 0xf) >> 3), NucPairType(index & 0x7)}; 31 | return node_nucpair; 32 | } 33 | }; 34 | 35 | struct NodeIndex { 36 | LINEAR_DESIGN_INLINE size_t operator()(const NodeType& node) const { 37 | return (node.first << 1) | node.second; 38 | } 39 | }; 40 | 41 | struct NodeNucReverseIndex2 { 42 | LINEAR_DESIGN_INLINE NodeType operator()(const size_t index) const { 43 | return {index >> 1, (index & 0x1)}; 44 | } 45 | }; 46 | 47 | } /* detail */ 48 | 49 | template , 52 | typename DFAType = DFA> 53 | string get_nuc_from_dfa_cai(DFAType& dfa, const NodeType& start_node, const NodeType& end_node, 54 | const std::vector& protein, std::unordered_map, 55 | std::tuple, std::hash>>>& 56 | best_path_in_one_codon_unit, std::unordered_map& aa_best_path_in_a_whole_codon) { 57 | 58 | IndexType s_index = start_node.first; 59 | IndexType t_index = end_node.first; 60 | 61 | if (s_index >= t_index) 62 | return ""; 63 | 64 | auto aa_left = protein[s_index / 3]; // tri letter 65 | auto aa_right = protein[t_index / 3]; 66 | auto start_node_re_index = make_pair(s_index % 3, start_node.second); 67 | auto end_node_re_index = make_pair(t_index % 3, end_node.second); 68 | if (t_index - s_index < 3) { 69 | if (s_index / 3 == t_index / 3) { 70 | std::string temp_seq = ""; 71 | auto& nucs = best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index, end_node_re_index)]; 72 | temp_seq.append(1, GET_ACGU(std::get<1>(nucs))); 73 | if (std::get<2>(nucs) != k_void_nuc) 74 | temp_seq.append(1, GET_ACGU(std::get<2>(nucs))); 75 | 76 | if (temp_seq.length() != end_node.first - start_node.first) { 77 | assert(false); 78 | } 79 | return temp_seq; 80 | } else { 81 | std::string temp_left = ""; 82 | std::string temp_right = ""; 83 | if (s_index % 3 != 0) { 84 | auto& nucs = best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index, make_pair(0, 0))]; 85 | temp_left.append(1, GET_ACGU(std::get<1>(nucs))); 86 | if (std::get<2>(nucs) != k_void_nuc) 87 | temp_left.append(1, GET_ACGU(std::get<2>(nucs))); 88 | } 89 | 90 | if (t_index % 3 != 0) { 91 | auto& nucs = best_path_in_one_codon_unit[aa_right][make_tuple(make_pair(0, 0), end_node_re_index)]; 92 | temp_right.append(1, GET_ACGU(std::get<1>(nucs))); 93 | if (std::get<2>(nucs) != k_void_nuc) 94 | temp_right.append(1, GET_ACGU(std::get<2>(nucs))); 95 | } 96 | 97 | assert((temp_left + temp_right).length() == end_node.first - start_node.first); 98 | 99 | return temp_left + temp_right; 100 | } 101 | 102 | } else { 103 | 104 | std::string temp_left = ""; 105 | std::string temp_mid = ""; 106 | std::string temp_right = ""; 107 | 108 | if (s_index % 3 != 0) { 109 | auto& nucs = best_path_in_one_codon_unit[aa_left][make_tuple(start_node_re_index, make_pair(0, 0))]; 110 | temp_left.append(1, GET_ACGU(std::get<1>(nucs))); 111 | if (std::get<2>(nucs) != k_void_nuc) 112 | temp_left.append(1, GET_ACGU(std::get<2>(nucs))); 113 | } 114 | 115 | IndexType protein_start_index = s_index / 3; 116 | if (s_index % 3 != 0) 117 | protein_start_index++; 118 | 119 | IndexType protein_end_index = t_index / 3; 120 | 121 | if (protein_start_index != protein_end_index) { 122 | for (IndexType protein_index = protein_start_index; protein_index < protein_end_index; ++protein_index) { 123 | 124 | std::string nucs; 125 | auto aa_tri = protein[protein_index]; 126 | if (k_map_3_1.count(aa_tri)) { 127 | nucs = aa_best_path_in_a_whole_codon[std::string(1, k_map_3_1[aa_tri])]; 128 | } else if (aa_best_path_in_a_whole_codon.count(aa_tri)) { 129 | nucs = aa_best_path_in_a_whole_codon[aa_tri]; 130 | } else { 131 | assert(false); 132 | } 133 | 134 | for (auto nuc : nucs) { 135 | temp_mid.append(1, nuc); 136 | } 137 | } 138 | } 139 | 140 | if (t_index % 3 != 0) { 141 | auto& nucs = best_path_in_one_codon_unit[aa_right][make_tuple(make_pair(0, 0), end_node_re_index)]; 142 | temp_right.append(1, GET_ACGU(std::get<1>(nucs))); 143 | if (std::get<2>(nucs) != k_void_nuc) 144 | temp_right.append(1, GET_ACGU(std::get<2>(nucs))); 145 | } 146 | 147 | assert((temp_left + temp_mid + temp_right).length() == end_node.first - start_node.first); 148 | 149 | return temp_left + temp_mid + temp_right; 150 | } 151 | } 152 | 153 | template > 156 | class BeamCKYParser { 157 | public: 158 | using State_t = State; 159 | using DFA_t = DFA; 160 | using ScoreInnerDate_t = ScoreInnerDate; 161 | using NextPair_t = vector>, hash_pair>>; 162 | using NextPairSet_t = vector>, hash_pair>>; 163 | using PrefixScore_t = unordered_map; 164 | using BestX_t_CAI = Flat, detail::NodeIndex>; 165 | using BestM_t_CAI = Flat, detail::NodeIndex>; 166 | using BestC_t_CAI = Flat; 167 | using Broken_codon_t_CAI = Flat, detail::NodeIndex>; 168 | 169 | BeamCKYParser(const double lambda_value, const bool verbose); 170 | 171 | DecoderResult parse(DFA_t& dfa, 172 | Codon& codon, 173 | std::string& aa_seq, 174 | std::vector& p, 175 | std::unordered_map& aa_best_in_codon, 176 | std::unordered_map, 177 | std::tuple, std::hash>>>& best_path_in_one_codon, 178 | std::unordered_map>& aa_graphs_with_ln_weights); 179 | 180 | private: 181 | 182 | template 183 | void hairpin_beam(IndexType j, DFA_t& dfa); 184 | 185 | template 186 | void Multi_beam(IndexType j, DFA_t& dfa); 187 | 188 | template 189 | void P_beam(IndexType j, DFA_t& dfa); 190 | 191 | template 192 | void M2_beam(IndexType j, DFA_t& dfa); 193 | 194 | template 195 | void M_beam(IndexType j, DFA_t& dfa); 196 | 197 | template 198 | void C_beam(IndexType j, DFA_t& dfa); 199 | 200 | void update_if_better(State_t &state, const ScoreType newscore, const double cai_score) { 201 | if (state.score + state.cai_score < newscore + cai_score) { 202 | state.score = newscore; 203 | state.cai_score = cai_score; 204 | } 205 | } 206 | 207 | void update_if_better(State_t &state, const ScoreType newscore, const double cai_score, const NodeType pre_node, const double pre_left_cai) { 208 | if (state.score + state.cai_score < newscore + cai_score) { 209 | state.score = newscore; 210 | state.cai_score = cai_score; 211 | state.pre_node = pre_node; 212 | state.pre_left_cai = pre_left_cai; 213 | } 214 | } 215 | 216 | 217 | void get_next_pair(DFA_t& dfa); 218 | void get_next_pair_set(); 219 | 220 | void get_prev_pair(DFA_t& dfa); 221 | void get_prev_pair_set(); 222 | 223 | void preprocess(DFA_t& dfa); 224 | 225 | BacktraceResult backtrace(DFA_t& dfa, const State_t& state, NodeType end_node); 226 | 227 | ScoreType quickselect_partition(std::vector& scores, 228 | ScoreType lower, ScoreType upper); 229 | 230 | ScoreType quickselect(std::vector& scores, 231 | const ScoreType lower, const ScoreType upper, const IndexType k); 232 | 233 | double get_broken_codon_score(const NodeType& start_node, const NodeType& end_node); 234 | 235 | double lambda; 236 | bool is_verbose; 237 | 238 | IndexType seq_length; 239 | 240 | BestX_t_CAI bestH, bestP, bestMulti; 241 | BestM_t_CAI bestM2, bestM, bestM_P; // hzhang: bestM_P 242 | BestC_t_CAI bestC; 243 | 244 | detail::NodeNucReverseIndex reverse_index; 245 | detail::NodeNucReverseIndex2 reverse_index2; 246 | 247 | NextPair_t next_pair; 248 | NextPairSet_t next_pair_set; 249 | 250 | NextPair_t prev_pair; 251 | NextPairSet_t prev_pair_set; 252 | 253 | NextPair_t next_list; 254 | NextPair_t prev_list; 255 | 256 | vector>> bulge_score; 257 | vector> stacking_score; 258 | 259 | std::unordered_map> aa_graphs_with_ln_w; 260 | 261 | std::vector protein; 262 | std::unordered_map aa_best_path_in_a_whole_codon; 263 | std::unordered_map, std::tuple, 265 | std::hash>>> best_path_in_one_codon_unit; 266 | 267 | Broken_codon_t_CAI get_broken_codon_score_map; 268 | 269 | #ifdef SPECIAL_HP 270 | unordered_map>>, hash_pair>, hash_pair> hairpin_seq_score_cai; 271 | void special_hp(DFA_t& dfa, int8_t hairpin_length); 272 | #endif 273 | }; 274 | 275 | } 276 | -------------------------------------------------------------------------------- /src/linear_design.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "beam_cky_parser.h" 3 | #include "beam_cky_parser.cc" 4 | #include "Utils/reader.h" 5 | #include "Utils/common.h" 6 | #include "Utils/codon.h" 7 | 8 | // #ifndef CODON_TABLE 9 | // #define CODON_TABLE "./codon_usage_freq_table_human.csv" 10 | // #endif 11 | 12 | #ifndef CODING_WHEEL 13 | #define CODING_WHEEL "./coding_wheel.txt" 14 | #endif 15 | 16 | using namespace LinearDesign; 17 | 18 | template 19 | bool output_result(const DecoderResult& result, 20 | const double duration, const double lambda, const bool is_verbose, 21 | const Codon& codon, string& CODON_TABLE) { 22 | 23 | stringstream ss; 24 | if (is_verbose) 25 | ss << "Using lambda = " << (lambda / 100.) << "; Using codon frequency table = " << CODON_TABLE << endl; 26 | ss << "mRNA sequence: " << result.sequence << endl; 27 | ss << "mRNA structure: " << result.structure << endl; 28 | ss << "mRNA folding free energy: " << std::setprecision(2) << fixed << result.score 29 | << " kcal/mol; mRNA CAI: " << std::setprecision(3) 30 | << fixed << codon.calc_cai(result.sequence) << endl; 31 | if (is_verbose) 32 | ss << "Runtime: " << duration << " seconds" << endl; 33 | cout << ss.str() << endl; 34 | 35 | return true; 36 | } 37 | 38 | void show_usage() { 39 | cerr << "echo SEQUENCE | ./lineardesign -l [LAMBDA]" << endl; 40 | cerr << "OR" << endl; 41 | cerr << "cat SEQ_FILE_OR_FASTA_FILE | ./lineardesign -l [LAMBDA]" << endl; 42 | } 43 | 44 | 45 | int main(int argc, char** argv) { 46 | 47 | // default args 48 | double lambda = 0.0f; 49 | bool is_verbose = false; 50 | string CODON_TABLE = "./codon_usage_freq_table_human.csv"; 51 | 52 | // parse args 53 | if (argc != 4) { 54 | show_usage(); 55 | return 1; 56 | }else{ 57 | lambda = atof(argv[1]); 58 | is_verbose = atoi(argv[2]) == 1; 59 | if (string(argv[3]) != ""){ 60 | CODON_TABLE = argv[3]; 61 | } 62 | } 63 | lambda *= 100.; 64 | 65 | // load codon table and coding wheel 66 | Codon codon(CODON_TABLE); 67 | std::unordered_map> aa_graphs_with_ln_weights; 68 | std::unordered_map, std::tuple, std::hash>>> best_path_in_one_codon_unit; 69 | std::unordered_map aa_best_path_in_a_whole_codon; 70 | prepare_codon_unit_lattice(CODING_WHEEL, codon, aa_graphs_with_ln_weights, best_path_in_one_codon_unit, aa_best_path_in_a_whole_codon, lambda); 71 | 72 | // main loop 73 | string aa_seq, aa_tri_seq; 74 | vector aa_seq_list, aa_name_list; 75 | // load input 76 | for (string seq; getline(cin, seq);){ 77 | if (seq.empty()) continue; 78 | if (seq[0] == '>'){ 79 | aa_name_list.push_back(seq); // sequence name 80 | if (!aa_seq.empty()) 81 | aa_seq_list.push_back(aa_seq); 82 | aa_seq.clear(); 83 | continue; 84 | }else{ 85 | rtrim(seq); 86 | aa_seq += seq; 87 | } 88 | } 89 | if (!aa_seq.empty()) 90 | aa_seq_list.push_back(aa_seq); 91 | 92 | // start design 93 | for(int i = 0; i < aa_seq_list.size(); i++){ 94 | if (aa_name_list.size() > i) 95 | cout << aa_name_list[i] << endl; 96 | auto& aa_seq = aa_seq_list[i]; 97 | // convert to uppercase 98 | transform(aa_seq.begin(), aa_seq.end(), aa_seq.begin(), ::toupper); 99 | aa_tri_seq.clear(); 100 | if (is_verbose) 101 | cout << "Input protein: " << aa_seq << endl; 102 | if (!ReaderTraits::cvt_to_seq(aa_seq, aa_tri_seq)) 103 | continue; 104 | 105 | // init parser 106 | BeamCKYParser parser(lambda, is_verbose); 107 | 108 | auto protein = util::split(aa_tri_seq, ' '); 109 | // parse 110 | auto system_start = chrono::system_clock::now(); 111 | auto dfa = get_dfa(aa_graphs_with_ln_weights, util::split(aa_tri_seq, ' ')); 112 | auto result = parser.parse(dfa, codon, aa_seq, protein, aa_best_path_in_a_whole_codon, best_path_in_one_codon_unit, aa_graphs_with_ln_weights); 113 | auto system_diff = chrono::system_clock::now() - system_start; 114 | auto system_duration = chrono::duration(system_diff).count(); 115 | 116 | // output 117 | output_result(result, system_duration, lambda, is_verbose, codon, CODON_TABLE); 118 | 119 | #ifdef FINAL_CHECK 120 | if (codon.cvt_rna_seq_to_aa_seq(result.sequence) != aa_seq) { 121 | std::cerr << "Final Check Failed:" << std::endl; 122 | std::cerr << codon.cvt_rna_seq_to_aa_seq(result.sequence) << std::endl; 123 | std::cerr << aa_seq << std::endl; 124 | assert(false); 125 | } 126 | #endif 127 | } 128 | return 0; 129 | } 130 | -------------------------------------------------------------------------------- /testseq: -------------------------------------------------------------------------------- 1 | >seq1 2 | MPNTLACP 3 | >seq2 4 | MLDQVNKLKYPEVSLT* 5 | --------------------------------------------------------------------------------