├── nt_database └── empty_db.fasta ├── utils ├── seqkit ├── __pycache__ │ └── utils.cpython-36.pyc ├── FreeKnot │ ├── COPYRIGHT │ ├── COPYRIGHT.txt │ ├── BpseqWriter.pm │ ├── BracketPairs.pm │ ├── ScoringFunctions.pm │ ├── DPWriter.pm │ ├── BpseqParser.pm │ ├── README │ ├── ChordModel.pm │ ├── README.txt │ ├── MIS.pm │ ├── CircleGraph.pm │ ├── DPParser.pm │ ├── PrimitivePseudoknotExtractor.pm │ ├── VertexSubset.pm │ ├── MWIS.pm │ └── remove_pseudoknot.pl ├── bpseq2dbn.py ├── getpssm.pl ├── SPOT-RNA2.py ├── utils.py └── parse_blastn_local.pl ├── requirements.txt ├── sample_run ├── 6ufj.fasta ├── sample_seq.fasta ├── sample_seq_features │ ├── sample_seq.db │ ├── sample_seq.fasta │ ├── temp.txt │ ├── sample_seq.tfrecords │ ├── sample_seq.dbn │ ├── sample_seq.aln │ ├── temp.sto │ ├── sample_seq.sto │ ├── sample_seq.bpseq.unknotted │ ├── sample_seq.bpseq │ ├── sample_seq.ct │ ├── sample_seq.bla │ ├── sample_seq.pssm │ ├── sample_seq.prob │ ├── temp.a2m │ ├── sample_seq.a2m │ ├── sample_seq.log_gremlin │ └── sample_seq.msa └── sample_seq_outputs │ ├── sample_seq.bpseq │ ├── sample_seq.st │ └── sample_seq.ct ├── docs ├── SPOTRNA2_pipeline.png └── benchmark_results.png ├── __pycache__ └── utils.cpython-36.pyc ├── Dockerfile ├── README.md └── run_spotrna2.sh /nt_database/empty_db.fasta: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/seqkit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/utils/seqkit -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.14.0 2 | pandas 3 | numpy==1.16.4 4 | argparse 5 | tqdm 6 | six 7 | -------------------------------------------------------------------------------- /sample_run/6ufj.fasta: -------------------------------------------------------------------------------- 1 | >6ufj: chain A,B 2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG 3 | -------------------------------------------------------------------------------- /sample_run/sample_seq.fasta: -------------------------------------------------------------------------------- 1 | 2 | >6ufj_A_B 3 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG -------------------------------------------------------------------------------- /docs/SPOTRNA2_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/docs/SPOTRNA2_pipeline.png -------------------------------------------------------------------------------- /docs/benchmark_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/docs/benchmark_results.png -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.db: -------------------------------------------------------------------------------- 1 | ((((((....))))))..........................((((((((..)))))))). 2 | -------------------------------------------------------------------------------- /__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.fasta: -------------------------------------------------------------------------------- 1 | >sample_seq 2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG -------------------------------------------------------------------------------- /sample_run/sample_seq_features/temp.txt: -------------------------------------------------------------------------------- 1 | #=GC SS_cons ((((((....))))))..........................((((((((..)))))))). 2 | -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/utils/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.tfrecords: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/sample_run/sample_seq_features/sample_seq.tfrecords -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.dbn: -------------------------------------------------------------------------------- 1 | >single_seq 2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG 3 | ((((((....))))))..........................((((((((..)))))))). 4 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.aln: -------------------------------------------------------------------------------- 1 | >sample_seq E=0.0 2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG 3 | >6UFJ_A(1-51:51) Chain A, RNA (50-MER) 6UFJ_C Chain C, RNA (50-MER) 6UFK_A Chain A, RNA (50-MER) 6UFK_C Chain C, RNA (50-MER) E=2e-16 s/c=1.87 id=98% cov=85% 4 | ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA---------- 5 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/temp.sto: -------------------------------------------------------------------------------- 1 | # STOCKHOLM 1.0 2 | 3 | #=GF DE E=0.0 4 | #=GC RF ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG 5 | sample_seq ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG 6 | 6UFJ_A(1-51:51) ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA---------- 7 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.sto: -------------------------------------------------------------------------------- 1 | # STOCKHOLM 1.0 2 | 3 | #=GF DE E=0.0 4 | #=GC RF ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG 5 | sample_seq ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG 6 | 6UFJ_A(1-51:51) ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA---------- 7 | #=GC SS_cons ((((((....))))))..........................((((((((..)))))))). 8 | // 9 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.bpseq.unknotted: -------------------------------------------------------------------------------- 1 | 1 A 16 2 | 2 C 15 3 | 3 U 14 4 | 4 C 13 5 | 5 G 12 6 | 6 U 11 7 | 7 U 0 8 | 8 U 0 9 | 9 G 0 10 | 10 A 0 11 | 11 G 6 12 | 12 C 5 13 | 13 G 4 14 | 14 A 3 15 | 15 G 2 16 | 16 U 1 17 | 17 A 0 18 | 18 U 0 19 | 19 A 0 20 | 20 A 0 21 | 21 A 0 22 | 22 C 0 23 | 23 A 0 24 | 24 G 0 25 | 25 C 0 26 | 26 U 0 27 | 27 G 0 28 | 28 G 0 29 | 29 U 0 30 | 30 U 0 31 | 31 A 0 32 | 32 A 0 33 | 33 G 0 34 | 34 C 0 35 | 35 U 0 36 | 36 C 0 37 | 37 A 0 38 | 38 A 0 39 | 39 A 0 40 | 40 G 0 41 | 41 C 0 42 | 42 G 0 43 | 43 G 60 44 | 44 A 59 45 | 45 G 58 46 | 46 A 57 47 | 47 G 56 48 | 48 C 55 49 | 49 A 54 50 | 50 G 53 51 | 51 A 0 52 | 52 U 0 53 | 53 C 50 54 | 54 U 49 55 | 55 G 48 56 | 56 C 47 57 | 57 U 46 58 | 58 C 45 59 | 59 U 44 60 | 60 C 43 61 | 61 G 0 62 | -------------------------------------------------------------------------------- /utils/FreeKnot/COPYRIGHT: -------------------------------------------------------------------------------- 1 | Copyright (C) 2012 Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen 2 | 3 | This file is part of FreeKnot. 4 | 5 | FreeKnot is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | FreeKnot is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with FreeKnot. If not, see . 17 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.bpseq: -------------------------------------------------------------------------------- 1 | #sample_seq 2 | 1 A 16 3 | 2 C 15 4 | 3 U 14 5 | 4 C 13 6 | 5 G 12 7 | 6 U 11 8 | 7 U 0 9 | 8 U 37 10 | 9 G 36 11 | 10 A 35 12 | 11 G 6 13 | 12 C 5 14 | 13 G 4 15 | 14 A 3 16 | 15 G 2 17 | 16 U 1 18 | 17 A 0 19 | 18 U 0 20 | 19 A 0 21 | 20 A 0 22 | 21 A 0 23 | 22 C 0 24 | 23 A 0 25 | 24 G 0 26 | 25 C 0 27 | 26 U 0 28 | 27 G 0 29 | 28 G 0 30 | 29 U 0 31 | 30 U 0 32 | 31 A 0 33 | 32 A 0 34 | 33 G 0 35 | 34 C 0 36 | 35 U 10 37 | 36 C 9 38 | 37 A 8 39 | 38 A 0 40 | 39 A 0 41 | 40 G 0 42 | 41 C 0 43 | 42 G 0 44 | 43 G 60 45 | 44 A 59 46 | 45 G 58 47 | 46 A 57 48 | 47 G 56 49 | 48 C 55 50 | 49 A 54 51 | 50 G 53 52 | 51 A 0 53 | 52 U 0 54 | 53 C 50 55 | 54 U 49 56 | 55 G 48 57 | 56 C 47 58 | 57 U 46 59 | 58 C 45 60 | 59 U 44 61 | 60 C 43 62 | 61 G 0 63 | -------------------------------------------------------------------------------- /sample_run/sample_seq_outputs/sample_seq.bpseq: -------------------------------------------------------------------------------- 1 | #sample_seq 2 | 1 A 16 3 | 2 C 15 4 | 3 U 14 5 | 4 C 13 6 | 5 G 0 7 | 6 U 39 8 | 7 U 38 9 | 8 U 37 10 | 9 G 36 11 | 10 A 35 12 | 11 G 34 13 | 12 C 33 14 | 13 G 4 15 | 14 A 3 16 | 15 G 2 17 | 16 U 1 18 | 17 A 0 19 | 18 U 0 20 | 19 A 0 21 | 20 A 0 22 | 21 A 0 23 | 22 C 0 24 | 23 A 0 25 | 24 G 0 26 | 25 C 0 27 | 26 U 0 28 | 27 G 0 29 | 28 G 0 30 | 29 U 0 31 | 30 U 0 32 | 31 A 0 33 | 32 A 0 34 | 33 G 12 35 | 34 C 11 36 | 35 U 10 37 | 36 C 9 38 | 37 A 8 39 | 38 A 7 40 | 39 A 6 41 | 40 G 0 42 | 41 C 0 43 | 42 G 61 44 | 43 G 60 45 | 44 A 59 46 | 45 G 58 47 | 46 A 57 48 | 47 G 56 49 | 48 C 55 50 | 49 A 54 51 | 50 G 53 52 | 51 A 0 53 | 52 U 0 54 | 53 C 50 55 | 54 U 49 56 | 55 G 48 57 | 56 C 47 58 | 57 U 46 59 | 58 C 45 60 | 59 U 44 61 | 60 C 43 62 | 61 G 42 63 | -------------------------------------------------------------------------------- /utils/FreeKnot/COPYRIGHT.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2012 Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen 2 | 3 | This file is part of FreeKnot. 4 | 5 | FreeKnot is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | FreeKnot is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with FreeKnot. If not, see . 17 | -------------------------------------------------------------------------------- /utils/FreeKnot/BpseqWriter.pm: -------------------------------------------------------------------------------- 1 | #Writer for BPSEQ format 2 | 3 | package BpseqWriter; 4 | 5 | use strict; 6 | 7 | sub output_results { 8 | my (undef, $combined_base_pair_removal_pos, $base_seq, $paired_pos_ptrs, $base_count) = @_; 9 | 10 | if (@{$combined_base_pair_removal_pos} == 0) { 11 | for (my $i = 1; $i <= $base_count; $i++) { 12 | print $i . ' ' . $base_seq->[$i - 1] . ' ' . $paired_pos_ptrs->[$i] . "\n"; 13 | } 14 | } 15 | 16 | foreach (@{$combined_base_pair_removal_pos}) { 17 | for (my $i = 1; $i <= $base_count; $i++) { 18 | print $i . ' ' . $base_seq->[$i - 1] . ' '; 19 | if (exists($_->{$i})) { 20 | print "0\n"; 21 | } 22 | else { 23 | print $paired_pos_ptrs->[$i] . "\n"; 24 | } 25 | } 26 | } 27 | } 28 | 29 | 1; 30 | -------------------------------------------------------------------------------- /sample_run/sample_seq_outputs/sample_seq.st: -------------------------------------------------------------------------------- 1 | #Name: sample_seq 2 | #Length: 61 3 | #PageNumber: 2 4 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG 5 | [[[[.(((((((]]]]................)))))))..(((((((((..))))))))) 6 | EEEEESSSSSSSHHHHHHHHHHHHHHHHHHHHSSSSSSSXXSSSSSSSSSHHSSSSSSSSS 7 | KKKKNNNNNNNNKKKKNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 8 | S1 6..12 "UUUGAGC" 33..39 "GCUCAAA" 9 | S2 42..50 "GGAGAGCAG" 53..61 "CUGCUCUCG" 10 | H1 13..32 "GAGUAUAAACAGCUGGUUAA" (12,33) C:G PK{1} 11 | H2 51..52 "AU" (50,53) G:C 12 | X1 40..41 "GC" (39,6) A:U (42,61) G:G 13 | E1 1..5 "ACUCG" PK{1} 14 | PK1 4bp 1..4 13..16 E1 1..5 H1 13..32 15 | PK1.1 1 A 16 U 16 | PK1.2 2 C 15 G 17 | PK1.3 3 U 14 A 18 | PK1.4 4 C 13 G 19 | NCBP1 42 G 61 G S2 20 | segment1 7bp 6..12 UUUGAGC 33..39 GCUCAAA 21 | segment2 9bp 42..50 GGAGAGCAG 53..61 CUGCUCUCG 22 | -------------------------------------------------------------------------------- /utils/FreeKnot/BracketPairs.pm: -------------------------------------------------------------------------------- 1 | #Bracket handler for DPParser 2 | 3 | package BracketPairs; 4 | use strict; 5 | 6 | my $open_bracket_map = {")" => "(", "]" => "[", "}" => "{", ">" => "<"}; 7 | 8 | #Check whether a symbol (in dot-parentheses format) is an open bracket 9 | sub is_open_bracket { 10 | my (undef, $symbol) = @_; 11 | 12 | if ($symbol =~ /^[\(\[{]$/) { 24 | return $open_bracket_map->{$close_bracket}; 25 | } 26 | elsif ($close_bracket =~ /^[a-z]$/) { 27 | return uc $close_bracket; 28 | } 29 | else { 30 | die "Unknown closing bracket\n"; 31 | } 32 | } 33 | 34 | 1; 35 | -------------------------------------------------------------------------------- /utils/bpseq2dbn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import argparse 4 | import os 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--inputs', default='inputs', type=str, help='Path to input file in fasta format, accept multiple sequences as well in fasta format; default = ''inputs/2zzm-1-B.fasta''\n', metavar='') 8 | parser.add_argument('--outputs',default='inputs', type=str, help='Path to output files; SPOT-RNA outputs at least three files .ct, .bpseq, and .prob files; default = ''inputs/\n', metavar='') 9 | parser.add_argument('--rna_id', default='sample_seq', type=str, help='Name of the input sequence file\n') 10 | 11 | args = parser.parse_args() 12 | 13 | with open(os.path.join(args.inputs, args.rna_id + ".bpseq.unknotted")) as f: 14 | temp = pd.read_csv(f,comment='#', delim_whitespace=True, header=None, usecols=[0,1,2]).values 15 | seq = temp[:,1] 16 | 17 | pairs = [[i,j] for i,j in zip(temp[:,0], temp[:,2]) if i!=0 and j!=0 and i6UFJ_A Chain A, RNA (50-MER) 6UFJ_C Chain C, RNA (50-MER) 6UFK_A 25 | Chain A, RNA (50-MER) 6UFK_C Chain C, RNA (50-MER) 26 | Length=51 27 | 28 | Score = 95.3 bits (51), Expect = 2e-16 29 | Identities = 51/51 (100%), Gaps = 0/51 (0%) 30 | Strand=Plus/Plus 31 | 32 | Query 1 ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA 51 33 | ||||||||||||||||||||||||||||||||||||||||||||||||||| 34 | Sbjct 1 ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA 51 35 | 36 | 37 | 38 | Lambda K H 39 | 1.33 0.621 1.12 40 | 41 | Gapped 42 | Lambda K H 43 | 1.28 0.460 0.850 44 | 45 | Effective search space used: 7769692438560 46 | 47 | 48 | Database: /nt_database/nt 49 | Posted date: May 30, 2020 5:58 AM 50 | Number of letters in database: 260,722,916,040 51 | Number of sequences in database: 55,908,648 52 | 53 | 54 | 55 | Matrix: blastn matrix 1 -2 56 | Gap Penalties: Existence: 0, Extension: 2.5 57 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | MAINTAINER Jaswinder Singh (jaswinder.singh3@griffithuni.edu.au) 3 | 4 | RUN rm /bin/sh && ln -s /bin/bash /bin/sh 5 | RUN apt-get update && apt-get install -y build-essential wget virtualenv git python-minimal cpanminus gawk 6 | RUN cpanm Graph 7 | 8 | RUN wget 'https://www.dropbox.com/s/h6j53u7wjyj6uir/SPOT-RNA2.tar.xz' || wget 'https://app.nihaocloud.com/f/3e826caf8efc43adaaa0/?dl=1' && tar -xvf SPOT-RNA2.tar.xz && rm SPOT-RNA2.tar.xz 9 | WORKDIR SPOT-RNA2 10 | 11 | RUN wget -O utils/models_ckps.tar.xz 'https://www.dropbox.com/s/udzcsva76lh5wvq/models_ckps.tar.xz' || wget -O utils/models_ckps.tar.xz 'https://app.nihaocloud.com/f/586acb2658d74ccb92b8/?dl=1' && tar -xvf utils/models_ckps.tar.xz -C utils/ && rm utils/models_ckps.tar.xz 12 | RUN virtualenv -p python3.6 venv && source ./venv/bin/activate && pip install tensorflow==1.14.0 && pip install -r requirements.txt && deactivate 13 | 14 | RUN wget 'eddylab.org/infernal/infernal-1.1.3-linux-intel-gcc.tar.gz' && tar -xvzf infernal-*.tar.gz && rm infernal-*.tar.gz 15 | RUN wget 'ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-*+-x64-linux.tar.gz' && tar -xvzf ncbi-blast-*+-x64-linux.tar.gz && rm ncbi-blast-*+-x64-linux.tar.gz 16 | RUN git clone https://github.com/jaswindersingh2/SPOT-RNA.git && cd SPOT-RNA && wget 'https://www.dropbox.com/s/dsrcf460nbjqpxa/SPOT-RNA-models.tar.gz' || wget -O SPOT-RNA-models.tar.gz 'https://app.nihaocloud.com/f/fbf3315a91d542c0bdc2/?dl=1' && tar -xvzf SPOT-RNA-models.tar.gz && rm SPOT-RNA-models.tar.gz && cd ../ 17 | RUN git clone "https://github.com/sokrypton/GREMLIN_CPP" && cd GREMLIN_CPP && g++ -O3 -std=c++0x -o gremlin_cpp gremlin_cpp.cpp -fopenmp && cd ../ 18 | RUN git clone 'https://github.com/LinearFold/LinearPartition.git' && cd LinearPartition/ && make && cd ../ 19 | -------------------------------------------------------------------------------- /utils/FreeKnot/ScoringFunctions.pm: -------------------------------------------------------------------------------- 1 | package ScoringFunctions; 2 | 3 | use strict; 4 | 5 | #my $free_energy_params; 6 | #my $canonical_base_pairs = {'AU' => 0, 'CG' => 0, 'GC' => 0, 'UA' => 0, 'GU' => 0, 'UG' => 0}; 7 | 8 | #Return a scoring function according to the choice selected 9 | sub get_scoring_function { 10 | my (undef, $option) = @_; 11 | 12 | if ($option eq 'bp') { 13 | return \&_base_pair_score, 'max', 0; 14 | } 15 | elsif ($option eq 'stem') { 16 | return \&_stem_score, 'max', 0; 17 | } 18 | # elsif ($option eq 'sstab') { 19 | # $free_energy_params = _init_free_energy_parameters(); 20 | # return \&_stem_bp_stability, 'min', 1; 21 | # } 22 | elsif ($option eq 'hb') { 23 | return \&_hydrogen_bond, 'max', 0; 24 | } 25 | elsif ($option eq 'fe') { 26 | return \&_overall_stability, 'min', 1; 27 | } 28 | else { 29 | return undef, undef, undef; 30 | } 31 | } 32 | 33 | #Number of base pairs in a stem as the stem score 34 | sub _base_pair_score { 35 | my $chord_attrs = shift; 36 | 37 | my $stem_pair_count = $chord_attrs->{pair_count}; 38 | if (defined($stem_pair_count)) { 39 | return $stem_pair_count; 40 | } 41 | 42 | return 0; 43 | } 44 | 45 | #Each stem scores equally as 1 46 | sub _stem_score { 47 | return 1; 48 | } 49 | 50 | #GC and CG bonds = 3, other canonical of GU pairs = 2 51 | sub _hydrogen_bond { 52 | my ($chord_attrs, $base_seq) = @_; 53 | 54 | my $stem_base_pairs = $chord_attrs->{base_pairs}; 55 | my $total_score = 0; 56 | 57 | foreach (@{$stem_base_pairs}) { 58 | my $base_pair_type = uc($base_seq->[$_->[0] - 1] . $base_seq->[$_->[1] - 1]); 59 | if ($base_pair_type eq 'GC' || $base_pair_type eq 'CG') { 60 | $total_score += 3; 61 | } 62 | elsif ($base_pair_type eq 'AU' || $base_pair_type eq 'UA' || 63 | $base_pair_type eq 'GU' || $base_pair_type eq 'UG') { 64 | $total_score += 2; 65 | } 66 | } 67 | 68 | return $total_score; 69 | } 70 | 71 | #This allows all MISs to be reported as MWISs and they will be converted to all possible 72 | #de-knotted structures to determine the minimum free energy (MFE) 73 | sub _overall_stability { 74 | return 0; 75 | } 76 | 77 | 1; 78 | -------------------------------------------------------------------------------- /utils/FreeKnot/DPWriter.pm: -------------------------------------------------------------------------------- 1 | #Writer for dot-parentheses format 2 | 3 | package DPWriter; 4 | 5 | use strict; 6 | 7 | use constant DOT => '.'; 8 | use constant OPEN_BRACKET => '('; 9 | use constant CLOSE_BRACKET => ')'; 10 | use constant TEMP_DP_FILE => 'MWIS_temp.dp'; 11 | 12 | sub output_results { 13 | my (undef, $combined_base_pair_removal_pos, $structure_symbols, $base_seq_str) = @_; 14 | 15 | if (@{$combined_base_pair_removal_pos} == 0) { 16 | my $output_structure = join('', @{$structure_symbols}); 17 | print "$base_seq_str\n$output_structure\n"; 18 | } 19 | 20 | foreach (@{$combined_base_pair_removal_pos}) { 21 | my $output_structure = ''; 22 | for (my $i = 0; $i < @{$structure_symbols}; $i++) { 23 | if (exists($_->{$i + 1})) { 24 | $output_structure = $output_structure . DOT; 25 | } 26 | else { 27 | $output_structure = $output_structure . $structure_symbols->[$i]; 28 | } 29 | } 30 | 31 | print "$base_seq_str\n$output_structure\n"; 32 | } 33 | } 34 | 35 | sub output_mfe_candidate { 36 | my (undef, $base_pair_removal_pos, $paired_pos_ptrs, $structure_symbols, $base_seq_str) = @_; 37 | 38 | my $base_seq_len = length($base_seq_str); 39 | my $output_structure = ''; 40 | if (defined($paired_pos_ptrs)) { 41 | for (my $i = 1; $i <= $base_seq_len; $i++) { 42 | if (exists($base_pair_removal_pos->{$i})) { 43 | $output_structure = $output_structure . DOT; 44 | } 45 | else { 46 | my $paired_pos = $paired_pos_ptrs->[$i]; 47 | if ($paired_pos == 0) { 48 | $output_structure = $output_structure . DOT; 49 | } 50 | elsif ($i < $paired_pos) { 51 | $output_structure = $output_structure . OPEN_BRACKET; 52 | } 53 | else { 54 | $output_structure = $output_structure . CLOSE_BRACKET; 55 | } 56 | } 57 | } 58 | } 59 | elsif (defined($structure_symbols)) { 60 | for (my $i = 1; $i <= $base_seq_len; $i++) { 61 | if (exists($base_pair_removal_pos->{$i})) { 62 | $output_structure = $output_structure . DOT; 63 | } 64 | else { 65 | $output_structure = $output_structure . $structure_symbols->[$i - 1]; 66 | } 67 | } 68 | } 69 | 70 | $output_structure =~ s/[\[\{a-z]/\)/g; 72 | 73 | open (DP, ">" . TEMP_DP_FILE) or die "Cannot open file at " . TEMP_DP_FILE; 74 | print DP "$base_seq_str\n$output_structure\n"; 75 | close DP or die "Cannot close file at " . TEMP_DP_FILE; 76 | } 77 | 78 | 1; 79 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.pssm: -------------------------------------------------------------------------------- 1 | A 23 3 5 2 0 2 | C 0 3 8 22 0 3 | U 0 31 0 2 0 4 | C 0 3 1 29 0 5 | G 1 0 32 0 0 6 | U 3 24 0 6 0 7 | U 0 13 9 11 0 8 | U 0 31 1 1 0 9 | G 1 0 32 0 0 10 | A 26 0 6 1 0 11 | G 1 0 32 0 0 12 | C 0 2 0 31 0 13 | G 3 0 29 1 0 14 | A 31 0 2 0 0 15 | G 3 0 22 8 0 16 | U 3 24 4 2 0 17 | A 27 3 3 0 0 18 | U 3 28 0 2 0 19 | A 32 0 0 1 0 20 | A 33 0 0 0 0 21 | A 32 0 0 1 0 22 | C 3 6 3 21 0 23 | A 32 1 0 0 0 24 | G 0 0 32 1 0 25 | C 7 2 14 10 0 26 | U 2 14 0 16 1 27 | G 2 1 9 3 18 28 | G 8 14 5 4 2 29 | U 3 29 0 0 1 30 | U 0 29 0 4 0 31 | A 32 0 1 0 0 32 | A 18 3 11 0 1 33 | G 3 0 29 0 1 34 | C 3 0 0 30 0 35 | U 0 26 3 4 0 36 | C 1 1 0 31 0 37 | A 31 0 1 0 1 38 | A 13 0 10 9 1 39 | A 31 0 1 0 1 40 | G 0 0 31 1 1 41 | C 4 0 0 28 1 42 | G 0 1 30 1 1 43 | G 2 24 6 1 0 44 | A 8 8 1 16 0 45 | G 3 5 15 10 0 46 | A 13 7 3 9 1 47 | G 2 3 23 4 1 48 | C 7 2 9 14 1 49 | A 11 5 12 3 2 50 | G 2 4 21 0 6 51 | A 12 0 2 2 17 52 | U 2 10 1 0 20 53 | C 3 2 3 16 9 54 | U 5 9 3 12 4 55 | G 2 8 12 6 5 56 | C 2 3 4 18 6 57 | U 6 12 7 4 4 58 | C 6 2 10 12 3 59 | U 6 6 17 1 3 60 | C 11 2 12 4 4 61 | G 2 1 28 0 2 62 | -------------------------------------------------------------------------------- /utils/FreeKnot/BpseqParser.pm: -------------------------------------------------------------------------------- 1 | #Parser for BPSEQ format 2 | #It returns primitive pseudoknot objects, base sequence and paired positions 3 | 4 | package BpseqParser; 5 | 6 | use strict; 7 | 8 | sub parse { 9 | my (undef, $bpseq_file_path) = @_; 10 | 11 | my ($base_seq, $paired_pos_ptrs) = ([], []); 12 | my ($next_paired_pos, $prev_paired_pos) = ({}, {}); 13 | my $matched_pos = {}; 14 | my $last_paired_pos = 0; 15 | my $base_count = 0; 16 | 17 | open (BPSEQ, "<$bpseq_file_path") or die "Cannot open file at $bpseq_file_path"; 18 | 19 | while () { 20 | if ($_ =~ /^([0-9]+) ([A-Za-z]{1}) ([0-9]+)[\r\n]*$/) { 21 | my ($pos, $base, $paired_pos) = ($1, $2, $3); 22 | if ($pos != ++$base_count) { 23 | die "Base position $base_count is missing"; 24 | } 25 | 26 | if ($paired_pos > 0) { 27 | if ($pos < $paired_pos) { 28 | $matched_pos->{$pos} = $paired_pos; 29 | } 30 | else { 31 | if ($matched_pos->{$paired_pos} != $pos) { 32 | die "Unmatched pair position $pos and $paired_pos"; 33 | } 34 | } 35 | 36 | $next_paired_pos->{$last_paired_pos} = $pos; 37 | $prev_paired_pos->{$pos} = $last_paired_pos; 38 | $last_paired_pos = $pos; 39 | } 40 | 41 | $paired_pos_ptrs->[$pos] = $paired_pos; 42 | $base_seq->[$pos - 1] = $base; 43 | } 44 | elsif ($_ !~ /^#.*/ && $_ !~ /^\s+/) { 45 | die "Unknown input: $_"; 46 | } 47 | } 48 | 49 | $next_paired_pos->{$last_paired_pos} = 0; 50 | $prev_paired_pos->{0} = $last_paired_pos; 51 | 52 | close BPSEQ or die "Cannot close file at $bpseq_file_path"; 53 | 54 | #Group the base pairs into base pair stems 55 | my ($stem_outermost_pairs, $stems) = _group_to_stems($next_paired_pos, $prev_paired_pos, $paired_pos_ptrs); 56 | #Extract primitive pseudoknots from the base pair stems 57 | my $primitive_pseudoknots = PrimitivePseudoknotExtractor->extract($stem_outermost_pairs, $stems, $paired_pos_ptrs); 58 | 59 | return ($primitive_pseudoknots, $base_seq, $paired_pos_ptrs, $base_count); 60 | } 61 | 62 | sub _group_to_stems { 63 | my ($next_paired_pos, $prev_paired_pos, $paired_pos_ptrs) = @_; 64 | 65 | my $stems = {}; 66 | my $stem_outermost_pairs = []; 67 | my $stem; 68 | my $last_pair; 69 | 70 | my $curr_pos = $next_paired_pos->{0}; 71 | while ($curr_pos > 0) { 72 | my $paired_pos = $paired_pos_ptrs->[$curr_pos]; 73 | if ($paired_pos < $curr_pos) { 74 | undef $last_pair; 75 | $curr_pos = $next_paired_pos->{$curr_pos}; 76 | next; 77 | } 78 | 79 | my $curr_pair = [$curr_pos, $paired_pos]; 80 | 81 | if (defined($last_pair) && $prev_paired_pos->{$last_pair->[1]} == $paired_pos) { 82 | push @{$stem}, $curr_pair; 83 | } 84 | else { 85 | $stem = [$curr_pair]; 86 | $stems->{$curr_pos} = $stem; 87 | push @{$stem_outermost_pairs}, $curr_pair; 88 | } 89 | 90 | $last_pair = $curr_pair; 91 | $curr_pos = $next_paired_pos->{$curr_pos}; 92 | } 93 | 94 | return ($stem_outermost_pairs, $stems); 95 | } 96 | 97 | 1; 98 | -------------------------------------------------------------------------------- /utils/FreeKnot/README: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------- 2 | FreeKnot 3 | ------------------------------------------------------------------------- 4 | Authors: Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen 5 | Last updated on 15 Apr 2014 6 | 7 | ------------------------------------------------------------------------- 8 | Purpose 9 | 10 | FreeKnot is a tool for RNA pseudoknot removal. It converts any pseudoknot 11 | into nested substructures in RNA secondary structures. It removes some 12 | crossing stems to eliminate crossings based on certain scoring functions 13 | (details will be provided later in this README file) and reports one or 14 | more optimized pseudoknot-free structures. 15 | 16 | ------------------------------------------------------------------------- 17 | Platform and pre-requisites 18 | 19 | FreeKnot has been tested on various platforms including Linux (Ubuntu), 20 | Mac OS X and Windows. Perl (v5.14 or later) is recommended. Earlier 21 | versions might work but without guarantee. Windows users can download 22 | various Perl distributions for Windows. ViennaRNA package 2.1 is required 23 | for the free energy scoring function. 24 | 25 | ------------------------------------------------------------------------- 26 | Program/Module Description 27 | 28 | BpseqParser.pm, DPParser.pm - parser to accept bpseq or 29 | dot-parentheses formats as input 30 | BpseqWriter.pm, DPWriter.pm - writer to output converted results in 31 | bpseq or dot-parentheses formats 32 | ChordModel.pm, CircleGraph.pm - graphical object for primitive 33 | pseudoknot representation 34 | MIS.pm - MIS algorithm (for free energy scoring 35 | function) 36 | MWIS.pm - MWIS algorithm 37 | ScoringFunctions.pm - scoring functions 38 | remove_pseudoknot.pl - main program for pseudoknot removal 39 | PrimitivePseudoknotExtractor.pm - primitive pseudoknot extraction from 40 | the input secondary structure 41 | BracketPairs.pm - processing brackets in input secondary 42 | structure 43 | VertexSubset.pm - subset object for storing graph 44 | vertices in the MIS algorithm 45 | 46 | ------------------------------------------------------------------------- 47 | Usage 48 | 49 | FreeKnot is executed in console. The command is: 50 | 51 | perl remove_pseudoknot.pl -i 52 | -s 53 | 54 | Secondary structure format available: dp (dot-parentheses) / bpseq 55 | The secondary structure format for the output file follows that of the 56 | input file. So, if the input file is in bpseq format then the output 57 | file is also in bpseq format. Note that every line of data must end with 58 | a newline character (i.e. \n). 59 | 60 | Scoring function options: bp (# of base pairs) / stem (# of base pair 61 | stems) / hb (# of hydrogen bonds) / fe (structure overall free energy) 62 | 63 | The results are outputted to the console (stdout) by default. They can be 64 | directed to a file. For example, 65 | 66 | perl remove_pseudoknot.pl -i bpseq -s bp input.bpseq > output.bpseq 67 | 68 | ------------------------------------------------------------------------- 69 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.prob: -------------------------------------------------------------------------------- 1 | 1 16 2.4607e-01 2 | 2 11 4.8058e-04 3 | 2 15 2.7600e-01 4 | 2 40 2.7181e-04 5 | 2 42 5.0162e-03 6 | 2 43 7.0046e-04 7 | 2 61 3.1390e-04 8 | 3 10 4.9712e-04 9 | 3 14 2.7639e-01 10 | 3 39 2.2049e-04 11 | 3 42 2.1906e-03 12 | 3 43 3.6608e-03 13 | 4 9 4.9913e-04 14 | 4 13 2.7651e-01 15 | 4 40 4.7134e-03 16 | 4 42 6.8810e-01 17 | 5 12 2.7567e-01 18 | 5 34 3.2105e-03 19 | 5 41 7.0013e-01 20 | 6 10 1.6400e-04 21 | 6 11 2.4958e-01 22 | 6 33 3.2126e-03 23 | 6 39 1.8319e-01 24 | 6 40 5.2746e-01 25 | 7 11 2.1717e-04 26 | 7 32 3.2300e-03 27 | 7 37 2.5131e-05 28 | 7 38 3.5702e-01 29 | 7 39 3.4129e-01 30 | 8 31 3.0849e-03 31 | 8 37 5.4291e-01 32 | 8 38 1.6981e-01 33 | 9 30 2.3436e-03 34 | 9 36 7.1718e-01 35 | 10 26 8.7135e-05 36 | 10 29 1.9660e-03 37 | 10 30 1.5846e-03 38 | 10 35 7.1728e-01 39 | 11 25 1.2568e-04 40 | 11 29 5.5166e-03 41 | 11 34 7.1395e-01 42 | 12 24 1.3000e-04 43 | 12 27 1.1131e-03 44 | 12 28 1.5587e-02 45 | 12 33 7.0364e-01 46 | 12 40 1.8068e-05 47 | 13 22 1.9136e-04 48 | 13 25 1.9429e-04 49 | 13 26 4.2317e-04 50 | 13 29 9.7416e-04 51 | 13 30 9.5602e-02 52 | 13 36 1.4656e-03 53 | 14 26 3.0481e-02 54 | 14 29 1.1306e-01 55 | 14 30 7.3809e-03 56 | 14 35 1.5024e-03 57 | 15 22 3.7622e-03 58 | 15 25 3.5921e-02 59 | 15 26 1.5459e-04 60 | 15 29 5.8125e-03 61 | 15 30 2.9255e-02 62 | 15 34 1.5186e-03 63 | 15 41 4.7121e-05 64 | 16 21 3.4426e-03 65 | 16 23 2.3995e-04 66 | 16 24 3.4999e-02 67 | 16 27 1.3027e-01 68 | 16 28 2.1053e-03 69 | 16 31 1.0100e-02 70 | 16 32 1.1925e-03 71 | 16 33 1.4888e-03 72 | 16 40 6.5565e-05 73 | 17 26 1.2737e-01 74 | 17 29 4.0552e-02 75 | 17 30 1.0248e-02 76 | 18 23 8.8966e-04 77 | 18 24 2.7892e-03 78 | 18 27 1.9342e-01 79 | 18 28 3.8211e-02 80 | 18 31 7.2033e-03 81 | 18 32 3.3442e-03 82 | 18 37 8.8863e-05 83 | 18 38 1.8289e-04 84 | 18 39 3.4985e-04 85 | 18 40 5.0928e-04 86 | 18 42 5.8368e-05 87 | 19 26 1.7940e-01 88 | 19 29 3.9621e-04 89 | 19 30 8.6332e-03 90 | 20 26 1.0309e-02 91 | 20 29 5.0768e-03 92 | 20 30 2.9850e-02 93 | 21 26 2.6013e-03 94 | 21 29 3.8919e-02 95 | 22 27 1.2742e-02 96 | 22 28 3.9957e-02 97 | 22 40 1.4106e-03 98 | 22 42 4.3392e-03 99 | 22 61 1.7007e-04 100 | 23 29 1.2407e-04 101 | 23 30 1.3139e-04 102 | 23 35 2.1023e-01 103 | 23 57 2.0713e-04 104 | 24 29 8.2146e-05 105 | 24 34 2.4004e-01 106 | 24 41 2.7964e-02 107 | 24 56 2.2795e-04 108 | 25 33 2.4021e-01 109 | 25 40 2.8015e-02 110 | 25 55 2.2719e-04 111 | 26 31 1.3356e-04 112 | 26 32 2.3811e-01 113 | 26 37 2.0999e-03 114 | 26 38 1.5837e-04 115 | 26 39 2.5461e-02 116 | 26 42 1.5538e-04 117 | 27 34 4.0980e-04 118 | 27 35 8.8677e-03 119 | 27 36 4.1981e-03 120 | 27 41 1.8599e-04 121 | 27 53 2.1974e-04 122 | 28 34 1.0404e-02 123 | 28 35 1.9487e-03 124 | 28 36 2.1486e-03 125 | 28 41 4.1504e-04 126 | 28 52 2.2328e-04 127 | 29 33 9.0913e-03 128 | 29 37 6.7256e-05 129 | 29 38 1.8006e-04 130 | 29 39 1.7555e-04 131 | 29 40 4.1642e-04 132 | 29 51 2.2158e-04 133 | 30 37 1.8760e-04 134 | 30 38 1.7507e-04 135 | 30 39 3.9742e-04 136 | 30 50 2.1004e-04 137 | 33 41 6.4184e-03 138 | 33 48 2.2673e-04 139 | 34 40 6.4329e-03 140 | 34 47 2.2717e-04 141 | 35 39 5.2554e-03 142 | 35 46 2.2709e-04 143 | 36 40 1.2025e-04 144 | 36 42 2.8760e-03 145 | 36 45 2.2703e-04 146 | 39 57 2.1183e-04 147 | 40 56 2.3156e-04 148 | 41 55 2.3165e-04 149 | 41 61 5.8108e-03 150 | 42 54 2.3119e-04 151 | 42 60 3.9038e-03 152 | 43 53 2.3140e-04 153 | 43 59 7.1042e-04 154 | 43 60 9.8885e-01 155 | 44 52 2.2817e-04 156 | 44 59 9.9764e-01 157 | 45 58 9.9940e-01 158 | 46 57 9.9913e-01 159 | 47 56 9.9941e-01 160 | 48 55 9.9908e-01 161 | 49 54 8.1076e-01 162 | 50 54 1.6715e-04 163 | 164 | -------------------------------------------------------------------------------- /utils/getpssm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | 5 | my $ecut=0.001; 6 | my @AA=qw(A U G C -); 7 | my %AA2index = ('A'=>'1', 'U'=>'2', 'G'=>'3', 'C'=>'4', '-'=>'5'); 8 | 9 | my $seq=$ARGV[0]; 10 | my $aln=$ARGV[1]; 11 | my $outfile=$ARGV[2]; 12 | 13 | my @seq=`cat $seq`; chomp(@seq); 14 | my $len=length $seq[1]; 15 | 16 | #print "parse ...\n"; 17 | my %freq=&wfreq($len, $aln); 18 | 19 | my @nn=split(//, $seq[1]); 20 | open(PRO, ">$outfile"); 21 | for(my $i=1; $i<=$len; $i++) 22 | { 23 | print PRO "$nn[$i-1] "; 24 | foreach my $A(@AA) 25 | { 26 | printf PRO "%6d ", $freq{$i, $A}; 27 | } 28 | 29 | printf PRO "\n"; 30 | } 31 | close(PRO); 32 | 33 | 34 | 35 | sub wfreq 36 | { 37 | my ($len, $file)=@_; 38 | 39 | my %ALN=(); 40 | my $Pcount=0; 41 | open(ALN,"$file") || die "Cant open $file"; 42 | while(my $line=) 43 | { 44 | chomp($line); 45 | if($line =~ /^>(\S+)/) 46 | { 47 | my $Pname=$1; 48 | # my $Evalue= $1 if($line =~ /E=(\S+)/); 49 | # last if($Evalue>$ecut); 50 | $Pcount++; 51 | $ALN{$Pcount, 0}=$Pname; 52 | # $ALN{$Pcount, 1}=$Evalue; 53 | } 54 | else 55 | { 56 | $line =~ s/T/U/g; ###replace T by U 57 | $ALN{$Pcount, 2}=$line; 58 | } 59 | } 60 | close(ALN); 61 | 62 | my %freq=(); 63 | $Pcount=50000 if($Pcount>50000); 64 | printf "%d sequences\n", $Pcount; 65 | if($Pcount >= 1) 66 | { 67 | %freq = &frquency(\%ALN, $Pcount, \%AA2index); 68 | } 69 | else 70 | { 71 | my @Qres = split(//, $ALN{1, 2}); 72 | for(my $j=0; $j<@Qres; $j++) 73 | { 74 | foreach my $key (@AA) 75 | { 76 | $freq{$j+1, $key}=0; 77 | } 78 | } 79 | } 80 | 81 | return %freq; 82 | } 83 | 84 | 85 | sub frquency 86 | { 87 | my ($ALN_ref, $Nseq, $AA_ref)=@_; 88 | my %align = %$ALN_ref; 89 | my %AA2in = %$AA_ref; 90 | 91 | my @Qres = split(//, $align{1, 2}); 92 | my $Ncol = $#Qres; 93 | my %res_count=(); 94 | 95 | 96 | my $Qresno=0; 97 | my %Qmapping=(); 98 | for(my $j=0; $j<=$#Qres; $j++) 99 | { 100 | $res_count{$j}=0; 101 | if($Qres[$j] ne '-') 102 | { 103 | $Qresno++; 104 | $Qmapping{$Qresno}=$j; 105 | } 106 | } 107 | 108 | 109 | my @ARR=(); 110 | for(my $i=1; $i<=$Nseq; $i++) 111 | { 112 | my @res=split(//, $align{$i, 2}); 113 | for(my $j=0; $j<=$#res; $j++) 114 | { 115 | $ARR[$i][$j]=$res[$j]; 116 | } 117 | } 118 | my $AAcount = keys %AA2in; 119 | my %AA_freq=(); 120 | my %sum_seq_weights=(); 121 | my $k=0; 122 | 123 | for(my $j=0; $j<=$Ncol; $j++) 124 | { 125 | if($Qres[$j] eq '-') 126 | { 127 | next; 128 | } 129 | $k++; 130 | foreach my $key (@AA) 131 | { 132 | $AA_freq{$k, $key}=0; 133 | } 134 | my $w=0; 135 | for(my $i=1; $i<=$Nseq; $i++) 136 | { 137 | my $AAN=""; 138 | 139 | if(!exists $AA2in{$ARR[$i][$j]}) 140 | { 141 | print "replace $ARR[$i][$j] by $ARR[1][$j]\n"; 142 | $AAN=$ARR[1][$j]; #replace nonstandard base in templates by query base 143 | } 144 | else 145 | { 146 | $AAN=$ARR[$i][$j]; 147 | } 148 | 149 | # print "$AAN "; 150 | $AA_freq{$k, $AAN} += 1; ##weighted frequency in clolumn $j 151 | } 152 | #print "\n"; 153 | 154 | } 155 | return %AA_freq; 156 | } 157 | -------------------------------------------------------------------------------- /utils/FreeKnot/ChordModel.pm: -------------------------------------------------------------------------------- 1 | #Chord model of the circle graph representing a primitive pseudoknot. Each chord denotes a unique 2 | #crossing base pair stem in the primitive pseudoknot. If two stems cross, then their corresponding 3 | #chords also cross. Each chord is associated with its underlying base pairs. 4 | 5 | package ChordModel; 6 | 7 | use strict; 8 | 9 | sub new { 10 | my (undef, $primitive_pseudoknot) = @_; 11 | 12 | my $prim_pseudoknot_stems = $primitive_pseudoknot->[0]; 13 | my $chord_end_point_num_map = _get_chord_end_point_num_map($prim_pseudoknot_stems); 14 | 15 | my ($chord_edges, $all_chord_base_pairs) = ({}, {}); 16 | my ($chord_end_point_nums, $end_point_to_edge_map, $is_left_end_points) = ([], [], []); 17 | 18 | foreach (@{$prim_pseudoknot_stems}) { 19 | my $chord_left_end_point_num = $chord_end_point_num_map->{$_->[0][0]}; 20 | my $chord_right_end_point_num = $chord_end_point_num_map->{$_->[0][1]}; 21 | push @{$chord_end_point_nums}, ($chord_left_end_point_num, $chord_right_end_point_num); 22 | $is_left_end_points->[$chord_left_end_point_num] = 1; 23 | $is_left_end_points->[$chord_right_end_point_num] = 0; 24 | $all_chord_base_pairs->{$chord_left_end_point_num . '-' . $chord_right_end_point_num} = $_; 25 | 26 | my $chord_edge = [$chord_left_end_point_num, $chord_right_end_point_num]; 27 | $chord_edges->{$chord_left_end_point_num . '-' . $chord_right_end_point_num} = $chord_edge; 28 | $end_point_to_edge_map->[$chord_left_end_point_num] = $chord_edge; 29 | $end_point_to_edge_map->[$chord_right_end_point_num] = $chord_edge; 30 | } 31 | 32 | my @sorted_chord_end_point_nums = sort {$b <=> $a} @{$chord_end_point_nums}; 33 | 34 | my $self = {}; 35 | $self->{chord_end_point_nums} = \@sorted_chord_end_point_nums; 36 | $self->{chord_edges} = $chord_edges; 37 | $self->{end_point_to_edge_map} = $end_point_to_edge_map; 38 | $self->{is_left_end_points} = $is_left_end_points; 39 | $self->{all_chord_base_pairs} = $all_chord_base_pairs; 40 | 41 | bless $self; 42 | 43 | return $self; 44 | } 45 | 46 | sub _get_chord_end_point_num_map { 47 | my $prim_pseudoknot_stems = shift; 48 | 49 | my $stem_end_points = []; 50 | 51 | foreach (@{$prim_pseudoknot_stems}) { 52 | push @{$stem_end_points}, $_->[0][0]; 53 | push @{$stem_end_points}, $_->[0][1]; 54 | } 55 | 56 | my @sorted_stem_end_points = sort {$a <=> $b} @{$stem_end_points}; 57 | 58 | my $chord_end_point_num_map = {}; 59 | for (my $i = 0; $i < @sorted_stem_end_points; $i++) { 60 | $chord_end_point_num_map->{$sorted_stem_end_points[$i]} = $i + 1; 61 | } 62 | 63 | return $chord_end_point_num_map; 64 | } 65 | 66 | sub get_chord_end_point_nums { 67 | my $self = shift; 68 | 69 | return $self->{chord_end_point_nums}; 70 | } 71 | 72 | sub get_chord_edges { 73 | my $self = shift; 74 | 75 | return $self->{chord_edges}; 76 | } 77 | 78 | sub get_chord_edge_count { 79 | my $self = shift; 80 | 81 | return scalar(keys %{$self->{chord_edges}}); 82 | } 83 | 84 | sub get_chord_edge_by_end_point { 85 | my ($self, $end_point_num) = @_; 86 | 87 | my $end_point_to_edge_map = $self->{end_point_to_edge_map}; 88 | 89 | return $end_point_to_edge_map->[$end_point_num]; 90 | } 91 | 92 | sub is_left_end_point { 93 | my ($self, $end_point_num) = @_; 94 | 95 | my $is_left_end_points = $self->{is_left_end_points}; 96 | 97 | return $is_left_end_points->[$end_point_num]; 98 | } 99 | 100 | sub get_chord_base_pairs { 101 | my ($self, $chord_left_end_point, $chord_right_end_point) = @_; 102 | 103 | my $all_chord_base_pairs = $self->{all_chord_base_pairs}; 104 | 105 | return $all_chord_base_pairs->{$chord_left_end_point . '-' . $chord_right_end_point}; 106 | } 107 | 108 | 1; 109 | -------------------------------------------------------------------------------- /utils/FreeKnot/README.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------- 2 | 3 | FreeKnot 4 | 5 | ------------------------------------------------------------------------- 6 | 7 | Authors: Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen 8 | 9 | Last updated on 15 Apr 2014 10 | 11 | 12 | 13 | ------------------------------------------------------------------------- 14 | 15 | Purpose 16 | 17 | 18 | 19 | FreeKnot is a tool for RNA pseudoknot removal. It converts any pseudoknot 20 | 21 | into nested substructures in RNA secondary structures. It removes some 22 | 23 | crossing stems to eliminate crossings based on certain scoring functions 24 | 25 | (details will be provided later in this README file) and reports one or 26 | 27 | more optimized pseudoknot-free structures. 28 | 29 | 30 | 31 | ------------------------------------------------------------------------- 32 | 33 | Platform and pre-requisites 34 | 35 | 36 | 37 | FreeKnot has been tested on various platforms including Linux (Ubuntu), 38 | 39 | Mac OS X and Windows. Perl (v5.14 or later) is recommended. Earlier 40 | 41 | versions might work but without guarantee. Windows users can download 42 | 43 | various Perl distributions for Windows. ViennaRNA package 2.1 is required 44 | 45 | for the free energy scoring function. 46 | 47 | 48 | 49 | ------------------------------------------------------------------------- 50 | 51 | Program/Module Description 52 | 53 | 54 | 55 | BpseqParser.pm, DPParser.pm - parser to accept bpseq or 56 | 57 | dot-parentheses formats as input 58 | 59 | BpseqWriter.pm, DPWriter.pm - writer to output converted results in 60 | 61 | bpseq or dot-parentheses formats 62 | 63 | ChordModel.pm, CircleGraph.pm - graphical object for primitive 64 | 65 | pseudoknot representation 66 | 67 | MIS.pm - MIS algorithm (for free energy scoring 68 | 69 | function) 70 | MWIS.pm - MWIS algorithm 71 | 72 | ScoringFunctions.pm - scoring functions 73 | 74 | remove_pseudoknot.pl - main program for pseudoknot removal 75 | 76 | PrimitivePseudoknotExtractor.pm - primitive pseudoknot extraction from 77 | 78 | the input secondary structure 79 | 80 | BracketPairs.pm - processing brackets in input secondary 81 | 82 | structure 83 | 84 | 85 | VertexSubset.pm - subset objects for storing graph 86 | vertices in the MIS algorithm 87 | 88 | ------------------------------------------------------------------------- 89 | 90 | Usage 91 | 92 | 93 | 94 | FreeKnot is executed in console. The command is: 95 | 96 | 97 | 98 | perl remove_pseudoknot.pl -i 99 | 100 | -s 101 | 102 | 103 | 104 | Secondary structure format available: dp (dot-parentheses) / bpseq 105 | 106 | The secondary structure format for the output file follows that of the 107 | 108 | input file. So, if the input file is in bpseq format then the output 109 | 110 | file is also in bpseq format. Note that every line of data must end with 111 | 112 | a newline character (i.e. \n). 113 | 114 | 115 | 116 | Scoring function options: bp (# of base pairs) / stem (# of base pair 117 | 118 | stems) / hb (# of hydrogen bonds) / fe (structure overall free energy) 119 | 120 | 121 | 122 | The results are outputted to the console (stdout) by default. They can be 123 | 124 | directed to a file. For example, 125 | 126 | 127 | 128 | perl remove_pseudoknot.pl -i bpseq -s bp input.bpseq > output.bpseq 129 | 130 | 131 | 132 | ------------------------------------------------------------------------- 133 | -------------------------------------------------------------------------------- /utils/SPOT-RNA2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | from tqdm import tqdm 5 | import argparse 6 | from utils import create_tfr_files, prob_to_secondary_structure 7 | import time 8 | start = time.time() 9 | from argparse import RawTextHelpFormatter 10 | from pathlib import Path 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--inputs', default='inputs/single_seq.fasta', type=str, help='Path to input file in fasta format, accept multiple sequences as well in fasta format; default = ''inputs/2zzm-1-B.fasta''\n', metavar='') 14 | parser.add_argument('--outputs',default='outputs/', type=str, help='Path to output files; SPOT-RNA outputs at least three files .ct, .bpseq, and .prob files; default = ''outputs/\n', metavar='') 15 | parser.add_argument('--gpu', default=1, type=int, help='To run on GPU, specifiy GPU number. If only one GPU in computer specifiy 0; default = -1 (no GPU)\n', metavar='') 16 | parser.add_argument('--plots',default=False, type=bool, help='Set this to "True" to get the 2D plots of predicted secondary structure by SPOT-RNA; default = False\n', metavar='') 17 | parser.add_argument('--motifs',default=False, type=bool, help='Set this to "True" to get the motifs of predicted secondary structure by SPOT-RNA; default = False\n', metavar='') 18 | #parser.add_argument('--NC',default=True, type=bool, help='Set this to "False" to predict only canonical pairs; default = True\n', metavar='') 19 | args = parser.parse_args() 20 | 21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 22 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 23 | 24 | base_path = os.path.dirname(os.path.realpath(__file__)) 25 | 26 | create_tfr_files(args) 27 | 28 | with open(args.inputs) as file: 29 | input_data = [line.strip() for line in file.read().splitlines() if line.strip()] 30 | 31 | count = int(len(input_data)/2) 32 | 33 | ids = [input_data[2*i].replace(">", "") for i in range(count)] 34 | sequences = {} 35 | for i,I in enumerate(ids): 36 | sequences[I] = input_data[2*i+1].replace(" ", "").replace("T", "U").upper() 37 | 38 | os.environ["CUDA_VISIBLE_DEVICES"]= str(args.gpu) 39 | #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 40 | NUM_MODELS = 4 41 | 42 | test_loc = [os.path.splitext(args.inputs)[0] + ".tfrecords"] 43 | 44 | outputs = {} 45 | mask = {} 46 | def sigmoid(x): 47 | return 1/(1+np.exp(-np.array(x, dtype=np.float128))) 48 | 49 | #for MODEL in range(NUM_MODELS): 50 | for MODEL in [0, 1, 2, 3]: 51 | #for MODEL in [0, 1, 2, 3]: 52 | print(MODEL) 53 | config = tf.ConfigProto() 54 | #config.gpu_options.allow_growth = True 55 | config.allow_soft_placement=True 56 | config.log_device_placement=False 57 | print('\nPredicting for SPOT-RNA2 model '+str(MODEL)) 58 | with tf.Session(config=config) as sess: 59 | saver = tf.train.import_meta_graph(os.path.join(base_path, 'models_ckps'+'/model_'+str(MODEL)+'.meta')) 60 | saver.restore(sess, os.path.join(base_path, 'models_ckps'+'/model_'+str(MODEL))) 61 | graph = tf.get_default_graph() 62 | init_test = graph.get_operation_by_name('make_initializer_1') 63 | tmp_out = graph.get_tensor_by_name('output_FC/fully_connected/BiasAdd:0') 64 | name_tensor = graph.get_tensor_by_name('tensors_1/component_0:0') 65 | RNA_name = graph.get_tensor_by_name('IteratorGetNext:0') 66 | label_mask = graph.get_tensor_by_name('IteratorGetNext:4') 67 | sess.run([init_test], feed_dict={name_tensor:test_loc}) 68 | 69 | pbar = tqdm(total = count) 70 | for rna in ids: 71 | out = sess.run([tmp_out,RNA_name,label_mask],feed_dict={'dropout:0':1}) 72 | out[1] = rna 73 | 74 | mask[out[1]] = out[2] 75 | 76 | if MODEL == 0: 77 | outputs[out[1]] = [sigmoid(out[0])] 78 | else: 79 | outputs[out[1]].append(sigmoid(out[0])) 80 | pbar.update(1) 81 | pbar.close() 82 | tf.reset_default_graph() 83 | 84 | 85 | RNA_ids = [i for i in list(outputs.keys())] 86 | ensemble_outputs = {} 87 | 88 | print('\nPost Processing and Saving Output') 89 | for i in RNA_ids: 90 | #print(i, mask[i].shape, len(sequences[i])) 91 | ensemble_outputs[i] = np.mean(outputs[i],0) 92 | prob_to_secondary_structure(ensemble_outputs[i], mask[i], sequences[i], i, args) 93 | 94 | print('\nFinished!') 95 | end = time.time() 96 | print('\nProcesssing Time {} seconds'.format(end - start)) 97 | -------------------------------------------------------------------------------- /utils/FreeKnot/MIS.pm: -------------------------------------------------------------------------------- 1 | #The MIS algorithm module. It is an extension of the k-MIS algorithm proposed by Byskov (Byskov, J., 2004) 2 | 3 | package MIS; 4 | 5 | use strict; 6 | 7 | use constant D => 3; 8 | 9 | my $miss; 10 | my $checked_sets; 11 | 12 | sub get_mis { 13 | my (undef, $circle_graph) = @_; 14 | 15 | $miss = []; 16 | $checked_sets = {}; 17 | 18 | #Initialize the vertex set with goal opposing vertices filtered 19 | my $vertex_set = VertexSubset->new($circle_graph); 20 | #Call the branching algorithm _search_mis, all the MWISs will be stored in $mwiss 21 | _search_mis($vertex_set, [], $circle_graph); 22 | 23 | undef $checked_sets; 24 | 25 | return $miss; 26 | } 27 | 28 | sub _search_mis { 29 | my ($vertex_subset, $candidate_set, $circle_graph) = @_; 30 | 31 | if ($vertex_subset->get_size() == 0) { 32 | #If the vertex subset is empty, check whether the $candidate_set is an independent set. If so then it is 33 | #an MIS and the toal vertex weight is evaluated. Those with the best overall weight (according to the 34 | #goal specified by $criteria) are put in $miss. Since the same subset may appear more than once, 35 | #$checked_sets stores all the subset verified before to avoid unnecessary checking. 36 | @{$candidate_set} = sort {$a <=> $b} @{$candidate_set}; 37 | my $candidate_set_id = join('-', @{$candidate_set}); 38 | if (!exists($checked_sets->{$candidate_set_id}) && _is_independent_set($candidate_set, $circle_graph)) { 39 | push @{$miss}, $candidate_set; 40 | $checked_sets->{$candidate_set_id} = $candidate_set; 41 | } 42 | } 43 | else { 44 | my ($highest_degree_vertices, $highest_vertex_degree) = $vertex_subset->get_highest_degree_vertex_info(); 45 | #If the highest vertex degree is at least D, select a vertex with such degree to branch 46 | if ($highest_vertex_degree >= D) { 47 | my @self_adj_vertices = (@{$vertex_subset->get_adjacent_vertices_at($highest_degree_vertices->[0])}, $highest_degree_vertices->[0]); 48 | my @expanded_candidate_set = (@{$candidate_set}, $highest_degree_vertices->[0]); 49 | #Branch on by including the selected vertex in $candidate_set 50 | _search_mis($vertex_subset->get_subset(\@self_adj_vertices), \@expanded_candidate_set, $circle_graph); 51 | 52 | #Branch on by just excluding the selected vertex in $candidate_set 53 | _search_mis($vertex_subset->get_subset([$highest_degree_vertices->[0]]), $candidate_set, $circle_graph); 54 | } 55 | #If the highest vertex degree is lower than D, select a vertex with the lowest vertex degree to branch instead 56 | else { 57 | my ($lowest_degree_vertices, undef) = $vertex_subset->get_lowest_degree_vertex_info(); 58 | my $adj_vertices = $vertex_subset->get_adjacent_vertices_at($lowest_degree_vertices->[0]); 59 | my @self_adj_vertices1 = (@{$adj_vertices}, $lowest_degree_vertices->[0]); 60 | my @expanded_candidate_set1 = (@{$candidate_set}, $lowest_degree_vertices->[0]); 61 | #Branch on by including the selected vertex in $candidate_set 62 | _search_mis($vertex_subset->get_subset(\@self_adj_vertices1), \@expanded_candidate_set1, $circle_graph); 63 | 64 | #Branch on by enumerating and including each adjacent vertex of the selected vertex in $candidate_set 65 | foreach (@{$adj_vertices}) { 66 | my @expanded_candidate_set2 = (@{$candidate_set}, $_); 67 | my @self_adj_vertices2 = (@{$vertex_subset->get_adjacent_vertices_at($_)}, $_); 68 | _search_mis($vertex_subset->get_subset(\@self_adj_vertices2), \@expanded_candidate_set2, $circle_graph); 69 | } 70 | } 71 | } 72 | } 73 | 74 | sub _is_independent_set { 75 | my ($candidate_set, $circle_graph) = @_; 76 | 77 | my ($all_non_adj_vertex_mask, $candidate_set_bitstrings) = ([], []); 78 | 79 | for (my $i = @{$candidate_set} - 1; $i >= 0; $i--) { 80 | my $non_adj_vertex_mask = $circle_graph->get_non_adj_vertex_mask_at($candidate_set->[$i]); 81 | for (my $j = 0; $j < @{$candidate_set_bitstrings}; $j++) { 82 | if (($candidate_set_bitstrings->[$j] & $non_adj_vertex_mask->[$j]) != $candidate_set_bitstrings->[$j]) { 83 | return 0; 84 | } 85 | } 86 | 87 | my ($vertex_bitstring_segment_num, $vertex_bitstring) = @{$circle_graph->get_vertex_bitstring_segment_at($candidate_set->[$i])}; 88 | $candidate_set_bitstrings->[$vertex_bitstring_segment_num] = $candidate_set_bitstrings->[$vertex_bitstring_segment_num] | $vertex_bitstring; 89 | } 90 | 91 | return 1; 92 | } 93 | 94 | 1; 95 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/temp.a2m: -------------------------------------------------------------------------------- 1 | >6UFJ_A/1-51 Chain A, RNA (50-MER) 2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGA---------- 3 | >6UEY_A/1-50 Chain A, RNA (50-MER) 4 | ACUCGUUUGAGCGAGUAUAAACAGUUGGUUAGGCUCAAAGCGGAGAGCAG----------- 5 | >HE577054.1/3246821-3246757 Paenibacillus polymyxa M1 main chromosome, complete genome 6 | ACUCGUCUGAGCGAGUAUAAACAGGUCAUUAAGCUCAGAGCGUUCACCG----CGGUGAGG 7 | >MF288922.1/150528-150592 Bacillus phage Janet, complete genome 8 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAAGCUCACAGCGUAGAGAGG--CCUCUCUAG 9 | >CP033464.1/4485719-4485655 Brevibacillus laterosporus strain 1821L chromosome, complete genome 10 | ACUCGAUUGAGCGAGUAUAAACAGAC-CUUAGGCUCAAAGCGUUGAGAAG--CUUCUCAGG 11 | >KT307976.1/157679-157741 Bacillus phage AvesoBmore, complete genome 12 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGCGGG 13 | >CP032410.1/870062-870126 Brevibacillus laterosporus strain E7593-50 chromosome, complete genome 14 | ACUCGAUUGAGCGAGUAUAAAUAGAC-CUUAAGCUCAAAGCGUUGAGGAG--CUUCUCAGG 15 | >MK892513.1/27480-27550 Prokaryotic dsDNA virus sp. isolate Unbinned_2716_contig-100_1, complete genome 16 | AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGGCGUAUAG--CUAUACGCG 17 | >MF288921.1/151458-151522 Bacillus phage OTooleKemple52, complete genome 18 | ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGUAGAGAGG--CCUCUCUAG 19 | >KJ489397.1/151758-151822 Bacillus phage CAM003, complete genome 20 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGUAGGGAGG--CCUCUCUAG 21 | >KF669647.1/155754-155816 Bacillus phage BigBertha, complete genome 22 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGUGGG 23 | >CP009278.1/2800251-2800310 Sphingobacterium sp. ML3W, complete genome 24 | AGUCGUUUGAGCGACUUAAAAUAGGU-UUUAAGCUCAAAGCGCCCCGAUAAUAAUCGGGAG 25 | >CP045298.1/5377890-5377826 Paenibacillus brasilensis strain KACC 13842 chromosome, complete genome 26 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACCGGAUCCGGUGAGG 27 | >KF669662.1/155100-155162 Bacillus phage Spock, complete genome 28 | ACUCGUGUAAGCGAGUAUAAAAAGGC-UUUAGGCUUACAGCGUCGCGGAGAUCUCCGCGGG 29 | >KR063281.1/60079-60028 Gordonia phage GMA2, complete genome 30 | ACUCGACUGAGCGAGUAUAAACAGUU-CUUAAGCUCAGAGCGGCC------------GGCG 31 | >KJ489402.1/153758-153819 Bacillus phage Riley, complete genome 32 | ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCGUCGCGGG----C--CCGCG 33 | >CP000154.2/3364238-3364174 Paenibacillus polymyxa E681, complete genome 34 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACUGGA-CCAGUGAGA 35 | >LN852800.1/7754-7693 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0110 36 | GCUCGUCUGGGCGAGGAUAAACAGCUA-UUAAGCCCAGAGCGUUCCGGUUAUGAUCGGAGG 37 | >CP019039.1/7984-8046 Bacillus velezensis strain GH1-13 plasmid unnamed, complete sequence 38 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCUCAGAGCGUCCUUCC----GGAAGGGG 39 | >LN852940.1/1904-1844 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0268 40 | GCUCGUCUGGGCGAGGGUAAAUAGCUAAUUAGGCCCAGAGCGUCCAGGAUG-AUCCUGGAG 41 | >JN790865.1/35681-35620 Bacillus phage B4, complete genome 42 | AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--UCCCCCGUG 43 | >KY888882.1/156410-156472 Bacillus phage Flapjack, complete genome 44 | ACUCGUGUGAGUGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--CCCUGCG-G 45 | >CP014843.1/29638-29697 Bacillus licheniformis strain SCDB 14 plasmid pSCDB14, complete sequence 46 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCGUUUCCCUUCUAGGGGAGGU 47 | >CP045906.1/14639513-14639571 Caligus rogercresseyi isolate FCH chromosome 17 48 | UCUUGCUUGAGCAAGAAUAAAGAGCUGUACAUAAGCAAAGAGUCUUGCCU--GAGCAAGAG 49 | >HG916826.1/843085-843030 Pseudomonas pseudoalcaligenes CECT 5344 complete genome 50 | CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCGAAAUUCGGGGCCG-----G 51 | >XM_028713395.1/30-87 PREDICTED: Podarcis muralis solute carrier family 16 member 6 (SLC16A6), mRNA 52 | ACCGGCUCGAGCCGGUAUAAAAAGCU---UGAGCUCGAGCACAGCGGCAGCACUGCCGCAG 53 | >AC100771.2/133706-133648 Homo sapiens chromosome 11, clone RP11-159H22, complete sequence 54 | GUUCAUUUGGGUGAAUAUAAAAAGGAGAUUA--CUCAAAGCUUUAAAAAAAAUUUUUUUAA 55 | >CP022654.2/63818-63880 Bacillus velezensis strain SCDB 291 chromosome, complete genome 56 | AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCGUCCUUCC----GGAAGGGG 57 | >CP045899.1/5107513-5107456 Caligus rogercresseyi isolate FCH chromosome 10 58 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGCUG---AGCAAGAG 59 | >CP010557.1/4528803-4528858 Raoultella ornithinolytica strain S12, complete genome 60 | CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------UCAGGCAGAUCUGCCAGAG 61 | >MH153801.1/58164-58217 Microbacterium phage Count, complete genome 62 | AGUCGUCUGAGCGACUUUAAAUAGGU-CUUAGGCUCAGAGCGGAUAGAUG------UAUUG 63 | >CP045896.1/486401-486459 Caligus rogercresseyi isolate FCH chromosome 7 64 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGC--AUGAGCAAGAG 65 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.a2m: -------------------------------------------------------------------------------- 1 | >sample_seq 2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG 3 | >6UFJ_A/1-51 Chain A, RNA (50-MER) 4 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGA---------- 5 | >6UEY_A/1-50 Chain A, RNA (50-MER) 6 | ACUCGUUUGAGCGAGUAUAAACAGUUGGUUAGGCUCAAAGCGGAGAGCAG----------- 7 | >HE577054.1/3246821-3246757 Paenibacillus polymyxa M1 main chromosome, complete genome 8 | ACUCGUCUGAGCGAGUAUAAACAGGUCAUUAAGCUCAGAGCGUUCACCG----CGGUGAGG 9 | >MF288922.1/150528-150592 Bacillus phage Janet, complete genome 10 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAAGCUCACAGCGUAGAGAGG--CCUCUCUAG 11 | >CP033464.1/4485719-4485655 Brevibacillus laterosporus strain 1821L chromosome, complete genome 12 | ACUCGAUUGAGCGAGUAUAAACAGAC-CUUAGGCUCAAAGCGUUGAGAAG--CUUCUCAGG 13 | >KT307976.1/157679-157741 Bacillus phage AvesoBmore, complete genome 14 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGCGGG 15 | >CP032410.1/870062-870126 Brevibacillus laterosporus strain E7593-50 chromosome, complete genome 16 | ACUCGAUUGAGCGAGUAUAAAUAGAC-CUUAAGCUCAAAGCGUUGAGGAG--CUUCUCAGG 17 | >MK892513.1/27480-27550 Prokaryotic dsDNA virus sp. isolate Unbinned_2716_contig-100_1, complete genome 18 | AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGGCGUAUAG--CUAUACGCG 19 | >MF288921.1/151458-151522 Bacillus phage OTooleKemple52, complete genome 20 | ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGUAGAGAGG--CCUCUCUAG 21 | >KJ489397.1/151758-151822 Bacillus phage CAM003, complete genome 22 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGUAGGGAGG--CCUCUCUAG 23 | >KF669647.1/155754-155816 Bacillus phage BigBertha, complete genome 24 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGUGGG 25 | >CP009278.1/2800251-2800310 Sphingobacterium sp. ML3W, complete genome 26 | AGUCGUUUGAGCGACUUAAAAUAGGU-UUUAAGCUCAAAGCGCCCCGAUAAUAAUCGGGAG 27 | >CP045298.1/5377890-5377826 Paenibacillus brasilensis strain KACC 13842 chromosome, complete genome 28 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACCGGAUCCGGUGAGG 29 | >KF669662.1/155100-155162 Bacillus phage Spock, complete genome 30 | ACUCGUGUAAGCGAGUAUAAAAAGGC-UUUAGGCUUACAGCGUCGCGGAGAUCUCCGCGGG 31 | >KR063281.1/60079-60028 Gordonia phage GMA2, complete genome 32 | ACUCGACUGAGCGAGUAUAAACAGUU-CUUAAGCUCAGAGCGGCC------------GGCG 33 | >KJ489402.1/153758-153819 Bacillus phage Riley, complete genome 34 | ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCGUCGCGGG----C--CCGCG 35 | >CP000154.2/3364238-3364174 Paenibacillus polymyxa E681, complete genome 36 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACUGGA-CCAGUGAGA 37 | >LN852800.1/7754-7693 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0110 38 | GCUCGUCUGGGCGAGGAUAAACAGCUA-UUAAGCCCAGAGCGUUCCGGUUAUGAUCGGAGG 39 | >CP019039.1/7984-8046 Bacillus velezensis strain GH1-13 plasmid unnamed, complete sequence 40 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCUCAGAGCGUCCUUCC----GGAAGGGG 41 | >LN852940.1/1904-1844 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0268 42 | GCUCGUCUGGGCGAGGGUAAAUAGCUAAUUAGGCCCAGAGCGUCCAGGAUG-AUCCUGGAG 43 | >JN790865.1/35681-35620 Bacillus phage B4, complete genome 44 | AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--UCCCCCGUG 45 | >KY888882.1/156410-156472 Bacillus phage Flapjack, complete genome 46 | ACUCGUGUGAGUGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--CCCUGCG-G 47 | >CP014843.1/29638-29697 Bacillus licheniformis strain SCDB 14 plasmid pSCDB14, complete sequence 48 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCGUUUCCCUUCUAGGGGAGGU 49 | >CP045906.1/14639513-14639571 Caligus rogercresseyi isolate FCH chromosome 17 50 | UCUUGCUUGAGCAAGAAUAAAGAGCUGUACAUAAGCAAAGAGUCUUGCCU--GAGCAAGAG 51 | >HG916826.1/843085-843030 Pseudomonas pseudoalcaligenes CECT 5344 complete genome 52 | CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCGAAAUUCGGGGCCG-----G 53 | >XM_028713395.1/30-87 PREDICTED: Podarcis muralis solute carrier family 16 member 6 (SLC16A6), mRNA 54 | ACCGGCUCGAGCCGGUAUAAAAAGCU---UGAGCUCGAGCACAGCGGCAGCACUGCCGCAG 55 | >AC100771.2/133706-133648 Homo sapiens chromosome 11, clone RP11-159H22, complete sequence 56 | GUUCAUUUGGGUGAAUAUAAAAAGGAGAUUA--CUCAAAGCUUUAAAAAAAAUUUUUUUAA 57 | >CP022654.2/63818-63880 Bacillus velezensis strain SCDB 291 chromosome, complete genome 58 | AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCGUCCUUCC----GGAAGGGG 59 | >CP045899.1/5107513-5107456 Caligus rogercresseyi isolate FCH chromosome 10 60 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGCUG---AGCAAGAG 61 | >CP010557.1/4528803-4528858 Raoultella ornithinolytica strain S12, complete genome 62 | CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------UCAGGCAGAUCUGCCAGAG 63 | >MH153801.1/58164-58217 Microbacterium phage Count, complete genome 64 | AGUCGUCUGAGCGACUUUAAAUAGGU-CUUAGGCUCAGAGCGGAUAGAUG------UAUUG 65 | >CP045896.1/486401-486459 Caligus rogercresseyi isolate FCH chromosome 7 66 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGC--AUGAGCAAGAG 67 | -------------------------------------------------------------------------------- /utils/FreeKnot/CircleGraph.pm: -------------------------------------------------------------------------------- 1 | #Circle graph is the graphical model for a primitive pseudoknot. Each vertex represents a crossing stem of 2 | #the pseudoknot, and each edge represents a crossing between two stems. The vertex attributes store information 3 | #such as number of base pairs, paired positions. 4 | # 5 | #Every vertex is represented by a unique bitstring and its adjacent vertices are represented by a bitstring 6 | #mask. The least significant bit (LSB) represents the most preceding vertex and the most significant bit (MSB) 7 | #represents the least preceding vertex of the knot-stem graph. However, the no. of vertices may exceed the 8 | #length of one bitstring. To solve this problem, multiple bitstrings are required to form a bitstring long 9 | #enough for each bit position to uniquely identify a vertex. This 'long' bitstring is disassembled into an 10 | #array of bitstrings and each array element is called a bitstring segment. Every bit position of the 'long' 11 | #bitstring is then transformed by a (segment no., segment bitstring) pair. 12 | 13 | package CircleGraph; 14 | 15 | use strict; 16 | 17 | sub new { 18 | my (undef, $primitive_pseudoknot, $os_bit) = @_; 19 | 20 | my $vertex_attrs = []; 21 | # my ($stem_pair_counts, $gains) = ([], []); 22 | my ($vertex_bitstring_segments, $non_adj_vertex_masks) = ([], []); 23 | 24 | my ($prim_pseudoknot_stems, $prim_pseudoknot_stem_crossings) = @{$primitive_pseudoknot}; 25 | my $vertex_count = @{$prim_pseudoknot_stems}; 26 | 27 | my ($bitstring_segment_num, $vertex_bit) = (0, 0); 28 | 29 | for (my $i = $vertex_count - 1; $i >= 0; $i--) { 30 | my $prim_pseudoknot_stem = $prim_pseudoknot_stems->[$i]; 31 | # $stem_pair_counts->[$i] = @{$prim_pseudoknot_stem}; 32 | # $gains->[$i] = $stem_pair_counts->[$i]; 33 | 34 | $vertex_bitstring_segments->[$i] = [$bitstring_segment_num, 1 << $vertex_bit]; 35 | my $non_adj_vertex_mask_bitstrings = []; 36 | 37 | my $stem_crossings = $prim_pseudoknot_stem_crossings->[$i]; 38 | my $next_crossing_index = @{$stem_crossings} - 1; 39 | my $next_crossing_stem_id; 40 | if ($next_crossing_index >= 0) { 41 | $next_crossing_stem_id = $stem_crossings->[$next_crossing_index]; 42 | } 43 | 44 | for (my $j = $vertex_count - 1; $j > $i; $j--) { 45 | if ($next_crossing_index >= 0 && $j == $next_crossing_stem_id) { 46 | # $gains->[$i] -= $stem_pair_counts->[$j]; 47 | # $gains->[$j] -= $stem_pair_counts->[$i]; 48 | if (--$next_crossing_index >= 0) { 49 | $next_crossing_stem_id = $stem_crossings->[$next_crossing_index]; 50 | } 51 | } 52 | else { 53 | my $non_adj_vertex_bitstring_segment_num = $vertex_bitstring_segments->[$j][0]; 54 | $non_adj_vertex_mask_bitstrings->[$non_adj_vertex_bitstring_segment_num] = $non_adj_vertex_mask_bitstrings->[$non_adj_vertex_bitstring_segment_num] | $vertex_bitstring_segments->[$j][1]; 55 | } 56 | } 57 | 58 | $non_adj_vertex_masks->[$i] = $non_adj_vertex_mask_bitstrings; 59 | 60 | if (++$vertex_bit == $os_bit) { 61 | $bitstring_segment_num++; 62 | $vertex_bit = 0; 63 | } 64 | } 65 | 66 | for (my $i = 0; $i < $vertex_count; $i++) { 67 | my $attrs = {}; 68 | # $attrs->{pair_count} = $stem_pair_counts->[$i]; 69 | # $attrs->{gain} = $gains->[$i]; 70 | $attrs->{stem_pairs} = $prim_pseudoknot_stems->[$i]; 71 | $vertex_attrs->[$i] = $attrs; 72 | } 73 | 74 | my $self = {}; 75 | $self->{vertex_count} = $vertex_count; 76 | $self->{vertex_attrs} = $vertex_attrs; 77 | $self->{edges} = $prim_pseudoknot_stem_crossings; 78 | $self->{vertex_bitstring_segments} = $vertex_bitstring_segments; 79 | $self->{non_adj_vertex_masks} = $non_adj_vertex_masks; 80 | 81 | bless $self; 82 | 83 | return $self; 84 | } 85 | 86 | sub get_vertex_count { 87 | my $self = shift; 88 | 89 | return $self->{vertex_count}; 90 | } 91 | 92 | sub get_vertex_attrs_at { 93 | my ($self, $vertex_num) = @_; 94 | 95 | if ($vertex_num >= $self->{vertex_count}) { 96 | return []; 97 | } 98 | 99 | my $vertex_attrs = $self->{vertex_attrs}; 100 | 101 | return $vertex_attrs->[$vertex_num]; 102 | } 103 | 104 | sub get_edges_at { 105 | my ($self, $vertex_num) = @_; 106 | 107 | if ($vertex_num >= $self->{vertex_count}) { 108 | return []; 109 | } 110 | 111 | my $edges = $self->{edges}; 112 | 113 | return $edges->[$vertex_num]; 114 | } 115 | 116 | #Return the bitstring segment of the vertex. Each bitstring segment is a (segment no., 117 | #segment bitstring) pair. 118 | sub get_vertex_bitstring_segment_at { 119 | my ($self, $vertex_num) = @_; 120 | 121 | if ($vertex_num >= $self->{vertex_count}) { 122 | return []; 123 | } 124 | 125 | my $bitstring_segments = $self->{vertex_bitstring_segments}; 126 | 127 | return $bitstring_segments->[$vertex_num]; 128 | } 129 | 130 | #Returns bitstring segments that filter all the subsequent adjacent vertices 131 | sub get_non_adj_vertex_mask_at { 132 | my ($self, $vertex_num) = @_; 133 | 134 | if ($vertex_num >= $self->{vertex_count}) { 135 | return 0; 136 | } 137 | 138 | my $non_adj_vertex_mask_bitstrings = $self->{non_adj_vertex_masks}; 139 | 140 | return $non_adj_vertex_mask_bitstrings->[$vertex_num]; 141 | } 142 | 143 | 1; 144 | -------------------------------------------------------------------------------- /utils/FreeKnot/DPParser.pm: -------------------------------------------------------------------------------- 1 | #Parser for dot-parentheses format 2 | #It returns primitive pseudoknot objects, base sequence and dot-parentheses array 3 | 4 | package DPParser; 5 | 6 | use strict; 7 | 8 | use constant DOT => '.'; 9 | 10 | sub parse{ 11 | my (undef, $dp_file_path) = @_; 12 | 13 | my $primitive_pseudoknots = []; 14 | my ($base_seq_str, $secondary_structure) = ('', ''); 15 | 16 | open (DP, "<$dp_file_path") or die "Cannot open file at $dp_file_path"; 17 | while () { 18 | if ($_ =~ /^([A-Za-z]+)[\r\n]*$/) { 19 | $base_seq_str = $base_seq_str . $1; 20 | } 21 | elsif ($_ =~ /^([\.\(\)\[\]\{\}<>A-Za-z]+)[\r\n]*$/) { 22 | $secondary_structure = $secondary_structure . $1; 23 | } 24 | elsif ($_ !~ /^#.*/ && $_ !~ /^\s+/) { 25 | die "Unknown input: $_"; 26 | } 27 | } 28 | 29 | close DP or die "Cannot close file at $dp_file_path"; 30 | 31 | if ($base_seq_str eq '') { 32 | die 'Base sequence is missing'; 33 | } 34 | 35 | if ($secondary_structure eq '') { 36 | die 'Secondary structure is missing'; 37 | } 38 | 39 | if (length($base_seq_str) != length($secondary_structure)) { 40 | die 'Base sequence length not equal to secondary structure length'; 41 | } 42 | 43 | #Group the base pairs into base pair stems 44 | my ($stem_outermost_pairs, $stems, $paired_pos_ptrs, $structure_symbols) = _group_to_stems($secondary_structure); 45 | #Extract primitive pseudoknots from the base pair stems 46 | my $primitive_pseudoknots = PrimitivePseudoknotExtractor->extract($stem_outermost_pairs, $stems, $paired_pos_ptrs); 47 | my @base_seq = split(//, $base_seq_str); 48 | 49 | return $primitive_pseudoknots, \@base_seq, $structure_symbols, $base_seq_str; 50 | } 51 | 52 | sub _group_to_stems { 53 | my $secondary_structure = shift; 54 | my $stems = {}; 55 | my ($stem_outermost_pairs, $stem, $outermost_base_pair) = ([], [], []); 56 | my $paired_pos_ptrs = []; 57 | my $unsettled_bracket_upstream_pos = {}; 58 | my $next_paired_pos = {}; 59 | my $last_paired_pos = 0; 60 | 61 | my @structure_symbols = split(//, $secondary_structure); 62 | my $structure_length = scalar @structure_symbols; 63 | 64 | for (my $i = 0; $i < $structure_length; $i++) { 65 | my $symbol = $structure_symbols[$i]; 66 | if ($symbol eq DOT) { 67 | next; 68 | } 69 | elsif (BracketPairs->is_open_bracket($symbol)) { 70 | my $unsettled_upstream_pos = $unsettled_bracket_upstream_pos->{$symbol}; 71 | if (!defined($unsettled_upstream_pos)) { 72 | $unsettled_upstream_pos = []; 73 | $unsettled_bracket_upstream_pos->{$symbol} = $unsettled_upstream_pos; 74 | } 75 | 76 | my $curr_upstream_pos = $i + 1; 77 | push @{$unsettled_upstream_pos}, $curr_upstream_pos; 78 | 79 | if (defined($outermost_base_pair->[0])) { 80 | ($stem_outermost_pairs, $stems, $outermost_base_pair, $stem) = _add_to_stems($stem_outermost_pairs, $stems, $outermost_base_pair, $stem); 81 | } 82 | 83 | $next_paired_pos->{$last_paired_pos} = $curr_upstream_pos; 84 | $last_paired_pos = $curr_upstream_pos; 85 | } 86 | else { 87 | my $pair_open_bracket = BracketPairs->get_open_bracket($symbol); 88 | my $unsettled_upstream_pos = $unsettled_bracket_upstream_pos->{$pair_open_bracket}; 89 | if (defined($unsettled_upstream_pos) && defined($unsettled_upstream_pos->[0])) { 90 | my $paired_upstream_pos = pop @{$unsettled_upstream_pos}; 91 | my $curr_downstream_pos = $i + 1; 92 | 93 | if (defined($outermost_base_pair->[0])) { 94 | if ($next_paired_pos->{$paired_upstream_pos} != $outermost_base_pair->[0]) { 95 | ($stem_outermost_pairs, $stems, $outermost_base_pair, $stem) = _add_to_stems($stem_outermost_pairs, $stems, $outermost_base_pair, $stem); 96 | } 97 | 98 | $outermost_base_pair = [$paired_upstream_pos, $curr_downstream_pos]; 99 | unshift @{$stem}, $outermost_base_pair; 100 | } 101 | else { 102 | $outermost_base_pair = [$paired_upstream_pos, $curr_downstream_pos]; 103 | $stem = [$outermost_base_pair]; 104 | } 105 | 106 | $paired_pos_ptrs->[$paired_upstream_pos] = $curr_downstream_pos; 107 | $paired_pos_ptrs->[$curr_downstream_pos] = $paired_upstream_pos; 108 | 109 | $next_paired_pos->{$last_paired_pos} = $curr_downstream_pos; 110 | $last_paired_pos = $curr_downstream_pos; 111 | } 112 | else { 113 | die "Closing bracket $symbol not paired\n"; 114 | } 115 | } 116 | } 117 | 118 | if (!_is_all_open_bracket_settled($unsettled_bracket_upstream_pos)) { 119 | die "Unpaired open bracket remains\n"; 120 | } 121 | 122 | if (defined($outermost_base_pair->[0])) { 123 | ($stem_outermost_pairs, $stems, undef, undef) = _add_to_stems($stem_outermost_pairs, $stems, $outermost_base_pair, $stem); 124 | } 125 | 126 | my @sorted_outermost_pairs = sort {$a->[0] <=> $b->[0]} @{$stem_outermost_pairs}; 127 | 128 | return (\@sorted_outermost_pairs, $stems, $paired_pos_ptrs, \@structure_symbols); 129 | } 130 | 131 | sub _add_to_stems { 132 | my ($stem_outermost_pairs, $stems, $stem_outermost_pair, $stem) = @_; 133 | 134 | $stems->{$stem_outermost_pair->[0]} = $stem; 135 | push @{$stem_outermost_pairs}, $stem_outermost_pair; 136 | 137 | return ($stem_outermost_pairs, $stems, [], []); 138 | } 139 | 140 | sub _is_all_open_bracket_settled { 141 | my $unsettled_open_bracket_pos = shift; 142 | 143 | foreach (values %{$unsettled_open_bracket_pos}) { 144 | if (defined($_->[0])) { 145 | return 0; 146 | } 147 | } 148 | 149 | return 1; 150 | } 151 | 152 | 1; 153 | -------------------------------------------------------------------------------- /utils/FreeKnot/PrimitivePseudoknotExtractor.pm: -------------------------------------------------------------------------------- 1 | #Module that extracts primitive pseudoknots from all the base pair stems of the RNA secondary structure 2 | 3 | package PrimitivePseudoknotExtractor; 4 | 5 | use strict; 6 | 7 | sub extract { 8 | my (undef, $stem_outermost_pairs, $stems, $paired_pos_ptrs) = @_; 9 | 10 | #Group together the crossing stems of a pseudoknot 11 | my ($knotted_pair_pos_groups, $outermost_pair_crossings) = _group_knotted_outermost_pairs($stem_outermost_pairs); 12 | #Create the pseudoknot objects 13 | my $primitive_pseudoknots = _get_prim_pseudoknots($stems, $knotted_pair_pos_groups, $outermost_pair_crossings, $paired_pos_ptrs); 14 | 15 | return $primitive_pseudoknots; 16 | } 17 | 18 | sub _group_knotted_outermost_pairs { 19 | my $stem_outermost_pairs = shift; 20 | 21 | my $knotted_pair_pos_groups = []; 22 | my $outermost_pair_crossings = {}; 23 | my $paired_pos_to_group_id = {}; 24 | my $max_group_id; 25 | 26 | my $outermost_pair_count = @{$stem_outermost_pairs}; 27 | 28 | for (my $i = 0; $i < $outermost_pair_count; $i++) { 29 | my ($curr_pair_upstream_pos, $curr_pair_downstream_pos) = @{$stem_outermost_pairs->[$i]}; 30 | my $curr_pair_group_id = $paired_pos_to_group_id->{$curr_pair_upstream_pos}; 31 | 32 | my $succ_pair_crossings = []; 33 | 34 | for (my $j = $i + 1; $j < $outermost_pair_count; $j++) { 35 | my ($candidate_pair_upstream_pos, $candidate_pair_downstream_pos) = @{$stem_outermost_pairs->[$j]}; 36 | if ($candidate_pair_upstream_pos > $curr_pair_downstream_pos) { 37 | last; 38 | } 39 | 40 | if ($candidate_pair_downstream_pos > $curr_pair_downstream_pos) { 41 | my $crossing_pair_group_id = $paired_pos_to_group_id->{$candidate_pair_upstream_pos}; 42 | if (defined($curr_pair_group_id)) { 43 | if (!defined($crossing_pair_group_id)) { 44 | push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $candidate_pair_upstream_pos; 45 | push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $candidate_pair_downstream_pos; 46 | $paired_pos_to_group_id->{$candidate_pair_upstream_pos} = $curr_pair_group_id; 47 | } 48 | elsif ($crossing_pair_group_id != $curr_pair_group_id) { 49 | my @merged_pos_group = (@{$knotted_pair_pos_groups->[$curr_pair_group_id]}, @{$knotted_pair_pos_groups->[$crossing_pair_group_id]}); 50 | $knotted_pair_pos_groups->[$curr_pair_group_id] = \@merged_pos_group; 51 | 52 | foreach (@{$knotted_pair_pos_groups->[$crossing_pair_group_id]}) { 53 | if (exists($paired_pos_to_group_id->{$_})) { 54 | $paired_pos_to_group_id->{$_} = $curr_pair_group_id; 55 | } 56 | } 57 | 58 | delete $knotted_pair_pos_groups->[$crossing_pair_group_id]; 59 | } 60 | } 61 | else { 62 | if (defined($crossing_pair_group_id)) { 63 | $curr_pair_group_id = $crossing_pair_group_id; 64 | push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $curr_pair_upstream_pos; 65 | push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $curr_pair_downstream_pos; 66 | } 67 | else { 68 | $curr_pair_group_id = $max_group_id++; 69 | $knotted_pair_pos_groups->[$curr_pair_group_id] = [$curr_pair_upstream_pos, $curr_pair_downstream_pos, $candidate_pair_upstream_pos, $candidate_pair_downstream_pos]; 70 | $paired_pos_to_group_id->{$candidate_pair_upstream_pos} = $curr_pair_group_id; 71 | } 72 | } 73 | 74 | push @{$succ_pair_crossings}, $candidate_pair_upstream_pos; 75 | } 76 | } 77 | 78 | $outermost_pair_crossings->{$curr_pair_upstream_pos} = $succ_pair_crossings; 79 | } 80 | 81 | return ($knotted_pair_pos_groups, $outermost_pair_crossings); 82 | } 83 | 84 | sub _get_prim_pseudoknots { 85 | my ($stems, $knotted_pair_pos_groups, $outermost_pair_crossings, $paired_pos_ptrs) = @_; 86 | 87 | my $primitive_pseudoknots = []; 88 | 89 | for (my $i = 0; $i < @{$knotted_pair_pos_groups}; $i++) { 90 | if (!defined($knotted_pair_pos_groups->[$i])) { 91 | next; 92 | } 93 | 94 | my @sorted_knot_pair_pos = sort {$a <=> $b} @{$knotted_pair_pos_groups->[$i]}; 95 | my $prev_knot_pair_pos = {}; 96 | for (my $j = 1; $j < @sorted_knot_pair_pos; $j++) { 97 | $prev_knot_pair_pos->{$sorted_knot_pair_pos[$j]} = $sorted_knot_pair_pos[$j - 1]; 98 | } 99 | 100 | my ($prim_pseudoknot_stems, $prim_pseudoknot_stem) = ([], []); 101 | my $knot_pair_pos_to_stem_id = {}; 102 | my $max_stem_id = 0; 103 | 104 | for (my $j = 0; $j < (@sorted_knot_pair_pos - 1); $j++) { 105 | my $curr_pos = $sorted_knot_pair_pos[$j]; 106 | my $curr_paired_pos = $paired_pos_ptrs->[$curr_pos]; 107 | if ($curr_pos > $curr_paired_pos) { 108 | next; 109 | } 110 | 111 | my @merged_stem = (@{$prim_pseudoknot_stem}, @{$stems->{$curr_pos}}); 112 | $prim_pseudoknot_stem = \@merged_stem; 113 | 114 | my $next_pos = $sorted_knot_pair_pos[$j + 1]; 115 | my $next_paired_pos = $paired_pos_ptrs->[$next_pos]; 116 | if ($prev_knot_pair_pos->{$curr_paired_pos} != $next_paired_pos) { 117 | push @{$prim_pseudoknot_stems}, $prim_pseudoknot_stem; 118 | $knot_pair_pos_to_stem_id->{$curr_pos} = $max_stem_id++; 119 | $prim_pseudoknot_stem = []; 120 | } 121 | } 122 | 123 | my $prim_pseudoknot_stem_crossings = []; 124 | while (my ($knot_pair_upstream_pos, $stem_id) = each %{$knot_pair_pos_to_stem_id}) { 125 | my $stem_crossings = []; 126 | my $knot_pair_crossings = $outermost_pair_crossings->{$knot_pair_upstream_pos}; 127 | foreach (@{$knot_pair_crossings}) { 128 | if (exists($knot_pair_pos_to_stem_id->{$_})) { 129 | push @{$stem_crossings}, $knot_pair_pos_to_stem_id->{$_}; 130 | } 131 | } 132 | 133 | $prim_pseudoknot_stem_crossings->[$stem_id] = $stem_crossings; 134 | } 135 | 136 | push @{$primitive_pseudoknots}, [$prim_pseudoknot_stems, $prim_pseudoknot_stem_crossings]; 137 | } 138 | 139 | return $primitive_pseudoknots; 140 | } 141 | 142 | 1; 143 | -------------------------------------------------------------------------------- /utils/FreeKnot/VertexSubset.pm: -------------------------------------------------------------------------------- 1 | #Module that represents the vertex subset in the MWIS algorithm. All the vertices of the knot-stem 2 | #graph are added to this subset (with the goal opposing vertices filtered) at initialization. When 3 | #the MWIS algorithm proceeds, vertices are gradually removed from this subset and the algorithm 4 | #stops when this subset is empty. 5 | # 6 | #This subset also keeps the adjacent vertices for each vertex in it, as well as the vertex degrees. 7 | #It enables the MWIS algorithm to select the highest degree and lowest degree vertices, and to 8 | #further generate a new subset of it while updating the adjacent vertices and vertex degrees. 9 | 10 | package VertexSubset; 11 | 12 | use strict; 13 | 14 | sub new { 15 | # my (undef, $circle_graph, $stem_scores, $criteria) = @_; 16 | my (undef, $circle_graph) = @_; 17 | 18 | my ($vertex_degrees, $adj_vertex_sets) = ({}, {}); 19 | 20 | my $subset_size = 0; 21 | 22 | for (my $i = $circle_graph->get_vertex_count() - 1; $i >= 0; $i--) { 23 | $vertex_degrees->{$i} = 0; 24 | 25 | foreach (@{$circle_graph->get_edges_at($i)}) { 26 | $vertex_degrees->{$i}++; 27 | $vertex_degrees->{$_}++; 28 | $adj_vertex_sets->{$i}{$_} = 1; 29 | $adj_vertex_sets->{$_}{$i} = 1; 30 | } 31 | 32 | $subset_size++; 33 | } 34 | 35 | my ($highest_degree_vertices, $lowest_degree_vertices, $highest_vertex_degree, $lowest_vertex_degree) = _get_highest_and_lowest_degree_vertices($vertex_degrees); 36 | 37 | my $self = {}; 38 | $self->{subset_size} = $subset_size; 39 | $self->{vertex_degrees} = $vertex_degrees; 40 | $self->{adj_vertex_sets} = $adj_vertex_sets; 41 | $self->{highest_degree_vertices} = $highest_degree_vertices; 42 | $self->{lowest_degree_vertices} = $lowest_degree_vertices; 43 | $self->{highest_vertex_degree} = $highest_vertex_degree; 44 | $self->{lowest_vertex_degree} = $lowest_vertex_degree; 45 | 46 | bless $self; 47 | 48 | return $self; 49 | } 50 | 51 | #Generate a new subset instance by removing the vertices specified in the input 52 | sub get_subset { 53 | my ($self, $vertices_to_remove) = @_; 54 | 55 | my $subset_size = 0; 56 | my ($subset_vertex_degrees, $subset_adj_vertex_sets) = ({}, {}); 57 | 58 | my %delete_vertices = map {$_ => 1} @{$vertices_to_remove}; 59 | my $vertex_degrees = $self->{vertex_degrees}; 60 | foreach (keys %{$vertex_degrees}) { 61 | if (!exists($delete_vertices{$_})) { 62 | $subset_vertex_degrees->{$_} = 0; 63 | $subset_adj_vertex_sets->{$_} = {}; 64 | $subset_size++; 65 | } 66 | } 67 | 68 | my $adj_vertex_sets = $self->{adj_vertex_sets}; 69 | while (my ($vertex, $adj_vertices) = each %{$adj_vertex_sets}) { 70 | if (!exists($delete_vertices{$vertex})) { 71 | foreach (keys %{$adj_vertices}) { 72 | if ($vertex < $_ && !exists($delete_vertices{$_})) { 73 | $subset_adj_vertex_sets->{$vertex}{$_} = 1; 74 | $subset_adj_vertex_sets->{$_}{$vertex} = 1; 75 | $subset_vertex_degrees->{$vertex}++; 76 | $subset_vertex_degrees->{$_}++; 77 | } 78 | } 79 | } 80 | } 81 | 82 | my ($highest_degree_vertices, $lowest_degree_vertices, $highest_vertex_degree, $lowest_vertex_degree) = _get_highest_and_lowest_degree_vertices($subset_vertex_degrees); 83 | 84 | my $subset_self = {}; 85 | $subset_self->{subset_size} = $subset_size; 86 | $subset_self->{vertex_degrees} = $subset_vertex_degrees; 87 | $subset_self->{adj_vertex_sets} = $subset_adj_vertex_sets; 88 | $subset_self->{highest_degree_vertices} = $highest_degree_vertices; 89 | $subset_self->{lowest_degree_vertices} = $lowest_degree_vertices; 90 | $subset_self->{highest_vertex_degree} = $highest_vertex_degree; 91 | $subset_self->{lowest_vertex_degree} = $lowest_vertex_degree; 92 | 93 | bless $subset_self; 94 | 95 | return $subset_self; 96 | } 97 | 98 | sub _get_highest_and_lowest_degree_vertices { 99 | my $vertex_degrees = shift; 100 | 101 | my ($highest_degree_vertices, $lowest_degree_vertices) = ([], []); 102 | my ($highest_vertex_degree, $lowest_vertex_degree) = (-1, -1); 103 | 104 | while (my ($vertex, $vertex_degree) = each %{$vertex_degrees}) { 105 | if ($vertex_degree > $highest_vertex_degree) { 106 | $highest_degree_vertices = [$vertex]; 107 | $highest_vertex_degree = $vertex_degree; 108 | } 109 | elsif ($vertex_degree == $highest_vertex_degree) { 110 | push @{$highest_degree_vertices}, $vertex; 111 | } 112 | 113 | if ($vertex_degree < $lowest_vertex_degree || $lowest_vertex_degree < 0) { 114 | $lowest_degree_vertices = [$vertex]; 115 | $lowest_vertex_degree = $vertex_degree; 116 | } 117 | elsif ($vertex_degree == $lowest_vertex_degree) { 118 | push @{$lowest_degree_vertices}, $vertex; 119 | } 120 | } 121 | 122 | my @sorted_highest_degree_vertices = sort {$a <=> $b} @{$highest_degree_vertices}; 123 | my @sorted_lowest_degree_vertices = sort {$a <=> $b} @{$lowest_degree_vertices}; 124 | 125 | return \@sorted_highest_degree_vertices, \@sorted_lowest_degree_vertices, $highest_vertex_degree, $lowest_vertex_degree; 126 | } 127 | 128 | sub get_size { 129 | my $self = shift; 130 | 131 | return $self->{subset_size}; 132 | } 133 | 134 | sub get_vertices { 135 | my $self = shift; 136 | 137 | my @vertices = sort {$a <=> $b} keys %{$self->{vertex_degrees}}; 138 | 139 | return \@vertices; 140 | } 141 | 142 | sub get_adjacent_vertices_at { 143 | my ($self, $vertex) = @_; 144 | 145 | my $adj_vertex_sets = $self->{adj_vertex_sets}; 146 | if (exists($adj_vertex_sets->{$vertex})) { 147 | my @adj_vertices = sort {$a <=> $b} keys %{$adj_vertex_sets->{$vertex}}; 148 | return \@adj_vertices; 149 | } 150 | 151 | return []; 152 | } 153 | 154 | sub get_highest_degree_vertex_info { 155 | my $self = shift; 156 | 157 | return $self->{highest_degree_vertices}, $self->{highest_vertex_degree}; 158 | } 159 | 160 | sub get_lowest_degree_vertex_info { 161 | my $self = shift; 162 | 163 | return $self->{lowest_degree_vertices}, $self->{lowest_vertex_degree}; 164 | } 165 | 166 | 1; 167 | -------------------------------------------------------------------------------- /utils/FreeKnot/MWIS.pm: -------------------------------------------------------------------------------- 1 | #Modified circle graph MWIS algorithm based on that proposed by Valiente (Valiente, G., 2003), with 2 | #enhancement suggested by Nash et al. (Nash, N., Lelait, S., and Gregg, D., 2009). It operates with 3 | #the chord model and reports either single solution or all solutions according to the user option. 4 | 5 | package MWIS; 6 | 7 | use strict; 8 | 9 | sub get_mwis { 10 | my (undef, $chord_model, $base_seq, $scoring_function, $criteria, $is_report_all) = @_; 11 | 12 | my $chord_weights = _get_chord_weights($chord_model, $base_seq, $scoring_function); 13 | 14 | my $end_point_count = $chord_model->get_chord_edge_count() * 2; 15 | 16 | #Enhancement by Nash et al. to get MWISs (in variable c) and the scores (in variable cmis) in 17 | #every region bounded by the endpoints of each chord. 18 | my ($m, $p) = ([], []); 19 | my ($cmis, $c) = ({}, {}); 20 | 21 | for (my $i = 1; $i <= $end_point_count + 1; $i++) { 22 | $m->[$i] = 0; 23 | $p->[$i] = [0]; 24 | } 25 | 26 | my $last = 1; 27 | 28 | for (my $i = 1; $i <= $end_point_count; $i++) { 29 | if ($chord_model->is_left_end_point($i)) { 30 | next; 31 | } 32 | 33 | my ($left_end_point, $right_end_point) = @{$chord_model->get_chord_edge_by_end_point($i)}; 34 | 35 | for (my $j = $last; $j > $left_end_point; $j--) { 36 | $m->[$j] = $m->[$j + 1]; 37 | $p->[$j] = $p->[$j + 1]; 38 | 39 | if ($chord_model->is_left_end_point($j)) { 40 | my (undef, $inner_right_end_point) = @{$chord_model->get_chord_edge_by_end_point($j)}; 41 | my $candidate_m = $m->[$inner_right_end_point + 1] + $cmis->{$j . '-' . $inner_right_end_point}; 42 | 43 | if (($criteria eq 'max' && $candidate_m > $m->[$j]) || 44 | ($criteria eq 'min' && $candidate_m < $m->[$j])) { 45 | $m->[$j] = $candidate_m; 46 | $p->[$j] = [$inner_right_end_point]; 47 | } 48 | elsif ($is_report_all && $candidate_m == $m->[$j]) { 49 | my @arr_clone = @{$p->[$j + 1]}; 50 | $p->[$j] = [$inner_right_end_point]; 51 | push @{$p->[$j]}, @arr_clone; 52 | } 53 | } 54 | } 55 | 56 | $cmis->{$left_end_point . '-' . $right_end_point} = $m->[$left_end_point + 1] + $chord_weights->{$left_end_point . '-' . $right_end_point}; 57 | $c->{$left_end_point . '-' . $right_end_point} = _add_front($p, $left_end_point + 1, $chord_model, []); 58 | $last = $left_end_point; 59 | } 60 | 61 | #Algorithm proposed by Valiente to obtain MWISs starting at each endpoint. Only those chords 62 | #in the MWIS that are not bounded by other chords in the same MWIS set are stored. 63 | my ($t_structures, $t_struct_weights) = ([], []); 64 | 65 | foreach (@{$chord_model->get_chord_end_point_nums()}) { 66 | $t_structures->[$_] = [[]]; 67 | 68 | if (!$chord_model->is_left_end_point($_)) { 69 | if ($_ < $end_point_count) { 70 | @{$t_structures->[$_]} = @{$t_structures->[$_ + 1]}; 71 | $t_struct_weights->[$_] = $t_struct_weights->[$_ + 1]; 72 | } 73 | else { 74 | $t_struct_weights->[$_] = 0; 75 | } 76 | } 77 | else { 78 | my $chord_edge = $chord_model->get_chord_edge_by_end_point($_); 79 | my $candidate_total_chord_weight = $cmis->{$chord_edge->[0] . '-' . $chord_edge->[1]}; 80 | 81 | if ($chord_edge->[1] < $end_point_count) { 82 | $candidate_total_chord_weight += $t_struct_weights->[$chord_edge->[1] + 1]; 83 | } 84 | 85 | if (($criteria eq 'max' && $candidate_total_chord_weight > $t_struct_weights->[$_ + 1]) || 86 | ($criteria eq 'min' && $candidate_total_chord_weight < $t_struct_weights->[$_ + 1]) || 87 | ($candidate_total_chord_weight == $t_struct_weights->[$_ + 1] && $is_report_all)) { 88 | my $generated_new_t_structures; 89 | 90 | if ($candidate_total_chord_weight == $t_struct_weights->[$_ + 1]) { 91 | @{$generated_new_t_structures} = @{$t_structures->[$_ + 1]}; 92 | } 93 | else { 94 | $generated_new_t_structures = []; 95 | } 96 | 97 | if ($chord_edge->[1] < $end_point_count) { 98 | foreach my $t_structure (@{$t_structures->[$chord_edge->[1] + 1]}) { 99 | my @new_t_structure = @{$t_structure}; 100 | unshift @new_t_structure, $chord_edge; 101 | push @{$generated_new_t_structures}, \@new_t_structure; 102 | } 103 | } 104 | else { 105 | push @{$generated_new_t_structures}, [$chord_edge]; 106 | } 107 | 108 | $t_structures->[$_] = $generated_new_t_structures; 109 | $t_struct_weights->[$_] = $candidate_total_chord_weight; 110 | } 111 | else { 112 | $t_structures->[$_] = $t_structures->[$_ + 1]; 113 | $t_struct_weights->[$_] = $t_struct_weights->[$_ + 1]; 114 | } 115 | } 116 | } 117 | 118 | my $mwiss = _restore_chord_mwiss($t_structures->[1], $c); 119 | 120 | return $mwiss; 121 | } 122 | 123 | #Generate all the MWISs in the region bounded by the endpoints of a single chord 124 | sub _add_front { 125 | my ($p, $start_pos, $chord_model, $org_c_element) = @_; 126 | 127 | my $p_element = $p->[$start_pos]; 128 | 129 | if ($p_element->[0] > 0) { 130 | my $new_c_element = []; 131 | 132 | foreach (@{$p_element}) { 133 | my $chord_edge = $chord_model->get_chord_edge_by_end_point($_); 134 | my $expanded_c_element = []; 135 | 136 | if (!defined($org_c_element->[0])) { 137 | push @{$expanded_c_element}, [$chord_edge]; 138 | } 139 | else { 140 | foreach my $element_value (@{$org_c_element}) { 141 | my @arr_clone = @{$element_value}; 142 | push @arr_clone, $chord_edge; 143 | push @{$expanded_c_element}, \@arr_clone; 144 | } 145 | } 146 | 147 | my $new_values = _add_front($p, $_, $chord_model, $expanded_c_element); 148 | push @{$new_c_element}, @{$new_values}; 149 | } 150 | 151 | return $new_c_element; 152 | } 153 | 154 | return $org_c_element; 155 | } 156 | 157 | sub _get_chord_weights { 158 | my ($chord_model, $base_seq, $scoring_function) = @_; 159 | 160 | my $chord_weights = {}; 161 | 162 | foreach (values %{$chord_model->get_chord_edges()}) { 163 | my $chord_base_pairs = $chord_model->get_chord_base_pairs($_->[0], $_->[1]); 164 | my $chord_attrs = {}; 165 | $chord_attrs->{base_pairs} = $chord_base_pairs; 166 | $chord_attrs->{pair_count} = @{$chord_base_pairs}; 167 | $chord_weights->{$_->[0] . '-' . $_->[1]} = $scoring_function->($chord_attrs, $base_seq); 168 | } 169 | 170 | return $chord_weights; 171 | } 172 | 173 | #Recover the MWISs from the chord sets in variable c 174 | sub _restore_chord_mwiss { 175 | my ($chord_edge_sets, $c) = @_; 176 | 177 | my $chord_mwiss = []; 178 | 179 | foreach my $chord_edge_set (@{$chord_edge_sets}) { 180 | my $single_chord_edge_set_mwiss = [$chord_edge_set]; 181 | 182 | foreach my $chord_edge (@{$chord_edge_set}) { 183 | my $inner_chord_edge_sets = $c->{$chord_edge->[0] . '-' . $chord_edge->[1]}; 184 | if (!defined($inner_chord_edge_sets->[0])) { 185 | next; 186 | } 187 | 188 | my $inner_chord_mwiss = _restore_chord_mwiss($inner_chord_edge_sets, $c); 189 | my @org_single_chord_edge_set_mwiss = @{$single_chord_edge_set_mwiss}; 190 | $single_chord_edge_set_mwiss = []; 191 | 192 | foreach my $single_chord_edge_set_mwis (@org_single_chord_edge_set_mwiss) { 193 | foreach my $inner_chord_mwis (@{$inner_chord_mwiss}) { 194 | my @merged_mwis = (@{$single_chord_edge_set_mwis}, @{$inner_chord_mwis}); 195 | push @{$single_chord_edge_set_mwiss}, \@merged_mwis; 196 | } 197 | } 198 | } 199 | 200 | push @{$chord_mwiss}, @{$single_chord_edge_set_mwiss}; 201 | 202 | } 203 | 204 | return $chord_mwiss; 205 | } 206 | 207 | 1; 208 | -------------------------------------------------------------------------------- /utils/FreeKnot/remove_pseudoknot.pl: -------------------------------------------------------------------------------- 1 | #Main program for pseudoknot removal 2 | #It accepts input RNA secondary structure as BPSEQ format or dot-parentheses format 3 | #There are four choices of scoring functions: No. of base pairs, no. of stems, no. of hydrogen 4 | #bonds, and Turner free energy (Turner, D. H. & Mathews, D. H., NAR 2009)). The optimization goal 5 | #for the first three options is to maximize the score as all the choices only give positive values. 6 | #For the last option, the goal is to minimize the score (i.e. free energy). 7 | 8 | #!/usr/bin/perl 9 | 10 | use BpseqParser; 11 | use BpseqWriter; 12 | use BracketPairs; 13 | use ChordModel; 14 | use CircleGraph; 15 | use DPParser; 16 | use DPWriter; 17 | use MIS; 18 | use MWIS; 19 | use PrimitivePseudoknotExtractor; 20 | use ScoringFunctions; 21 | use VertexSubset; 22 | use strict; 23 | 24 | #OS_BIT specifies the length of a bitstring used in the circle graph 25 | use constant OS_BIT => 32; 26 | 27 | if (@ARGV < 5) { 28 | print "Usage: perl $0 -i -s [-a : report all optimal solutions]\n"; 29 | exit; 30 | } 31 | 32 | my ($input_file_path, $input_file_format, $scoring_fx_option); 33 | my $is_report_all = 0; 34 | 35 | for (my $i = 0; $i < @ARGV; $i++) { 36 | if ($ARGV[$i] eq '-i') { 37 | if (defined($input_file_format)) { 38 | print "Duplicated input file format specification\n"; 39 | exit; 40 | } 41 | else { 42 | $input_file_format = $ARGV[++$i]; 43 | } 44 | } 45 | elsif ($ARGV[$i] eq '-s') { 46 | if (defined($scoring_fx_option)) { 47 | print "Duplicated scoring function specification\n"; 48 | exit; 49 | } 50 | else { 51 | $scoring_fx_option = $ARGV[++$i]; 52 | } 53 | } 54 | elsif ($ARGV[$i] eq '-a') { 55 | $is_report_all = 1; 56 | } 57 | elsif (substr($ARGV[$i], 0, 1) eq '-') { 58 | print "Unknown parameter $ARGV[$i]\n"; 59 | exit; 60 | } 61 | elsif (!defined($input_file_path)) { 62 | $input_file_path = $ARGV[$i]; 63 | } 64 | } 65 | 66 | if (!defined($input_file_path)) { 67 | print "No input file path specified\n"; 68 | exit; 69 | } 70 | 71 | #Select the scoring function according to the user option. It will be used to calculate the score of 72 | #each stem in the MWIS algorithm 73 | my ($scoring_function, $criteria, $is_fe) = ScoringFunctions->get_scoring_function($scoring_fx_option); 74 | if (!defined($scoring_function)) { 75 | print "Unknown scoring function specified: $scoring_fx_option\n"; 76 | exit; 77 | } 78 | 79 | my ($primitive_pseudoknots, $base_seq, $paired_pos_ptrs, $base_count, $structure_symbols, $base_seq_str); 80 | 81 | #Parse the input structure file to generate pseudoknot objects 82 | if ($input_file_format eq 'bpseq') { 83 | ($primitive_pseudoknots, $base_seq, $paired_pos_ptrs, $base_count) = BpseqParser->parse($input_file_path); 84 | } 85 | elsif ($input_file_format eq 'dp') { 86 | ($primitive_pseudoknots, $base_seq, $structure_symbols, $base_seq_str) = DPParser->parse($input_file_path); 87 | } 88 | else { 89 | print "Unknown input file format: $input_file_format\n"; 90 | exit; 91 | } 92 | 93 | my $pseudoknot_base_pair_removal_pos = []; 94 | my $prim_pseudoknot_count = 0; 95 | 96 | #If free energy is selected as the scoring function, then MIS algorithm is applied to generate 97 | #all MISs of the circle graph, and evaluated the free energy for each of them 98 | if ($is_fe) { 99 | foreach (@{$primitive_pseudoknots}) { 100 | my $circle_graph = CircleGraph->new($_, OS_BIT); 101 | my $miss = MIS->get_mis($circle_graph, $criteria); 102 | my $base_pair_removal_pos = convert_to_base_pair_removal_pos_circle_graph($circle_graph, $miss); 103 | push @{$pseudoknot_base_pair_removal_pos}, $base_pair_removal_pos; 104 | $prim_pseudoknot_count++; 105 | } 106 | } 107 | #For other scoring function options, MWIS algorithm is applied to generate one/all MWISs from 108 | #the chord model of the circle graph 109 | else{ 110 | foreach (@{$primitive_pseudoknots}) { 111 | my $chord_model = ChordModel->new($_); 112 | my $mwiss = MWIS->get_mwis($chord_model, $base_seq, $scoring_function, $criteria, $is_report_all); 113 | my $base_pair_removal_pos = convert_to_base_pair_removal_pos($chord_model, $mwiss); 114 | push @{$pseudoknot_base_pair_removal_pos}, $base_pair_removal_pos; 115 | $prim_pseudoknot_count++; 116 | } 117 | } 118 | 119 | #Combine the possible removal positions sets for all primitive pseudoknots 120 | my $combined_base_pair_removal_pos = combine_base_pair_removal_pos($pseudoknot_base_pair_removal_pos, []); 121 | 122 | #Determine the free energy of every structure converted from the MISs combinations of different 123 | #primitive pseudoknots in the structure. It writes the structure to a temporary file and call 124 | #RNAeval in ViennaRNA package to calculate its free energy 125 | if ($is_fe) { 126 | my $mfe; 127 | my $mfe_base_pair_models = []; 128 | 129 | if (!defined($base_seq_str)) { 130 | $base_seq_str = join('', @{$base_seq}); 131 | } 132 | 133 | foreach (@{$combined_base_pair_removal_pos}) { 134 | DPWriter->output_mfe_candidate($_, $paired_pos_ptrs, $structure_symbols, $base_seq_str); 135 | my $rna_eval_output = `RNAeval < MWIS_temp.dp`; 136 | $rna_eval_output =~ /(-?\d+\.\d+)/; 137 | if ($1 < $mfe || !defined($mfe)) { 138 | $mfe_base_pair_models = [$_]; 139 | $mfe = $1; 140 | } 141 | elsif ($1 == $mfe) { 142 | push @{$mfe_base_pair_models}, $_; 143 | } 144 | } 145 | 146 | $combined_base_pair_removal_pos = $mfe_base_pair_models; 147 | } 148 | 149 | if ($input_file_format eq 'bpseq') { 150 | BpseqWriter->output_results($combined_base_pair_removal_pos, $base_seq, $paired_pos_ptrs, $base_count); 151 | } 152 | elsif ($input_file_format eq 'dp') { 153 | DPWriter->output_results($combined_base_pair_removal_pos, $structure_symbols, $base_seq_str); 154 | } 155 | 156 | sub convert_to_base_pair_removal_pos_circle_graph { 157 | my ($circle_graph, $miss) = @_; 158 | 159 | my $base_pair_removal_pos = []; 160 | 161 | foreach my $mis (@{$miss}) { 162 | my $removed_vertex_nums = []; 163 | for (my $i = 0; $i < $mis->[0]; $i++) { 164 | push @{$removed_vertex_nums}, $i; 165 | } 166 | 167 | for (my $i = 1; $i < @{$mis}; $i++) { 168 | for (my $j = $mis->[$i - 1] + 1; $j < $mis->[$i]; $j++) { 169 | push @{$removed_vertex_nums}, $j; 170 | } 171 | } 172 | 173 | for (my $i = $mis->[-1] + 1; $i < $circle_graph->get_vertex_count(); $i++) { 174 | push @{$removed_vertex_nums}, $i; 175 | } 176 | 177 | my $removal_pos = {}; 178 | foreach (@{$removed_vertex_nums}) { 179 | my $vertex_attrs = $circle_graph->get_vertex_attrs_at($_); 180 | my $stem_pairs = $vertex_attrs->{stem_pairs}; 181 | foreach (@{$stem_pairs}) { 182 | my ($pair_upstream_pos, $pair_downstream_pos) = @{$_}; 183 | $removal_pos->{$pair_upstream_pos} = 1; 184 | $removal_pos->{$pair_downstream_pos} = 1; 185 | } 186 | } 187 | 188 | push @{$base_pair_removal_pos}, $removal_pos; 189 | } 190 | 191 | return $base_pair_removal_pos; 192 | } 193 | 194 | sub convert_to_base_pair_removal_pos { 195 | my ($chord_model, $mwiss) = @_; 196 | 197 | my $base_pair_removal_pos = []; 198 | 199 | foreach my $mwis (@{$mwiss}) { 200 | my %removed_chord_edges = %{$chord_model->get_chord_edges()}; 201 | foreach (@{$mwis}) { 202 | delete $removed_chord_edges{$_->[0] . '-' . $_->[1]}; 203 | } 204 | 205 | my $removal_pos = {}; 206 | foreach my $removed_chord_edge (values %removed_chord_edges) { 207 | my $removed_chord_base_pairs = $chord_model->get_chord_base_pairs($removed_chord_edge->[0], $removed_chord_edge->[1]); 208 | foreach (@{$removed_chord_base_pairs}) { 209 | $removal_pos->{$_->[0]} = 1; 210 | $removal_pos->{$_->[1]} = 1; 211 | } 212 | } 213 | 214 | push @{$base_pair_removal_pos}, $removal_pos; 215 | } 216 | 217 | return $base_pair_removal_pos; 218 | } 219 | 220 | sub combine_base_pair_removal_pos { 221 | my ($pseudoknot_base_pair_removal_pos, $combined_base_pair_removal_pos) = @_; 222 | 223 | my $expanded_base_pair_removal_pos = []; 224 | my $base_pair_removal_pos = pop @{$pseudoknot_base_pair_removal_pos}; 225 | foreach my $removal_pos (@{$base_pair_removal_pos}) { 226 | if (defined($combined_base_pair_removal_pos->[0])) { 227 | foreach (@{$combined_base_pair_removal_pos}) { 228 | my %expanded_removal_pos = (%{$removal_pos}, %{$_}); 229 | push @{$expanded_base_pair_removal_pos}, \%expanded_removal_pos; 230 | } 231 | } 232 | else { 233 | push @{$expanded_base_pair_removal_pos}, $removal_pos; 234 | } 235 | } 236 | 237 | if (defined($pseudoknot_base_pair_removal_pos->[0])) { 238 | $expanded_base_pair_removal_pos = combine_base_pair_removal_pos($pseudoknot_base_pair_removal_pos, $expanded_base_pair_removal_pos); 239 | } 240 | 241 | return $expanded_base_pair_removal_pos; 242 | } 243 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.log_gremlin: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------------------------------------- 2 | # GREMLIN_CPP v1.0 3 | # --------------------------------------------------------------------------------------------- 4 | # -i /home/jaswinder/github/SPOT-RNA2/sample_run/sample_seq_features/sample_seq.a2m 5 | # -o /home/jaswinder/github/SPOT-RNA2/sample_run/sample_seq_features/sample_seq.dca 6 | # --------------------------------------------------------------------------------------------- 7 | # -only_neff 0 8 | # -only_v 0 9 | # -gap_cutoff 0.5 10 | # -alphabet rna 11 | # -eff_cutoff 0.8 12 | # -lambda 0.01 13 | # --------------------------------------------------------------------------------------------- 14 | # -min_type lbfgs 15 | # -max_iter 100 16 | # --------------------------------------------------------------------------------------------- 17 | # removing 3 out of 61 positions with >= 50% gaps! 18 | # SEQ ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG 19 | # CUT ACUCGUUUGAGCGAGUAUAAACAGCU-GUUAAGCUCAAAGCGGAGAGCAG--CUGCUCUCG 20 | # NC 58 21 | # NEFF 16.9 22 | # learning MRF ... 23 | # lbfgs::iter S_S fx: 1577.57 gnorm: 79.1061 24 | # lbfgs::iter 0_1 fx: 1569.68 gnorm: 78.7645 25 | # lbfgs::iter 1_1 fx: 867.926 gnorm: 14.9869 26 | # lbfgs::iter 2_1 fx: 835.903 gnorm: 7.66255 27 | # lbfgs::iter 3_1 fx: 823.585 gnorm: 6.19229 28 | # lbfgs::iter 4_1 fx: 814.088 gnorm: 5.88803 29 | # lbfgs::iter 5_1 fx: 808.359 gnorm: 3.65714 30 | # lbfgs::iter 6_1 fx: 804.819 gnorm: 2.33437 31 | # lbfgs::iter 7_1 fx: 801.784 gnorm: 2.29087 32 | # lbfgs::iter 8_1 fx: 801.007 gnorm: 3.56033 33 | # lbfgs::iter 9_1 fx: 800.087 gnorm: 0.639645 34 | # lbfgs::iter 10_1 fx: 800.041 gnorm: 0.287188 35 | # lbfgs::iter 11_1 fx: 800.021 gnorm: 0.235963 36 | # lbfgs::iter 12_1 fx: 799.996 gnorm: 0.228575 37 | # lbfgs::iter 13_1 fx: 799.98 gnorm: 0.446223 38 | # lbfgs::iter 14_1 fx: 799.957 gnorm: 0.200702 39 | # lbfgs::iter 15_1 fx: 799.942 gnorm: 0.176329 40 | # lbfgs::iter 16_1 fx: 799.93 gnorm: 0.171215 41 | # lbfgs::iter 17_1 fx: 799.93 gnorm: 0.42462 42 | # lbfgs::iter 18_1 fx: 799.916 gnorm: 0.110312 43 | # lbfgs::iter 19_1 fx: 799.914 gnorm: 0.0797637 44 | # lbfgs::iter 20_1 fx: 799.91 gnorm: 0.0900997 45 | # lbfgs::iter 21_1 fx: 799.907 gnorm: 0.112634 46 | # lbfgs::iter 22_1 fx: 799.906 gnorm: 0.106627 47 | # lbfgs::iter 23_1 fx: 799.905 gnorm: 0.0325551 48 | # lbfgs::iter 24_1 fx: 799.905 gnorm: 0.0234513 49 | # lbfgs::iter 25_1 fx: 799.905 gnorm: 0.0211741 50 | # lbfgs::iter 26_1 fx: 799.905 gnorm: 0.0347908 51 | # lbfgs::iter 27_1 fx: 799.905 gnorm: 0.00985535 52 | # lbfgs::iter 28_1 fx: 799.905 gnorm: 0.00879822 53 | # lbfgs::iter 29_1 fx: 799.904 gnorm: 0.00837304 54 | # lbfgs::iter 30_1 fx: 799.904 gnorm: 0.0161944 55 | # lbfgs::iter 31_1 fx: 799.904 gnorm: 0.00567908 56 | # lbfgs::iter 32_1 fx: 799.904 gnorm: 0.00464024 57 | # lbfgs::iter 33_1 fx: 799.904 gnorm: 0.00515186 58 | # lbfgs::iter 34_1 fx: 799.904 gnorm: 0.00769589 59 | # lbfgs::iter 35_1 fx: 799.904 gnorm: 0.00417999 60 | # lbfgs::iter 36_1 fx: 799.904 gnorm: 0.00196562 61 | # lbfgs::iter 37_1 fx: 799.904 gnorm: 0.00138669 62 | # lbfgs::iter 38_1 fx: 799.904 gnorm: 0.00217301 63 | # lbfgs::iter 39_1 fx: 799.904 gnorm: 0.00122576 64 | # lbfgs::iter 40_1 fx: 799.904 gnorm: 0.000992969 65 | # lbfgs::iter 41_1 fx: 799.904 gnorm: 0.00110079 66 | # lbfgs::iter 42_1 fx: 799.904 gnorm: 0.00177581 67 | # lbfgs::iter 43_1 fx: 799.904 gnorm: 0.000827648 68 | # lbfgs::iter 44_1 fx: 799.904 gnorm: 0.000548524 69 | # lbfgs::iter 45_1 fx: 799.904 gnorm: 0.000489395 70 | # lbfgs::iter 46_1 fx: 799.904 gnorm: 0.000693037 71 | # lbfgs::iter 47_1 fx: 799.904 gnorm: 0.00017789 72 | # lbfgs::iter 48_1 fx: 799.904 gnorm: 0.000143922 73 | # lbfgs::iter 49_1 fx: 799.904 gnorm: 0.000167766 74 | # lbfgs::iter 50_1 fx: 799.904 gnorm: 0.000338155 75 | # lbfgs::iter 51_1 fx: 799.904 gnorm: 0.000158961 76 | # lbfgs::iter 52_1 fx: 799.904 gnorm: 0.000106748 77 | # lbfgs::iter 53_1 fx: 799.904 gnorm: 0.000105824 78 | # lbfgs::iter 54_1 fx: 799.904 gnorm: 0.000298927 79 | # lbfgs::iter 55_1 fx: 799.904 gnorm: 7.52617e-05 80 | # lbfgs::iter 56_1 fx: 799.904 gnorm: 5.87916e-05 81 | # lbfgs::iter 57_1 fx: 799.904 gnorm: 6.59898e-05 82 | # lbfgs::iter 58_1 fx: 799.904 gnorm: 0.000150251 83 | # lbfgs::iter 59_1 fx: 799.904 gnorm: 4.73333e-05 84 | # lbfgs::iter 60_1 fx: 799.904 gnorm: 3.42972e-05 85 | # lbfgs::iter 61_1 fx: 799.904 gnorm: 3.77292e-05 86 | # lbfgs::iter 62_1 fx: 799.904 gnorm: 3.80927e-05 87 | # lbfgs::iter 63_1 fx: 799.904 gnorm: 9.36524e-05 88 | # lbfgs::iter 64_1 fx: 799.904 gnorm: 1.61026e-05 89 | # lbfgs::iter 65_1 fx: 799.904 gnorm: 1.07761e-05 90 | # lbfgs::iter 66_1 fx: 799.904 gnorm: 8.87304e-06 91 | # lbfgs::iter 67_1 fx: 799.904 gnorm: 1.4495e-05 92 | # lbfgs::iter 68_1 fx: 799.904 gnorm: 7.46466e-06 93 | # lbfgs::iter 69_1 fx: 799.904 gnorm: 6.47399e-06 94 | # lbfgs::iter 70_1 fx: 799.904 gnorm: 7.70032e-06 95 | # lbfgs::iter 71_1 fx: 799.904 gnorm: 1.43926e-05 96 | # lbfgs::iter 72_1 fx: 799.904 gnorm: 3.22099e-06 97 | # lbfgs::iter 73_1 fx: 799.904 gnorm: 2.48525e-06 98 | # lbfgs::iter 74_1 fx: 799.904 gnorm: 2.60497e-06 99 | # lbfgs::iter 75_1 fx: 799.904 gnorm: 3.34597e-06 100 | # lbfgs::iter 76_1 fx: 799.904 gnorm: 7.18017e-06 101 | # lbfgs::iter 77_1 fx: 799.904 gnorm: 1.53372e-06 102 | # lbfgs::iter 78_1 fx: 799.904 gnorm: 1.04848e-06 103 | # lbfgs::iter 79_1 fx: 799.904 gnorm: 9.07634e-07 104 | # lbfgs::iter 80_1 fx: 799.904 gnorm: 1.613e-06 105 | # lbfgs::iter 81_1 fx: 799.904 gnorm: 7.12023e-07 106 | # lbfgs::iter 82_1 fx: 799.904 gnorm: 5.6185e-07 107 | # lbfgs::iter 83_1 fx: 799.904 gnorm: 5.75372e-07 108 | # lbfgs::iter 84_1 fx: 799.904 gnorm: 1.19944e-06 109 | # lbfgs::iter 85_1 fx: 799.904 gnorm: 4.0767e-07 110 | # lbfgs::iter 86_1 fx: 799.904 gnorm: 2.73253e-07 111 | # lbfgs::iter 87_1 fx: 799.904 gnorm: 2.46659e-07 112 | # lbfgs::iter 88_1 fx: 799.904 gnorm: 4.38814e-07 113 | # lbfgs::iter 89_1 fx: 799.904 gnorm: 2.32778e-07 114 | # lbfgs::iter 90_1 fx: 799.904 gnorm: 1.52613e-07 115 | # lbfgs::iter 91_1 fx: 799.904 gnorm: 1.24444e-07 116 | # lbfgs::iter 92_1 fx: 799.904 gnorm: 1.97241e-07 117 | # lbfgs::iter 93_1 fx: 799.904 gnorm: 8.6033e-08 118 | # lbfgs::iter 94_1 fx: 799.904 gnorm: 7.09053e-08 119 | # lbfgs::iter 95_1 fx: 799.904 gnorm: 6.83742e-08 120 | # lbfgs::iter 96_1 fx: 799.904 gnorm: 1.44261e-07 121 | # lbfgs::iter 97_1 fx: 799.904 gnorm: 3.08042e-08 122 | # lbfgs::iter 98_1 fx: 799.904 gnorm: 2.62992e-08 123 | # lbfgs::iter 99_1 fx: 799.904 gnorm: 2.98886e-08 124 | # lbfgs::iter S_S fx: 799.904 gnorm: 152.499 125 | # lbfgs::iter 0_1 fx: 784.797 gnorm: 149.682 126 | # lbfgs::iter 1_1 fx: 395.893 gnorm: 133.288 127 | # lbfgs::iter 2_2 fx: 364.519 gnorm: 99.8387 128 | # lbfgs::iter 3_1 fx: 298.457 gnorm: 29.6813 129 | # lbfgs::iter 4_1 fx: 287.945 gnorm: 19.0757 130 | # lbfgs::iter 5_1 fx: 282.982 gnorm: 11.4838 131 | # lbfgs::iter 6_1 fx: 280.631 gnorm: 10.3144 132 | # lbfgs::iter 7_1 fx: 279.448 gnorm: 6.3234 133 | # lbfgs::iter 8_1 fx: 279.008 gnorm: 3.7695 134 | # lbfgs::iter 9_1 fx: 278.802 gnorm: 2.4342 135 | # lbfgs::iter 10_1 fx: 278.677 gnorm: 2.46252 136 | # lbfgs::iter 11_1 fx: 278.606 gnorm: 2.49599 137 | # lbfgs::iter 12_1 fx: 278.556 gnorm: 1.34479 138 | # lbfgs::iter 13_1 fx: 278.511 gnorm: 1.29845 139 | # lbfgs::iter 14_1 fx: 278.466 gnorm: 1.43844 140 | # lbfgs::iter 15_1 fx: 278.428 gnorm: 3.48674 141 | # lbfgs::iter 16_1 fx: 278.349 gnorm: 1.66639 142 | # lbfgs::iter 17_1 fx: 278.274 gnorm: 1.79046 143 | # lbfgs::iter 18_1 fx: 278.147 gnorm: 2.82733 144 | # lbfgs::iter 19_1 fx: 277.925 gnorm: 4.0464 145 | # lbfgs::iter 20_1 fx: 277.712 gnorm: 7.464 146 | # lbfgs::iter 21_1 fx: 277.334 gnorm: 3.37071 147 | # lbfgs::iter 22_1 fx: 277.034 gnorm: 3.46683 148 | # lbfgs::iter 23_1 fx: 276.762 gnorm: 3.62724 149 | # lbfgs::iter 24_2 fx: 276.697 gnorm: 3.55826 150 | # lbfgs::iter 25_1 fx: 276.443 gnorm: 2.13056 151 | # lbfgs::iter 26_2 fx: 276.428 gnorm: 1.77109 152 | # lbfgs::iter 27_1 fx: 276.376 gnorm: 1.13688 153 | # lbfgs::iter 28_1 fx: 276.358 gnorm: 0.928894 154 | # lbfgs::iter 29_1 fx: 276.357 gnorm: 1.96084 155 | # lbfgs::iter 30_1 fx: 276.342 gnorm: 0.534038 156 | # lbfgs::iter 31_1 fx: 276.338 gnorm: 0.423256 157 | # lbfgs::iter 32_1 fx: 276.33 gnorm: 0.588926 158 | # lbfgs::iter 33_1 fx: 276.323 gnorm: 0.630135 159 | # lbfgs::iter 34_1 fx: 276.312 gnorm: 1.41765 160 | # lbfgs::iter 35_1 fx: 276.293 gnorm: 0.710723 161 | # lbfgs::iter 36_1 fx: 276.277 gnorm: 0.761555 162 | # lbfgs::iter 37_1 fx: 276.25 gnorm: 1.13114 163 | # lbfgs::iter 38_1 fx: 276.24 gnorm: 2.68521 164 | # lbfgs::iter 39_1 fx: 276.209 gnorm: 1.1845 165 | # lbfgs::iter 40_1 fx: 276.183 gnorm: 0.777926 166 | # lbfgs::iter 41_1 fx: 276.163 gnorm: 0.93568 167 | # lbfgs::iter 42_1 fx: 276.145 gnorm: 1.4975 168 | # lbfgs::iter 43_1 fx: 276.134 gnorm: 0.83518 169 | # lbfgs::iter 44_1 fx: 276.129 gnorm: 0.427227 170 | # lbfgs::iter 45_1 fx: 276.125 gnorm: 0.378219 171 | # lbfgs::iter 46_1 fx: 276.119 gnorm: 0.444098 172 | # lbfgs::iter 47_1 fx: 276.115 gnorm: 0.874621 173 | # lbfgs::iter 48_1 fx: 276.109 gnorm: 0.501375 174 | # lbfgs::iter 49_1 fx: 276.102 gnorm: 0.515069 175 | # lbfgs::iter 50_1 fx: 276.094 gnorm: 0.662954 176 | # lbfgs::iter 51_1 fx: 276.079 gnorm: 1.44839 177 | # lbfgs::iter 52_1 fx: 276.055 gnorm: 1.06933 178 | # lbfgs::iter 53_1 fx: 276.023 gnorm: 0.856106 179 | # lbfgs::iter 54_1 fx: 275.997 gnorm: 1.6488 180 | # lbfgs::iter 55_1 fx: 275.976 gnorm: 0.904621 181 | # lbfgs::iter 56_1 fx: 275.967 gnorm: 0.634115 182 | # lbfgs::iter 57_1 fx: 275.956 gnorm: 0.598754 183 | # lbfgs::iter 58_1 fx: 275.949 gnorm: 0.767746 184 | # lbfgs::iter 59_1 fx: 275.942 gnorm: 0.506698 185 | # lbfgs::iter 60_1 fx: 275.935 gnorm: 0.478146 186 | # lbfgs::iter 61_1 fx: 275.93 gnorm: 0.839587 187 | # lbfgs::iter 62_1 fx: 275.922 gnorm: 0.640863 188 | # lbfgs::iter 63_1 fx: 275.895 gnorm: 0.832216 189 | # lbfgs::iter 64_1 fx: 275.881 gnorm: 1.28579 190 | # lbfgs::iter 65_1 fx: 275.865 gnorm: 0.934848 191 | # lbfgs::iter 66_1 fx: 275.833 gnorm: 0.930706 192 | # lbfgs::iter 67_1 fx: 275.815 gnorm: 1.23511 193 | # lbfgs::iter 68_1 fx: 275.793 gnorm: 0.828008 194 | # lbfgs::iter 69_1 fx: 275.777 gnorm: 0.945493 195 | # lbfgs::iter 70_1 fx: 275.769 gnorm: 0.724122 196 | # lbfgs::iter 71_1 fx: 275.765 gnorm: 0.422678 197 | # lbfgs::iter 72_1 fx: 275.761 gnorm: 0.263507 198 | # lbfgs::iter 73_1 fx: 275.759 gnorm: 0.253831 199 | # lbfgs::iter 74_1 fx: 275.758 gnorm: 0.341821 200 | # lbfgs::iter 75_1 fx: 275.758 gnorm: 0.102256 201 | # lbfgs::iter 76_1 fx: 275.758 gnorm: 0.0740558 202 | # lbfgs::iter 77_1 fx: 275.758 gnorm: 0.0552766 203 | # lbfgs::iter 78_1 fx: 275.758 gnorm: 0.170435 204 | # lbfgs::iter 79_1 fx: 275.757 gnorm: 0.0399481 205 | # lbfgs::iter 80_1 fx: 275.757 gnorm: 0.0298459 206 | # lbfgs::iter 81_1 fx: 275.757 gnorm: 0.0316006 207 | # lbfgs::iter 82_1 fx: 275.757 gnorm: 0.0479131 208 | # lbfgs::iter 83_1 fx: 275.757 gnorm: 0.0287621 209 | # lbfgs::iter 84_1 fx: 275.757 gnorm: 0.0368384 210 | # lbfgs::iter 85_1 fx: 275.757 gnorm: 0.0438442 211 | # lbfgs::iter 86_1 fx: 275.757 gnorm: 0.146457 212 | # lbfgs::iter 87_1 fx: 275.757 gnorm: 0.05263 213 | # lbfgs::iter 88_1 fx: 275.757 gnorm: 0.0397041 214 | # lbfgs::iter 89_1 fx: 275.757 gnorm: 0.0548509 215 | # lbfgs::iter 90_1 fx: 275.757 gnorm: 0.0667815 216 | # lbfgs::iter 91_1 fx: 275.757 gnorm: 0.182811 217 | # lbfgs::iter 92_1 fx: 275.757 gnorm: 0.0391792 218 | # lbfgs::iter 93_1 fx: 275.757 gnorm: 0.0289307 219 | # lbfgs::iter 94_1 fx: 275.757 gnorm: 0.0297646 220 | # lbfgs::iter 95_1 fx: 275.757 gnorm: 0.0807598 221 | # lbfgs::iter 96_1 fx: 275.757 gnorm: 0.0366649 222 | # lbfgs::iter 97_1 fx: 275.757 gnorm: 0.0289475 223 | # lbfgs::iter 98_1 fx: 275.757 gnorm: 0.0366467 224 | # lbfgs::iter 99_1 fx: 275.757 gnorm: 0.0515997 225 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SPOT-RNA2 2 | Improved RNA Secondary Structure and Tertiary Base-pairing Prediction using Evolutionary Profile, Mutational Coupling and Two-dimensional Transfer Learning. 3 | 4 | ## Contents 5 | 6 | * [Introduction](#introduction) 7 | * [Results](#results) 8 | * [System Requirments](#system-requirments) 9 | * [Installation](#installation) 10 | * [Usage](#Usage) 11 | * [Datasets](#datasets) 12 | * [Citation guide](#citation-guide) 13 | * [Licence](#licence) 14 | * [Contact](#contact) 15 | 16 | ## Introduction 17 | 18 | The recent discovery of numerous non-coding RNAs (long non-coding RNAs, in particular) has transformed our perception about the roles of RNAs in living organisms. Our ability to understand them, however, is hampered by our inability to solve their secondary and tertiary structures in high resolution efficiently by existing experimental techniques. Computational prediction of RNA secondary structure, on the other hand, has received much-needed improvement, recently, through deep learning of a large approximate data, followed by transfer learning with gold-standard base-pairing structures from high-resolution 3-D structures. Here, we expand this single-sequence-based learning to the use of evolutionary profiles and mutational coupling. 19 | 20 | |![](./docs/SPOTRNA2_pipeline.png) 21 | |----| 22 | |

Figure 1: (A) Inputted one dimensional (1-D) and two dimensional (2-D) features employed in SPOT-RNA2 (L is the RNA sequence length; BP is base-pair; CSS is consensus secondary structure). (B) An example of the model architecture of SPOT-RNA2. (C) The schematic diagram for model pre-training by the bpRNA data set (TR0) and transfer learning by PDB data set (TR1).| 23 | 24 | Results 25 | ---- 26 | The new method allows large improvement not only in canonical base-pairs (RNA secondary structures) but more so in base-pairing associated with tertiary interactions such as pseudoknots, noncanonical and lone base-pairs. In particular, it is highly accurate for those RNAs of more than 1000 homologous sequences by achieving > 0.8 F1-score (harmonic mean of sensitivity and precision) for 14/16 RNAs tested. The method can also significantly improve base-pairing prediction by incorporating artificial but functional homologous sequences generated from deep mutational scanning without any modification. The fully automatic method should provide the scientific community a new powerful tool to capture not only the secondary structure but also tertiary base-pairing information for building three-dimensional models. It also highlights the future of accurately solving the base-pairing structure by using a large number of natural and/or artificial homologous sequences. 27 | 28 | 29 | |![](./docs/benchmark_results.png) 30 | |----| 31 | |

Figure 2: Distribution of F1-scores for individual RNAs on the combined test sets TS1, TS2, and TS3 given by various methods as labeled. On each box, the central mark indicates the median, and the bottom and top edges of the box indicate the 25th and 75th percentiles, respectively. The outliers are plotted individually by using the “+” symbol.| 32 | 33 | 34 | ## System Requirments 35 | 36 | **Hardware Requirments:** 37 | It is recommended that your system should have 32 GB RAM, 500 GB disk space to support the in-memory operations for RNA sequence length less than 500. Multiple CPU threads are also recommended as the MSA generating process is computationally expensive. 38 | 39 | **Software Requirments:** 40 | * [Python3.6](https://docs.python-guide.org/starting/install3/linux/) 41 | * [Perl-5.4 or later](https://www.perl.org/get.html) 42 | * [virtualenv](https://virtualenv.pypa.io/en/latest/installation/) or [Anaconda](https://anaconda.org/anaconda/virtualenv) 43 | * [CUDA 10.0](https://developer.nvidia.com/cuda-10.0-download-archive) (Optional if using GPU) 44 | * [cuDNN (>= 7.4.1)](https://developer.nvidia.com/cudnn) (Optional if using GPU) 45 | * [Docker](https://docs.docker.com/engine/install/) (Optional if runnig SPOT-RNA2 through docker image) 46 | 47 | SPOT-RNA2 has been tested on Ubuntu 14.04, 16.04, and 18.04 operating systems. 48 | 49 | 50 | ## Installation 51 | 52 | ### Installation using Docker image: 53 | 54 | The following command can be used to install SPOT-RNA2 and its dependencies: 55 | 56 | 1. `git clone https://github.com/jaswindersingh2/SPOT-RNA2.git && cd SPOT-RNA2` 57 | 58 | 2. `docker image build -t spot_rna2 .` 59 | 60 | ### Manual installation: 61 | 62 | To install SPOT-RNA2 and its dependencies following commands can be used in the terminal: 63 | 64 | 65 | 1. `git clone https://github.com/jaswindersingh2/SPOT-RNA2.git && cd SPOT-RNA2` 66 | 2. `wget -O utils/models_ckps.tar.xz 'https://www.dropbox.com/s/udzcsva76lh5wvq/models_ckps.tar.xz' || wget -O utils/models_ckps.tar.xz 'https://app.nihaocloud.com/f/586acb2658d74ccb92b8/?dl=1'` 67 | 3. `tar -xvf utils/models_ckps.tar.xz -C utils/ && rm utils/models_ckps.tar.xz` 68 | 4. `sudo apt install cpanminus && sudo cpanm Graph && sudo apt install gawk` 69 | 70 | Based on the virtual environment package manager (**virtualenv** or **conda**) you have follow the stpes below:
71 | 72 | | |                       virtualenv |                                      conda | 73 | | :- | :-------- | :--- | 74 | | 5. | `virtualenv -p python3.6 venv` | `conda create -n venv python=3.6` | 75 | | 6. | `source ./venv/bin/activate` | `conda activate venv` | 76 | | 7. | `pip install -r requirements.txt && deactivate` | `while read p; do conda install --yes $p; done < requirements.txt && conda deactivate` | 77 | 78 | If you have **Infernal** already installed, please set `binaries/` directory path of **Infernal** installation in line 12 of the `run_spotrna2.sh`. Otherwise, follow commands below to install **Infernal** tool. If you run into issue, please follow the [link](http://eddylab.org/infernal/) for more info. 79 | 80 | 8. `wget 'eddylab.org/infernal/infernal-1.1.3-linux-intel-gcc.tar.gz' && tar -xvzf infernal-*.tar.gz && rm infernal-*.tar.gz` 81 | 82 | If you have **BLASTN** already installed, please set `bin/` directory path of **BLASTN** installation in line 10 of the `run_spotrna2.sh`. Otherwise, follow commands below to install **BLASTN** tool. If you run into issue, please follow the [link](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download) for more info. 83 | 84 | 9. `wget 'ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-*+-x64-linux.tar.gz' && tar -xvzf ncbi-blast-*+-x64-linux.tar.gz && rm ncbi-blast-*+-x64-linux.tar.gz` 85 | 86 | To install the **SPOT-RNA** predictor, follow the commands below:
87 | 88 | 10. `git clone https://github.com/jaswindersingh2/SPOT-RNA.git && cd SPOT-RNA` 89 | 11. `wget 'https://www.dropbox.com/s/dsrcf460nbjqpxa/SPOT-RNA-models.tar.gz' || wget -O SPOT-RNA-models.tar.gz 'https://app.nihaocloud.com/f/fbf3315a91d542c0bdc2/?dl=1'` 90 | 12. `tar -xvzf SPOT-RNA-models.tar.gz && rm SPOT-RNA-models.tar.gz && cd ../` 91 | 92 | To install the DCA predictor, follow the commands below:
93 | 94 | 13. `git clone "https://github.com/sokrypton/GREMLIN_CPP" && cd GREMLIN_CPP && g++ -O3 -std=c++0x -o gremlin_cpp gremlin_cpp.cpp -fopenmp && cd ../` 95 | 96 | To install the LinearPartition, follow the commands below:
97 | 98 | 14. `git clone 'https://github.com/LinearFold/LinearPartition.git' && cd LinearPartition/ && make && cd ../` 99 | 100 | If NCBI's nt database already available in your system, please set path to the database directory in line 11 and 13 of the `run_spotrna.sh` file. Otherwise, use the following command to download. It can take few hours the download to finish depending on your internet speed. If you run into issue, please follow the [link](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download) for more info. 101 | 102 | 15. `wget -c "ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz" -O ./nt_database/nt.gz && gunzip ./nt_database/nt.gz` 103 | 104 | Database needs to be formated for using in **BLASTN**. Please follow the command below to format the database.
105 | 106 | 16. `./ncbi-blast-*+/bin/makeblastdb -in ./nt_database/nt -dbtype nucl` 107 | 108 | 109 | ## Usage 110 | 111 | ### Run SPOT-RNA2 using docker: 112 | 113 | `docker run --rm -ti -v $(pwd)/sample_run:/SPOT-RNA2/sample_run -v $(pwd)/nt_database/:/SPOT-RNA2/nt_database spot_rna2:latest ./run_spotrna2.sh sample_run/6ufj.fasta` 114 | 115 | ### Run SPOT-RNA2 using Manual installation: 116 | ``` 117 | ./run_spotrna2.sh sample_run/6ufj.fasta 118 | ``` 119 | 120 | Both command creates two folder `6ufj_features` and `6ufj_outputs` in input file directory (`sample_run/` in this case). `6ufj_features/` contains all the alignments (MSA-1, MSA-2) and features (PSSM, DCA, bps probability) generated from SPOT-RNA2 pipeline. `6ufj_outputs/` contains predicted secondary structure in bpseq format (`6ufj.bpseq`), ct format (`6ufj.ct`), dbn format (`6ufj.st`) with secondary structure motifs, and base-pair probability (`6ufj.prob`). The verify the results, it should be same as in `sample_seq_features` and `sample_seq_outputs` folder because both sequence (`sample_seq.fasta` and `6ufj.fasta`) are same. 121 | 122 | ## Datasets 123 | 124 | The following datasets were used for Initial Training: 125 | * bpRNA[1]: Initial Learning (Training TR0, validation VL0, and test TS0) 126 | [Dropbox](https://www.dropbox.com/s/sg1p1otsqnaqev8/bpRNA_dataset.tar.xz) or [Nihao Cloud](https://app.nihaocloud.com/f/6f7b456d874c4842b8ac/) 127 | 128 | 129 | The following datasets were used for Transfer Learning: 130 | * PDB[2]: Transfer Learning (Training TR1, validation VL1, and testsets TS1, TS2, TS3, and TS_hard) 131 | [Dropbox](https://www.dropbox.com/s/apqrsl7hm1091ie/PDB_dataset.tar.xz) or [Nihao Cloud](https://app.nihaocloud.com/f/f301baed4dac4474a185/) 132 | 133 | ## Citation guide 134 | 135 | **If you use SPOT-RNA2 for your research please cite the following papers:** 136 | 137 | Jaswinder Singh, Kuldip Paliwal, Tongchuan Zhang, Jaspreet Singh, Thomas Litfin, Yaoqi Zhou, Improved RNA Secondary Structure and Tertiary Base-pairing Prediction Using Evolutionary Profile, Mutational Coupling and Two-dimensional Transfer Learning, Bioinformatics, 2021;, btab165, https://doi.org/10.1093/bioinformatics/btab165 138 | 139 | **If you use SPOT-RNA2 data sets and/or input feature pipeline, please consider citing the following papers:** 140 | 141 | [1] Zhang, T., Singh, J., Litfin, T., Zhan, J., Paliwal, K. and Zhou, Y., 2020. RNAcmap: A Fully Automatic Method for Predicting Contact Maps of RNAs by Evolutionary Coupling Analysis. bioRxiv. 142 | 143 | [2] Zhang, H., Zhang, L., Mathews, D.H. and Huang, L., 2020. LinearPartition: linear-time approximation of RNA folding partition function and base-pairing probabilities. Bioinformatics, 36(Supplement_1), pp.i258-i267. 144 | 145 | [3] Singh, J., Hanson, J., Paliwal, K. and Zhou, Y., 2019. RNA secondary structure prediction using an ensemble of two-dimensional deep neural networks and transfer learning. Nature communications, 10(1), pp.1-13. 146 | 147 | [4] Nawrocki, E.P. and Eddy, S.R., 2013. Infernal 1.1: 100-fold faster RNA homology searches. Bioinformatics, 29(22), pp.2933-2935. 148 | 149 | [5] H.M. Berman, J. Westbrook, Z. Feng, G. Gilliland, T.N. Bhat, H. Weissig, I.N. Shindyalov, P.E. Bourne. (2000) The Protein Data Bank Nucleic Acids Research, 28: 235-242. 150 | 151 | [6] Padideh Danaee, Mason Rouches, Michelle Wiley, Dezhong Deng, Liang Huang, David Hendrix, bpRNA: large-scale automated annotation and analysis of RNA secondary structure, Nucleic Acids Research, Volume 46, Issue 11, 20 June 2018, Pages 5381–5394, https://doi.org/10.1093/nar/gky285 152 | 153 | [7] Kamisetty, H., Ovchinnikov, S. and Baker, D., 2013. Assessing the utility of coevolution-based residue–residue contact predictions in a sequence-and structure-rich era. Proceedings of the National Academy of Sciences, 110(39), pp.15674-15679. 154 | 155 | [8] Chiu, J.K.H. and Chen, Y.P.P., 2014. Efficient conversion of RNA pseudoknots to knot-free structures using a graphical model. IEEE Transactions on Biomedical Engineering, 62(5), pp.1265-1271. 156 | 157 | Licence 158 | ----- 159 | Mozilla Public License 2.0 160 | 161 | 162 | Contact 163 | ----- 164 | jaswinder.singh3@griffithuni.edu.au, yaoqi.zhou@griffith.edu.au 165 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os, six, sys, subprocess 3 | import tensorflow as tf 4 | import random 5 | from tqdm import tqdm 6 | import pandas as pd 7 | from pathlib import Path 8 | 9 | # ------------- one hot encoding of RNA sequences -----------------# 10 | def one_hot(seq): 11 | RNN_seq = seq 12 | BASES = 'AUCG' 13 | bases = np.array([base for base in BASES]) 14 | feat = np.concatenate( 15 | [[(bases == base.upper()).astype(int)] if str(base).upper() in BASES else np.array([[-1] * len(BASES)]) for base 16 | in RNN_seq]) 17 | 18 | return feat 19 | 20 | 21 | def z_mask(seq_len): 22 | mask = np.ones((seq_len, seq_len)) 23 | return np.triu(mask, 2) 24 | 25 | def l_mask(inp, seq_len): 26 | mask = np.ones((seq_len, seq_len)) 27 | return np.triu(mask, 1) 28 | 29 | def get_data(seq, rna_id, args): 30 | seq_len = len(seq) 31 | one_hot_feat = one_hot(seq) 32 | 33 | with open(os.path.splitext(args.inputs)[0] + '.pssm') as f: 34 | temp = pd.read_csv(f, comment='#', delim_whitespace=True, header=None).values 35 | seq = ['U' if k == 'T' else k for k in temp[:, 0]] 36 | profile = temp[:, 1:5].astype(float) 37 | off_set = np.zeros((len(seq), profile.shape[1])) + 0.3 38 | for k, K in enumerate(seq): 39 | try: 40 | off_set[k, BASES.index(K)] = 8.7 41 | except: 42 | pass 43 | profile += off_set 44 | profile /= np.sum(profile, axis=1, keepdims=True) 45 | profile = -np.log(profile) 46 | 47 | profile_one_hot = np.concatenate([profile, one_hot_feat], axis=1) 48 | 49 | ############ load base-pair prob form linearpartition ############################## 50 | try: 51 | with open(os.path.splitext(args.inputs)[0] + '.prob', 'r') as f: 52 | prob = pd.read_csv(f, delimiter=None, delim_whitespace=True, header=None, skiprows=[0]).values 53 | bp_prob_lp = np.zeros((len(seq), len(seq))) 54 | for i in prob: 55 | bp_prob_lp[int(i[0])-1, int(i[1])-1] = i[2] 56 | bp_prob_lp = bp_prob_lp + np.transpose(bp_prob_lp) 57 | except: 58 | print("linearpartition output missing",rna_id) 59 | bp_prob_lp = np.zeros((len(seq), len(seq))) 60 | 61 | ############ load dca obtained from gremlin ############################## 62 | try: 63 | with open(os.path.splitext(args.inputs)[0] + '.dca') as f: 64 | temp4 = pd.read_csv(f, comment='#', delim_whitespace=True, header=None, skiprows=[0], usecols=[0,1,2]).values 65 | #print(temp4.shape) 66 | dca_output = np.zeros((len(seq), len(seq))) 67 | for k in temp4: 68 | if abs(int(k[0]) - int(k[1])) < 4: 69 | dca_output[int(k[0]), int(k[1])] = 0.01*k[2] 70 | else: 71 | dca_output[int(k[0]), int(k[1])] = k[2] 72 | dca_output = dca_output + np.transpose(dca_output) 73 | except: 74 | print("dca missing", rna_id) 75 | dca_output = np.zeros((len(seq), len(seq))) 76 | 77 | zero_mask = z_mask(seq_len)[None, :, :, None] 78 | label_mask = l_mask(profile_one_hot, seq_len) 79 | temp = profile_one_hot[None, :, :] 80 | temp = np.tile(temp, (temp.shape[1], 1, 1)) 81 | feature = np.concatenate([temp, np.transpose(temp, [1, 0, 2])], 2) 82 | feature = np.concatenate([feature, np.expand_dims(dca_output, axis=2)], axis=2) 83 | feature = np.concatenate([feature, np.expand_dims(bp_prob_lp, axis=2)], axis=2) 84 | 85 | assert feature.shape==(seq_len,seq_len, 18) 86 | 87 | return seq_len, [i for i in (feature.astype(float)).flatten()], [i for i in zero_mask.flatten()], [i for i in label_mask.flatten()], [i for i in label_mask.flatten()] 88 | 89 | def _int64_feature(value): 90 | if not isinstance(value, list) and not isinstance(value, np.ndarray): 91 | value = [value] 92 | 93 | return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) 94 | 95 | 96 | def _float_feature(value): 97 | if not isinstance(value, list) and not isinstance(value, np.ndarray): 98 | value = [value] 99 | 100 | return tf.train.Feature(float_list=tf.train.FloatList(value=value)) 101 | 102 | 103 | def _bytes_feature(value): 104 | """Wrapper for inserting bytes features into Example proto.""" 105 | if isinstance(value, six.string_types): 106 | value = six.binary_type(value, encoding='utf-8') 107 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 108 | 109 | def create_tfr_files(args): 110 | 111 | print('\nPreparing tfr records file for SPOT-RNA2:') 112 | path_tfrecords = os.path.splitext(args.inputs)[0] + ".tfrecords" 113 | with open(args.inputs) as file: 114 | input_data = [line.strip() for line in file.read().splitlines() if line.strip()] 115 | 116 | count = int(len(input_data)/2) 117 | 118 | ids = [input_data[2*i][1:].strip() for i in range(count)] 119 | 120 | with tf.io.TFRecordWriter(path_tfrecords) as writer: 121 | for i in tqdm(range(len(ids))): 122 | name = input_data[2*i].replace(">", "") 123 | sequence = input_data[2*i+1].replace(" ", "").replace("T", "U").upper() 124 | #print(sequence[-1]) 125 | 126 | #print(len(sequence), name) 127 | seq_len, feature, zero_mask, label_mask, true_label = get_data(sequence, name, args) 128 | 129 | example = tf.train.Example(features=tf.train.Features(feature={'rna_name': _bytes_feature(name), 130 | 'seq_len': _int64_feature(seq_len), 131 | 'feature': _float_feature(feature), 132 | 'zero_mask': _float_feature(zero_mask), 133 | 'label_mask': _float_feature(label_mask), 134 | 'true_label': _float_feature(true_label)})) 135 | 136 | writer.write(example.SerializeToString()) 137 | 138 | writer.close() 139 | 140 | # ----------------------- hair pin loop assumption i - j < 2 --------------------------------# 141 | def hair_pin_assumption(pred_pairs): 142 | pred_pairs_all = [i[:2] for i in pred_pairs] 143 | bad_pairs = [] 144 | for i in pred_pairs_all: 145 | if abs(i[0] - i[1]) < 3: 146 | bad_pairs.append(i) 147 | return bad_pairs 148 | 149 | def flatten(x): 150 | result = [] 151 | for el in x: 152 | if hasattr(el, "__iter__") and not isinstance(el, str): 153 | result.extend(flatten(el)) 154 | else: 155 | result.append(el) 156 | return result 157 | 158 | def type_pairs(pairs, sequence): 159 | sequence = [i.upper() for i in sequence] 160 | # seq_pairs = [[sequence[i[0]],sequence[i[1]]] for i in pairs] 161 | 162 | AU_pair = [] 163 | GC_pair = [] 164 | GU_pair = [] 165 | other_pairs = [] 166 | for i in pairs: 167 | if [sequence[i[0]],sequence[i[1]]] in [["A","U"], ["U","A"]]: 168 | AU_pair.append(i) 169 | elif [sequence[i[0]],sequence[i[1]]] in [["G","C"], ["C","G"]]: 170 | GC_pair.append(i) 171 | elif [sequence[i[0]],sequence[i[1]]] in [["G","U"], ["U","G"]]: 172 | GU_pair.append(i) 173 | else: 174 | other_pairs.append(i) 175 | watson_pairs_t = AU_pair + GC_pair 176 | wobble_pairs_t = GU_pair 177 | other_pairs_t = other_pairs 178 | # print(watson_pairs_t, wobble_pairs_t, other_pairs_t) 179 | return watson_pairs_t, wobble_pairs_t, other_pairs_t 180 | 181 | # ----------------------- find multiplets pairs--------------------------------# 182 | def multiplets_pairs(pred_pairs): 183 | 184 | pred_pair = [i[:2] for i in pred_pairs] 185 | temp_list = flatten(pred_pair) 186 | temp_list.sort() 187 | new_list = sorted(set(temp_list)) 188 | dup_list = [] 189 | for i in range(len(new_list)): 190 | if (temp_list.count(new_list[i]) > 1): 191 | dup_list.append(new_list[i]) 192 | 193 | dub_pairs = [] 194 | for e in pred_pair: 195 | if e[0] in dup_list: 196 | dub_pairs.append(e) 197 | elif e[1] in dup_list: 198 | dub_pairs.append(e) 199 | 200 | temp3 = [] 201 | for i in dup_list: 202 | temp4 = [] 203 | for k in dub_pairs: 204 | if i in k: 205 | temp4.append(k) 206 | temp3.append(temp4) 207 | 208 | return temp3 209 | 210 | def multiplets_free_bp(pred_pairs, y_pred): 211 | L = len(pred_pairs) 212 | multiplets_bp = multiplets_pairs(pred_pairs) 213 | save_multiplets = [] 214 | while len(multiplets_bp) > 0: 215 | remove_pairs = [] 216 | for i in multiplets_bp: 217 | save_prob = [] 218 | for j in i: 219 | save_prob.append(y_pred[j[0], j[1]]) 220 | remove_pairs.append(i[save_prob.index(min(save_prob))]) 221 | save_multiplets.append(i[save_prob.index(min(save_prob))]) 222 | pred_pairs = [k for k in pred_pairs if k not in remove_pairs] 223 | multiplets_bp = multiplets_pairs(pred_pairs) 224 | save_multiplets = [list(x) for x in set(tuple(x) for x in save_multiplets)] 225 | assert L == len(pred_pairs)+len(save_multiplets) 226 | #print(L, len(pred_pairs), save_multiplets) 227 | return pred_pairs, save_multiplets 228 | 229 | def output_mask(seq, NC=True): 230 | if NC: 231 | include_pairs = ['AU', 'UA', 'GC', 'CG', 'GU', 'UG', 'CC', 'GG', 'AG', 'CA', 'AC', 'UU', 'AA', 'CU', 'GA', 'UC'] 232 | else: 233 | include_pairs = ['AU', 'UA', 'GC', 'CG', 'GU', 'UG'] 234 | mask = np.zeros((len(seq), len(seq))) 235 | for i, I in enumerate(seq): 236 | for j, J in enumerate(seq): 237 | if str(I) + str(J) in include_pairs: 238 | mask[i, j] = 1 239 | return mask 240 | 241 | def ct_file_output(pairs, seq, id, save_result_path): 242 | 243 | col1 = np.arange(1, len(seq) + 1, 1) 244 | col2 = np.array([i for i in seq]) 245 | col3 = np.arange(0, len(seq), 1) 246 | col4 = np.append(np.delete(col1, 0), [0]) 247 | col5 = np.zeros(len(seq), dtype=int) 248 | 249 | for i, I in enumerate(pairs): 250 | col5[I[0]] = int(I[1]) + 1 251 | col5[I[1]] = int(I[0]) + 1 252 | col6 = np.arange(1, len(seq) + 1, 1) 253 | temp = np.vstack((np.char.mod('%d', col1), col2, np.char.mod('%d', col3), np.char.mod('%d', col4), 254 | np.char.mod('%d', col5), np.char.mod('%d', col6))).T 255 | 256 | np.savetxt(os.path.join(save_result_path, str(id))+'.ct', (temp), delimiter='\t\t', fmt="%s", header=str(len(seq)) + '\t\t' + str(id) + '\t\t' + 'SPOT-RNA2 output\n' , comments='') 257 | 258 | return 259 | 260 | def bpseq_file_output(pairs, seq, id, save_result_path): 261 | 262 | col1 = np.arange(1, len(seq) + 1, 1) 263 | col2 = np.array([i for i in seq]) 264 | #col3 = np.arange(0, len(seq), 1) 265 | #col4 = np.append(np.delete(col1, 0), [0]) 266 | col5 = np.zeros(len(seq), dtype=int) 267 | 268 | for i, I in enumerate(pairs): 269 | col5[I[0]] = int(I[1]) + 1 270 | col5[I[1]] = int(I[0]) + 1 271 | #col6 = np.arange(1, len(seq) + 1, 1) 272 | temp = np.vstack((np.char.mod('%d', col1), col2, np.char.mod('%d', col5))).T 273 | #os.chdir(save_result_path) 274 | #print(os.path.join(save_result_path, str(id[0:-1]))+'.spotrna') 275 | np.savetxt(os.path.join(save_result_path, str(id))+'.bpseq', (temp), delimiter=' ', fmt="%s", header='#' + str(id) , comments='') 276 | 277 | return 278 | 279 | def lone_pair(pairs): 280 | lone_pairs = [] 281 | pairs.sort() 282 | for i, I in enumerate(pairs): 283 | if ([I[0] - 1, I[1] + 1] not in pairs) and ([I[0] + 1, I[1] - 1] not in pairs): 284 | lone_pairs.append(I) 285 | 286 | return lone_pairs 287 | 288 | def prob_to_secondary_structure(ensemble_outputs, label_mask, seq, name, args): 289 | #save_result_path = 'outputs' 290 | Threshold = 0.795 291 | label_mask = np.triu(np.ones((len(seq), len(seq))),1) 292 | test_output = ensemble_outputs 293 | mask = output_mask(seq) 294 | inds = np.where(label_mask == 1) 295 | y_pred = np.zeros(label_mask.shape) 296 | for i in range(test_output.shape[0]): 297 | y_pred[inds[0][i], inds[1][i]] = test_output[i] 298 | #y_pred = np.multiply(y_pred, mask) 299 | 300 | tri_inds = np.triu_indices(y_pred.shape[0], k=1) 301 | 302 | out_pred = y_pred[tri_inds] 303 | outputs = out_pred[:, None] 304 | seq_pairs = [[tri_inds[0][j], tri_inds[1][j], ''.join([seq[tri_inds[0][j]], seq[tri_inds[1][j]]])] for j in 305 | range(tri_inds[0].shape[0])] 306 | 307 | outputs_T = np.greater_equal(outputs, Threshold) 308 | pred_pairs = [i for I, i in enumerate(seq_pairs) if outputs_T[I]] 309 | pred_pairs = [i[:2] for i in pred_pairs] 310 | pred_pairs, save_multiplets = multiplets_free_bp(pred_pairs, y_pred) 311 | 312 | if args.outputs=='outputs/': 313 | output_path = os.path.join(Path(os.path.dirname(os.path.realpath(__file__))).parent, args.outputs) 314 | else: 315 | output_path = args.outputs 316 | 317 | ct_file_output(pred_pairs, seq, name, output_path) 318 | bpseq_file_output(pred_pairs, seq, name, output_path) 319 | np.savetxt(output_path + '/'+ name +'.prob', y_pred, delimiter='\t') 320 | 321 | if args.motifs: 322 | try: 323 | os.chdir(args.outputs) 324 | p = subprocess.Popen(['perl', os.path.join(Path(os.path.dirname(os.path.realpath(__file__))).parent, 'utils/bpRNA.pl'), os.path.join(args.outputs, name + '.bpseq')]) 325 | except: 326 | print('\nUnable to run bpRNA script;\nplease refer to "https://github.com/hendrixlab/bpRNA/" for system requirments to use bpRNA') 327 | #os.chdir('../') 328 | 329 | return 330 | -------------------------------------------------------------------------------- /sample_run/sample_seq_features/sample_seq.msa: -------------------------------------------------------------------------------- 1 | # STOCKHOLM 1.0 2 | #=GF AU Infernal 1.1.3 3 | 4 | #=GS 6UFJ_A/1-51 DE Chain A, RNA (50-MER) 5 | #=GS 6UEY_A/1-50 DE Chain A, RNA (50-MER) 6 | #=GS HE577054.1/3246821-3246757 DE Paenibacillus polymyxa M1 main chromosome, complete genome 7 | #=GS MF288922.1/150528-150592 DE Bacillus phage Janet, complete genome 8 | #=GS CP033464.1/4485719-4485655 DE Brevibacillus laterosporus strain 1821L chromosome, complete genome 9 | #=GS KT307976.1/157679-157741 DE Bacillus phage AvesoBmore, complete genome 10 | #=GS CP032410.1/870062-870126 DE Brevibacillus laterosporus strain E7593-50 chromosome, complete genome 11 | #=GS MK892513.1/27480-27550 DE Prokaryotic dsDNA virus sp. isolate Unbinned_2716_contig-100_1, complete genome 12 | #=GS MK892777.1/32264-32334 DE Prokaryotic dsDNA virus sp. isolate Tp1_39_SUR_34326_1, partial genome 13 | #=GS MF288921.1/151458-151522 DE Bacillus phage OTooleKemple52, complete genome 14 | #=GS MH638310.1/151443-151507 DE Bacillus phage Kamfam, complete genome 15 | #=GS KJ489397.1/151758-151822 DE Bacillus phage CAM003, complete genome 16 | #=GS KJ489398.1/150857-150921 DE Bacillus phage Evoli, complete genome 17 | #=GS KJ489400.1/150952-151016 DE Bacillus phage Hoody T, complete genome 18 | #=GS KU737346.1/152020-152084 DE Bacillus phage Vinny, complete genome 19 | #=GS KF669647.1/155754-155816 DE Bacillus phage BigBertha, complete genome 20 | #=GS KU737345.1/154884-154946 DE Bacillus phage Juglone, complete genome 21 | #=GS KU737347.1/155734-155796 DE Bacillus phage Phrodo, complete genome 22 | #=GS MN038178.1/155190-155252 DE Bacillus phage Beyonphe, complete genome 23 | #=GS KF208639.2/156075-156137 DE Bacillus phage Troll, complete genome 24 | #=GS CP009278.1/2800251-2800310 DE Sphingobacterium sp. ML3W, complete genome 25 | #=GS CP045298.1/5377890-5377826 DE Paenibacillus brasilensis strain KACC 13842 chromosome, complete genome 26 | #=GS KF669662.1/155100-155162 DE Bacillus phage Spock, complete genome 27 | #=GS KR063281.1/60079-60028 DE Gordonia phage GMA2, complete genome 28 | #=GS KJ489402.1/153758-153819 DE Bacillus phage Riley, complete genome 29 | #=GS MF765814.1/155980-156041 DE Bacillus phage Taffo16, complete genome 30 | #=GS CP000154.2/3364238-3364174 DE Paenibacillus polymyxa E681, complete genome 31 | #=GS LN852800.1/7754-7693 DE Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0110 32 | #=GS CP019039.1/7984-8046 DE Bacillus velezensis strain GH1-13 plasmid unnamed, complete sequence 33 | #=GS LN852940.1/1904-1844 DE Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0268 34 | #=GS JN790865.1/35681-35620 DE Bacillus phage B4, complete genome 35 | #=GS JN797796.1/35736-35675 DE Bacillus phage B5S, complete genome 36 | #=GS KY888882.1/156410-156472 DE Bacillus phage Flapjack, complete genome 37 | #=GS CP014843.1/29638-29697 DE Bacillus licheniformis strain SCDB 14 plasmid pSCDB14, complete sequence 38 | #=GS CP021670.1/37922-37863 DE Bacillus licheniformis strain SRCM100141 plasmid pBL141-2 sequence 39 | #=GS CP035189.1/167253-167194 DE Bacillus licheniformis strain SRCM103914 plasmid unnamed1, complete sequence 40 | #=GS CP045906.1/14639513-14639571 DE Caligus rogercresseyi isolate FCH chromosome 17 41 | #=GS HG916826.1/843085-843030 DE Pseudomonas pseudoalcaligenes CECT 5344 complete genome 42 | #=GS LK391695.1/845304-845249 DE Pseudomonas pseudoalcaligenes genome assembly Ppseudo_Pac, chromosome : I 43 | #=GS XM_028713395.1/30-87 DE PREDICTED: Podarcis muralis solute carrier family 16 member 6 (SLC16A6), mRNA 44 | #=GS AC100771.2/133706-133648 DE Homo sapiens chromosome 11, clone RP11-159H22, complete sequence 45 | #=GS CP022654.2/63818-63880 DE Bacillus velezensis strain SCDB 291 chromosome, complete genome 46 | #=GS CP023320.1/44833-44771 DE Bacillus velezensis strain SCGB 1 chromosome, complete genome 47 | #=GS CP045899.1/5107513-5107456 DE Caligus rogercresseyi isolate FCH chromosome 10 48 | #=GS CP045890.1/2686952-2687009 DE Caligus rogercresseyi isolate FCH chromosome 1 49 | #=GS CP010557.1/4528803-4528858 DE Raoultella ornithinolytica strain S12, complete genome 50 | #=GS LR134253.1/1479651-1479596 DE Klebsiella aerogenes strain NCTC9997 genome assembly, chromosome: 3 51 | #=GS MH153801.1/58164-58217 DE Microbacterium phage Count, complete genome 52 | #=GS CP045896.1/486401-486459 DE Caligus rogercresseyi isolate FCH chromosome 7 53 | #=GS CP045901.1/8022709-8022767 DE Caligus rogercresseyi isolate FCH chromosome 12 54 | 55 | 6UFJ_A/1-51 ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCG.GAGAGCAG...A-...............--------- 56 | #=GR 6UFJ_A/1-51 PP ******************************************.********...8......................... 57 | 6UEY_A/1-50 ACUCGUUUGAGCGAGUAUAAACAGUUGGUUAGGCUCAAAGCG.GAGAGCAG...--...............--------- 58 | #=GR 6UEY_A/1-50 PP ******************************************.********............................. 59 | HE577054.1/3246821-3246757 ACUCGUCUGAGCGAGUAUAAACAGGUCAUUAAGCUCAGAGCG.UUCACCG-...--ggau....caug...-CGGUGAGG 60 | #=GR HE577054.1/3246821-3246757 PP ******************************************.******8......5666....6665....8******* 61 | MF288922.1/150528-150592 ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAAGCUCACAGCGuUAGAGAGG...--guu......ucu...CCUCUCUAG 62 | #=GR MF288922.1/150528-150592 PP *************************7.59*************88******9.....577......777...9******** 63 | CP033464.1/4485719-4485655 ACUCGAUUGAGCGAGUAUAAACAGAC-CUUAGGCUCAAAGCG.UUGAGAAG...--caa.....aaag...CUUCUCAGG 64 | #=GR CP033464.1/4485719-4485655 PP ************************76.59*************.*******9.....677.....7777...9******** 65 | KT307976.1/157679-157741 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGCGGG 66 | #=GR KT307976.1/157679-157741 PP ***********************854.499************.********76666...............********* 67 | CP032410.1/870062-870126 ACUCGAUUGAGCGAGUAUAAAUAGAC-CUUAAGCUCAAAGCG.UUGAGGAG...--cga.....ucag...CUUCUCAGG 68 | #=GR CP032410.1/870062-870126 PP ************************76.59*************.*******9.....677.....7777...9******** 69 | MK892513.1/27480-27550 AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGuGCGUAUAG...--cuaggucaagug...CUAUACGCG 70 | #=GR MK892513.1/27480-27550 PP ************************9.89**********************9.....8***********...9******** 71 | MK892777.1/32264-32334 AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGuGCGUAUAG...--cuaggucaagug...CUAUACGCG 72 | #=GR MK892777.1/32264-32334 PP ************************9.89**********************9.....8***********...9******** 73 | MF288921.1/151458-151522 ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGuUAGAGAGG...--guu......ucu...CCUCUCUAG 74 | #=GR MF288921.1/151458-151522 PP ************************75.59*************88******9.....577......777...9******** 75 | MH638310.1/151443-151507 ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGuUAGAGAGG...--guu......ucu...CCUCUCUAG 76 | #=GR MH638310.1/151443-151507 PP ************************75.59*************88******9.....577......777...9******** 77 | KJ489397.1/151758-151822 ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG 78 | #=GR KJ489397.1/151758-151822 PP *************************7.59*************889*99999.....577......777...999****9* 79 | KJ489398.1/150857-150921 ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG 80 | #=GR KJ489398.1/150857-150921 PP *************************7.59*************889*99999.....577......777...999****9* 81 | KJ489400.1/150952-151016 ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG 82 | #=GR KJ489400.1/150952-151016 PP *************************7.59*************889*99999.....577......777...999****9* 83 | KU737346.1/152020-152084 ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG 84 | #=GR KU737346.1/152020-152084 PP *************************7.59*************889*99999.....577......777...999****9* 85 | KF669647.1/155754-155816 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG 86 | #=GR KF669647.1/155754-155816 PP ***********************854.499************.********76666...............********* 87 | KU737345.1/154884-154946 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG 88 | #=GR KU737345.1/154884-154946 PP ***********************854.499************.********76666...............********* 89 | KU737347.1/155734-155796 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG 90 | #=GR KU737347.1/155734-155796 PP ***********************854.499************.********76666...............********* 91 | MN038178.1/155190-155252 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG 92 | #=GR MN038178.1/155190-155252 PP ***********************854.499************.********76666...............********* 93 | KF208639.2/156075-156137 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGcuuAU...............CCCCGUGGG 94 | #=GR KF208639.2/156075-156137 PP ***********************854.499************.********76666...............********* 95 | CP009278.1/2800251-2800310 AGUCGUUUGAGCGACUUAAAAUAGGU-UUUAAGCUCAAAGCG.CCCCGAUA...AU...............AAUCGGGAG 96 | #=GR CP009278.1/2800251-2800310 PP ************************98.499************.********...**...............********* 97 | CP045298.1/5377890-5377826 GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCG.UUCACCGG..gAU............cauCCGGUGAGG 98 | #=GR CP045298.1/5377890-5377826 PP ******************************************.*******9..643............334********* 99 | KF669662.1/155100-155162 ACUCGUGUAAGCGAGUAUAAAAAGGC-UUUAGGCUUACAGCG.UCGCGGAGuuuAU...............CUCCGCGGG 100 | #=GR KF669662.1/155100-155162 PP *********************99843.499************.********76666...............********* 101 | KR063281.1/60079-60028 ACUCGACUGAGCGAGUAUAAACAGUU-CUUAAGCUCAGAGCG.GCC-----...--ga........ga...-----GGCG 102 | #=GR KR063281.1/60079-60028 PP ************************88.59*************.985..........67........76........589* 103 | KJ489402.1/153758-153819 ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCG.UCGCGGG-...--guuu....aucu...-C--CCGCG 104 | #=GR KJ489402.1/153758-153819 PP ***********************854.49*************.6665555......4566....6654....4..5555* 105 | MF765814.1/155980-156041 ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCG.UCGCGGG-...--guuu....aucu...-C--CCGCG 106 | #=GR MF765814.1/155980-156041 PP ***********************854.49*************.6665555......4566....6654....4..5555* 107 | CP000154.2/3364238-3364174 GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCG.UUCACUGG...A-uu.......cgu...CCAGUGAGA 108 | #=GR CP000154.2/3364238-3364174 PP ******************************************.********...8.55.......555...********* 109 | LN852800.1/7754-7693 GCUCGUCUGGGCGAGGAUAAACAGCUA-UUAAGCCCAGAGCG.UUCCGGUU...AU............a.uGAUCGGAGG 110 | #=GR LN852800.1/7754-7693 PP **************************5.9*************.*****998...64............3.3789****** 111 | CP019039.1/7984-8046 AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCUCAGAGCG.UCCUUCC-...--ugc.....uucg...-GGAAGGGG 112 | #=GR CP019039.1/7984-8046 PP ***********************975.69*************.***9997......688.....8886....7999**** 113 | LN852940.1/1904-1844 GCUCGUCUGGGCGAGGGUAAAUAGCUAAUUAGGCCCAGAGCGuUCCAGGAU...G-...............AUCCUGGAG 114 | #=GR LN852940.1/1904-1844 PP ******************************************889******...9................********* 115 | JN790865.1/35681-35620 AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGG...--uu........ua...UCCCCCGUG 116 | #=GR JN790865.1/35681-35620 PP ***********************854.499************.99977665.....33........33...34555888* 117 | JN797796.1/35736-35675 AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGG...--uu........ua...UCCCCCGUG 118 | #=GR JN797796.1/35736-35675 PP ***********************854.499************.99977665.....33........33...34555888* 119 | KY888882.1/156410-156472 ACUCGUGUGAGUGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGG...--uuu......auc...CCCUGCG-G 120 | #=GR KY888882.1/156410-156472 PP ***********************854.499************.99999999.....455......555...8899999.* 121 | CP014843.1/29638-29697 AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCG.UUUCCCUU...CU...............AGGGGAGGU 122 | #=GR CP014843.1/29638-29697 PP ***********************975.69*************.********...**...............********* 123 | CP021670.1/37922-37863 AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCG.UUUCCCUU...CU...............AGGGGAGGU 124 | #=GR CP021670.1/37922-37863 PP ***********************975.69*************.********...**...............********* 125 | CP035189.1/167253-167194 AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCG.UUUCCCUU...CU...............AGGGGAGGU 126 | #=GR CP035189.1/167253-167194 PP ***********************975.69*************.********...**...............********* 127 | CP045906.1/14639513-14639571 UCUUGCUUGAGCAAGAAUAAAGAGCUGUACAUAAGCAAAGAG.UCUUGCCU...--...............GAGCAAGAG 128 | #=GR CP045906.1/14639513-14639571 PP ***************************999999999******.*****943....................569****** 129 | HG916826.1/843085-843030 CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCG.AAAUUCGG...GG...............CCG-----G 130 | #=GR HG916826.1/843085-843030 PP ******************************************.********...**...............***.....* 131 | LK391695.1/845304-845249 CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCG.AAAUUCGG...GG...............CCG-----G 132 | #=GR LK391695.1/845304-845249 PP ******************************************.********...**...............***.....* 133 | XM_028713395.1/30-87 ACCGGCUCGAGCCGGUAUAAAAAGCU---UGAGCUCGAGCAC.AGCGGCAG...CA...............CUGCCGCAG 134 | #=GR XM_028713395.1/30-87 PP *************************7...669****998888.9*******...99...............********* 135 | AC100771.2/133706-133648 GUUCAUUUGGGUGAAUAUAAAAAGGAGAUUA--CUCAAAGCU.UUAAAAAA...AA...............UUUUUUUAA 136 | #=GR AC100771.2/133706-133648 PP ******************************9..9********.98888888...88...............********* 137 | CP022654.2/63818-63880 AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCG.UCCUUCC-...--ugc.....uacg...-GGAAGGGG 138 | #=GR CP022654.2/63818-63880 PP ************************86.69*************.****997......678.....8886....899***** 139 | CP023320.1/44833-44771 AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCG.UCCUUCC-...--ugc.....uacg...-GGAAGGGG 140 | #=GR CP023320.1/44833-44771 PP ************************86.69*************.****997......678.....8886....899***** 141 | CP045899.1/5107513-5107456 UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGCUG...--...............-AGCAAGAG 142 | #=GR CP045899.1/5107513-5107456 PP ***************************999999999******.******85.....................59****** 143 | CP045890.1/2686952-2687009 UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGCUG...--...............-AGCAAGAG 144 | #=GR CP045890.1/2686952-2687009 PP ***************************999999999******.******85.....................59****** 145 | CP010557.1/4528803-4528858 CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------.UCAGGCAG...AU..............uCUGCCAGAG 146 | #=GR CP010557.1/4528803-4528858 PP **********************************96.......8889****...96..............6********* 147 | LR134253.1/1479651-1479596 CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------.UCAGGCAG...AU..............uCUGCCAGAG 148 | #=GR LR134253.1/1479651-1479596 PP **********************************96.......8889****...96..............6********* 149 | MH153801.1/58164-58217 AGUCGUCUGAGCGACUUUAAAUAGGU-CUUAGGCUCAGAGCG.GAUAGAUG...--...............----UAUUG 150 | #=GR MH153801.1/58164-58217 PP ************************98.49*************.*9985433........................4566* 151 | CP045896.1/486401-486459 UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGC--...AU...............GAGCAAGAG 152 | #=GR CP045896.1/486401-486459 PP ***************************999999999******.*****9.....77...............78******* 153 | CP045901.1/8022709-8022767 UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGC--...AU...............GAGCAAGAG 154 | #=GR CP045901.1/8022709-8022767 PP ***************************999999999******.*****9.....77...............78******* 155 | #=GC SS_cons <<<<<<____>>>>>>--------------------------.<<<<<<<<...__~~~~~~~~~~~~...>>>>>>>>: 156 | #=GC RF ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCG.GAGAGCAG...AU~~~~~~~~~~~~...CUGCUCUCG 157 | // 158 | -------------------------------------------------------------------------------- /utils/parse_blastn_local.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | 5 | 6 | # Default options 7 | my $E_max=10; 8 | my $E_min=-1.0; 9 | my $P_max=1; 10 | my $cov_thrshd=0; 11 | my $sc_thrshd=-10; 12 | my $qid_thrshd=0; 13 | my $bl=-10; # minimum per-residue bit score with query at ends of HSP for loose end pruning 14 | my $bs=-10; # minimum per-residue bit score with query at ends of HSP for strict end pruning 15 | my $bg=30; # below this number of end gaps the loose HSP pruning score is used 16 | my $outformat="fas"; 17 | my $append=0; 18 | my $query_file=""; 19 | my $infile; 20 | my $outfile; 21 | my $v=2; 22 | 23 | # Variable declarations 24 | my $i; # residue index 25 | my $j; # residue index 26 | my $k; # sequence index 27 | my $options=""; 28 | my $line; # line read in from file 29 | my $query_length=0; # number of residues in query sequence 30 | my $query_match=0; # number of upper-case residues (=match states) in query sequence 31 | my $capitalize=0; # capitalize query 32 | my $nameline; # >template_name 33 | my $Evalue; # e-value of hit 34 | my $score; # bit score of hit 35 | my $hit_length; # number of residues in HSP 36 | my $coverage; # hit-length/$query_length 37 | my $score_col; # score per column 38 | my $score_min=0; # $score_min=-3*log($P_max)/log(2); 39 | 40 | my $query_name; # name of query file 41 | my $queryseq; # residues of query read in with -q or -q2m option 42 | my $qfirst; # index of first query residue in pairwise alignment 43 | my $qlast; # index of last query residue in pairwise alignment 44 | my $tfirst; # index of first template residue in pairwise alignment 45 | my $tlast; # index of last template residue in pairwise alignment 46 | my $tlen=0; # length of template in pairwise alignment 47 | my @query_res; # query residues from current pairwise alignment 48 | my @template_res; # template residues from current pairwise alignment 49 | my $query_res; # query residues from current pairwise alignment 50 | my $template_res; # template residues from current pairwise alignment 51 | my $line_number=0; 52 | my $new_hit=""; # new sequence record; is only written if coverage threshold is exceeded 53 | my $nhit=0; # counts the number of sequences already in alignment 54 | my @hitnames; # $hitnames[$nhit] is the nameline of the ihit'th hit 55 | my @hitseqs; # $hitseqs[$nhit] contains the residues of the ihit'th hit 56 | my @match; # for -q option: $match[$i]=1 if $i'th query residue is capital letter in query, else 0 57 | my $qid; # $qid is sequence identity with query (for -q option: CONSIDER ONLY MATCH STATES) 58 | my $len; # $len is sequence number of residues of seq k aligned with a match state in query 59 | my $b; # minimum per-residue bit score with query at ends of HSP 60 | my $pfile=""; # alignment file used to calculate PSSM for -p and s/c options 61 | my $bfile=""; # alignment file used to calculate PSSM for -b option 62 | my $GAP=11.0/3.0; # gap opening penalty in bits (for BLOSUM62: 11 bits/3) 63 | my $EXTEND=1.0/3.0; # gap extension penalty in bits (for BLOSUM62: 1 bits/3) 64 | my @queryseq; 65 | my $skip=0; # skip this template sequence because it might be a synthetic fusion protein 66 | my $best=0; # extract only the best HSP per sequence 67 | my $rescaled_Gonnet=0; # Gonnet matrix not yet rescaled to bits 68 | my @qp=(); # $qb[$i][$a] is PSSM from alignment read in with -B option 69 | my @qb=(); # $qp[$i][$a] is PSSM from alignment read in with -P option 70 | 71 | 72 | $infile=$ARGV[0]; 73 | $query_file= $ARGV[1]; 74 | $outfile=$ARGV[2]; 75 | 76 | #Include query sequence as first sequence in alignment? 77 | if ($query_file) { 78 | open(QUERYFILE,"<$query_file") or die ("ERROR: Cannot open $query_file: $!\n"); 79 | while($line=) # Read name line 80 | { 81 | if ($line=~/^>(.*)/) 82 | { 83 | $query_name=$1; 84 | last; 85 | } 86 | } 87 | $hitseqs[0]=""; 88 | while($line=) # Read residues 89 | { 90 | if ($line=~/^>/) {last;} 91 | chomp($line); 92 | $line=~s/\s+//g; # remove white space 93 | $hitseqs[0].=$line; 94 | } 95 | close(QUERYFILE); 96 | 97 | # Prepare name line of hit 98 | if ($outformat eq "psi") { 99 | $query_name=~/^(\S{1,20})\S*\s*(.*)/; # delete everything after first block 100 | $line=sprintf("%s",$1); 101 | $line=~ tr/ /_/; 102 | $hitnames[0] = sprintf("%-31.31s ",$line); 103 | } else { 104 | $hitnames[0] = sprintf(">%s E=0.0",$query_name); 105 | } 106 | $hitseqs[0] =~ tr/-.//d; # delete all gaps from query 107 | $queryseq = $hitseqs[0]; 108 | $hitseqs[0] =~ tr/a-z/A-Z/d; # capitalize hitseq[0] and delete gaps 109 | # $hitseqs[0] =~ tr/Uu/Cc/; # nicht mehr noetig in blast. Kann aber alignhits.pl zum abschmieren bringen. 110 | $nhit=1; 111 | 112 | # Capitalize query? 113 | if ($capitalize) {$queryseq =~ tr/a-z/A-Z/;} 114 | $query_match = ($queryseq=~tr/A-Z/A-Z/); # count number of match states in query 115 | 116 | # Determine match columns as those with upper case residue in query 117 | @queryseq=unpack("C*",$queryseq); 118 | for ($j=0; $j<@queryseq; $j++) { 119 | if ($queryseq[$j]>=65 && $queryseq[$j]<=90) {$match[$j]=1;} else {$match[$j]=0;} 120 | } 121 | } 122 | 123 | 124 | 125 | 126 | # Scan Blast output file for query length (needed for coverage) 127 | open(INFILE,"<$infile") or die ("Error: cannot open $infile: $!\n"); 128 | $line_number++; 129 | while ($line=) 130 | { 131 | if ($line=~/^Length\s*=\s*(\d+)/) {$query_length = $1; last;} 132 | $line_number++; 133 | } 134 | #print("Query length = $query_length\n"); 135 | 136 | while ($line = ) #scan through PsiBlast-output line by line 137 | { 138 | # New nameline found? 139 | #print "$line"; 140 | #if ($line=~/^Length\s*=\s*(\d+)/) { print "length=$1\n\n\n\n";} 141 | 142 | if ($line=~s/^>//) 143 | { 144 | #print "$line"; 145 | $line=~s/\s+/ /g; 146 | chomp($line); 147 | $nameline=$line; 148 | while ($line=) 149 | { 150 | if ($line=~/^Length\s*=\s*(\d+)/) {last;} 151 | chomp($line); 152 | $nameline.=$line; 153 | } 154 | $line=~/^Length\s*=\s*(\d+)/; 155 | $tlen=$1; 156 | $nameline=~s/\s+/ /g; 157 | $nameline=~s/\s+gi\|/ gi\|/g; 158 | # Is sequence a synthetic fusion protein ? 159 | #if ($nameline=~/(\[synthetic| synthetic|construct|cloning|vector|chimeric|fusion)/i) {$skip=1;} else {$skip=0;} 160 | 161 | #print "$nameline\n"; 162 | } 163 | 164 | # New HSP found? 165 | elsif (!$skip && $line=~/^ Score =/) 166 | { 167 | if($best) {$skip=1;} # skip all following hits with same sequence? 168 | 169 | # First check whether E-value is small enough 170 | if($line =~ /^ Score =\s*(\S+)\s*bits\s*\S*\s*Expect =\s*(\S+)/) 171 | { 172 | $score=$1; 173 | $Evalue=$2; 174 | 175 | #print "$score, $Evalue\n"; 176 | } 177 | else 178 | { 179 | print("\nWARNING: wrong format in blast output. Expecting Score = ... Expect = ..\n$line\n"); 180 | } 181 | $Evalue=~s/^(e|E)/1$1/; # Expect = e-123 -> 1e-123 182 | $Evalue=~tr/,//d; 183 | if ($Evalue>$E_max || $Evalue<$E_min) {$new_hit=""; next;} # reject hit 184 | 185 | # Record sequence identity 186 | # (not needed, qid calculated afterwards WITHOUT counting template residues aligned to gaps in query) 187 | $line=; 188 | if ($line =~ /^ Identities =\s*\S+\/(\S+)\s+\((\S+)%\)/) 189 | { 190 | $qid=$2; 191 | #print "$qid\n"; 192 | $line=; 193 | } 194 | else 195 | { 196 | $qid=0.0; # if match is too poor then no identities are given 197 | } 198 | 199 | # Skip another line and read following line 200 | 201 | $line=; 202 | $line=; 203 | 204 | # Read pairwise alignment 205 | $qfirst=""; 206 | $tfirst=""; 207 | $query_res=""; 208 | $template_res=""; 209 | while ($line=~/^Query\s+\d+\s+\S+\s+\d*/) # Cycle in this loop until no new "Query:" lines are found 210 | { 211 | if ($line!~/^Query\s+(\d+)\s+(\S+)\s+(\d*)/) 212 | { 213 | print("WARNING 1: wrong format of blast output in $infile, line $.\n"); 214 | last; 215 | } 216 | if ($3 eq "") { 217 | ; ; ; $line=; 218 | print("WARNING 2: wrong format of blast output in $infile, line $. Skipping alignment block.\n"); 219 | next; 220 | } 221 | if ($qfirst eq "") {$qfirst=$1;} 222 | $query_res .= $2; 223 | $qlast=$3; 224 | ; $line=; 225 | if ($line!~/^Sbjct\s+(\d+)\s+(\S+)\s+(\d+)/) 226 | { 227 | print("WARNING 3: wrong format of blast output in $infile, line $.\n"); 228 | last; 229 | } 230 | if ($tfirst eq "") {$tfirst=$1;} 231 | $template_res .= $2; 232 | $tlast=$3; 233 | ; $line=; 234 | } # end while(1) 235 | # Check lengths 236 | $query_res = uc($query_res); 237 | $template_res = uc($template_res); 238 | if (length($template_res)!=length($query_res)) { 239 | print("WARNING: Query and template lines do not have the same length in $infile, line $.\n"); 240 | print("Q: $query_res\n"); 241 | print("T: $template_res\n"); 242 | next; 243 | } 244 | 245 | 246 | #print "$query_res\n"; 247 | #print "$template_res\n"; 248 | 249 | # Check whether hit has sufficient score per column 250 | $hit_length=($template_res=~tr/a-zA-Z/a-zA-Z/); 251 | if ($hit_length==0) {next;} # Reject hit? 252 | $score_col=$score/$hit_length; 253 | 254 | @query_res =unpack("C*",$query_res); 255 | @template_res=unpack("C*",$template_res); 256 | 257 | # Prune ends of HSP which are not reliably homologous 258 | #if (($bs>-9 || $bl>-9) && !&PruneHSP()) {next;} # if entire HSP is pruned away, goto next alignment 259 | 260 | # Check whether hit has sufficient sequence identity and coverage with query 261 | if (!$query_file) 262 | { 263 | $len=0; $qid=0; 264 | for ($i=0; $i-9 || $score_min>0) { 297 | if (!&CheckScorePerColumn()) {next;} 298 | } 299 | 300 | if ($v>=3) {printf("nhit=%-2i qid=%-3i qlen=%-3i qid=%-3i%% s/c=%-6.3f\n",$nhit,$qid,$len,100*$qid/$len,$score_col);} 301 | 302 | # Record residues 303 | $new_hit = "-"x($qfirst-1); # Print gaps at beginning of sequence 304 | if ($outformat eq "psi") { 305 | for ($i=0; $i%s(%i-%i:%i) %s E=%g s/c=%4.2f id=%.0f%% cov=%.0f%%", 335 | $1,$tfirst,$tlast,$tlen,$2,$Evalue,$score_col,100*$qid/$len,$coverage); 336 | } 337 | 338 | $nhit++; 339 | 340 | #print "$nhit\n" if($nhit%100 ==0); 341 | } # end elseif new HSP found 342 | } # end while ($line) 343 | 344 | close(INFILE); 345 | 346 | 347 | 348 | # If output format is fasta or a2m we have to insert gaps: 349 | if ($outformat ne "psi") 350 | { 351 | my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $j. 352 | my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $k after the $j'th match state 353 | my $insert; 354 | my $ngap; 355 | 356 | # For each match state determine length of LONGEST insert after this match state and store in @len_ins 357 | for ($k=0; $k<$nhit; $k++) { 358 | # split into list of single match states and variable-length inserts 359 | # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements 360 | # The '#' symbol is prepended to get rid of a perl bug in split 361 | $j=0; 362 | @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#"); 363 | # printf("%3i: %12.12s %s\n",$k,$hitnames[$k],$hitseqs[$k]); 364 | # printf("Sequence $k: @inserts\n"); 365 | foreach $insert (@inserts) { 366 | if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) { 367 | $len_ins[$j]=length($insert); 368 | } 369 | $j++; 370 | # printf("$insert|"); 371 | } 372 | # for (my $i=0; $i<@inserts; $i++) {printf("%s%-2i ",$inserts[$i],$len_ins[$i]);} 373 | # printf("\n"); 374 | } 375 | 376 | # After each match state insert residues and fill up with gaps to $len_ins[$i] characters 377 | for ($k=0; $k<$nhit; $k++) { 378 | # split into list of single match states and variable-length inserts 379 | @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#"); 380 | $j=0; 381 | 382 | # append the missing number of gaps after each match state 383 | foreach $insert (@inserts) { 384 | if($outformat eq "fas") { 385 | for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.="-";} 386 | } 387 | else { 388 | for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";} 389 | } 390 | $j++; 391 | } 392 | $hitseqs[$k] = join("",@inserts); 393 | $hitseqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end 394 | } 395 | } 396 | 397 | 398 | if ($query_file) { 399 | # Determine match states 400 | my @qa2m = unpack("C*",$hitseqs[0]); # $hitseq[0] is query sequence WITH INSERTS 401 | my @matchali=(); 402 | my $L=scalar(@qa2m); 403 | $j=0; 404 | for ($i=0; $i<@match; $i++) { 405 | while ($j<$L && !($qa2m[$j]>=65 && $qa2m[$j]<=90)) {$matchali[$j++]=0;} #move to column with next upper case residue 406 | $matchali[$j++]=$match[$i]; #is next query residue upper-case or not? 407 | } 408 | 409 | # Set all match states to upper case, non-match states to lower case 410 | my @res; 411 | for ($k=0; $k<$nhit; $k++) { 412 | @res = unpack("C*",$hitseqs[$k]); 413 | # printf("Q: %s\n",$hitseqs[0]); 414 | # printf("T: %s\n",$hitseqs[$k]); 415 | for ($i=0; $i<@res; $i++) { 416 | if ($matchali[$i]) { 417 | if ($res[$i]>=97 && $res[$i]<=122) {$res[$i]-=32;} #convert to upper case 418 | } else { 419 | if ($res[$i]>=65 && $res[$i]<=90) {$res[$i]+=32;} # convert to lower case 420 | elsif ($res[$i]==45) {$res[$i]=46;} # convert '-' to '.' 421 | } 422 | # printf("%3i Q:%s T:%s match=%i len=%i\n",$i,chr($qa2m[$i]),chr($res[$i]),$qid[$k],$len); 423 | } 424 | $hitseqs[$k] = pack("C*",@res); 425 | } 426 | } 427 | 428 | 429 | # Remove gaps? Captialize? 430 | if ($outformat eq "ufas") { 431 | for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/a-z.-/A-Z/d;} # Transform to upper case and remove all gaps 432 | } elsif ($outformat eq "fas") { 433 | for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/a-z./A-Z-/;} # Transform to upper case 434 | } elsif ($outformat eq "a3m") { 435 | for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/.//d;} # Remove gaps aligned to inserts 436 | } 437 | 438 | # Write sequences into output file 439 | open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n"); 440 | if ($outformat eq "psi") { 441 | for ($k=0; $k<$nhit; $k++) { 442 | $hitseqs[$k] =~ tr/./-/; 443 | printf(OUTFILE "%s %s\n",$hitnames[$k],$hitseqs[$k]); 444 | } 445 | } 446 | else { 447 | for ($k=0; $k<$nhit; $k++) { 448 | printf(OUTFILE "%s\n%s\n",$hitnames[$k],$hitseqs[$k]); 449 | } 450 | } 451 | close OUTFILE; 452 | 453 | if ($v>=1) {printf("$nhit sequences extracted from $infile and written to $outfile\n");} 454 | exit(0); 455 | 456 | 457 | 458 | 459 | -------------------------------------------------------------------------------- /run_spotrna2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start=`date +%s` 4 | 5 | input="$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" 6 | input_dir=$(dirname $input) 7 | seq_id=$(basename $(basename $input) | cut -d. -f1) 8 | program_dir=$(dirname $(readlink -f $0)) 9 | 10 | path_blastn=$program_dir/ncbi-blast-*+/bin # set path to the folder contains executable binary files of Blast package 11 | path_blastn_database=$program_dir/nt_database/nt # set path to the formatted NCBI's database file without extension 12 | path_infernal=$program_dir/infernal-*-linux-intel-gcc/binaries # set path to the folder contains executable binary files Infernal package 13 | path_infernal_database=$program_dir/nt_database/nt # set path to the NCBI's database database file 14 | 15 | mkdir -p $input_dir/${seq_id}_features && mkdir -p $input_dir/${seq_id}_outputs 16 | echo ">"$seq_id > $input_dir/${seq_id}_features/$seq_id.fasta 17 | awk -i inplace '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' $input 18 | tail -n1 $input >> $input_dir/${seq_id}_features/$seq_id.fasta 19 | 20 | feature_dir=$input_dir/${seq_id}_features 21 | output_dir=$input_dir/${seq_id}_outputs 22 | 23 | #exit 1 24 | 25 | if [ ! -f $path_blastn_database ]; then 26 | echo "" 27 | echo "========================================================================================" 28 | echo " Looks like nt database doesn't exists in the path $path_blastn_database. " 29 | echo " If you want to download the database now, please make sure you have enough " 30 | echo " space in mounted directory and internet connection have enough bandwidth as " 31 | echo " file is of size 270 GBs after unzip. It may take forever to download if " 32 | echo " internet is slow! " 33 | echo "========================================================================================" 34 | echo "" 35 | 36 | echo -n "Type 'y' for download or any other key to exit: " 37 | read userinput 38 | 39 | if [[ $(echo $userinput | tr '[A-Z]' '[a-z]') == 'y' ]]; then 40 | 41 | echo "" 42 | echo "==============================================================================================" 43 | echo " Downloading NCBI's database form ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz link. " 44 | echo " May take few hours to download. " 45 | echo "==============================================================================================" 46 | echo "" 47 | wget -c "ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz" -O $program_dir/nt_database/nt.gz 48 | 49 | 50 | if [[ $? -eq 0 ]]; then 51 | echo "" 52 | echo "=======================================================================" 53 | echo " nt database is completed successfully. " 54 | echo "=======================================================================" 55 | echo "" 56 | else 57 | echo "" 58 | echo "=======================================================================" 59 | echo " Error! Unable to download database sucessfully. " 60 | echo " Check wget command or internet connection. " 61 | echo "=======================================================================" 62 | echo "" 63 | exit 1 64 | fi 65 | 66 | echo "" 67 | echo "======================================================================" 68 | echo " Unziping the downloaded nt database. " 69 | echo " May take few hours as size of unzipped file is around 270 GBs. " 70 | echo "======================================================================" 71 | echo "" 72 | 73 | ############ unzip the nt data base file ############ 74 | gunzip $program_dir/nt_database/nt.gz 75 | 76 | if [[ $? -eq 0 ]]; then 77 | echo "" 78 | echo "=======================================================================" 79 | echo " nt database unzip completed successfully. " 80 | echo "=======================================================================" 81 | echo "" 82 | else 83 | echo "" 84 | echo "=======================================================================" 85 | echo " Error! unable to unzip database sucessfully. " 86 | echo " Please check if gunzip program exists! " 87 | echo "=======================================================================" 88 | echo "" 89 | exit 1 90 | fi 91 | 92 | else 93 | echo "" 94 | echo "===========================================================" 95 | echo " Exiting the program because nt database is missing! " 96 | echo "===========================================================" 97 | echo "" 98 | exit 1 99 | fi 100 | 101 | fi 102 | 103 | 104 | ###### check if aligned homologous sequences file already exists ############ 105 | if [ -f $feature_dir/$seq_id.a2m ]; then 106 | echo "" 107 | echo "======================================================================" 108 | echo " MSA file $feature_dir/$seq_id.a2m from Infernal Pipeline already " 109 | echo " exists for query sequence $feature_dir/$seq_id.fasta. " 110 | echo " " 111 | echo " Delete existing $feature_dir/$seq_id.a2m if want to generate new " 112 | echo " alignment file " 113 | echo "======================================================================" 114 | echo "" 115 | else 116 | 117 | #### check if formatted nt database exists or not ##### 118 | if [[ ! -f "$path_blastn_database.nal" ]]; then 119 | echo "" 120 | echo "=====================================================================" 121 | echo " Nucleotide database file $path_database/nt need to formated " 122 | echo " formated to use with 'makeblastdb' program in BLAST-N program. " 123 | echo "" 124 | echo " Formatting may take 2-3 hours as size of file is around 270 GBs. " 125 | echo "=====================================================================" 126 | echo "" 127 | $path_blastn/makeblastdb -in $path_database/nt -dbtype nucl 128 | 129 | if [[ $? -eq 0 ]]; then 130 | echo "" 131 | echo "=======================================================" 132 | echo " nt database formatted successfully. " 133 | echo "=======================================================" 134 | echo "" 135 | else 136 | echo "" 137 | echo "==================================================================" 138 | echo " Error occured while formatting the nt database. " 139 | echo "" 140 | echo " Check for '$path_blastn/makeblastdb' program in BLAST package " 141 | echo "==================================================================" 142 | echo "" 143 | exit 1 144 | fi 145 | fi 146 | 147 | 148 | #################### check if blastn alignment file ready exists ###################### 149 | if [ -f $feature_dir/$seq_id.bla ]; then 150 | echo "" 151 | echo "=======================================================================" 152 | echo " MSA-1 file $feature_dir/$seq_id.bla from Infernal Pipeline already " 153 | echo " exists for query sequence $feature_dir/$seq_id.fasta. " 154 | echo " " 155 | echo " Delete existing $feature_dir/$seq_id.a2m if want to generate new " 156 | echo " alignment file. " 157 | echo "=======================================================================" 158 | echo "" 159 | else 160 | echo "" 161 | echo "===========================================================================================================================" 162 | echo " Running BLASTN for first round of homologous sequence search for query sequence $feature_dir/$seq_id.fasta. " 163 | echo " May take 5 mins to few hours depending on sequence length and no. of homologous sequences in database. " 164 | echo "===========================================================================================================================" 165 | echo "" 166 | $path_blastn/blastn -db $path_blastn_database -query $feature_dir/$seq_id.fasta -out $feature_dir/$seq_id.bla -evalue 0.001 -num_descriptions 1 -num_threads 8 -line_length 1000 -num_alignments 50000 167 | fi 168 | 169 | if [ $? -eq 0 ]; then 170 | echo "" 171 | echo "===========================================================" 172 | echo " First round of MSA-1 search completed successfully. " 173 | echo "===========================================================" 174 | echo "" 175 | else 176 | echo "" 177 | echo "==================================================================" 178 | echo " Error occured while formatting the nt database. " 179 | echo "" 180 | echo " Check for '$path_blastn/makeblastdb' program in BLAST package " 181 | echo "==================================================================" 182 | echo "" 183 | exit 1 184 | fi 185 | 186 | ######## reformat the output ################ 187 | echo "" 188 | echo "========================================================================================" 189 | echo " Converting $feature_dir/$seq_id.bla from BLASTN to $feature_dir/$seq_id.sto. " 190 | echo "========================================================================================" 191 | echo "" 192 | $program_dir/utils/parse_blastn_local.pl $feature_dir/$seq_id.bla $feature_dir/$seq_id.fasta $feature_dir/$seq_id.aln 193 | $program_dir/utils/reformat.pl fas sto $feature_dir/$seq_id.aln $feature_dir/$seq_id.sto 194 | 195 | 196 | if [ $? -eq 0 ]; then 197 | echo "" 198 | echo "==========================================" 199 | echo " Converison completed successfully. " 200 | echo "==========================================" 201 | echo "" 202 | else 203 | echo "" 204 | echo "=============================================================================================" 205 | echo " Error occured while Converting $feature_dir/$seq_id.bla to $feature_dir/$seq_id.sto " 206 | echo " " 207 | echo " Check for $program_dir/utils/parse_blastn_local.pl and $program_dir/utils/reformat.pl file." 208 | echo "=============================================================================================" 209 | echo "" 210 | exit 1 211 | fi 212 | 213 | ######## predict secondary structure from SPOT-RNA ################ 214 | echo "" 215 | echo "===============================================================================================================================" 216 | echo " Predicting Consensus Secondary Structure (CSS) of query sequence $feature_dir/$seq_id.fasta using SPOT-RNA predictor. " 217 | echo "===============================================================================================================================" 218 | echo "" 219 | source $program_dir/venv/bin/activate || conda activate venv 220 | cd $program_dir/SPOT-RNA 221 | python3 SPOT-RNA.py --inputs $feature_dir/$seq_id.fasta --outputs $feature_dir 222 | cd - 223 | 224 | export PERL5LIB=$program_dir/utils/FreeKnot 225 | perl $program_dir/utils/FreeKnot/remove_pseudoknot.pl -i bpseq -s bp $feature_dir/$seq_id.bpseq > $feature_dir/$seq_id.bpseq.unknotted 226 | python3 $program_dir/utils/bpseq2dbn.py --inputs $feature_dir --outputs $feature_dir --rna_id $seq_id 227 | tail -n +3 $feature_dir/$seq_id.dbn > $feature_dir/$seq_id.db 228 | 229 | deactivate || conda deactivate 230 | 231 | ################ reformat ss with according to gaps in reference sequence of .sto file from blastn ################ 232 | for i in `awk '{print $2}' $feature_dir/$seq_id.sto | head -n5 | tail -n1 | grep -b -o - | sed 's/..$//'`; do sed -i "s/./&-/$i" $feature_dir/$seq_id.db; done 233 | 234 | ######### add reformated ss from last step to .sto file of blastn ############## 235 | head -n -1 $feature_dir/$seq_id.sto > $feature_dir/temp.sto 236 | echo "#=GC SS_cons "`cat $feature_dir/$seq_id.db` > $feature_dir/temp.txt 237 | cat $feature_dir/temp.sto $feature_dir/temp.txt > $feature_dir/$seq_id.sto 238 | echo "//" >> $feature_dir/$seq_id.sto 239 | 240 | if [ $? -eq 0 ]; then 241 | echo "" 242 | echo "==================================================================" 243 | echo " Consensus Secondary Structure (CSS) generated successfully. " 244 | echo "==================================================================" 245 | echo "" 246 | else 247 | echo "" 248 | echo "==============================================================================" 249 | echo " Error occured while generating structure from SPOT-RNA. " 250 | echo " " 251 | echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA2/issues'." 252 | echo "==============================================================================" 253 | echo "" 254 | exit 1 255 | fi 256 | 257 | ######## run infernal ################ 258 | echo "" 259 | echo "==============================================================================================================" 260 | echo " Building Covariance Model from BLASTN alignment (with SS from SPOT-RNA) from $feature_dir/$seq_id.sto file. " 261 | echo "==============================================================================================================" 262 | echo "" 263 | $path_infernal/cmbuild --hand -F $feature_dir/$seq_id.cm $feature_dir/$seq_id.sto 264 | 265 | if [ $? -eq 0 ]; then 266 | echo "" 267 | echo "============================================================================" 268 | echo " Covariance Model (CM) built successfully from $feature_dir/$seq_id.sto. " 269 | echo "============================================================================" 270 | echo "" 271 | else 272 | echo "" 273 | echo "===============================================================================================" 274 | echo " Error occured while building Covariance Model (CM) from $path_infernal/cmbuild. " 275 | echo " " 276 | echo " Please check for $path_infernal/cmbuild program. " 277 | echo "===============================================================================================" 278 | echo "" 279 | exit 1 280 | fi 281 | 282 | echo "" 283 | echo "====================================================================" 284 | echo " Calibrating the Covariance Model $feature_dir/$seq_id.cm. " 285 | echo "====================================================================" 286 | echo "" 287 | $path_infernal/cmcalibrate $feature_dir/$seq_id.cm 288 | 289 | if [ $? -eq 0 ]; then 290 | echo "" 291 | echo "===========================================================" 292 | echo " CM calibrated $feature_dir/$seq_id.cm successfully. " 293 | echo "===========================================================" 294 | echo "" 295 | else 296 | echo "" 297 | echo "===============================================================" 298 | echo " Error occured while calibrating $feature_dir/$seq_id.cm. " 299 | echo " " 300 | echo " Please check for $path_infernal/cmcalibrate program. " 301 | echo "===============================================================" 302 | echo "" 303 | exit 1 304 | fi 305 | 306 | echo "" 307 | echo "======================================================================================================================" 308 | echo " Second round of homologous sequences search using the calibrated covariance model $feature_dir/$seq_id.cm. " 309 | echo " May take 15 mins to few hours for this step. " 310 | echo "======================================================================================================================" 311 | echo "" 312 | $path_infernal/cmsearch -o $feature_dir/$seq_id.out -A $feature_dir/$seq_id.msa --cpu 24 --incE 10.0 $feature_dir/$seq_id.cm $path_infernal_database 313 | 314 | if [ $? -eq 0 ]; then 315 | echo "" 316 | echo "===========================================================" 317 | echo " Second round of MSA-2 search completed successfully. " 318 | echo "===========================================================" 319 | echo "" 320 | else 321 | echo "" 322 | echo "====================================================================================" 323 | echo " Error occured during the second round search using CM $feature_dir/$seq_id.cm. " 324 | echo " " 325 | echo " Please check for $path_infernal/cmsearch program. " 326 | echo "====================================================================================" 327 | echo "" 328 | exit 1 329 | fi 330 | 331 | ######### reformat the alignment without gaps and dashes ############### 332 | echo "" 333 | echo "=======================================================================" 334 | echo " Reformatting the output alignment $feature_dir/$seq_id.msa " 335 | echo " for PSSM and DCA features by removing the gaps and dashes. " 336 | echo "=======================================================================" 337 | echo "" 338 | 339 | ##### check if .msa is not empty ######### 340 | if [[ -s $feature_dir/$seq_id.msa ]] 341 | then 342 | $path_infernal/esl-reformat --replace acgturyswkmbdhvn:................ a2m $feature_dir/$seq_id.msa > $feature_dir/temp.a2m 343 | else 344 | cat $feature_dir/$seq_id.fasta > $feature_dir/temp.a2m 345 | cat $feature_dir/$seq_id.fasta >> $feature_dir/temp.a2m 346 | sed -i '$ s/.$/./' $feature_dir/temp.a2m 347 | fi 348 | 349 | # $path_infernal/esl-reformat --replace acgturyswkmbdhvn:................ a2m $feature_dir/$seq_id.msa > $feature_dir/temp.a2m 350 | 351 | if [ $? -eq 0 ]; then 352 | echo "" 353 | echo "===========================================================" 354 | echo " Reformatted the $feature_dir/$seq_id.msa successfully. " 355 | echo "===========================================================" 356 | echo "" 357 | else 358 | echo "" 359 | echo "========================================================================================" 360 | echo " Error occured during the refomatting the alignment file $feature_dir/$seq_id.msa. " 361 | echo " " 362 | echo " Please check for $path_infernal/esl-reformat program. " 363 | echo "========================================================================================" 364 | echo "" 365 | exit 1 366 | fi 367 | 368 | ######### remove duplicates sequences from the alignment ############### 369 | echo "" 370 | echo "=======================================================================" 371 | echo " Removing duplicates from the alignment. " 372 | echo "=======================================================================" 373 | echo "" 374 | $program_dir/utils/seqkit rmdup -s $feature_dir/temp.a2m > $feature_dir/$seq_id.a2m 375 | 376 | if [ $? -eq 0 ]; then 377 | echo "" 378 | echo "===============================================" 379 | echo " Duplicate sequences removed successfully. " 380 | echo "===============================================" 381 | echo "" 382 | else 383 | echo "" 384 | echo "========================================================================================" 385 | echo " Error occured during the removel of duplicates from MSA-2. " 386 | echo " " 387 | echo " Please check for $program_dir/utils/seqkit program. " 388 | echo "========================================================================================" 389 | echo "" 390 | exit 1 391 | fi 392 | 393 | ############# multiline fasta to single line fasta file ############# 394 | awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < $feature_dir/$seq_id.a2m | sed '/^$/d' > $feature_dir/temp.a2m 395 | ############# add query sequence at the top of MSA file ############# 396 | cat $feature_dir/$seq_id.fasta $feature_dir/temp.a2m > $feature_dir/$seq_id.a2m 397 | 398 | fi 399 | 400 | ############# check if pssm file already exists otherwise generate from alignment file ############# 401 | if [ -f $feature_dir/$seq_id.pssm ]; then 402 | echo "" 403 | echo "==============================================================================================================================================" 404 | echo " PSSM feature file $feature_dir/$seq_id.pssm already exists for query sequence $feature_dir/$seq_id.fasta. " 405 | echo "==============================================================================================================================================" 406 | echo "" 407 | else 408 | echo "" 409 | echo "======================================================================================" 410 | echo " Extracting PSSM features from the alignment $feature_dir/$seq_id.a2m. " 411 | echo "======================================================================================" 412 | echo "" 413 | $program_dir/utils/getpssm.pl $feature_dir/$seq_id.fasta $feature_dir/$seq_id.a2m $feature_dir/$seq_id.pssm 414 | 415 | if [ $? -eq 0 ]; then 416 | echo "" 417 | echo "===============================================================" 418 | echo " PSSM extracted successfully from $feature_dir/$seq_id.a2m. " 419 | echo "===============================================================" 420 | echo "" 421 | else 422 | echo "" 423 | echo "=========================================================================" 424 | echo " Error occured while extracting PSSM from $feature_dir/$seq_id.a2m. " 425 | echo " " 426 | echo " Please check for $program_dir/utils/getpssm.pl program. " 427 | echo "=========================================================================" 428 | echo "" 429 | exit 1 430 | fi 431 | fi 432 | 433 | ######### run linearpartition RNA secondary structure base-pair probability predictor ############### 434 | echo "" 435 | echo "============================================================================" 436 | echo " Running LinearPartition-V for base-pair probabilty features. " 437 | echo "============================================================================" 438 | echo "" 439 | tail -n +2 $feature_dir/$seq_id.fasta | $program_dir/LinearPartition/linearpartition -V -r $feature_dir/$seq_id.prob 440 | 441 | if [ $? -eq 0 ]; then 442 | echo "" 443 | echo "====================================================================" 444 | echo " Base-pair probabilty successfully obtained from LinearPartition. " 445 | echo "====================================================================" 446 | echo "" 447 | else 448 | echo "" 449 | echo "=============================================================================" 450 | echo " Error occured while running LinearPartition. " 451 | echo " " 452 | echo " Please check for $program_dir/LinearPartition/linearpartition program. " 453 | echo "=============================================================================" 454 | echo "" 455 | exit 1 456 | fi 457 | 458 | ############# check if dca file already exists otherwise generate from alignment file ############# 459 | if [ -f $feature_dir/$seq_id.dca ]; then 460 | echo "" 461 | echo "===============================================================" 462 | echo " GRELMLIN feature file $feature_dir/$seq_id.dca already " 463 | echo " exists for query sequence $feature_dir/$seq_id.fasta. " 464 | echo " " 465 | echo " Delete the existing file if want to generate new dca file. " 466 | echo "===============================================================" 467 | echo "" 468 | else 469 | echo "" 470 | echo "============================================================================" 471 | echo " Running GREMLIN for DCA features. " 472 | echo "============================================================================" 473 | echo "" 474 | $program_dir/GREMLIN_CPP/gremlin_cpp -alphabet rna -i $feature_dir/$seq_id.a2m -o $feature_dir/$seq_id.dca > $feature_dir/$seq_id.log_gremlin 475 | if [ $? -eq 0 ]; then 476 | echo "" 477 | echo "====================================================" 478 | echo " DCA features successfully obtained from GREMLIN. " 479 | echo "====================================================" 480 | echo "" 481 | else 482 | echo "" 483 | echo "=============================================================================" 484 | echo " Error occured while running GREMLIN. " 485 | echo " " 486 | echo " Please check for $program_dir/GREMLIN_CPP/gremlin_cpp program. " 487 | echo "=============================================================================" 488 | echo "" 489 | exit 1 490 | fi 491 | fi 492 | 493 | 494 | echo "" 495 | echo "============================================================================" 496 | echo " Running SPOT-RNA2 for RNA secondary structure prediction. " 497 | echo "============================================================================" 498 | echo "" 499 | source $program_dir/venv/bin/activate || conda activate venv 500 | python3 $program_dir/utils/SPOT-RNA2.py --inputs $feature_dir/$seq_id.fasta --outputs $output_dir --motifs True 501 | deactivate || conda deactivate 502 | 503 | end=`date +%s` 504 | 505 | runtime=$((end-start)) 506 | 507 | echo -e "\ncomputation time = "$runtime" seconds" 508 | 509 | --------------------------------------------------------------------------------