├── nt_database
└── empty_db.fasta
├── utils
├── seqkit
├── __pycache__
│ └── utils.cpython-36.pyc
├── FreeKnot
│ ├── COPYRIGHT
│ ├── COPYRIGHT.txt
│ ├── BpseqWriter.pm
│ ├── BracketPairs.pm
│ ├── ScoringFunctions.pm
│ ├── DPWriter.pm
│ ├── BpseqParser.pm
│ ├── README
│ ├── ChordModel.pm
│ ├── README.txt
│ ├── MIS.pm
│ ├── CircleGraph.pm
│ ├── DPParser.pm
│ ├── PrimitivePseudoknotExtractor.pm
│ ├── VertexSubset.pm
│ ├── MWIS.pm
│ └── remove_pseudoknot.pl
├── bpseq2dbn.py
├── getpssm.pl
├── SPOT-RNA2.py
├── utils.py
└── parse_blastn_local.pl
├── requirements.txt
├── sample_run
├── 6ufj.fasta
├── sample_seq.fasta
├── sample_seq_features
│ ├── sample_seq.db
│ ├── sample_seq.fasta
│ ├── temp.txt
│ ├── sample_seq.tfrecords
│ ├── sample_seq.dbn
│ ├── sample_seq.aln
│ ├── temp.sto
│ ├── sample_seq.sto
│ ├── sample_seq.bpseq.unknotted
│ ├── sample_seq.bpseq
│ ├── sample_seq.ct
│ ├── sample_seq.bla
│ ├── sample_seq.pssm
│ ├── sample_seq.prob
│ ├── temp.a2m
│ ├── sample_seq.a2m
│ ├── sample_seq.log_gremlin
│ └── sample_seq.msa
└── sample_seq_outputs
│ ├── sample_seq.bpseq
│ ├── sample_seq.st
│ └── sample_seq.ct
├── docs
├── SPOTRNA2_pipeline.png
└── benchmark_results.png
├── __pycache__
└── utils.cpython-36.pyc
├── Dockerfile
├── README.md
└── run_spotrna2.sh
/nt_database/empty_db.fasta:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/utils/seqkit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/utils/seqkit
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.14.0
2 | pandas
3 | numpy==1.16.4
4 | argparse
5 | tqdm
6 | six
7 |
--------------------------------------------------------------------------------
/sample_run/6ufj.fasta:
--------------------------------------------------------------------------------
1 | >6ufj: chain A,B
2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
3 |
--------------------------------------------------------------------------------
/sample_run/sample_seq.fasta:
--------------------------------------------------------------------------------
1 |
2 | >6ufj_A_B
3 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
--------------------------------------------------------------------------------
/docs/SPOTRNA2_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/docs/SPOTRNA2_pipeline.png
--------------------------------------------------------------------------------
/docs/benchmark_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/docs/benchmark_results.png
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.db:
--------------------------------------------------------------------------------
1 | ((((((....))))))..........................((((((((..)))))))).
2 |
--------------------------------------------------------------------------------
/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.fasta:
--------------------------------------------------------------------------------
1 | >sample_seq
2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/temp.txt:
--------------------------------------------------------------------------------
1 | #=GC SS_cons ((((((....))))))..........................((((((((..)))))))).
2 |
--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/utils/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.tfrecords:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/sample_run/sample_seq_features/sample_seq.tfrecords
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.dbn:
--------------------------------------------------------------------------------
1 | >single_seq
2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
3 | ((((((....))))))..........................((((((((..)))))))).
4 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.aln:
--------------------------------------------------------------------------------
1 | >sample_seq E=0.0
2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
3 | >6UFJ_A(1-51:51) Chain A, RNA (50-MER) 6UFJ_C Chain C, RNA (50-MER) 6UFK_A Chain A, RNA (50-MER) 6UFK_C Chain C, RNA (50-MER) E=2e-16 s/c=1.87 id=98% cov=85%
4 | ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA----------
5 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/temp.sto:
--------------------------------------------------------------------------------
1 | # STOCKHOLM 1.0
2 |
3 | #=GF DE E=0.0
4 | #=GC RF ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
5 | sample_seq ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
6 | 6UFJ_A(1-51:51) ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA----------
7 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.sto:
--------------------------------------------------------------------------------
1 | # STOCKHOLM 1.0
2 |
3 | #=GF DE E=0.0
4 | #=GC RF ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
5 | sample_seq ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
6 | 6UFJ_A(1-51:51) ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA----------
7 | #=GC SS_cons ((((((....))))))..........................((((((((..)))))))).
8 | //
9 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.bpseq.unknotted:
--------------------------------------------------------------------------------
1 | 1 A 16
2 | 2 C 15
3 | 3 U 14
4 | 4 C 13
5 | 5 G 12
6 | 6 U 11
7 | 7 U 0
8 | 8 U 0
9 | 9 G 0
10 | 10 A 0
11 | 11 G 6
12 | 12 C 5
13 | 13 G 4
14 | 14 A 3
15 | 15 G 2
16 | 16 U 1
17 | 17 A 0
18 | 18 U 0
19 | 19 A 0
20 | 20 A 0
21 | 21 A 0
22 | 22 C 0
23 | 23 A 0
24 | 24 G 0
25 | 25 C 0
26 | 26 U 0
27 | 27 G 0
28 | 28 G 0
29 | 29 U 0
30 | 30 U 0
31 | 31 A 0
32 | 32 A 0
33 | 33 G 0
34 | 34 C 0
35 | 35 U 0
36 | 36 C 0
37 | 37 A 0
38 | 38 A 0
39 | 39 A 0
40 | 40 G 0
41 | 41 C 0
42 | 42 G 0
43 | 43 G 60
44 | 44 A 59
45 | 45 G 58
46 | 46 A 57
47 | 47 G 56
48 | 48 C 55
49 | 49 A 54
50 | 50 G 53
51 | 51 A 0
52 | 52 U 0
53 | 53 C 50
54 | 54 U 49
55 | 55 G 48
56 | 56 C 47
57 | 57 U 46
58 | 58 C 45
59 | 59 U 44
60 | 60 C 43
61 | 61 G 0
62 |
--------------------------------------------------------------------------------
/utils/FreeKnot/COPYRIGHT:
--------------------------------------------------------------------------------
1 | Copyright (C) 2012 Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen
2 |
3 | This file is part of FreeKnot.
4 |
5 | FreeKnot is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation, either version 3 of the License, or
8 | (at your option) any later version.
9 |
10 | FreeKnot is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | GNU General Public License for more details.
14 |
15 | You should have received a copy of the GNU General Public License
16 | along with FreeKnot. If not, see .
17 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.bpseq:
--------------------------------------------------------------------------------
1 | #sample_seq
2 | 1 A 16
3 | 2 C 15
4 | 3 U 14
5 | 4 C 13
6 | 5 G 12
7 | 6 U 11
8 | 7 U 0
9 | 8 U 37
10 | 9 G 36
11 | 10 A 35
12 | 11 G 6
13 | 12 C 5
14 | 13 G 4
15 | 14 A 3
16 | 15 G 2
17 | 16 U 1
18 | 17 A 0
19 | 18 U 0
20 | 19 A 0
21 | 20 A 0
22 | 21 A 0
23 | 22 C 0
24 | 23 A 0
25 | 24 G 0
26 | 25 C 0
27 | 26 U 0
28 | 27 G 0
29 | 28 G 0
30 | 29 U 0
31 | 30 U 0
32 | 31 A 0
33 | 32 A 0
34 | 33 G 0
35 | 34 C 0
36 | 35 U 10
37 | 36 C 9
38 | 37 A 8
39 | 38 A 0
40 | 39 A 0
41 | 40 G 0
42 | 41 C 0
43 | 42 G 0
44 | 43 G 60
45 | 44 A 59
46 | 45 G 58
47 | 46 A 57
48 | 47 G 56
49 | 48 C 55
50 | 49 A 54
51 | 50 G 53
52 | 51 A 0
53 | 52 U 0
54 | 53 C 50
55 | 54 U 49
56 | 55 G 48
57 | 56 C 47
58 | 57 U 46
59 | 58 C 45
60 | 59 U 44
61 | 60 C 43
62 | 61 G 0
63 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_outputs/sample_seq.bpseq:
--------------------------------------------------------------------------------
1 | #sample_seq
2 | 1 A 16
3 | 2 C 15
4 | 3 U 14
5 | 4 C 13
6 | 5 G 0
7 | 6 U 39
8 | 7 U 38
9 | 8 U 37
10 | 9 G 36
11 | 10 A 35
12 | 11 G 34
13 | 12 C 33
14 | 13 G 4
15 | 14 A 3
16 | 15 G 2
17 | 16 U 1
18 | 17 A 0
19 | 18 U 0
20 | 19 A 0
21 | 20 A 0
22 | 21 A 0
23 | 22 C 0
24 | 23 A 0
25 | 24 G 0
26 | 25 C 0
27 | 26 U 0
28 | 27 G 0
29 | 28 G 0
30 | 29 U 0
31 | 30 U 0
32 | 31 A 0
33 | 32 A 0
34 | 33 G 12
35 | 34 C 11
36 | 35 U 10
37 | 36 C 9
38 | 37 A 8
39 | 38 A 7
40 | 39 A 6
41 | 40 G 0
42 | 41 C 0
43 | 42 G 61
44 | 43 G 60
45 | 44 A 59
46 | 45 G 58
47 | 46 A 57
48 | 47 G 56
49 | 48 C 55
50 | 49 A 54
51 | 50 G 53
52 | 51 A 0
53 | 52 U 0
54 | 53 C 50
55 | 54 U 49
56 | 55 G 48
57 | 56 C 47
58 | 57 U 46
59 | 58 C 45
60 | 59 U 44
61 | 60 C 43
62 | 61 G 42
63 |
--------------------------------------------------------------------------------
/utils/FreeKnot/COPYRIGHT.txt:
--------------------------------------------------------------------------------
1 | Copyright (C) 2012 Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen
2 |
3 | This file is part of FreeKnot.
4 |
5 | FreeKnot is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation, either version 3 of the License, or
8 | (at your option) any later version.
9 |
10 | FreeKnot is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | GNU General Public License for more details.
14 |
15 | You should have received a copy of the GNU General Public License
16 | along with FreeKnot. If not, see .
17 |
--------------------------------------------------------------------------------
/utils/FreeKnot/BpseqWriter.pm:
--------------------------------------------------------------------------------
1 | #Writer for BPSEQ format
2 |
3 | package BpseqWriter;
4 |
5 | use strict;
6 |
7 | sub output_results {
8 | my (undef, $combined_base_pair_removal_pos, $base_seq, $paired_pos_ptrs, $base_count) = @_;
9 |
10 | if (@{$combined_base_pair_removal_pos} == 0) {
11 | for (my $i = 1; $i <= $base_count; $i++) {
12 | print $i . ' ' . $base_seq->[$i - 1] . ' ' . $paired_pos_ptrs->[$i] . "\n";
13 | }
14 | }
15 |
16 | foreach (@{$combined_base_pair_removal_pos}) {
17 | for (my $i = 1; $i <= $base_count; $i++) {
18 | print $i . ' ' . $base_seq->[$i - 1] . ' ';
19 | if (exists($_->{$i})) {
20 | print "0\n";
21 | }
22 | else {
23 | print $paired_pos_ptrs->[$i] . "\n";
24 | }
25 | }
26 | }
27 | }
28 |
29 | 1;
30 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_outputs/sample_seq.st:
--------------------------------------------------------------------------------
1 | #Name: sample_seq
2 | #Length: 61
3 | #PageNumber: 2
4 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
5 | [[[[.(((((((]]]]................)))))))..(((((((((..)))))))))
6 | EEEEESSSSSSSHHHHHHHHHHHHHHHHHHHHSSSSSSSXXSSSSSSSSSHHSSSSSSSSS
7 | KKKKNNNNNNNNKKKKNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
8 | S1 6..12 "UUUGAGC" 33..39 "GCUCAAA"
9 | S2 42..50 "GGAGAGCAG" 53..61 "CUGCUCUCG"
10 | H1 13..32 "GAGUAUAAACAGCUGGUUAA" (12,33) C:G PK{1}
11 | H2 51..52 "AU" (50,53) G:C
12 | X1 40..41 "GC" (39,6) A:U (42,61) G:G
13 | E1 1..5 "ACUCG" PK{1}
14 | PK1 4bp 1..4 13..16 E1 1..5 H1 13..32
15 | PK1.1 1 A 16 U
16 | PK1.2 2 C 15 G
17 | PK1.3 3 U 14 A
18 | PK1.4 4 C 13 G
19 | NCBP1 42 G 61 G S2
20 | segment1 7bp 6..12 UUUGAGC 33..39 GCUCAAA
21 | segment2 9bp 42..50 GGAGAGCAG 53..61 CUGCUCUCG
22 |
--------------------------------------------------------------------------------
/utils/FreeKnot/BracketPairs.pm:
--------------------------------------------------------------------------------
1 | #Bracket handler for DPParser
2 |
3 | package BracketPairs;
4 | use strict;
5 |
6 | my $open_bracket_map = {")" => "(", "]" => "[", "}" => "{", ">" => "<"};
7 |
8 | #Check whether a symbol (in dot-parentheses format) is an open bracket
9 | sub is_open_bracket {
10 | my (undef, $symbol) = @_;
11 |
12 | if ($symbol =~ /^[\(\[{]$/) {
24 | return $open_bracket_map->{$close_bracket};
25 | }
26 | elsif ($close_bracket =~ /^[a-z]$/) {
27 | return uc $close_bracket;
28 | }
29 | else {
30 | die "Unknown closing bracket\n";
31 | }
32 | }
33 |
34 | 1;
35 |
--------------------------------------------------------------------------------
/utils/bpseq2dbn.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import argparse
4 | import os
5 |
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument('--inputs', default='inputs', type=str, help='Path to input file in fasta format, accept multiple sequences as well in fasta format; default = ''inputs/2zzm-1-B.fasta''\n', metavar='')
8 | parser.add_argument('--outputs',default='inputs', type=str, help='Path to output files; SPOT-RNA outputs at least three files .ct, .bpseq, and .prob files; default = ''inputs/\n', metavar='')
9 | parser.add_argument('--rna_id', default='sample_seq', type=str, help='Name of the input sequence file\n')
10 |
11 | args = parser.parse_args()
12 |
13 | with open(os.path.join(args.inputs, args.rna_id + ".bpseq.unknotted")) as f:
14 | temp = pd.read_csv(f,comment='#', delim_whitespace=True, header=None, usecols=[0,1,2]).values
15 | seq = temp[:,1]
16 |
17 | pairs = [[i,j] for i,j in zip(temp[:,0], temp[:,2]) if i!=0 and j!=0 and i6UFJ_A Chain A, RNA (50-MER) 6UFJ_C Chain C, RNA (50-MER) 6UFK_A
25 | Chain A, RNA (50-MER) 6UFK_C Chain C, RNA (50-MER)
26 | Length=51
27 |
28 | Score = 95.3 bits (51), Expect = 2e-16
29 | Identities = 51/51 (100%), Gaps = 0/51 (0%)
30 | Strand=Plus/Plus
31 |
32 | Query 1 ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA 51
33 | |||||||||||||||||||||||||||||||||||||||||||||||||||
34 | Sbjct 1 ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA 51
35 |
36 |
37 |
38 | Lambda K H
39 | 1.33 0.621 1.12
40 |
41 | Gapped
42 | Lambda K H
43 | 1.28 0.460 0.850
44 |
45 | Effective search space used: 7769692438560
46 |
47 |
48 | Database: /nt_database/nt
49 | Posted date: May 30, 2020 5:58 AM
50 | Number of letters in database: 260,722,916,040
51 | Number of sequences in database: 55,908,648
52 |
53 |
54 |
55 | Matrix: blastn matrix 1 -2
56 | Gap Penalties: Existence: 0, Extension: 2.5
57 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 | MAINTAINER Jaswinder Singh (jaswinder.singh3@griffithuni.edu.au)
3 |
4 | RUN rm /bin/sh && ln -s /bin/bash /bin/sh
5 | RUN apt-get update && apt-get install -y build-essential wget virtualenv git python-minimal cpanminus gawk
6 | RUN cpanm Graph
7 |
8 | RUN wget 'https://www.dropbox.com/s/h6j53u7wjyj6uir/SPOT-RNA2.tar.xz' || wget 'https://app.nihaocloud.com/f/3e826caf8efc43adaaa0/?dl=1' && tar -xvf SPOT-RNA2.tar.xz && rm SPOT-RNA2.tar.xz
9 | WORKDIR SPOT-RNA2
10 |
11 | RUN wget -O utils/models_ckps.tar.xz 'https://www.dropbox.com/s/udzcsva76lh5wvq/models_ckps.tar.xz' || wget -O utils/models_ckps.tar.xz 'https://app.nihaocloud.com/f/586acb2658d74ccb92b8/?dl=1' && tar -xvf utils/models_ckps.tar.xz -C utils/ && rm utils/models_ckps.tar.xz
12 | RUN virtualenv -p python3.6 venv && source ./venv/bin/activate && pip install tensorflow==1.14.0 && pip install -r requirements.txt && deactivate
13 |
14 | RUN wget 'eddylab.org/infernal/infernal-1.1.3-linux-intel-gcc.tar.gz' && tar -xvzf infernal-*.tar.gz && rm infernal-*.tar.gz
15 | RUN wget 'ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-*+-x64-linux.tar.gz' && tar -xvzf ncbi-blast-*+-x64-linux.tar.gz && rm ncbi-blast-*+-x64-linux.tar.gz
16 | RUN git clone https://github.com/jaswindersingh2/SPOT-RNA.git && cd SPOT-RNA && wget 'https://www.dropbox.com/s/dsrcf460nbjqpxa/SPOT-RNA-models.tar.gz' || wget -O SPOT-RNA-models.tar.gz 'https://app.nihaocloud.com/f/fbf3315a91d542c0bdc2/?dl=1' && tar -xvzf SPOT-RNA-models.tar.gz && rm SPOT-RNA-models.tar.gz && cd ../
17 | RUN git clone "https://github.com/sokrypton/GREMLIN_CPP" && cd GREMLIN_CPP && g++ -O3 -std=c++0x -o gremlin_cpp gremlin_cpp.cpp -fopenmp && cd ../
18 | RUN git clone 'https://github.com/LinearFold/LinearPartition.git' && cd LinearPartition/ && make && cd ../
19 |
--------------------------------------------------------------------------------
/utils/FreeKnot/ScoringFunctions.pm:
--------------------------------------------------------------------------------
1 | package ScoringFunctions;
2 |
3 | use strict;
4 |
5 | #my $free_energy_params;
6 | #my $canonical_base_pairs = {'AU' => 0, 'CG' => 0, 'GC' => 0, 'UA' => 0, 'GU' => 0, 'UG' => 0};
7 |
8 | #Return a scoring function according to the choice selected
9 | sub get_scoring_function {
10 | my (undef, $option) = @_;
11 |
12 | if ($option eq 'bp') {
13 | return \&_base_pair_score, 'max', 0;
14 | }
15 | elsif ($option eq 'stem') {
16 | return \&_stem_score, 'max', 0;
17 | }
18 | # elsif ($option eq 'sstab') {
19 | # $free_energy_params = _init_free_energy_parameters();
20 | # return \&_stem_bp_stability, 'min', 1;
21 | # }
22 | elsif ($option eq 'hb') {
23 | return \&_hydrogen_bond, 'max', 0;
24 | }
25 | elsif ($option eq 'fe') {
26 | return \&_overall_stability, 'min', 1;
27 | }
28 | else {
29 | return undef, undef, undef;
30 | }
31 | }
32 |
33 | #Number of base pairs in a stem as the stem score
34 | sub _base_pair_score {
35 | my $chord_attrs = shift;
36 |
37 | my $stem_pair_count = $chord_attrs->{pair_count};
38 | if (defined($stem_pair_count)) {
39 | return $stem_pair_count;
40 | }
41 |
42 | return 0;
43 | }
44 |
45 | #Each stem scores equally as 1
46 | sub _stem_score {
47 | return 1;
48 | }
49 |
50 | #GC and CG bonds = 3, other canonical of GU pairs = 2
51 | sub _hydrogen_bond {
52 | my ($chord_attrs, $base_seq) = @_;
53 |
54 | my $stem_base_pairs = $chord_attrs->{base_pairs};
55 | my $total_score = 0;
56 |
57 | foreach (@{$stem_base_pairs}) {
58 | my $base_pair_type = uc($base_seq->[$_->[0] - 1] . $base_seq->[$_->[1] - 1]);
59 | if ($base_pair_type eq 'GC' || $base_pair_type eq 'CG') {
60 | $total_score += 3;
61 | }
62 | elsif ($base_pair_type eq 'AU' || $base_pair_type eq 'UA' ||
63 | $base_pair_type eq 'GU' || $base_pair_type eq 'UG') {
64 | $total_score += 2;
65 | }
66 | }
67 |
68 | return $total_score;
69 | }
70 |
71 | #This allows all MISs to be reported as MWISs and they will be converted to all possible
72 | #de-knotted structures to determine the minimum free energy (MFE)
73 | sub _overall_stability {
74 | return 0;
75 | }
76 |
77 | 1;
78 |
--------------------------------------------------------------------------------
/utils/FreeKnot/DPWriter.pm:
--------------------------------------------------------------------------------
1 | #Writer for dot-parentheses format
2 |
3 | package DPWriter;
4 |
5 | use strict;
6 |
7 | use constant DOT => '.';
8 | use constant OPEN_BRACKET => '(';
9 | use constant CLOSE_BRACKET => ')';
10 | use constant TEMP_DP_FILE => 'MWIS_temp.dp';
11 |
12 | sub output_results {
13 | my (undef, $combined_base_pair_removal_pos, $structure_symbols, $base_seq_str) = @_;
14 |
15 | if (@{$combined_base_pair_removal_pos} == 0) {
16 | my $output_structure = join('', @{$structure_symbols});
17 | print "$base_seq_str\n$output_structure\n";
18 | }
19 |
20 | foreach (@{$combined_base_pair_removal_pos}) {
21 | my $output_structure = '';
22 | for (my $i = 0; $i < @{$structure_symbols}; $i++) {
23 | if (exists($_->{$i + 1})) {
24 | $output_structure = $output_structure . DOT;
25 | }
26 | else {
27 | $output_structure = $output_structure . $structure_symbols->[$i];
28 | }
29 | }
30 |
31 | print "$base_seq_str\n$output_structure\n";
32 | }
33 | }
34 |
35 | sub output_mfe_candidate {
36 | my (undef, $base_pair_removal_pos, $paired_pos_ptrs, $structure_symbols, $base_seq_str) = @_;
37 |
38 | my $base_seq_len = length($base_seq_str);
39 | my $output_structure = '';
40 | if (defined($paired_pos_ptrs)) {
41 | for (my $i = 1; $i <= $base_seq_len; $i++) {
42 | if (exists($base_pair_removal_pos->{$i})) {
43 | $output_structure = $output_structure . DOT;
44 | }
45 | else {
46 | my $paired_pos = $paired_pos_ptrs->[$i];
47 | if ($paired_pos == 0) {
48 | $output_structure = $output_structure . DOT;
49 | }
50 | elsif ($i < $paired_pos) {
51 | $output_structure = $output_structure . OPEN_BRACKET;
52 | }
53 | else {
54 | $output_structure = $output_structure . CLOSE_BRACKET;
55 | }
56 | }
57 | }
58 | }
59 | elsif (defined($structure_symbols)) {
60 | for (my $i = 1; $i <= $base_seq_len; $i++) {
61 | if (exists($base_pair_removal_pos->{$i})) {
62 | $output_structure = $output_structure . DOT;
63 | }
64 | else {
65 | $output_structure = $output_structure . $structure_symbols->[$i - 1];
66 | }
67 | }
68 | }
69 |
70 | $output_structure =~ s/[\[\{a-z]/\)/g;
72 |
73 | open (DP, ">" . TEMP_DP_FILE) or die "Cannot open file at " . TEMP_DP_FILE;
74 | print DP "$base_seq_str\n$output_structure\n";
75 | close DP or die "Cannot close file at " . TEMP_DP_FILE;
76 | }
77 |
78 | 1;
79 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.pssm:
--------------------------------------------------------------------------------
1 | A 23 3 5 2 0
2 | C 0 3 8 22 0
3 | U 0 31 0 2 0
4 | C 0 3 1 29 0
5 | G 1 0 32 0 0
6 | U 3 24 0 6 0
7 | U 0 13 9 11 0
8 | U 0 31 1 1 0
9 | G 1 0 32 0 0
10 | A 26 0 6 1 0
11 | G 1 0 32 0 0
12 | C 0 2 0 31 0
13 | G 3 0 29 1 0
14 | A 31 0 2 0 0
15 | G 3 0 22 8 0
16 | U 3 24 4 2 0
17 | A 27 3 3 0 0
18 | U 3 28 0 2 0
19 | A 32 0 0 1 0
20 | A 33 0 0 0 0
21 | A 32 0 0 1 0
22 | C 3 6 3 21 0
23 | A 32 1 0 0 0
24 | G 0 0 32 1 0
25 | C 7 2 14 10 0
26 | U 2 14 0 16 1
27 | G 2 1 9 3 18
28 | G 8 14 5 4 2
29 | U 3 29 0 0 1
30 | U 0 29 0 4 0
31 | A 32 0 1 0 0
32 | A 18 3 11 0 1
33 | G 3 0 29 0 1
34 | C 3 0 0 30 0
35 | U 0 26 3 4 0
36 | C 1 1 0 31 0
37 | A 31 0 1 0 1
38 | A 13 0 10 9 1
39 | A 31 0 1 0 1
40 | G 0 0 31 1 1
41 | C 4 0 0 28 1
42 | G 0 1 30 1 1
43 | G 2 24 6 1 0
44 | A 8 8 1 16 0
45 | G 3 5 15 10 0
46 | A 13 7 3 9 1
47 | G 2 3 23 4 1
48 | C 7 2 9 14 1
49 | A 11 5 12 3 2
50 | G 2 4 21 0 6
51 | A 12 0 2 2 17
52 | U 2 10 1 0 20
53 | C 3 2 3 16 9
54 | U 5 9 3 12 4
55 | G 2 8 12 6 5
56 | C 2 3 4 18 6
57 | U 6 12 7 4 4
58 | C 6 2 10 12 3
59 | U 6 6 17 1 3
60 | C 11 2 12 4 4
61 | G 2 1 28 0 2
62 |
--------------------------------------------------------------------------------
/utils/FreeKnot/BpseqParser.pm:
--------------------------------------------------------------------------------
1 | #Parser for BPSEQ format
2 | #It returns primitive pseudoknot objects, base sequence and paired positions
3 |
4 | package BpseqParser;
5 |
6 | use strict;
7 |
8 | sub parse {
9 | my (undef, $bpseq_file_path) = @_;
10 |
11 | my ($base_seq, $paired_pos_ptrs) = ([], []);
12 | my ($next_paired_pos, $prev_paired_pos) = ({}, {});
13 | my $matched_pos = {};
14 | my $last_paired_pos = 0;
15 | my $base_count = 0;
16 |
17 | open (BPSEQ, "<$bpseq_file_path") or die "Cannot open file at $bpseq_file_path";
18 |
19 | while () {
20 | if ($_ =~ /^([0-9]+) ([A-Za-z]{1}) ([0-9]+)[\r\n]*$/) {
21 | my ($pos, $base, $paired_pos) = ($1, $2, $3);
22 | if ($pos != ++$base_count) {
23 | die "Base position $base_count is missing";
24 | }
25 |
26 | if ($paired_pos > 0) {
27 | if ($pos < $paired_pos) {
28 | $matched_pos->{$pos} = $paired_pos;
29 | }
30 | else {
31 | if ($matched_pos->{$paired_pos} != $pos) {
32 | die "Unmatched pair position $pos and $paired_pos";
33 | }
34 | }
35 |
36 | $next_paired_pos->{$last_paired_pos} = $pos;
37 | $prev_paired_pos->{$pos} = $last_paired_pos;
38 | $last_paired_pos = $pos;
39 | }
40 |
41 | $paired_pos_ptrs->[$pos] = $paired_pos;
42 | $base_seq->[$pos - 1] = $base;
43 | }
44 | elsif ($_ !~ /^#.*/ && $_ !~ /^\s+/) {
45 | die "Unknown input: $_";
46 | }
47 | }
48 |
49 | $next_paired_pos->{$last_paired_pos} = 0;
50 | $prev_paired_pos->{0} = $last_paired_pos;
51 |
52 | close BPSEQ or die "Cannot close file at $bpseq_file_path";
53 |
54 | #Group the base pairs into base pair stems
55 | my ($stem_outermost_pairs, $stems) = _group_to_stems($next_paired_pos, $prev_paired_pos, $paired_pos_ptrs);
56 | #Extract primitive pseudoknots from the base pair stems
57 | my $primitive_pseudoknots = PrimitivePseudoknotExtractor->extract($stem_outermost_pairs, $stems, $paired_pos_ptrs);
58 |
59 | return ($primitive_pseudoknots, $base_seq, $paired_pos_ptrs, $base_count);
60 | }
61 |
62 | sub _group_to_stems {
63 | my ($next_paired_pos, $prev_paired_pos, $paired_pos_ptrs) = @_;
64 |
65 | my $stems = {};
66 | my $stem_outermost_pairs = [];
67 | my $stem;
68 | my $last_pair;
69 |
70 | my $curr_pos = $next_paired_pos->{0};
71 | while ($curr_pos > 0) {
72 | my $paired_pos = $paired_pos_ptrs->[$curr_pos];
73 | if ($paired_pos < $curr_pos) {
74 | undef $last_pair;
75 | $curr_pos = $next_paired_pos->{$curr_pos};
76 | next;
77 | }
78 |
79 | my $curr_pair = [$curr_pos, $paired_pos];
80 |
81 | if (defined($last_pair) && $prev_paired_pos->{$last_pair->[1]} == $paired_pos) {
82 | push @{$stem}, $curr_pair;
83 | }
84 | else {
85 | $stem = [$curr_pair];
86 | $stems->{$curr_pos} = $stem;
87 | push @{$stem_outermost_pairs}, $curr_pair;
88 | }
89 |
90 | $last_pair = $curr_pair;
91 | $curr_pos = $next_paired_pos->{$curr_pos};
92 | }
93 |
94 | return ($stem_outermost_pairs, $stems);
95 | }
96 |
97 | 1;
98 |
--------------------------------------------------------------------------------
/utils/FreeKnot/README:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------------------
2 | FreeKnot
3 | -------------------------------------------------------------------------
4 | Authors: Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen
5 | Last updated on 15 Apr 2014
6 |
7 | -------------------------------------------------------------------------
8 | Purpose
9 |
10 | FreeKnot is a tool for RNA pseudoknot removal. It converts any pseudoknot
11 | into nested substructures in RNA secondary structures. It removes some
12 | crossing stems to eliminate crossings based on certain scoring functions
13 | (details will be provided later in this README file) and reports one or
14 | more optimized pseudoknot-free structures.
15 |
16 | -------------------------------------------------------------------------
17 | Platform and pre-requisites
18 |
19 | FreeKnot has been tested on various platforms including Linux (Ubuntu),
20 | Mac OS X and Windows. Perl (v5.14 or later) is recommended. Earlier
21 | versions might work but without guarantee. Windows users can download
22 | various Perl distributions for Windows. ViennaRNA package 2.1 is required
23 | for the free energy scoring function.
24 |
25 | -------------------------------------------------------------------------
26 | Program/Module Description
27 |
28 | BpseqParser.pm, DPParser.pm - parser to accept bpseq or
29 | dot-parentheses formats as input
30 | BpseqWriter.pm, DPWriter.pm - writer to output converted results in
31 | bpseq or dot-parentheses formats
32 | ChordModel.pm, CircleGraph.pm - graphical object for primitive
33 | pseudoknot representation
34 | MIS.pm - MIS algorithm (for free energy scoring
35 | function)
36 | MWIS.pm - MWIS algorithm
37 | ScoringFunctions.pm - scoring functions
38 | remove_pseudoknot.pl - main program for pseudoknot removal
39 | PrimitivePseudoknotExtractor.pm - primitive pseudoknot extraction from
40 | the input secondary structure
41 | BracketPairs.pm - processing brackets in input secondary
42 | structure
43 | VertexSubset.pm - subset object for storing graph
44 | vertices in the MIS algorithm
45 |
46 | -------------------------------------------------------------------------
47 | Usage
48 |
49 | FreeKnot is executed in console. The command is:
50 |
51 | perl remove_pseudoknot.pl -i
52 | -s
53 |
54 | Secondary structure format available: dp (dot-parentheses) / bpseq
55 | The secondary structure format for the output file follows that of the
56 | input file. So, if the input file is in bpseq format then the output
57 | file is also in bpseq format. Note that every line of data must end with
58 | a newline character (i.e. \n).
59 |
60 | Scoring function options: bp (# of base pairs) / stem (# of base pair
61 | stems) / hb (# of hydrogen bonds) / fe (structure overall free energy)
62 |
63 | The results are outputted to the console (stdout) by default. They can be
64 | directed to a file. For example,
65 |
66 | perl remove_pseudoknot.pl -i bpseq -s bp input.bpseq > output.bpseq
67 |
68 | -------------------------------------------------------------------------
69 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.prob:
--------------------------------------------------------------------------------
1 | 1 16 2.4607e-01
2 | 2 11 4.8058e-04
3 | 2 15 2.7600e-01
4 | 2 40 2.7181e-04
5 | 2 42 5.0162e-03
6 | 2 43 7.0046e-04
7 | 2 61 3.1390e-04
8 | 3 10 4.9712e-04
9 | 3 14 2.7639e-01
10 | 3 39 2.2049e-04
11 | 3 42 2.1906e-03
12 | 3 43 3.6608e-03
13 | 4 9 4.9913e-04
14 | 4 13 2.7651e-01
15 | 4 40 4.7134e-03
16 | 4 42 6.8810e-01
17 | 5 12 2.7567e-01
18 | 5 34 3.2105e-03
19 | 5 41 7.0013e-01
20 | 6 10 1.6400e-04
21 | 6 11 2.4958e-01
22 | 6 33 3.2126e-03
23 | 6 39 1.8319e-01
24 | 6 40 5.2746e-01
25 | 7 11 2.1717e-04
26 | 7 32 3.2300e-03
27 | 7 37 2.5131e-05
28 | 7 38 3.5702e-01
29 | 7 39 3.4129e-01
30 | 8 31 3.0849e-03
31 | 8 37 5.4291e-01
32 | 8 38 1.6981e-01
33 | 9 30 2.3436e-03
34 | 9 36 7.1718e-01
35 | 10 26 8.7135e-05
36 | 10 29 1.9660e-03
37 | 10 30 1.5846e-03
38 | 10 35 7.1728e-01
39 | 11 25 1.2568e-04
40 | 11 29 5.5166e-03
41 | 11 34 7.1395e-01
42 | 12 24 1.3000e-04
43 | 12 27 1.1131e-03
44 | 12 28 1.5587e-02
45 | 12 33 7.0364e-01
46 | 12 40 1.8068e-05
47 | 13 22 1.9136e-04
48 | 13 25 1.9429e-04
49 | 13 26 4.2317e-04
50 | 13 29 9.7416e-04
51 | 13 30 9.5602e-02
52 | 13 36 1.4656e-03
53 | 14 26 3.0481e-02
54 | 14 29 1.1306e-01
55 | 14 30 7.3809e-03
56 | 14 35 1.5024e-03
57 | 15 22 3.7622e-03
58 | 15 25 3.5921e-02
59 | 15 26 1.5459e-04
60 | 15 29 5.8125e-03
61 | 15 30 2.9255e-02
62 | 15 34 1.5186e-03
63 | 15 41 4.7121e-05
64 | 16 21 3.4426e-03
65 | 16 23 2.3995e-04
66 | 16 24 3.4999e-02
67 | 16 27 1.3027e-01
68 | 16 28 2.1053e-03
69 | 16 31 1.0100e-02
70 | 16 32 1.1925e-03
71 | 16 33 1.4888e-03
72 | 16 40 6.5565e-05
73 | 17 26 1.2737e-01
74 | 17 29 4.0552e-02
75 | 17 30 1.0248e-02
76 | 18 23 8.8966e-04
77 | 18 24 2.7892e-03
78 | 18 27 1.9342e-01
79 | 18 28 3.8211e-02
80 | 18 31 7.2033e-03
81 | 18 32 3.3442e-03
82 | 18 37 8.8863e-05
83 | 18 38 1.8289e-04
84 | 18 39 3.4985e-04
85 | 18 40 5.0928e-04
86 | 18 42 5.8368e-05
87 | 19 26 1.7940e-01
88 | 19 29 3.9621e-04
89 | 19 30 8.6332e-03
90 | 20 26 1.0309e-02
91 | 20 29 5.0768e-03
92 | 20 30 2.9850e-02
93 | 21 26 2.6013e-03
94 | 21 29 3.8919e-02
95 | 22 27 1.2742e-02
96 | 22 28 3.9957e-02
97 | 22 40 1.4106e-03
98 | 22 42 4.3392e-03
99 | 22 61 1.7007e-04
100 | 23 29 1.2407e-04
101 | 23 30 1.3139e-04
102 | 23 35 2.1023e-01
103 | 23 57 2.0713e-04
104 | 24 29 8.2146e-05
105 | 24 34 2.4004e-01
106 | 24 41 2.7964e-02
107 | 24 56 2.2795e-04
108 | 25 33 2.4021e-01
109 | 25 40 2.8015e-02
110 | 25 55 2.2719e-04
111 | 26 31 1.3356e-04
112 | 26 32 2.3811e-01
113 | 26 37 2.0999e-03
114 | 26 38 1.5837e-04
115 | 26 39 2.5461e-02
116 | 26 42 1.5538e-04
117 | 27 34 4.0980e-04
118 | 27 35 8.8677e-03
119 | 27 36 4.1981e-03
120 | 27 41 1.8599e-04
121 | 27 53 2.1974e-04
122 | 28 34 1.0404e-02
123 | 28 35 1.9487e-03
124 | 28 36 2.1486e-03
125 | 28 41 4.1504e-04
126 | 28 52 2.2328e-04
127 | 29 33 9.0913e-03
128 | 29 37 6.7256e-05
129 | 29 38 1.8006e-04
130 | 29 39 1.7555e-04
131 | 29 40 4.1642e-04
132 | 29 51 2.2158e-04
133 | 30 37 1.8760e-04
134 | 30 38 1.7507e-04
135 | 30 39 3.9742e-04
136 | 30 50 2.1004e-04
137 | 33 41 6.4184e-03
138 | 33 48 2.2673e-04
139 | 34 40 6.4329e-03
140 | 34 47 2.2717e-04
141 | 35 39 5.2554e-03
142 | 35 46 2.2709e-04
143 | 36 40 1.2025e-04
144 | 36 42 2.8760e-03
145 | 36 45 2.2703e-04
146 | 39 57 2.1183e-04
147 | 40 56 2.3156e-04
148 | 41 55 2.3165e-04
149 | 41 61 5.8108e-03
150 | 42 54 2.3119e-04
151 | 42 60 3.9038e-03
152 | 43 53 2.3140e-04
153 | 43 59 7.1042e-04
154 | 43 60 9.8885e-01
155 | 44 52 2.2817e-04
156 | 44 59 9.9764e-01
157 | 45 58 9.9940e-01
158 | 46 57 9.9913e-01
159 | 47 56 9.9941e-01
160 | 48 55 9.9908e-01
161 | 49 54 8.1076e-01
162 | 50 54 1.6715e-04
163 |
164 |
--------------------------------------------------------------------------------
/utils/getpssm.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use strict;
3 |
4 |
5 | my $ecut=0.001;
6 | my @AA=qw(A U G C -);
7 | my %AA2index = ('A'=>'1', 'U'=>'2', 'G'=>'3', 'C'=>'4', '-'=>'5');
8 |
9 | my $seq=$ARGV[0];
10 | my $aln=$ARGV[1];
11 | my $outfile=$ARGV[2];
12 |
13 | my @seq=`cat $seq`; chomp(@seq);
14 | my $len=length $seq[1];
15 |
16 | #print "parse ...\n";
17 | my %freq=&wfreq($len, $aln);
18 |
19 | my @nn=split(//, $seq[1]);
20 | open(PRO, ">$outfile");
21 | for(my $i=1; $i<=$len; $i++)
22 | {
23 | print PRO "$nn[$i-1] ";
24 | foreach my $A(@AA)
25 | {
26 | printf PRO "%6d ", $freq{$i, $A};
27 | }
28 |
29 | printf PRO "\n";
30 | }
31 | close(PRO);
32 |
33 |
34 |
35 | sub wfreq
36 | {
37 | my ($len, $file)=@_;
38 |
39 | my %ALN=();
40 | my $Pcount=0;
41 | open(ALN,"$file") || die "Cant open $file";
42 | while(my $line=)
43 | {
44 | chomp($line);
45 | if($line =~ /^>(\S+)/)
46 | {
47 | my $Pname=$1;
48 | # my $Evalue= $1 if($line =~ /E=(\S+)/);
49 | # last if($Evalue>$ecut);
50 | $Pcount++;
51 | $ALN{$Pcount, 0}=$Pname;
52 | # $ALN{$Pcount, 1}=$Evalue;
53 | }
54 | else
55 | {
56 | $line =~ s/T/U/g; ###replace T by U
57 | $ALN{$Pcount, 2}=$line;
58 | }
59 | }
60 | close(ALN);
61 |
62 | my %freq=();
63 | $Pcount=50000 if($Pcount>50000);
64 | printf "%d sequences\n", $Pcount;
65 | if($Pcount >= 1)
66 | {
67 | %freq = &frquency(\%ALN, $Pcount, \%AA2index);
68 | }
69 | else
70 | {
71 | my @Qres = split(//, $ALN{1, 2});
72 | for(my $j=0; $j<@Qres; $j++)
73 | {
74 | foreach my $key (@AA)
75 | {
76 | $freq{$j+1, $key}=0;
77 | }
78 | }
79 | }
80 |
81 | return %freq;
82 | }
83 |
84 |
85 | sub frquency
86 | {
87 | my ($ALN_ref, $Nseq, $AA_ref)=@_;
88 | my %align = %$ALN_ref;
89 | my %AA2in = %$AA_ref;
90 |
91 | my @Qres = split(//, $align{1, 2});
92 | my $Ncol = $#Qres;
93 | my %res_count=();
94 |
95 |
96 | my $Qresno=0;
97 | my %Qmapping=();
98 | for(my $j=0; $j<=$#Qres; $j++)
99 | {
100 | $res_count{$j}=0;
101 | if($Qres[$j] ne '-')
102 | {
103 | $Qresno++;
104 | $Qmapping{$Qresno}=$j;
105 | }
106 | }
107 |
108 |
109 | my @ARR=();
110 | for(my $i=1; $i<=$Nseq; $i++)
111 | {
112 | my @res=split(//, $align{$i, 2});
113 | for(my $j=0; $j<=$#res; $j++)
114 | {
115 | $ARR[$i][$j]=$res[$j];
116 | }
117 | }
118 | my $AAcount = keys %AA2in;
119 | my %AA_freq=();
120 | my %sum_seq_weights=();
121 | my $k=0;
122 |
123 | for(my $j=0; $j<=$Ncol; $j++)
124 | {
125 | if($Qres[$j] eq '-')
126 | {
127 | next;
128 | }
129 | $k++;
130 | foreach my $key (@AA)
131 | {
132 | $AA_freq{$k, $key}=0;
133 | }
134 | my $w=0;
135 | for(my $i=1; $i<=$Nseq; $i++)
136 | {
137 | my $AAN="";
138 |
139 | if(!exists $AA2in{$ARR[$i][$j]})
140 | {
141 | print "replace $ARR[$i][$j] by $ARR[1][$j]\n";
142 | $AAN=$ARR[1][$j]; #replace nonstandard base in templates by query base
143 | }
144 | else
145 | {
146 | $AAN=$ARR[$i][$j];
147 | }
148 |
149 | # print "$AAN ";
150 | $AA_freq{$k, $AAN} += 1; ##weighted frequency in clolumn $j
151 | }
152 | #print "\n";
153 |
154 | }
155 | return %AA_freq;
156 | }
157 |
--------------------------------------------------------------------------------
/utils/FreeKnot/ChordModel.pm:
--------------------------------------------------------------------------------
1 | #Chord model of the circle graph representing a primitive pseudoknot. Each chord denotes a unique
2 | #crossing base pair stem in the primitive pseudoknot. If two stems cross, then their corresponding
3 | #chords also cross. Each chord is associated with its underlying base pairs.
4 |
5 | package ChordModel;
6 |
7 | use strict;
8 |
9 | sub new {
10 | my (undef, $primitive_pseudoknot) = @_;
11 |
12 | my $prim_pseudoknot_stems = $primitive_pseudoknot->[0];
13 | my $chord_end_point_num_map = _get_chord_end_point_num_map($prim_pseudoknot_stems);
14 |
15 | my ($chord_edges, $all_chord_base_pairs) = ({}, {});
16 | my ($chord_end_point_nums, $end_point_to_edge_map, $is_left_end_points) = ([], [], []);
17 |
18 | foreach (@{$prim_pseudoknot_stems}) {
19 | my $chord_left_end_point_num = $chord_end_point_num_map->{$_->[0][0]};
20 | my $chord_right_end_point_num = $chord_end_point_num_map->{$_->[0][1]};
21 | push @{$chord_end_point_nums}, ($chord_left_end_point_num, $chord_right_end_point_num);
22 | $is_left_end_points->[$chord_left_end_point_num] = 1;
23 | $is_left_end_points->[$chord_right_end_point_num] = 0;
24 | $all_chord_base_pairs->{$chord_left_end_point_num . '-' . $chord_right_end_point_num} = $_;
25 |
26 | my $chord_edge = [$chord_left_end_point_num, $chord_right_end_point_num];
27 | $chord_edges->{$chord_left_end_point_num . '-' . $chord_right_end_point_num} = $chord_edge;
28 | $end_point_to_edge_map->[$chord_left_end_point_num] = $chord_edge;
29 | $end_point_to_edge_map->[$chord_right_end_point_num] = $chord_edge;
30 | }
31 |
32 | my @sorted_chord_end_point_nums = sort {$b <=> $a} @{$chord_end_point_nums};
33 |
34 | my $self = {};
35 | $self->{chord_end_point_nums} = \@sorted_chord_end_point_nums;
36 | $self->{chord_edges} = $chord_edges;
37 | $self->{end_point_to_edge_map} = $end_point_to_edge_map;
38 | $self->{is_left_end_points} = $is_left_end_points;
39 | $self->{all_chord_base_pairs} = $all_chord_base_pairs;
40 |
41 | bless $self;
42 |
43 | return $self;
44 | }
45 |
46 | sub _get_chord_end_point_num_map {
47 | my $prim_pseudoknot_stems = shift;
48 |
49 | my $stem_end_points = [];
50 |
51 | foreach (@{$prim_pseudoknot_stems}) {
52 | push @{$stem_end_points}, $_->[0][0];
53 | push @{$stem_end_points}, $_->[0][1];
54 | }
55 |
56 | my @sorted_stem_end_points = sort {$a <=> $b} @{$stem_end_points};
57 |
58 | my $chord_end_point_num_map = {};
59 | for (my $i = 0; $i < @sorted_stem_end_points; $i++) {
60 | $chord_end_point_num_map->{$sorted_stem_end_points[$i]} = $i + 1;
61 | }
62 |
63 | return $chord_end_point_num_map;
64 | }
65 |
66 | sub get_chord_end_point_nums {
67 | my $self = shift;
68 |
69 | return $self->{chord_end_point_nums};
70 | }
71 |
72 | sub get_chord_edges {
73 | my $self = shift;
74 |
75 | return $self->{chord_edges};
76 | }
77 |
78 | sub get_chord_edge_count {
79 | my $self = shift;
80 |
81 | return scalar(keys %{$self->{chord_edges}});
82 | }
83 |
84 | sub get_chord_edge_by_end_point {
85 | my ($self, $end_point_num) = @_;
86 |
87 | my $end_point_to_edge_map = $self->{end_point_to_edge_map};
88 |
89 | return $end_point_to_edge_map->[$end_point_num];
90 | }
91 |
92 | sub is_left_end_point {
93 | my ($self, $end_point_num) = @_;
94 |
95 | my $is_left_end_points = $self->{is_left_end_points};
96 |
97 | return $is_left_end_points->[$end_point_num];
98 | }
99 |
100 | sub get_chord_base_pairs {
101 | my ($self, $chord_left_end_point, $chord_right_end_point) = @_;
102 |
103 | my $all_chord_base_pairs = $self->{all_chord_base_pairs};
104 |
105 | return $all_chord_base_pairs->{$chord_left_end_point . '-' . $chord_right_end_point};
106 | }
107 |
108 | 1;
109 |
--------------------------------------------------------------------------------
/utils/FreeKnot/README.txt:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------------------
2 |
3 | FreeKnot
4 |
5 | -------------------------------------------------------------------------
6 |
7 | Authors: Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen
8 |
9 | Last updated on 15 Apr 2014
10 |
11 |
12 |
13 | -------------------------------------------------------------------------
14 |
15 | Purpose
16 |
17 |
18 |
19 | FreeKnot is a tool for RNA pseudoknot removal. It converts any pseudoknot
20 |
21 | into nested substructures in RNA secondary structures. It removes some
22 |
23 | crossing stems to eliminate crossings based on certain scoring functions
24 |
25 | (details will be provided later in this README file) and reports one or
26 |
27 | more optimized pseudoknot-free structures.
28 |
29 |
30 |
31 | -------------------------------------------------------------------------
32 |
33 | Platform and pre-requisites
34 |
35 |
36 |
37 | FreeKnot has been tested on various platforms including Linux (Ubuntu),
38 |
39 | Mac OS X and Windows. Perl (v5.14 or later) is recommended. Earlier
40 |
41 | versions might work but without guarantee. Windows users can download
42 |
43 | various Perl distributions for Windows. ViennaRNA package 2.1 is required
44 |
45 | for the free energy scoring function.
46 |
47 |
48 |
49 | -------------------------------------------------------------------------
50 |
51 | Program/Module Description
52 |
53 |
54 |
55 | BpseqParser.pm, DPParser.pm - parser to accept bpseq or
56 |
57 | dot-parentheses formats as input
58 |
59 | BpseqWriter.pm, DPWriter.pm - writer to output converted results in
60 |
61 | bpseq or dot-parentheses formats
62 |
63 | ChordModel.pm, CircleGraph.pm - graphical object for primitive
64 |
65 | pseudoknot representation
66 |
67 | MIS.pm - MIS algorithm (for free energy scoring
68 |
69 | function)
70 | MWIS.pm - MWIS algorithm
71 |
72 | ScoringFunctions.pm - scoring functions
73 |
74 | remove_pseudoknot.pl - main program for pseudoknot removal
75 |
76 | PrimitivePseudoknotExtractor.pm - primitive pseudoknot extraction from
77 |
78 | the input secondary structure
79 |
80 | BracketPairs.pm - processing brackets in input secondary
81 |
82 | structure
83 |
84 |
85 | VertexSubset.pm - subset objects for storing graph
86 | vertices in the MIS algorithm
87 |
88 | -------------------------------------------------------------------------
89 |
90 | Usage
91 |
92 |
93 |
94 | FreeKnot is executed in console. The command is:
95 |
96 |
97 |
98 | perl remove_pseudoknot.pl -i
99 |
100 | -s
101 |
102 |
103 |
104 | Secondary structure format available: dp (dot-parentheses) / bpseq
105 |
106 | The secondary structure format for the output file follows that of the
107 |
108 | input file. So, if the input file is in bpseq format then the output
109 |
110 | file is also in bpseq format. Note that every line of data must end with
111 |
112 | a newline character (i.e. \n).
113 |
114 |
115 |
116 | Scoring function options: bp (# of base pairs) / stem (# of base pair
117 |
118 | stems) / hb (# of hydrogen bonds) / fe (structure overall free energy)
119 |
120 |
121 |
122 | The results are outputted to the console (stdout) by default. They can be
123 |
124 | directed to a file. For example,
125 |
126 |
127 |
128 | perl remove_pseudoknot.pl -i bpseq -s bp input.bpseq > output.bpseq
129 |
130 |
131 |
132 | -------------------------------------------------------------------------
133 |
--------------------------------------------------------------------------------
/utils/SPOT-RNA2.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import os
4 | from tqdm import tqdm
5 | import argparse
6 | from utils import create_tfr_files, prob_to_secondary_structure
7 | import time
8 | start = time.time()
9 | from argparse import RawTextHelpFormatter
10 | from pathlib import Path
11 |
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--inputs', default='inputs/single_seq.fasta', type=str, help='Path to input file in fasta format, accept multiple sequences as well in fasta format; default = ''inputs/2zzm-1-B.fasta''\n', metavar='')
14 | parser.add_argument('--outputs',default='outputs/', type=str, help='Path to output files; SPOT-RNA outputs at least three files .ct, .bpseq, and .prob files; default = ''outputs/\n', metavar='')
15 | parser.add_argument('--gpu', default=1, type=int, help='To run on GPU, specifiy GPU number. If only one GPU in computer specifiy 0; default = -1 (no GPU)\n', metavar='')
16 | parser.add_argument('--plots',default=False, type=bool, help='Set this to "True" to get the 2D plots of predicted secondary structure by SPOT-RNA; default = False\n', metavar='')
17 | parser.add_argument('--motifs',default=False, type=bool, help='Set this to "True" to get the motifs of predicted secondary structure by SPOT-RNA; default = False\n', metavar='')
18 | #parser.add_argument('--NC',default=True, type=bool, help='Set this to "False" to predict only canonical pairs; default = True\n', metavar='')
19 | args = parser.parse_args()
20 |
21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
22 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
23 |
24 | base_path = os.path.dirname(os.path.realpath(__file__))
25 |
26 | create_tfr_files(args)
27 |
28 | with open(args.inputs) as file:
29 | input_data = [line.strip() for line in file.read().splitlines() if line.strip()]
30 |
31 | count = int(len(input_data)/2)
32 |
33 | ids = [input_data[2*i].replace(">", "") for i in range(count)]
34 | sequences = {}
35 | for i,I in enumerate(ids):
36 | sequences[I] = input_data[2*i+1].replace(" ", "").replace("T", "U").upper()
37 |
38 | os.environ["CUDA_VISIBLE_DEVICES"]= str(args.gpu)
39 | #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
40 | NUM_MODELS = 4
41 |
42 | test_loc = [os.path.splitext(args.inputs)[0] + ".tfrecords"]
43 |
44 | outputs = {}
45 | mask = {}
46 | def sigmoid(x):
47 | return 1/(1+np.exp(-np.array(x, dtype=np.float128)))
48 |
49 | #for MODEL in range(NUM_MODELS):
50 | for MODEL in [0, 1, 2, 3]:
51 | #for MODEL in [0, 1, 2, 3]:
52 | print(MODEL)
53 | config = tf.ConfigProto()
54 | #config.gpu_options.allow_growth = True
55 | config.allow_soft_placement=True
56 | config.log_device_placement=False
57 | print('\nPredicting for SPOT-RNA2 model '+str(MODEL))
58 | with tf.Session(config=config) as sess:
59 | saver = tf.train.import_meta_graph(os.path.join(base_path, 'models_ckps'+'/model_'+str(MODEL)+'.meta'))
60 | saver.restore(sess, os.path.join(base_path, 'models_ckps'+'/model_'+str(MODEL)))
61 | graph = tf.get_default_graph()
62 | init_test = graph.get_operation_by_name('make_initializer_1')
63 | tmp_out = graph.get_tensor_by_name('output_FC/fully_connected/BiasAdd:0')
64 | name_tensor = graph.get_tensor_by_name('tensors_1/component_0:0')
65 | RNA_name = graph.get_tensor_by_name('IteratorGetNext:0')
66 | label_mask = graph.get_tensor_by_name('IteratorGetNext:4')
67 | sess.run([init_test], feed_dict={name_tensor:test_loc})
68 |
69 | pbar = tqdm(total = count)
70 | for rna in ids:
71 | out = sess.run([tmp_out,RNA_name,label_mask],feed_dict={'dropout:0':1})
72 | out[1] = rna
73 |
74 | mask[out[1]] = out[2]
75 |
76 | if MODEL == 0:
77 | outputs[out[1]] = [sigmoid(out[0])]
78 | else:
79 | outputs[out[1]].append(sigmoid(out[0]))
80 | pbar.update(1)
81 | pbar.close()
82 | tf.reset_default_graph()
83 |
84 |
85 | RNA_ids = [i for i in list(outputs.keys())]
86 | ensemble_outputs = {}
87 |
88 | print('\nPost Processing and Saving Output')
89 | for i in RNA_ids:
90 | #print(i, mask[i].shape, len(sequences[i]))
91 | ensemble_outputs[i] = np.mean(outputs[i],0)
92 | prob_to_secondary_structure(ensemble_outputs[i], mask[i], sequences[i], i, args)
93 |
94 | print('\nFinished!')
95 | end = time.time()
96 | print('\nProcesssing Time {} seconds'.format(end - start))
97 |
--------------------------------------------------------------------------------
/utils/FreeKnot/MIS.pm:
--------------------------------------------------------------------------------
1 | #The MIS algorithm module. It is an extension of the k-MIS algorithm proposed by Byskov (Byskov, J., 2004)
2 |
3 | package MIS;
4 |
5 | use strict;
6 |
7 | use constant D => 3;
8 |
9 | my $miss;
10 | my $checked_sets;
11 |
12 | sub get_mis {
13 | my (undef, $circle_graph) = @_;
14 |
15 | $miss = [];
16 | $checked_sets = {};
17 |
18 | #Initialize the vertex set with goal opposing vertices filtered
19 | my $vertex_set = VertexSubset->new($circle_graph);
20 | #Call the branching algorithm _search_mis, all the MWISs will be stored in $mwiss
21 | _search_mis($vertex_set, [], $circle_graph);
22 |
23 | undef $checked_sets;
24 |
25 | return $miss;
26 | }
27 |
28 | sub _search_mis {
29 | my ($vertex_subset, $candidate_set, $circle_graph) = @_;
30 |
31 | if ($vertex_subset->get_size() == 0) {
32 | #If the vertex subset is empty, check whether the $candidate_set is an independent set. If so then it is
33 | #an MIS and the toal vertex weight is evaluated. Those with the best overall weight (according to the
34 | #goal specified by $criteria) are put in $miss. Since the same subset may appear more than once,
35 | #$checked_sets stores all the subset verified before to avoid unnecessary checking.
36 | @{$candidate_set} = sort {$a <=> $b} @{$candidate_set};
37 | my $candidate_set_id = join('-', @{$candidate_set});
38 | if (!exists($checked_sets->{$candidate_set_id}) && _is_independent_set($candidate_set, $circle_graph)) {
39 | push @{$miss}, $candidate_set;
40 | $checked_sets->{$candidate_set_id} = $candidate_set;
41 | }
42 | }
43 | else {
44 | my ($highest_degree_vertices, $highest_vertex_degree) = $vertex_subset->get_highest_degree_vertex_info();
45 | #If the highest vertex degree is at least D, select a vertex with such degree to branch
46 | if ($highest_vertex_degree >= D) {
47 | my @self_adj_vertices = (@{$vertex_subset->get_adjacent_vertices_at($highest_degree_vertices->[0])}, $highest_degree_vertices->[0]);
48 | my @expanded_candidate_set = (@{$candidate_set}, $highest_degree_vertices->[0]);
49 | #Branch on by including the selected vertex in $candidate_set
50 | _search_mis($vertex_subset->get_subset(\@self_adj_vertices), \@expanded_candidate_set, $circle_graph);
51 |
52 | #Branch on by just excluding the selected vertex in $candidate_set
53 | _search_mis($vertex_subset->get_subset([$highest_degree_vertices->[0]]), $candidate_set, $circle_graph);
54 | }
55 | #If the highest vertex degree is lower than D, select a vertex with the lowest vertex degree to branch instead
56 | else {
57 | my ($lowest_degree_vertices, undef) = $vertex_subset->get_lowest_degree_vertex_info();
58 | my $adj_vertices = $vertex_subset->get_adjacent_vertices_at($lowest_degree_vertices->[0]);
59 | my @self_adj_vertices1 = (@{$adj_vertices}, $lowest_degree_vertices->[0]);
60 | my @expanded_candidate_set1 = (@{$candidate_set}, $lowest_degree_vertices->[0]);
61 | #Branch on by including the selected vertex in $candidate_set
62 | _search_mis($vertex_subset->get_subset(\@self_adj_vertices1), \@expanded_candidate_set1, $circle_graph);
63 |
64 | #Branch on by enumerating and including each adjacent vertex of the selected vertex in $candidate_set
65 | foreach (@{$adj_vertices}) {
66 | my @expanded_candidate_set2 = (@{$candidate_set}, $_);
67 | my @self_adj_vertices2 = (@{$vertex_subset->get_adjacent_vertices_at($_)}, $_);
68 | _search_mis($vertex_subset->get_subset(\@self_adj_vertices2), \@expanded_candidate_set2, $circle_graph);
69 | }
70 | }
71 | }
72 | }
73 |
74 | sub _is_independent_set {
75 | my ($candidate_set, $circle_graph) = @_;
76 |
77 | my ($all_non_adj_vertex_mask, $candidate_set_bitstrings) = ([], []);
78 |
79 | for (my $i = @{$candidate_set} - 1; $i >= 0; $i--) {
80 | my $non_adj_vertex_mask = $circle_graph->get_non_adj_vertex_mask_at($candidate_set->[$i]);
81 | for (my $j = 0; $j < @{$candidate_set_bitstrings}; $j++) {
82 | if (($candidate_set_bitstrings->[$j] & $non_adj_vertex_mask->[$j]) != $candidate_set_bitstrings->[$j]) {
83 | return 0;
84 | }
85 | }
86 |
87 | my ($vertex_bitstring_segment_num, $vertex_bitstring) = @{$circle_graph->get_vertex_bitstring_segment_at($candidate_set->[$i])};
88 | $candidate_set_bitstrings->[$vertex_bitstring_segment_num] = $candidate_set_bitstrings->[$vertex_bitstring_segment_num] | $vertex_bitstring;
89 | }
90 |
91 | return 1;
92 | }
93 |
94 | 1;
95 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/temp.a2m:
--------------------------------------------------------------------------------
1 | >6UFJ_A/1-51 Chain A, RNA (50-MER)
2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGA----------
3 | >6UEY_A/1-50 Chain A, RNA (50-MER)
4 | ACUCGUUUGAGCGAGUAUAAACAGUUGGUUAGGCUCAAAGCGGAGAGCAG-----------
5 | >HE577054.1/3246821-3246757 Paenibacillus polymyxa M1 main chromosome, complete genome
6 | ACUCGUCUGAGCGAGUAUAAACAGGUCAUUAAGCUCAGAGCGUUCACCG----CGGUGAGG
7 | >MF288922.1/150528-150592 Bacillus phage Janet, complete genome
8 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAAGCUCACAGCGUAGAGAGG--CCUCUCUAG
9 | >CP033464.1/4485719-4485655 Brevibacillus laterosporus strain 1821L chromosome, complete genome
10 | ACUCGAUUGAGCGAGUAUAAACAGAC-CUUAGGCUCAAAGCGUUGAGAAG--CUUCUCAGG
11 | >KT307976.1/157679-157741 Bacillus phage AvesoBmore, complete genome
12 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGCGGG
13 | >CP032410.1/870062-870126 Brevibacillus laterosporus strain E7593-50 chromosome, complete genome
14 | ACUCGAUUGAGCGAGUAUAAAUAGAC-CUUAAGCUCAAAGCGUUGAGGAG--CUUCUCAGG
15 | >MK892513.1/27480-27550 Prokaryotic dsDNA virus sp. isolate Unbinned_2716_contig-100_1, complete genome
16 | AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGGCGUAUAG--CUAUACGCG
17 | >MF288921.1/151458-151522 Bacillus phage OTooleKemple52, complete genome
18 | ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGUAGAGAGG--CCUCUCUAG
19 | >KJ489397.1/151758-151822 Bacillus phage CAM003, complete genome
20 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGUAGGGAGG--CCUCUCUAG
21 | >KF669647.1/155754-155816 Bacillus phage BigBertha, complete genome
22 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGUGGG
23 | >CP009278.1/2800251-2800310 Sphingobacterium sp. ML3W, complete genome
24 | AGUCGUUUGAGCGACUUAAAAUAGGU-UUUAAGCUCAAAGCGCCCCGAUAAUAAUCGGGAG
25 | >CP045298.1/5377890-5377826 Paenibacillus brasilensis strain KACC 13842 chromosome, complete genome
26 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACCGGAUCCGGUGAGG
27 | >KF669662.1/155100-155162 Bacillus phage Spock, complete genome
28 | ACUCGUGUAAGCGAGUAUAAAAAGGC-UUUAGGCUUACAGCGUCGCGGAGAUCUCCGCGGG
29 | >KR063281.1/60079-60028 Gordonia phage GMA2, complete genome
30 | ACUCGACUGAGCGAGUAUAAACAGUU-CUUAAGCUCAGAGCGGCC------------GGCG
31 | >KJ489402.1/153758-153819 Bacillus phage Riley, complete genome
32 | ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCGUCGCGGG----C--CCGCG
33 | >CP000154.2/3364238-3364174 Paenibacillus polymyxa E681, complete genome
34 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACUGGA-CCAGUGAGA
35 | >LN852800.1/7754-7693 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0110
36 | GCUCGUCUGGGCGAGGAUAAACAGCUA-UUAAGCCCAGAGCGUUCCGGUUAUGAUCGGAGG
37 | >CP019039.1/7984-8046 Bacillus velezensis strain GH1-13 plasmid unnamed, complete sequence
38 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCUCAGAGCGUCCUUCC----GGAAGGGG
39 | >LN852940.1/1904-1844 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0268
40 | GCUCGUCUGGGCGAGGGUAAAUAGCUAAUUAGGCCCAGAGCGUCCAGGAUG-AUCCUGGAG
41 | >JN790865.1/35681-35620 Bacillus phage B4, complete genome
42 | AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--UCCCCCGUG
43 | >KY888882.1/156410-156472 Bacillus phage Flapjack, complete genome
44 | ACUCGUGUGAGUGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--CCCUGCG-G
45 | >CP014843.1/29638-29697 Bacillus licheniformis strain SCDB 14 plasmid pSCDB14, complete sequence
46 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCGUUUCCCUUCUAGGGGAGGU
47 | >CP045906.1/14639513-14639571 Caligus rogercresseyi isolate FCH chromosome 17
48 | UCUUGCUUGAGCAAGAAUAAAGAGCUGUACAUAAGCAAAGAGUCUUGCCU--GAGCAAGAG
49 | >HG916826.1/843085-843030 Pseudomonas pseudoalcaligenes CECT 5344 complete genome
50 | CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCGAAAUUCGGGGCCG-----G
51 | >XM_028713395.1/30-87 PREDICTED: Podarcis muralis solute carrier family 16 member 6 (SLC16A6), mRNA
52 | ACCGGCUCGAGCCGGUAUAAAAAGCU---UGAGCUCGAGCACAGCGGCAGCACUGCCGCAG
53 | >AC100771.2/133706-133648 Homo sapiens chromosome 11, clone RP11-159H22, complete sequence
54 | GUUCAUUUGGGUGAAUAUAAAAAGGAGAUUA--CUCAAAGCUUUAAAAAAAAUUUUUUUAA
55 | >CP022654.2/63818-63880 Bacillus velezensis strain SCDB 291 chromosome, complete genome
56 | AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCGUCCUUCC----GGAAGGGG
57 | >CP045899.1/5107513-5107456 Caligus rogercresseyi isolate FCH chromosome 10
58 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGCUG---AGCAAGAG
59 | >CP010557.1/4528803-4528858 Raoultella ornithinolytica strain S12, complete genome
60 | CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------UCAGGCAGAUCUGCCAGAG
61 | >MH153801.1/58164-58217 Microbacterium phage Count, complete genome
62 | AGUCGUCUGAGCGACUUUAAAUAGGU-CUUAGGCUCAGAGCGGAUAGAUG------UAUUG
63 | >CP045896.1/486401-486459 Caligus rogercresseyi isolate FCH chromosome 7
64 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGC--AUGAGCAAGAG
65 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.a2m:
--------------------------------------------------------------------------------
1 | >sample_seq
2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
3 | >6UFJ_A/1-51 Chain A, RNA (50-MER)
4 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGA----------
5 | >6UEY_A/1-50 Chain A, RNA (50-MER)
6 | ACUCGUUUGAGCGAGUAUAAACAGUUGGUUAGGCUCAAAGCGGAGAGCAG-----------
7 | >HE577054.1/3246821-3246757 Paenibacillus polymyxa M1 main chromosome, complete genome
8 | ACUCGUCUGAGCGAGUAUAAACAGGUCAUUAAGCUCAGAGCGUUCACCG----CGGUGAGG
9 | >MF288922.1/150528-150592 Bacillus phage Janet, complete genome
10 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAAGCUCACAGCGUAGAGAGG--CCUCUCUAG
11 | >CP033464.1/4485719-4485655 Brevibacillus laterosporus strain 1821L chromosome, complete genome
12 | ACUCGAUUGAGCGAGUAUAAACAGAC-CUUAGGCUCAAAGCGUUGAGAAG--CUUCUCAGG
13 | >KT307976.1/157679-157741 Bacillus phage AvesoBmore, complete genome
14 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGCGGG
15 | >CP032410.1/870062-870126 Brevibacillus laterosporus strain E7593-50 chromosome, complete genome
16 | ACUCGAUUGAGCGAGUAUAAAUAGAC-CUUAAGCUCAAAGCGUUGAGGAG--CUUCUCAGG
17 | >MK892513.1/27480-27550 Prokaryotic dsDNA virus sp. isolate Unbinned_2716_contig-100_1, complete genome
18 | AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGGCGUAUAG--CUAUACGCG
19 | >MF288921.1/151458-151522 Bacillus phage OTooleKemple52, complete genome
20 | ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGUAGAGAGG--CCUCUCUAG
21 | >KJ489397.1/151758-151822 Bacillus phage CAM003, complete genome
22 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGUAGGGAGG--CCUCUCUAG
23 | >KF669647.1/155754-155816 Bacillus phage BigBertha, complete genome
24 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGUGGG
25 | >CP009278.1/2800251-2800310 Sphingobacterium sp. ML3W, complete genome
26 | AGUCGUUUGAGCGACUUAAAAUAGGU-UUUAAGCUCAAAGCGCCCCGAUAAUAAUCGGGAG
27 | >CP045298.1/5377890-5377826 Paenibacillus brasilensis strain KACC 13842 chromosome, complete genome
28 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACCGGAUCCGGUGAGG
29 | >KF669662.1/155100-155162 Bacillus phage Spock, complete genome
30 | ACUCGUGUAAGCGAGUAUAAAAAGGC-UUUAGGCUUACAGCGUCGCGGAGAUCUCCGCGGG
31 | >KR063281.1/60079-60028 Gordonia phage GMA2, complete genome
32 | ACUCGACUGAGCGAGUAUAAACAGUU-CUUAAGCUCAGAGCGGCC------------GGCG
33 | >KJ489402.1/153758-153819 Bacillus phage Riley, complete genome
34 | ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCGUCGCGGG----C--CCGCG
35 | >CP000154.2/3364238-3364174 Paenibacillus polymyxa E681, complete genome
36 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACUGGA-CCAGUGAGA
37 | >LN852800.1/7754-7693 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0110
38 | GCUCGUCUGGGCGAGGAUAAACAGCUA-UUAAGCCCAGAGCGUUCCGGUUAUGAUCGGAGG
39 | >CP019039.1/7984-8046 Bacillus velezensis strain GH1-13 plasmid unnamed, complete sequence
40 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCUCAGAGCGUCCUUCC----GGAAGGGG
41 | >LN852940.1/1904-1844 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0268
42 | GCUCGUCUGGGCGAGGGUAAAUAGCUAAUUAGGCCCAGAGCGUCCAGGAUG-AUCCUGGAG
43 | >JN790865.1/35681-35620 Bacillus phage B4, complete genome
44 | AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--UCCCCCGUG
45 | >KY888882.1/156410-156472 Bacillus phage Flapjack, complete genome
46 | ACUCGUGUGAGUGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--CCCUGCG-G
47 | >CP014843.1/29638-29697 Bacillus licheniformis strain SCDB 14 plasmid pSCDB14, complete sequence
48 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCGUUUCCCUUCUAGGGGAGGU
49 | >CP045906.1/14639513-14639571 Caligus rogercresseyi isolate FCH chromosome 17
50 | UCUUGCUUGAGCAAGAAUAAAGAGCUGUACAUAAGCAAAGAGUCUUGCCU--GAGCAAGAG
51 | >HG916826.1/843085-843030 Pseudomonas pseudoalcaligenes CECT 5344 complete genome
52 | CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCGAAAUUCGGGGCCG-----G
53 | >XM_028713395.1/30-87 PREDICTED: Podarcis muralis solute carrier family 16 member 6 (SLC16A6), mRNA
54 | ACCGGCUCGAGCCGGUAUAAAAAGCU---UGAGCUCGAGCACAGCGGCAGCACUGCCGCAG
55 | >AC100771.2/133706-133648 Homo sapiens chromosome 11, clone RP11-159H22, complete sequence
56 | GUUCAUUUGGGUGAAUAUAAAAAGGAGAUUA--CUCAAAGCUUUAAAAAAAAUUUUUUUAA
57 | >CP022654.2/63818-63880 Bacillus velezensis strain SCDB 291 chromosome, complete genome
58 | AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCGUCCUUCC----GGAAGGGG
59 | >CP045899.1/5107513-5107456 Caligus rogercresseyi isolate FCH chromosome 10
60 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGCUG---AGCAAGAG
61 | >CP010557.1/4528803-4528858 Raoultella ornithinolytica strain S12, complete genome
62 | CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------UCAGGCAGAUCUGCCAGAG
63 | >MH153801.1/58164-58217 Microbacterium phage Count, complete genome
64 | AGUCGUCUGAGCGACUUUAAAUAGGU-CUUAGGCUCAGAGCGGAUAGAUG------UAUUG
65 | >CP045896.1/486401-486459 Caligus rogercresseyi isolate FCH chromosome 7
66 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGC--AUGAGCAAGAG
67 |
--------------------------------------------------------------------------------
/utils/FreeKnot/CircleGraph.pm:
--------------------------------------------------------------------------------
1 | #Circle graph is the graphical model for a primitive pseudoknot. Each vertex represents a crossing stem of
2 | #the pseudoknot, and each edge represents a crossing between two stems. The vertex attributes store information
3 | #such as number of base pairs, paired positions.
4 | #
5 | #Every vertex is represented by a unique bitstring and its adjacent vertices are represented by a bitstring
6 | #mask. The least significant bit (LSB) represents the most preceding vertex and the most significant bit (MSB)
7 | #represents the least preceding vertex of the knot-stem graph. However, the no. of vertices may exceed the
8 | #length of one bitstring. To solve this problem, multiple bitstrings are required to form a bitstring long
9 | #enough for each bit position to uniquely identify a vertex. This 'long' bitstring is disassembled into an
10 | #array of bitstrings and each array element is called a bitstring segment. Every bit position of the 'long'
11 | #bitstring is then transformed by a (segment no., segment bitstring) pair.
12 |
13 | package CircleGraph;
14 |
15 | use strict;
16 |
17 | sub new {
18 | my (undef, $primitive_pseudoknot, $os_bit) = @_;
19 |
20 | my $vertex_attrs = [];
21 | # my ($stem_pair_counts, $gains) = ([], []);
22 | my ($vertex_bitstring_segments, $non_adj_vertex_masks) = ([], []);
23 |
24 | my ($prim_pseudoknot_stems, $prim_pseudoknot_stem_crossings) = @{$primitive_pseudoknot};
25 | my $vertex_count = @{$prim_pseudoknot_stems};
26 |
27 | my ($bitstring_segment_num, $vertex_bit) = (0, 0);
28 |
29 | for (my $i = $vertex_count - 1; $i >= 0; $i--) {
30 | my $prim_pseudoknot_stem = $prim_pseudoknot_stems->[$i];
31 | # $stem_pair_counts->[$i] = @{$prim_pseudoknot_stem};
32 | # $gains->[$i] = $stem_pair_counts->[$i];
33 |
34 | $vertex_bitstring_segments->[$i] = [$bitstring_segment_num, 1 << $vertex_bit];
35 | my $non_adj_vertex_mask_bitstrings = [];
36 |
37 | my $stem_crossings = $prim_pseudoknot_stem_crossings->[$i];
38 | my $next_crossing_index = @{$stem_crossings} - 1;
39 | my $next_crossing_stem_id;
40 | if ($next_crossing_index >= 0) {
41 | $next_crossing_stem_id = $stem_crossings->[$next_crossing_index];
42 | }
43 |
44 | for (my $j = $vertex_count - 1; $j > $i; $j--) {
45 | if ($next_crossing_index >= 0 && $j == $next_crossing_stem_id) {
46 | # $gains->[$i] -= $stem_pair_counts->[$j];
47 | # $gains->[$j] -= $stem_pair_counts->[$i];
48 | if (--$next_crossing_index >= 0) {
49 | $next_crossing_stem_id = $stem_crossings->[$next_crossing_index];
50 | }
51 | }
52 | else {
53 | my $non_adj_vertex_bitstring_segment_num = $vertex_bitstring_segments->[$j][0];
54 | $non_adj_vertex_mask_bitstrings->[$non_adj_vertex_bitstring_segment_num] = $non_adj_vertex_mask_bitstrings->[$non_adj_vertex_bitstring_segment_num] | $vertex_bitstring_segments->[$j][1];
55 | }
56 | }
57 |
58 | $non_adj_vertex_masks->[$i] = $non_adj_vertex_mask_bitstrings;
59 |
60 | if (++$vertex_bit == $os_bit) {
61 | $bitstring_segment_num++;
62 | $vertex_bit = 0;
63 | }
64 | }
65 |
66 | for (my $i = 0; $i < $vertex_count; $i++) {
67 | my $attrs = {};
68 | # $attrs->{pair_count} = $stem_pair_counts->[$i];
69 | # $attrs->{gain} = $gains->[$i];
70 | $attrs->{stem_pairs} = $prim_pseudoknot_stems->[$i];
71 | $vertex_attrs->[$i] = $attrs;
72 | }
73 |
74 | my $self = {};
75 | $self->{vertex_count} = $vertex_count;
76 | $self->{vertex_attrs} = $vertex_attrs;
77 | $self->{edges} = $prim_pseudoknot_stem_crossings;
78 | $self->{vertex_bitstring_segments} = $vertex_bitstring_segments;
79 | $self->{non_adj_vertex_masks} = $non_adj_vertex_masks;
80 |
81 | bless $self;
82 |
83 | return $self;
84 | }
85 |
86 | sub get_vertex_count {
87 | my $self = shift;
88 |
89 | return $self->{vertex_count};
90 | }
91 |
92 | sub get_vertex_attrs_at {
93 | my ($self, $vertex_num) = @_;
94 |
95 | if ($vertex_num >= $self->{vertex_count}) {
96 | return [];
97 | }
98 |
99 | my $vertex_attrs = $self->{vertex_attrs};
100 |
101 | return $vertex_attrs->[$vertex_num];
102 | }
103 |
104 | sub get_edges_at {
105 | my ($self, $vertex_num) = @_;
106 |
107 | if ($vertex_num >= $self->{vertex_count}) {
108 | return [];
109 | }
110 |
111 | my $edges = $self->{edges};
112 |
113 | return $edges->[$vertex_num];
114 | }
115 |
116 | #Return the bitstring segment of the vertex. Each bitstring segment is a (segment no.,
117 | #segment bitstring) pair.
118 | sub get_vertex_bitstring_segment_at {
119 | my ($self, $vertex_num) = @_;
120 |
121 | if ($vertex_num >= $self->{vertex_count}) {
122 | return [];
123 | }
124 |
125 | my $bitstring_segments = $self->{vertex_bitstring_segments};
126 |
127 | return $bitstring_segments->[$vertex_num];
128 | }
129 |
130 | #Returns bitstring segments that filter all the subsequent adjacent vertices
131 | sub get_non_adj_vertex_mask_at {
132 | my ($self, $vertex_num) = @_;
133 |
134 | if ($vertex_num >= $self->{vertex_count}) {
135 | return 0;
136 | }
137 |
138 | my $non_adj_vertex_mask_bitstrings = $self->{non_adj_vertex_masks};
139 |
140 | return $non_adj_vertex_mask_bitstrings->[$vertex_num];
141 | }
142 |
143 | 1;
144 |
--------------------------------------------------------------------------------
/utils/FreeKnot/DPParser.pm:
--------------------------------------------------------------------------------
1 | #Parser for dot-parentheses format
2 | #It returns primitive pseudoknot objects, base sequence and dot-parentheses array
3 |
4 | package DPParser;
5 |
6 | use strict;
7 |
8 | use constant DOT => '.';
9 |
10 | sub parse{
11 | my (undef, $dp_file_path) = @_;
12 |
13 | my $primitive_pseudoknots = [];
14 | my ($base_seq_str, $secondary_structure) = ('', '');
15 |
16 | open (DP, "<$dp_file_path") or die "Cannot open file at $dp_file_path";
17 | while () {
18 | if ($_ =~ /^([A-Za-z]+)[\r\n]*$/) {
19 | $base_seq_str = $base_seq_str . $1;
20 | }
21 | elsif ($_ =~ /^([\.\(\)\[\]\{\}<>A-Za-z]+)[\r\n]*$/) {
22 | $secondary_structure = $secondary_structure . $1;
23 | }
24 | elsif ($_ !~ /^#.*/ && $_ !~ /^\s+/) {
25 | die "Unknown input: $_";
26 | }
27 | }
28 |
29 | close DP or die "Cannot close file at $dp_file_path";
30 |
31 | if ($base_seq_str eq '') {
32 | die 'Base sequence is missing';
33 | }
34 |
35 | if ($secondary_structure eq '') {
36 | die 'Secondary structure is missing';
37 | }
38 |
39 | if (length($base_seq_str) != length($secondary_structure)) {
40 | die 'Base sequence length not equal to secondary structure length';
41 | }
42 |
43 | #Group the base pairs into base pair stems
44 | my ($stem_outermost_pairs, $stems, $paired_pos_ptrs, $structure_symbols) = _group_to_stems($secondary_structure);
45 | #Extract primitive pseudoknots from the base pair stems
46 | my $primitive_pseudoknots = PrimitivePseudoknotExtractor->extract($stem_outermost_pairs, $stems, $paired_pos_ptrs);
47 | my @base_seq = split(//, $base_seq_str);
48 |
49 | return $primitive_pseudoknots, \@base_seq, $structure_symbols, $base_seq_str;
50 | }
51 |
52 | sub _group_to_stems {
53 | my $secondary_structure = shift;
54 | my $stems = {};
55 | my ($stem_outermost_pairs, $stem, $outermost_base_pair) = ([], [], []);
56 | my $paired_pos_ptrs = [];
57 | my $unsettled_bracket_upstream_pos = {};
58 | my $next_paired_pos = {};
59 | my $last_paired_pos = 0;
60 |
61 | my @structure_symbols = split(//, $secondary_structure);
62 | my $structure_length = scalar @structure_symbols;
63 |
64 | for (my $i = 0; $i < $structure_length; $i++) {
65 | my $symbol = $structure_symbols[$i];
66 | if ($symbol eq DOT) {
67 | next;
68 | }
69 | elsif (BracketPairs->is_open_bracket($symbol)) {
70 | my $unsettled_upstream_pos = $unsettled_bracket_upstream_pos->{$symbol};
71 | if (!defined($unsettled_upstream_pos)) {
72 | $unsettled_upstream_pos = [];
73 | $unsettled_bracket_upstream_pos->{$symbol} = $unsettled_upstream_pos;
74 | }
75 |
76 | my $curr_upstream_pos = $i + 1;
77 | push @{$unsettled_upstream_pos}, $curr_upstream_pos;
78 |
79 | if (defined($outermost_base_pair->[0])) {
80 | ($stem_outermost_pairs, $stems, $outermost_base_pair, $stem) = _add_to_stems($stem_outermost_pairs, $stems, $outermost_base_pair, $stem);
81 | }
82 |
83 | $next_paired_pos->{$last_paired_pos} = $curr_upstream_pos;
84 | $last_paired_pos = $curr_upstream_pos;
85 | }
86 | else {
87 | my $pair_open_bracket = BracketPairs->get_open_bracket($symbol);
88 | my $unsettled_upstream_pos = $unsettled_bracket_upstream_pos->{$pair_open_bracket};
89 | if (defined($unsettled_upstream_pos) && defined($unsettled_upstream_pos->[0])) {
90 | my $paired_upstream_pos = pop @{$unsettled_upstream_pos};
91 | my $curr_downstream_pos = $i + 1;
92 |
93 | if (defined($outermost_base_pair->[0])) {
94 | if ($next_paired_pos->{$paired_upstream_pos} != $outermost_base_pair->[0]) {
95 | ($stem_outermost_pairs, $stems, $outermost_base_pair, $stem) = _add_to_stems($stem_outermost_pairs, $stems, $outermost_base_pair, $stem);
96 | }
97 |
98 | $outermost_base_pair = [$paired_upstream_pos, $curr_downstream_pos];
99 | unshift @{$stem}, $outermost_base_pair;
100 | }
101 | else {
102 | $outermost_base_pair = [$paired_upstream_pos, $curr_downstream_pos];
103 | $stem = [$outermost_base_pair];
104 | }
105 |
106 | $paired_pos_ptrs->[$paired_upstream_pos] = $curr_downstream_pos;
107 | $paired_pos_ptrs->[$curr_downstream_pos] = $paired_upstream_pos;
108 |
109 | $next_paired_pos->{$last_paired_pos} = $curr_downstream_pos;
110 | $last_paired_pos = $curr_downstream_pos;
111 | }
112 | else {
113 | die "Closing bracket $symbol not paired\n";
114 | }
115 | }
116 | }
117 |
118 | if (!_is_all_open_bracket_settled($unsettled_bracket_upstream_pos)) {
119 | die "Unpaired open bracket remains\n";
120 | }
121 |
122 | if (defined($outermost_base_pair->[0])) {
123 | ($stem_outermost_pairs, $stems, undef, undef) = _add_to_stems($stem_outermost_pairs, $stems, $outermost_base_pair, $stem);
124 | }
125 |
126 | my @sorted_outermost_pairs = sort {$a->[0] <=> $b->[0]} @{$stem_outermost_pairs};
127 |
128 | return (\@sorted_outermost_pairs, $stems, $paired_pos_ptrs, \@structure_symbols);
129 | }
130 |
131 | sub _add_to_stems {
132 | my ($stem_outermost_pairs, $stems, $stem_outermost_pair, $stem) = @_;
133 |
134 | $stems->{$stem_outermost_pair->[0]} = $stem;
135 | push @{$stem_outermost_pairs}, $stem_outermost_pair;
136 |
137 | return ($stem_outermost_pairs, $stems, [], []);
138 | }
139 |
140 | sub _is_all_open_bracket_settled {
141 | my $unsettled_open_bracket_pos = shift;
142 |
143 | foreach (values %{$unsettled_open_bracket_pos}) {
144 | if (defined($_->[0])) {
145 | return 0;
146 | }
147 | }
148 |
149 | return 1;
150 | }
151 |
152 | 1;
153 |
--------------------------------------------------------------------------------
/utils/FreeKnot/PrimitivePseudoknotExtractor.pm:
--------------------------------------------------------------------------------
1 | #Module that extracts primitive pseudoknots from all the base pair stems of the RNA secondary structure
2 |
3 | package PrimitivePseudoknotExtractor;
4 |
5 | use strict;
6 |
7 | sub extract {
8 | my (undef, $stem_outermost_pairs, $stems, $paired_pos_ptrs) = @_;
9 |
10 | #Group together the crossing stems of a pseudoknot
11 | my ($knotted_pair_pos_groups, $outermost_pair_crossings) = _group_knotted_outermost_pairs($stem_outermost_pairs);
12 | #Create the pseudoknot objects
13 | my $primitive_pseudoknots = _get_prim_pseudoknots($stems, $knotted_pair_pos_groups, $outermost_pair_crossings, $paired_pos_ptrs);
14 |
15 | return $primitive_pseudoknots;
16 | }
17 |
18 | sub _group_knotted_outermost_pairs {
19 | my $stem_outermost_pairs = shift;
20 |
21 | my $knotted_pair_pos_groups = [];
22 | my $outermost_pair_crossings = {};
23 | my $paired_pos_to_group_id = {};
24 | my $max_group_id;
25 |
26 | my $outermost_pair_count = @{$stem_outermost_pairs};
27 |
28 | for (my $i = 0; $i < $outermost_pair_count; $i++) {
29 | my ($curr_pair_upstream_pos, $curr_pair_downstream_pos) = @{$stem_outermost_pairs->[$i]};
30 | my $curr_pair_group_id = $paired_pos_to_group_id->{$curr_pair_upstream_pos};
31 |
32 | my $succ_pair_crossings = [];
33 |
34 | for (my $j = $i + 1; $j < $outermost_pair_count; $j++) {
35 | my ($candidate_pair_upstream_pos, $candidate_pair_downstream_pos) = @{$stem_outermost_pairs->[$j]};
36 | if ($candidate_pair_upstream_pos > $curr_pair_downstream_pos) {
37 | last;
38 | }
39 |
40 | if ($candidate_pair_downstream_pos > $curr_pair_downstream_pos) {
41 | my $crossing_pair_group_id = $paired_pos_to_group_id->{$candidate_pair_upstream_pos};
42 | if (defined($curr_pair_group_id)) {
43 | if (!defined($crossing_pair_group_id)) {
44 | push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $candidate_pair_upstream_pos;
45 | push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $candidate_pair_downstream_pos;
46 | $paired_pos_to_group_id->{$candidate_pair_upstream_pos} = $curr_pair_group_id;
47 | }
48 | elsif ($crossing_pair_group_id != $curr_pair_group_id) {
49 | my @merged_pos_group = (@{$knotted_pair_pos_groups->[$curr_pair_group_id]}, @{$knotted_pair_pos_groups->[$crossing_pair_group_id]});
50 | $knotted_pair_pos_groups->[$curr_pair_group_id] = \@merged_pos_group;
51 |
52 | foreach (@{$knotted_pair_pos_groups->[$crossing_pair_group_id]}) {
53 | if (exists($paired_pos_to_group_id->{$_})) {
54 | $paired_pos_to_group_id->{$_} = $curr_pair_group_id;
55 | }
56 | }
57 |
58 | delete $knotted_pair_pos_groups->[$crossing_pair_group_id];
59 | }
60 | }
61 | else {
62 | if (defined($crossing_pair_group_id)) {
63 | $curr_pair_group_id = $crossing_pair_group_id;
64 | push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $curr_pair_upstream_pos;
65 | push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $curr_pair_downstream_pos;
66 | }
67 | else {
68 | $curr_pair_group_id = $max_group_id++;
69 | $knotted_pair_pos_groups->[$curr_pair_group_id] = [$curr_pair_upstream_pos, $curr_pair_downstream_pos, $candidate_pair_upstream_pos, $candidate_pair_downstream_pos];
70 | $paired_pos_to_group_id->{$candidate_pair_upstream_pos} = $curr_pair_group_id;
71 | }
72 | }
73 |
74 | push @{$succ_pair_crossings}, $candidate_pair_upstream_pos;
75 | }
76 | }
77 |
78 | $outermost_pair_crossings->{$curr_pair_upstream_pos} = $succ_pair_crossings;
79 | }
80 |
81 | return ($knotted_pair_pos_groups, $outermost_pair_crossings);
82 | }
83 |
84 | sub _get_prim_pseudoknots {
85 | my ($stems, $knotted_pair_pos_groups, $outermost_pair_crossings, $paired_pos_ptrs) = @_;
86 |
87 | my $primitive_pseudoknots = [];
88 |
89 | for (my $i = 0; $i < @{$knotted_pair_pos_groups}; $i++) {
90 | if (!defined($knotted_pair_pos_groups->[$i])) {
91 | next;
92 | }
93 |
94 | my @sorted_knot_pair_pos = sort {$a <=> $b} @{$knotted_pair_pos_groups->[$i]};
95 | my $prev_knot_pair_pos = {};
96 | for (my $j = 1; $j < @sorted_knot_pair_pos; $j++) {
97 | $prev_knot_pair_pos->{$sorted_knot_pair_pos[$j]} = $sorted_knot_pair_pos[$j - 1];
98 | }
99 |
100 | my ($prim_pseudoknot_stems, $prim_pseudoknot_stem) = ([], []);
101 | my $knot_pair_pos_to_stem_id = {};
102 | my $max_stem_id = 0;
103 |
104 | for (my $j = 0; $j < (@sorted_knot_pair_pos - 1); $j++) {
105 | my $curr_pos = $sorted_knot_pair_pos[$j];
106 | my $curr_paired_pos = $paired_pos_ptrs->[$curr_pos];
107 | if ($curr_pos > $curr_paired_pos) {
108 | next;
109 | }
110 |
111 | my @merged_stem = (@{$prim_pseudoknot_stem}, @{$stems->{$curr_pos}});
112 | $prim_pseudoknot_stem = \@merged_stem;
113 |
114 | my $next_pos = $sorted_knot_pair_pos[$j + 1];
115 | my $next_paired_pos = $paired_pos_ptrs->[$next_pos];
116 | if ($prev_knot_pair_pos->{$curr_paired_pos} != $next_paired_pos) {
117 | push @{$prim_pseudoknot_stems}, $prim_pseudoknot_stem;
118 | $knot_pair_pos_to_stem_id->{$curr_pos} = $max_stem_id++;
119 | $prim_pseudoknot_stem = [];
120 | }
121 | }
122 |
123 | my $prim_pseudoknot_stem_crossings = [];
124 | while (my ($knot_pair_upstream_pos, $stem_id) = each %{$knot_pair_pos_to_stem_id}) {
125 | my $stem_crossings = [];
126 | my $knot_pair_crossings = $outermost_pair_crossings->{$knot_pair_upstream_pos};
127 | foreach (@{$knot_pair_crossings}) {
128 | if (exists($knot_pair_pos_to_stem_id->{$_})) {
129 | push @{$stem_crossings}, $knot_pair_pos_to_stem_id->{$_};
130 | }
131 | }
132 |
133 | $prim_pseudoknot_stem_crossings->[$stem_id] = $stem_crossings;
134 | }
135 |
136 | push @{$primitive_pseudoknots}, [$prim_pseudoknot_stems, $prim_pseudoknot_stem_crossings];
137 | }
138 |
139 | return $primitive_pseudoknots;
140 | }
141 |
142 | 1;
143 |
--------------------------------------------------------------------------------
/utils/FreeKnot/VertexSubset.pm:
--------------------------------------------------------------------------------
1 | #Module that represents the vertex subset in the MWIS algorithm. All the vertices of the knot-stem
2 | #graph are added to this subset (with the goal opposing vertices filtered) at initialization. When
3 | #the MWIS algorithm proceeds, vertices are gradually removed from this subset and the algorithm
4 | #stops when this subset is empty.
5 | #
6 | #This subset also keeps the adjacent vertices for each vertex in it, as well as the vertex degrees.
7 | #It enables the MWIS algorithm to select the highest degree and lowest degree vertices, and to
8 | #further generate a new subset of it while updating the adjacent vertices and vertex degrees.
9 |
10 | package VertexSubset;
11 |
12 | use strict;
13 |
14 | sub new {
15 | # my (undef, $circle_graph, $stem_scores, $criteria) = @_;
16 | my (undef, $circle_graph) = @_;
17 |
18 | my ($vertex_degrees, $adj_vertex_sets) = ({}, {});
19 |
20 | my $subset_size = 0;
21 |
22 | for (my $i = $circle_graph->get_vertex_count() - 1; $i >= 0; $i--) {
23 | $vertex_degrees->{$i} = 0;
24 |
25 | foreach (@{$circle_graph->get_edges_at($i)}) {
26 | $vertex_degrees->{$i}++;
27 | $vertex_degrees->{$_}++;
28 | $adj_vertex_sets->{$i}{$_} = 1;
29 | $adj_vertex_sets->{$_}{$i} = 1;
30 | }
31 |
32 | $subset_size++;
33 | }
34 |
35 | my ($highest_degree_vertices, $lowest_degree_vertices, $highest_vertex_degree, $lowest_vertex_degree) = _get_highest_and_lowest_degree_vertices($vertex_degrees);
36 |
37 | my $self = {};
38 | $self->{subset_size} = $subset_size;
39 | $self->{vertex_degrees} = $vertex_degrees;
40 | $self->{adj_vertex_sets} = $adj_vertex_sets;
41 | $self->{highest_degree_vertices} = $highest_degree_vertices;
42 | $self->{lowest_degree_vertices} = $lowest_degree_vertices;
43 | $self->{highest_vertex_degree} = $highest_vertex_degree;
44 | $self->{lowest_vertex_degree} = $lowest_vertex_degree;
45 |
46 | bless $self;
47 |
48 | return $self;
49 | }
50 |
51 | #Generate a new subset instance by removing the vertices specified in the input
52 | sub get_subset {
53 | my ($self, $vertices_to_remove) = @_;
54 |
55 | my $subset_size = 0;
56 | my ($subset_vertex_degrees, $subset_adj_vertex_sets) = ({}, {});
57 |
58 | my %delete_vertices = map {$_ => 1} @{$vertices_to_remove};
59 | my $vertex_degrees = $self->{vertex_degrees};
60 | foreach (keys %{$vertex_degrees}) {
61 | if (!exists($delete_vertices{$_})) {
62 | $subset_vertex_degrees->{$_} = 0;
63 | $subset_adj_vertex_sets->{$_} = {};
64 | $subset_size++;
65 | }
66 | }
67 |
68 | my $adj_vertex_sets = $self->{adj_vertex_sets};
69 | while (my ($vertex, $adj_vertices) = each %{$adj_vertex_sets}) {
70 | if (!exists($delete_vertices{$vertex})) {
71 | foreach (keys %{$adj_vertices}) {
72 | if ($vertex < $_ && !exists($delete_vertices{$_})) {
73 | $subset_adj_vertex_sets->{$vertex}{$_} = 1;
74 | $subset_adj_vertex_sets->{$_}{$vertex} = 1;
75 | $subset_vertex_degrees->{$vertex}++;
76 | $subset_vertex_degrees->{$_}++;
77 | }
78 | }
79 | }
80 | }
81 |
82 | my ($highest_degree_vertices, $lowest_degree_vertices, $highest_vertex_degree, $lowest_vertex_degree) = _get_highest_and_lowest_degree_vertices($subset_vertex_degrees);
83 |
84 | my $subset_self = {};
85 | $subset_self->{subset_size} = $subset_size;
86 | $subset_self->{vertex_degrees} = $subset_vertex_degrees;
87 | $subset_self->{adj_vertex_sets} = $subset_adj_vertex_sets;
88 | $subset_self->{highest_degree_vertices} = $highest_degree_vertices;
89 | $subset_self->{lowest_degree_vertices} = $lowest_degree_vertices;
90 | $subset_self->{highest_vertex_degree} = $highest_vertex_degree;
91 | $subset_self->{lowest_vertex_degree} = $lowest_vertex_degree;
92 |
93 | bless $subset_self;
94 |
95 | return $subset_self;
96 | }
97 |
98 | sub _get_highest_and_lowest_degree_vertices {
99 | my $vertex_degrees = shift;
100 |
101 | my ($highest_degree_vertices, $lowest_degree_vertices) = ([], []);
102 | my ($highest_vertex_degree, $lowest_vertex_degree) = (-1, -1);
103 |
104 | while (my ($vertex, $vertex_degree) = each %{$vertex_degrees}) {
105 | if ($vertex_degree > $highest_vertex_degree) {
106 | $highest_degree_vertices = [$vertex];
107 | $highest_vertex_degree = $vertex_degree;
108 | }
109 | elsif ($vertex_degree == $highest_vertex_degree) {
110 | push @{$highest_degree_vertices}, $vertex;
111 | }
112 |
113 | if ($vertex_degree < $lowest_vertex_degree || $lowest_vertex_degree < 0) {
114 | $lowest_degree_vertices = [$vertex];
115 | $lowest_vertex_degree = $vertex_degree;
116 | }
117 | elsif ($vertex_degree == $lowest_vertex_degree) {
118 | push @{$lowest_degree_vertices}, $vertex;
119 | }
120 | }
121 |
122 | my @sorted_highest_degree_vertices = sort {$a <=> $b} @{$highest_degree_vertices};
123 | my @sorted_lowest_degree_vertices = sort {$a <=> $b} @{$lowest_degree_vertices};
124 |
125 | return \@sorted_highest_degree_vertices, \@sorted_lowest_degree_vertices, $highest_vertex_degree, $lowest_vertex_degree;
126 | }
127 |
128 | sub get_size {
129 | my $self = shift;
130 |
131 | return $self->{subset_size};
132 | }
133 |
134 | sub get_vertices {
135 | my $self = shift;
136 |
137 | my @vertices = sort {$a <=> $b} keys %{$self->{vertex_degrees}};
138 |
139 | return \@vertices;
140 | }
141 |
142 | sub get_adjacent_vertices_at {
143 | my ($self, $vertex) = @_;
144 |
145 | my $adj_vertex_sets = $self->{adj_vertex_sets};
146 | if (exists($adj_vertex_sets->{$vertex})) {
147 | my @adj_vertices = sort {$a <=> $b} keys %{$adj_vertex_sets->{$vertex}};
148 | return \@adj_vertices;
149 | }
150 |
151 | return [];
152 | }
153 |
154 | sub get_highest_degree_vertex_info {
155 | my $self = shift;
156 |
157 | return $self->{highest_degree_vertices}, $self->{highest_vertex_degree};
158 | }
159 |
160 | sub get_lowest_degree_vertex_info {
161 | my $self = shift;
162 |
163 | return $self->{lowest_degree_vertices}, $self->{lowest_vertex_degree};
164 | }
165 |
166 | 1;
167 |
--------------------------------------------------------------------------------
/utils/FreeKnot/MWIS.pm:
--------------------------------------------------------------------------------
1 | #Modified circle graph MWIS algorithm based on that proposed by Valiente (Valiente, G., 2003), with
2 | #enhancement suggested by Nash et al. (Nash, N., Lelait, S., and Gregg, D., 2009). It operates with
3 | #the chord model and reports either single solution or all solutions according to the user option.
4 |
5 | package MWIS;
6 |
7 | use strict;
8 |
9 | sub get_mwis {
10 | my (undef, $chord_model, $base_seq, $scoring_function, $criteria, $is_report_all) = @_;
11 |
12 | my $chord_weights = _get_chord_weights($chord_model, $base_seq, $scoring_function);
13 |
14 | my $end_point_count = $chord_model->get_chord_edge_count() * 2;
15 |
16 | #Enhancement by Nash et al. to get MWISs (in variable c) and the scores (in variable cmis) in
17 | #every region bounded by the endpoints of each chord.
18 | my ($m, $p) = ([], []);
19 | my ($cmis, $c) = ({}, {});
20 |
21 | for (my $i = 1; $i <= $end_point_count + 1; $i++) {
22 | $m->[$i] = 0;
23 | $p->[$i] = [0];
24 | }
25 |
26 | my $last = 1;
27 |
28 | for (my $i = 1; $i <= $end_point_count; $i++) {
29 | if ($chord_model->is_left_end_point($i)) {
30 | next;
31 | }
32 |
33 | my ($left_end_point, $right_end_point) = @{$chord_model->get_chord_edge_by_end_point($i)};
34 |
35 | for (my $j = $last; $j > $left_end_point; $j--) {
36 | $m->[$j] = $m->[$j + 1];
37 | $p->[$j] = $p->[$j + 1];
38 |
39 | if ($chord_model->is_left_end_point($j)) {
40 | my (undef, $inner_right_end_point) = @{$chord_model->get_chord_edge_by_end_point($j)};
41 | my $candidate_m = $m->[$inner_right_end_point + 1] + $cmis->{$j . '-' . $inner_right_end_point};
42 |
43 | if (($criteria eq 'max' && $candidate_m > $m->[$j]) ||
44 | ($criteria eq 'min' && $candidate_m < $m->[$j])) {
45 | $m->[$j] = $candidate_m;
46 | $p->[$j] = [$inner_right_end_point];
47 | }
48 | elsif ($is_report_all && $candidate_m == $m->[$j]) {
49 | my @arr_clone = @{$p->[$j + 1]};
50 | $p->[$j] = [$inner_right_end_point];
51 | push @{$p->[$j]}, @arr_clone;
52 | }
53 | }
54 | }
55 |
56 | $cmis->{$left_end_point . '-' . $right_end_point} = $m->[$left_end_point + 1] + $chord_weights->{$left_end_point . '-' . $right_end_point};
57 | $c->{$left_end_point . '-' . $right_end_point} = _add_front($p, $left_end_point + 1, $chord_model, []);
58 | $last = $left_end_point;
59 | }
60 |
61 | #Algorithm proposed by Valiente to obtain MWISs starting at each endpoint. Only those chords
62 | #in the MWIS that are not bounded by other chords in the same MWIS set are stored.
63 | my ($t_structures, $t_struct_weights) = ([], []);
64 |
65 | foreach (@{$chord_model->get_chord_end_point_nums()}) {
66 | $t_structures->[$_] = [[]];
67 |
68 | if (!$chord_model->is_left_end_point($_)) {
69 | if ($_ < $end_point_count) {
70 | @{$t_structures->[$_]} = @{$t_structures->[$_ + 1]};
71 | $t_struct_weights->[$_] = $t_struct_weights->[$_ + 1];
72 | }
73 | else {
74 | $t_struct_weights->[$_] = 0;
75 | }
76 | }
77 | else {
78 | my $chord_edge = $chord_model->get_chord_edge_by_end_point($_);
79 | my $candidate_total_chord_weight = $cmis->{$chord_edge->[0] . '-' . $chord_edge->[1]};
80 |
81 | if ($chord_edge->[1] < $end_point_count) {
82 | $candidate_total_chord_weight += $t_struct_weights->[$chord_edge->[1] + 1];
83 | }
84 |
85 | if (($criteria eq 'max' && $candidate_total_chord_weight > $t_struct_weights->[$_ + 1]) ||
86 | ($criteria eq 'min' && $candidate_total_chord_weight < $t_struct_weights->[$_ + 1]) ||
87 | ($candidate_total_chord_weight == $t_struct_weights->[$_ + 1] && $is_report_all)) {
88 | my $generated_new_t_structures;
89 |
90 | if ($candidate_total_chord_weight == $t_struct_weights->[$_ + 1]) {
91 | @{$generated_new_t_structures} = @{$t_structures->[$_ + 1]};
92 | }
93 | else {
94 | $generated_new_t_structures = [];
95 | }
96 |
97 | if ($chord_edge->[1] < $end_point_count) {
98 | foreach my $t_structure (@{$t_structures->[$chord_edge->[1] + 1]}) {
99 | my @new_t_structure = @{$t_structure};
100 | unshift @new_t_structure, $chord_edge;
101 | push @{$generated_new_t_structures}, \@new_t_structure;
102 | }
103 | }
104 | else {
105 | push @{$generated_new_t_structures}, [$chord_edge];
106 | }
107 |
108 | $t_structures->[$_] = $generated_new_t_structures;
109 | $t_struct_weights->[$_] = $candidate_total_chord_weight;
110 | }
111 | else {
112 | $t_structures->[$_] = $t_structures->[$_ + 1];
113 | $t_struct_weights->[$_] = $t_struct_weights->[$_ + 1];
114 | }
115 | }
116 | }
117 |
118 | my $mwiss = _restore_chord_mwiss($t_structures->[1], $c);
119 |
120 | return $mwiss;
121 | }
122 |
123 | #Generate all the MWISs in the region bounded by the endpoints of a single chord
124 | sub _add_front {
125 | my ($p, $start_pos, $chord_model, $org_c_element) = @_;
126 |
127 | my $p_element = $p->[$start_pos];
128 |
129 | if ($p_element->[0] > 0) {
130 | my $new_c_element = [];
131 |
132 | foreach (@{$p_element}) {
133 | my $chord_edge = $chord_model->get_chord_edge_by_end_point($_);
134 | my $expanded_c_element = [];
135 |
136 | if (!defined($org_c_element->[0])) {
137 | push @{$expanded_c_element}, [$chord_edge];
138 | }
139 | else {
140 | foreach my $element_value (@{$org_c_element}) {
141 | my @arr_clone = @{$element_value};
142 | push @arr_clone, $chord_edge;
143 | push @{$expanded_c_element}, \@arr_clone;
144 | }
145 | }
146 |
147 | my $new_values = _add_front($p, $_, $chord_model, $expanded_c_element);
148 | push @{$new_c_element}, @{$new_values};
149 | }
150 |
151 | return $new_c_element;
152 | }
153 |
154 | return $org_c_element;
155 | }
156 |
157 | sub _get_chord_weights {
158 | my ($chord_model, $base_seq, $scoring_function) = @_;
159 |
160 | my $chord_weights = {};
161 |
162 | foreach (values %{$chord_model->get_chord_edges()}) {
163 | my $chord_base_pairs = $chord_model->get_chord_base_pairs($_->[0], $_->[1]);
164 | my $chord_attrs = {};
165 | $chord_attrs->{base_pairs} = $chord_base_pairs;
166 | $chord_attrs->{pair_count} = @{$chord_base_pairs};
167 | $chord_weights->{$_->[0] . '-' . $_->[1]} = $scoring_function->($chord_attrs, $base_seq);
168 | }
169 |
170 | return $chord_weights;
171 | }
172 |
173 | #Recover the MWISs from the chord sets in variable c
174 | sub _restore_chord_mwiss {
175 | my ($chord_edge_sets, $c) = @_;
176 |
177 | my $chord_mwiss = [];
178 |
179 | foreach my $chord_edge_set (@{$chord_edge_sets}) {
180 | my $single_chord_edge_set_mwiss = [$chord_edge_set];
181 |
182 | foreach my $chord_edge (@{$chord_edge_set}) {
183 | my $inner_chord_edge_sets = $c->{$chord_edge->[0] . '-' . $chord_edge->[1]};
184 | if (!defined($inner_chord_edge_sets->[0])) {
185 | next;
186 | }
187 |
188 | my $inner_chord_mwiss = _restore_chord_mwiss($inner_chord_edge_sets, $c);
189 | my @org_single_chord_edge_set_mwiss = @{$single_chord_edge_set_mwiss};
190 | $single_chord_edge_set_mwiss = [];
191 |
192 | foreach my $single_chord_edge_set_mwis (@org_single_chord_edge_set_mwiss) {
193 | foreach my $inner_chord_mwis (@{$inner_chord_mwiss}) {
194 | my @merged_mwis = (@{$single_chord_edge_set_mwis}, @{$inner_chord_mwis});
195 | push @{$single_chord_edge_set_mwiss}, \@merged_mwis;
196 | }
197 | }
198 | }
199 |
200 | push @{$chord_mwiss}, @{$single_chord_edge_set_mwiss};
201 |
202 | }
203 |
204 | return $chord_mwiss;
205 | }
206 |
207 | 1;
208 |
--------------------------------------------------------------------------------
/utils/FreeKnot/remove_pseudoknot.pl:
--------------------------------------------------------------------------------
1 | #Main program for pseudoknot removal
2 | #It accepts input RNA secondary structure as BPSEQ format or dot-parentheses format
3 | #There are four choices of scoring functions: No. of base pairs, no. of stems, no. of hydrogen
4 | #bonds, and Turner free energy (Turner, D. H. & Mathews, D. H., NAR 2009)). The optimization goal
5 | #for the first three options is to maximize the score as all the choices only give positive values.
6 | #For the last option, the goal is to minimize the score (i.e. free energy).
7 |
8 | #!/usr/bin/perl
9 |
10 | use BpseqParser;
11 | use BpseqWriter;
12 | use BracketPairs;
13 | use ChordModel;
14 | use CircleGraph;
15 | use DPParser;
16 | use DPWriter;
17 | use MIS;
18 | use MWIS;
19 | use PrimitivePseudoknotExtractor;
20 | use ScoringFunctions;
21 | use VertexSubset;
22 | use strict;
23 |
24 | #OS_BIT specifies the length of a bitstring used in the circle graph
25 | use constant OS_BIT => 32;
26 |
27 | if (@ARGV < 5) {
28 | print "Usage: perl $0 -i -s [-a : report all optimal solutions]\n";
29 | exit;
30 | }
31 |
32 | my ($input_file_path, $input_file_format, $scoring_fx_option);
33 | my $is_report_all = 0;
34 |
35 | for (my $i = 0; $i < @ARGV; $i++) {
36 | if ($ARGV[$i] eq '-i') {
37 | if (defined($input_file_format)) {
38 | print "Duplicated input file format specification\n";
39 | exit;
40 | }
41 | else {
42 | $input_file_format = $ARGV[++$i];
43 | }
44 | }
45 | elsif ($ARGV[$i] eq '-s') {
46 | if (defined($scoring_fx_option)) {
47 | print "Duplicated scoring function specification\n";
48 | exit;
49 | }
50 | else {
51 | $scoring_fx_option = $ARGV[++$i];
52 | }
53 | }
54 | elsif ($ARGV[$i] eq '-a') {
55 | $is_report_all = 1;
56 | }
57 | elsif (substr($ARGV[$i], 0, 1) eq '-') {
58 | print "Unknown parameter $ARGV[$i]\n";
59 | exit;
60 | }
61 | elsif (!defined($input_file_path)) {
62 | $input_file_path = $ARGV[$i];
63 | }
64 | }
65 |
66 | if (!defined($input_file_path)) {
67 | print "No input file path specified\n";
68 | exit;
69 | }
70 |
71 | #Select the scoring function according to the user option. It will be used to calculate the score of
72 | #each stem in the MWIS algorithm
73 | my ($scoring_function, $criteria, $is_fe) = ScoringFunctions->get_scoring_function($scoring_fx_option);
74 | if (!defined($scoring_function)) {
75 | print "Unknown scoring function specified: $scoring_fx_option\n";
76 | exit;
77 | }
78 |
79 | my ($primitive_pseudoknots, $base_seq, $paired_pos_ptrs, $base_count, $structure_symbols, $base_seq_str);
80 |
81 | #Parse the input structure file to generate pseudoknot objects
82 | if ($input_file_format eq 'bpseq') {
83 | ($primitive_pseudoknots, $base_seq, $paired_pos_ptrs, $base_count) = BpseqParser->parse($input_file_path);
84 | }
85 | elsif ($input_file_format eq 'dp') {
86 | ($primitive_pseudoknots, $base_seq, $structure_symbols, $base_seq_str) = DPParser->parse($input_file_path);
87 | }
88 | else {
89 | print "Unknown input file format: $input_file_format\n";
90 | exit;
91 | }
92 |
93 | my $pseudoknot_base_pair_removal_pos = [];
94 | my $prim_pseudoknot_count = 0;
95 |
96 | #If free energy is selected as the scoring function, then MIS algorithm is applied to generate
97 | #all MISs of the circle graph, and evaluated the free energy for each of them
98 | if ($is_fe) {
99 | foreach (@{$primitive_pseudoknots}) {
100 | my $circle_graph = CircleGraph->new($_, OS_BIT);
101 | my $miss = MIS->get_mis($circle_graph, $criteria);
102 | my $base_pair_removal_pos = convert_to_base_pair_removal_pos_circle_graph($circle_graph, $miss);
103 | push @{$pseudoknot_base_pair_removal_pos}, $base_pair_removal_pos;
104 | $prim_pseudoknot_count++;
105 | }
106 | }
107 | #For other scoring function options, MWIS algorithm is applied to generate one/all MWISs from
108 | #the chord model of the circle graph
109 | else{
110 | foreach (@{$primitive_pseudoknots}) {
111 | my $chord_model = ChordModel->new($_);
112 | my $mwiss = MWIS->get_mwis($chord_model, $base_seq, $scoring_function, $criteria, $is_report_all);
113 | my $base_pair_removal_pos = convert_to_base_pair_removal_pos($chord_model, $mwiss);
114 | push @{$pseudoknot_base_pair_removal_pos}, $base_pair_removal_pos;
115 | $prim_pseudoknot_count++;
116 | }
117 | }
118 |
119 | #Combine the possible removal positions sets for all primitive pseudoknots
120 | my $combined_base_pair_removal_pos = combine_base_pair_removal_pos($pseudoknot_base_pair_removal_pos, []);
121 |
122 | #Determine the free energy of every structure converted from the MISs combinations of different
123 | #primitive pseudoknots in the structure. It writes the structure to a temporary file and call
124 | #RNAeval in ViennaRNA package to calculate its free energy
125 | if ($is_fe) {
126 | my $mfe;
127 | my $mfe_base_pair_models = [];
128 |
129 | if (!defined($base_seq_str)) {
130 | $base_seq_str = join('', @{$base_seq});
131 | }
132 |
133 | foreach (@{$combined_base_pair_removal_pos}) {
134 | DPWriter->output_mfe_candidate($_, $paired_pos_ptrs, $structure_symbols, $base_seq_str);
135 | my $rna_eval_output = `RNAeval < MWIS_temp.dp`;
136 | $rna_eval_output =~ /(-?\d+\.\d+)/;
137 | if ($1 < $mfe || !defined($mfe)) {
138 | $mfe_base_pair_models = [$_];
139 | $mfe = $1;
140 | }
141 | elsif ($1 == $mfe) {
142 | push @{$mfe_base_pair_models}, $_;
143 | }
144 | }
145 |
146 | $combined_base_pair_removal_pos = $mfe_base_pair_models;
147 | }
148 |
149 | if ($input_file_format eq 'bpseq') {
150 | BpseqWriter->output_results($combined_base_pair_removal_pos, $base_seq, $paired_pos_ptrs, $base_count);
151 | }
152 | elsif ($input_file_format eq 'dp') {
153 | DPWriter->output_results($combined_base_pair_removal_pos, $structure_symbols, $base_seq_str);
154 | }
155 |
156 | sub convert_to_base_pair_removal_pos_circle_graph {
157 | my ($circle_graph, $miss) = @_;
158 |
159 | my $base_pair_removal_pos = [];
160 |
161 | foreach my $mis (@{$miss}) {
162 | my $removed_vertex_nums = [];
163 | for (my $i = 0; $i < $mis->[0]; $i++) {
164 | push @{$removed_vertex_nums}, $i;
165 | }
166 |
167 | for (my $i = 1; $i < @{$mis}; $i++) {
168 | for (my $j = $mis->[$i - 1] + 1; $j < $mis->[$i]; $j++) {
169 | push @{$removed_vertex_nums}, $j;
170 | }
171 | }
172 |
173 | for (my $i = $mis->[-1] + 1; $i < $circle_graph->get_vertex_count(); $i++) {
174 | push @{$removed_vertex_nums}, $i;
175 | }
176 |
177 | my $removal_pos = {};
178 | foreach (@{$removed_vertex_nums}) {
179 | my $vertex_attrs = $circle_graph->get_vertex_attrs_at($_);
180 | my $stem_pairs = $vertex_attrs->{stem_pairs};
181 | foreach (@{$stem_pairs}) {
182 | my ($pair_upstream_pos, $pair_downstream_pos) = @{$_};
183 | $removal_pos->{$pair_upstream_pos} = 1;
184 | $removal_pos->{$pair_downstream_pos} = 1;
185 | }
186 | }
187 |
188 | push @{$base_pair_removal_pos}, $removal_pos;
189 | }
190 |
191 | return $base_pair_removal_pos;
192 | }
193 |
194 | sub convert_to_base_pair_removal_pos {
195 | my ($chord_model, $mwiss) = @_;
196 |
197 | my $base_pair_removal_pos = [];
198 |
199 | foreach my $mwis (@{$mwiss}) {
200 | my %removed_chord_edges = %{$chord_model->get_chord_edges()};
201 | foreach (@{$mwis}) {
202 | delete $removed_chord_edges{$_->[0] . '-' . $_->[1]};
203 | }
204 |
205 | my $removal_pos = {};
206 | foreach my $removed_chord_edge (values %removed_chord_edges) {
207 | my $removed_chord_base_pairs = $chord_model->get_chord_base_pairs($removed_chord_edge->[0], $removed_chord_edge->[1]);
208 | foreach (@{$removed_chord_base_pairs}) {
209 | $removal_pos->{$_->[0]} = 1;
210 | $removal_pos->{$_->[1]} = 1;
211 | }
212 | }
213 |
214 | push @{$base_pair_removal_pos}, $removal_pos;
215 | }
216 |
217 | return $base_pair_removal_pos;
218 | }
219 |
220 | sub combine_base_pair_removal_pos {
221 | my ($pseudoknot_base_pair_removal_pos, $combined_base_pair_removal_pos) = @_;
222 |
223 | my $expanded_base_pair_removal_pos = [];
224 | my $base_pair_removal_pos = pop @{$pseudoknot_base_pair_removal_pos};
225 | foreach my $removal_pos (@{$base_pair_removal_pos}) {
226 | if (defined($combined_base_pair_removal_pos->[0])) {
227 | foreach (@{$combined_base_pair_removal_pos}) {
228 | my %expanded_removal_pos = (%{$removal_pos}, %{$_});
229 | push @{$expanded_base_pair_removal_pos}, \%expanded_removal_pos;
230 | }
231 | }
232 | else {
233 | push @{$expanded_base_pair_removal_pos}, $removal_pos;
234 | }
235 | }
236 |
237 | if (defined($pseudoknot_base_pair_removal_pos->[0])) {
238 | $expanded_base_pair_removal_pos = combine_base_pair_removal_pos($pseudoknot_base_pair_removal_pos, $expanded_base_pair_removal_pos);
239 | }
240 |
241 | return $expanded_base_pair_removal_pos;
242 | }
243 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.log_gremlin:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------------------------------------
2 | # GREMLIN_CPP v1.0
3 | # ---------------------------------------------------------------------------------------------
4 | # -i /home/jaswinder/github/SPOT-RNA2/sample_run/sample_seq_features/sample_seq.a2m
5 | # -o /home/jaswinder/github/SPOT-RNA2/sample_run/sample_seq_features/sample_seq.dca
6 | # ---------------------------------------------------------------------------------------------
7 | # -only_neff 0
8 | # -only_v 0
9 | # -gap_cutoff 0.5
10 | # -alphabet rna
11 | # -eff_cutoff 0.8
12 | # -lambda 0.01
13 | # ---------------------------------------------------------------------------------------------
14 | # -min_type lbfgs
15 | # -max_iter 100
16 | # ---------------------------------------------------------------------------------------------
17 | # removing 3 out of 61 positions with >= 50% gaps!
18 | # SEQ ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
19 | # CUT ACUCGUUUGAGCGAGUAUAAACAGCU-GUUAAGCUCAAAGCGGAGAGCAG--CUGCUCUCG
20 | # NC 58
21 | # NEFF 16.9
22 | # learning MRF ...
23 | # lbfgs::iter S_S fx: 1577.57 gnorm: 79.1061
24 | # lbfgs::iter 0_1 fx: 1569.68 gnorm: 78.7645
25 | # lbfgs::iter 1_1 fx: 867.926 gnorm: 14.9869
26 | # lbfgs::iter 2_1 fx: 835.903 gnorm: 7.66255
27 | # lbfgs::iter 3_1 fx: 823.585 gnorm: 6.19229
28 | # lbfgs::iter 4_1 fx: 814.088 gnorm: 5.88803
29 | # lbfgs::iter 5_1 fx: 808.359 gnorm: 3.65714
30 | # lbfgs::iter 6_1 fx: 804.819 gnorm: 2.33437
31 | # lbfgs::iter 7_1 fx: 801.784 gnorm: 2.29087
32 | # lbfgs::iter 8_1 fx: 801.007 gnorm: 3.56033
33 | # lbfgs::iter 9_1 fx: 800.087 gnorm: 0.639645
34 | # lbfgs::iter 10_1 fx: 800.041 gnorm: 0.287188
35 | # lbfgs::iter 11_1 fx: 800.021 gnorm: 0.235963
36 | # lbfgs::iter 12_1 fx: 799.996 gnorm: 0.228575
37 | # lbfgs::iter 13_1 fx: 799.98 gnorm: 0.446223
38 | # lbfgs::iter 14_1 fx: 799.957 gnorm: 0.200702
39 | # lbfgs::iter 15_1 fx: 799.942 gnorm: 0.176329
40 | # lbfgs::iter 16_1 fx: 799.93 gnorm: 0.171215
41 | # lbfgs::iter 17_1 fx: 799.93 gnorm: 0.42462
42 | # lbfgs::iter 18_1 fx: 799.916 gnorm: 0.110312
43 | # lbfgs::iter 19_1 fx: 799.914 gnorm: 0.0797637
44 | # lbfgs::iter 20_1 fx: 799.91 gnorm: 0.0900997
45 | # lbfgs::iter 21_1 fx: 799.907 gnorm: 0.112634
46 | # lbfgs::iter 22_1 fx: 799.906 gnorm: 0.106627
47 | # lbfgs::iter 23_1 fx: 799.905 gnorm: 0.0325551
48 | # lbfgs::iter 24_1 fx: 799.905 gnorm: 0.0234513
49 | # lbfgs::iter 25_1 fx: 799.905 gnorm: 0.0211741
50 | # lbfgs::iter 26_1 fx: 799.905 gnorm: 0.0347908
51 | # lbfgs::iter 27_1 fx: 799.905 gnorm: 0.00985535
52 | # lbfgs::iter 28_1 fx: 799.905 gnorm: 0.00879822
53 | # lbfgs::iter 29_1 fx: 799.904 gnorm: 0.00837304
54 | # lbfgs::iter 30_1 fx: 799.904 gnorm: 0.0161944
55 | # lbfgs::iter 31_1 fx: 799.904 gnorm: 0.00567908
56 | # lbfgs::iter 32_1 fx: 799.904 gnorm: 0.00464024
57 | # lbfgs::iter 33_1 fx: 799.904 gnorm: 0.00515186
58 | # lbfgs::iter 34_1 fx: 799.904 gnorm: 0.00769589
59 | # lbfgs::iter 35_1 fx: 799.904 gnorm: 0.00417999
60 | # lbfgs::iter 36_1 fx: 799.904 gnorm: 0.00196562
61 | # lbfgs::iter 37_1 fx: 799.904 gnorm: 0.00138669
62 | # lbfgs::iter 38_1 fx: 799.904 gnorm: 0.00217301
63 | # lbfgs::iter 39_1 fx: 799.904 gnorm: 0.00122576
64 | # lbfgs::iter 40_1 fx: 799.904 gnorm: 0.000992969
65 | # lbfgs::iter 41_1 fx: 799.904 gnorm: 0.00110079
66 | # lbfgs::iter 42_1 fx: 799.904 gnorm: 0.00177581
67 | # lbfgs::iter 43_1 fx: 799.904 gnorm: 0.000827648
68 | # lbfgs::iter 44_1 fx: 799.904 gnorm: 0.000548524
69 | # lbfgs::iter 45_1 fx: 799.904 gnorm: 0.000489395
70 | # lbfgs::iter 46_1 fx: 799.904 gnorm: 0.000693037
71 | # lbfgs::iter 47_1 fx: 799.904 gnorm: 0.00017789
72 | # lbfgs::iter 48_1 fx: 799.904 gnorm: 0.000143922
73 | # lbfgs::iter 49_1 fx: 799.904 gnorm: 0.000167766
74 | # lbfgs::iter 50_1 fx: 799.904 gnorm: 0.000338155
75 | # lbfgs::iter 51_1 fx: 799.904 gnorm: 0.000158961
76 | # lbfgs::iter 52_1 fx: 799.904 gnorm: 0.000106748
77 | # lbfgs::iter 53_1 fx: 799.904 gnorm: 0.000105824
78 | # lbfgs::iter 54_1 fx: 799.904 gnorm: 0.000298927
79 | # lbfgs::iter 55_1 fx: 799.904 gnorm: 7.52617e-05
80 | # lbfgs::iter 56_1 fx: 799.904 gnorm: 5.87916e-05
81 | # lbfgs::iter 57_1 fx: 799.904 gnorm: 6.59898e-05
82 | # lbfgs::iter 58_1 fx: 799.904 gnorm: 0.000150251
83 | # lbfgs::iter 59_1 fx: 799.904 gnorm: 4.73333e-05
84 | # lbfgs::iter 60_1 fx: 799.904 gnorm: 3.42972e-05
85 | # lbfgs::iter 61_1 fx: 799.904 gnorm: 3.77292e-05
86 | # lbfgs::iter 62_1 fx: 799.904 gnorm: 3.80927e-05
87 | # lbfgs::iter 63_1 fx: 799.904 gnorm: 9.36524e-05
88 | # lbfgs::iter 64_1 fx: 799.904 gnorm: 1.61026e-05
89 | # lbfgs::iter 65_1 fx: 799.904 gnorm: 1.07761e-05
90 | # lbfgs::iter 66_1 fx: 799.904 gnorm: 8.87304e-06
91 | # lbfgs::iter 67_1 fx: 799.904 gnorm: 1.4495e-05
92 | # lbfgs::iter 68_1 fx: 799.904 gnorm: 7.46466e-06
93 | # lbfgs::iter 69_1 fx: 799.904 gnorm: 6.47399e-06
94 | # lbfgs::iter 70_1 fx: 799.904 gnorm: 7.70032e-06
95 | # lbfgs::iter 71_1 fx: 799.904 gnorm: 1.43926e-05
96 | # lbfgs::iter 72_1 fx: 799.904 gnorm: 3.22099e-06
97 | # lbfgs::iter 73_1 fx: 799.904 gnorm: 2.48525e-06
98 | # lbfgs::iter 74_1 fx: 799.904 gnorm: 2.60497e-06
99 | # lbfgs::iter 75_1 fx: 799.904 gnorm: 3.34597e-06
100 | # lbfgs::iter 76_1 fx: 799.904 gnorm: 7.18017e-06
101 | # lbfgs::iter 77_1 fx: 799.904 gnorm: 1.53372e-06
102 | # lbfgs::iter 78_1 fx: 799.904 gnorm: 1.04848e-06
103 | # lbfgs::iter 79_1 fx: 799.904 gnorm: 9.07634e-07
104 | # lbfgs::iter 80_1 fx: 799.904 gnorm: 1.613e-06
105 | # lbfgs::iter 81_1 fx: 799.904 gnorm: 7.12023e-07
106 | # lbfgs::iter 82_1 fx: 799.904 gnorm: 5.6185e-07
107 | # lbfgs::iter 83_1 fx: 799.904 gnorm: 5.75372e-07
108 | # lbfgs::iter 84_1 fx: 799.904 gnorm: 1.19944e-06
109 | # lbfgs::iter 85_1 fx: 799.904 gnorm: 4.0767e-07
110 | # lbfgs::iter 86_1 fx: 799.904 gnorm: 2.73253e-07
111 | # lbfgs::iter 87_1 fx: 799.904 gnorm: 2.46659e-07
112 | # lbfgs::iter 88_1 fx: 799.904 gnorm: 4.38814e-07
113 | # lbfgs::iter 89_1 fx: 799.904 gnorm: 2.32778e-07
114 | # lbfgs::iter 90_1 fx: 799.904 gnorm: 1.52613e-07
115 | # lbfgs::iter 91_1 fx: 799.904 gnorm: 1.24444e-07
116 | # lbfgs::iter 92_1 fx: 799.904 gnorm: 1.97241e-07
117 | # lbfgs::iter 93_1 fx: 799.904 gnorm: 8.6033e-08
118 | # lbfgs::iter 94_1 fx: 799.904 gnorm: 7.09053e-08
119 | # lbfgs::iter 95_1 fx: 799.904 gnorm: 6.83742e-08
120 | # lbfgs::iter 96_1 fx: 799.904 gnorm: 1.44261e-07
121 | # lbfgs::iter 97_1 fx: 799.904 gnorm: 3.08042e-08
122 | # lbfgs::iter 98_1 fx: 799.904 gnorm: 2.62992e-08
123 | # lbfgs::iter 99_1 fx: 799.904 gnorm: 2.98886e-08
124 | # lbfgs::iter S_S fx: 799.904 gnorm: 152.499
125 | # lbfgs::iter 0_1 fx: 784.797 gnorm: 149.682
126 | # lbfgs::iter 1_1 fx: 395.893 gnorm: 133.288
127 | # lbfgs::iter 2_2 fx: 364.519 gnorm: 99.8387
128 | # lbfgs::iter 3_1 fx: 298.457 gnorm: 29.6813
129 | # lbfgs::iter 4_1 fx: 287.945 gnorm: 19.0757
130 | # lbfgs::iter 5_1 fx: 282.982 gnorm: 11.4838
131 | # lbfgs::iter 6_1 fx: 280.631 gnorm: 10.3144
132 | # lbfgs::iter 7_1 fx: 279.448 gnorm: 6.3234
133 | # lbfgs::iter 8_1 fx: 279.008 gnorm: 3.7695
134 | # lbfgs::iter 9_1 fx: 278.802 gnorm: 2.4342
135 | # lbfgs::iter 10_1 fx: 278.677 gnorm: 2.46252
136 | # lbfgs::iter 11_1 fx: 278.606 gnorm: 2.49599
137 | # lbfgs::iter 12_1 fx: 278.556 gnorm: 1.34479
138 | # lbfgs::iter 13_1 fx: 278.511 gnorm: 1.29845
139 | # lbfgs::iter 14_1 fx: 278.466 gnorm: 1.43844
140 | # lbfgs::iter 15_1 fx: 278.428 gnorm: 3.48674
141 | # lbfgs::iter 16_1 fx: 278.349 gnorm: 1.66639
142 | # lbfgs::iter 17_1 fx: 278.274 gnorm: 1.79046
143 | # lbfgs::iter 18_1 fx: 278.147 gnorm: 2.82733
144 | # lbfgs::iter 19_1 fx: 277.925 gnorm: 4.0464
145 | # lbfgs::iter 20_1 fx: 277.712 gnorm: 7.464
146 | # lbfgs::iter 21_1 fx: 277.334 gnorm: 3.37071
147 | # lbfgs::iter 22_1 fx: 277.034 gnorm: 3.46683
148 | # lbfgs::iter 23_1 fx: 276.762 gnorm: 3.62724
149 | # lbfgs::iter 24_2 fx: 276.697 gnorm: 3.55826
150 | # lbfgs::iter 25_1 fx: 276.443 gnorm: 2.13056
151 | # lbfgs::iter 26_2 fx: 276.428 gnorm: 1.77109
152 | # lbfgs::iter 27_1 fx: 276.376 gnorm: 1.13688
153 | # lbfgs::iter 28_1 fx: 276.358 gnorm: 0.928894
154 | # lbfgs::iter 29_1 fx: 276.357 gnorm: 1.96084
155 | # lbfgs::iter 30_1 fx: 276.342 gnorm: 0.534038
156 | # lbfgs::iter 31_1 fx: 276.338 gnorm: 0.423256
157 | # lbfgs::iter 32_1 fx: 276.33 gnorm: 0.588926
158 | # lbfgs::iter 33_1 fx: 276.323 gnorm: 0.630135
159 | # lbfgs::iter 34_1 fx: 276.312 gnorm: 1.41765
160 | # lbfgs::iter 35_1 fx: 276.293 gnorm: 0.710723
161 | # lbfgs::iter 36_1 fx: 276.277 gnorm: 0.761555
162 | # lbfgs::iter 37_1 fx: 276.25 gnorm: 1.13114
163 | # lbfgs::iter 38_1 fx: 276.24 gnorm: 2.68521
164 | # lbfgs::iter 39_1 fx: 276.209 gnorm: 1.1845
165 | # lbfgs::iter 40_1 fx: 276.183 gnorm: 0.777926
166 | # lbfgs::iter 41_1 fx: 276.163 gnorm: 0.93568
167 | # lbfgs::iter 42_1 fx: 276.145 gnorm: 1.4975
168 | # lbfgs::iter 43_1 fx: 276.134 gnorm: 0.83518
169 | # lbfgs::iter 44_1 fx: 276.129 gnorm: 0.427227
170 | # lbfgs::iter 45_1 fx: 276.125 gnorm: 0.378219
171 | # lbfgs::iter 46_1 fx: 276.119 gnorm: 0.444098
172 | # lbfgs::iter 47_1 fx: 276.115 gnorm: 0.874621
173 | # lbfgs::iter 48_1 fx: 276.109 gnorm: 0.501375
174 | # lbfgs::iter 49_1 fx: 276.102 gnorm: 0.515069
175 | # lbfgs::iter 50_1 fx: 276.094 gnorm: 0.662954
176 | # lbfgs::iter 51_1 fx: 276.079 gnorm: 1.44839
177 | # lbfgs::iter 52_1 fx: 276.055 gnorm: 1.06933
178 | # lbfgs::iter 53_1 fx: 276.023 gnorm: 0.856106
179 | # lbfgs::iter 54_1 fx: 275.997 gnorm: 1.6488
180 | # lbfgs::iter 55_1 fx: 275.976 gnorm: 0.904621
181 | # lbfgs::iter 56_1 fx: 275.967 gnorm: 0.634115
182 | # lbfgs::iter 57_1 fx: 275.956 gnorm: 0.598754
183 | # lbfgs::iter 58_1 fx: 275.949 gnorm: 0.767746
184 | # lbfgs::iter 59_1 fx: 275.942 gnorm: 0.506698
185 | # lbfgs::iter 60_1 fx: 275.935 gnorm: 0.478146
186 | # lbfgs::iter 61_1 fx: 275.93 gnorm: 0.839587
187 | # lbfgs::iter 62_1 fx: 275.922 gnorm: 0.640863
188 | # lbfgs::iter 63_1 fx: 275.895 gnorm: 0.832216
189 | # lbfgs::iter 64_1 fx: 275.881 gnorm: 1.28579
190 | # lbfgs::iter 65_1 fx: 275.865 gnorm: 0.934848
191 | # lbfgs::iter 66_1 fx: 275.833 gnorm: 0.930706
192 | # lbfgs::iter 67_1 fx: 275.815 gnorm: 1.23511
193 | # lbfgs::iter 68_1 fx: 275.793 gnorm: 0.828008
194 | # lbfgs::iter 69_1 fx: 275.777 gnorm: 0.945493
195 | # lbfgs::iter 70_1 fx: 275.769 gnorm: 0.724122
196 | # lbfgs::iter 71_1 fx: 275.765 gnorm: 0.422678
197 | # lbfgs::iter 72_1 fx: 275.761 gnorm: 0.263507
198 | # lbfgs::iter 73_1 fx: 275.759 gnorm: 0.253831
199 | # lbfgs::iter 74_1 fx: 275.758 gnorm: 0.341821
200 | # lbfgs::iter 75_1 fx: 275.758 gnorm: 0.102256
201 | # lbfgs::iter 76_1 fx: 275.758 gnorm: 0.0740558
202 | # lbfgs::iter 77_1 fx: 275.758 gnorm: 0.0552766
203 | # lbfgs::iter 78_1 fx: 275.758 gnorm: 0.170435
204 | # lbfgs::iter 79_1 fx: 275.757 gnorm: 0.0399481
205 | # lbfgs::iter 80_1 fx: 275.757 gnorm: 0.0298459
206 | # lbfgs::iter 81_1 fx: 275.757 gnorm: 0.0316006
207 | # lbfgs::iter 82_1 fx: 275.757 gnorm: 0.0479131
208 | # lbfgs::iter 83_1 fx: 275.757 gnorm: 0.0287621
209 | # lbfgs::iter 84_1 fx: 275.757 gnorm: 0.0368384
210 | # lbfgs::iter 85_1 fx: 275.757 gnorm: 0.0438442
211 | # lbfgs::iter 86_1 fx: 275.757 gnorm: 0.146457
212 | # lbfgs::iter 87_1 fx: 275.757 gnorm: 0.05263
213 | # lbfgs::iter 88_1 fx: 275.757 gnorm: 0.0397041
214 | # lbfgs::iter 89_1 fx: 275.757 gnorm: 0.0548509
215 | # lbfgs::iter 90_1 fx: 275.757 gnorm: 0.0667815
216 | # lbfgs::iter 91_1 fx: 275.757 gnorm: 0.182811
217 | # lbfgs::iter 92_1 fx: 275.757 gnorm: 0.0391792
218 | # lbfgs::iter 93_1 fx: 275.757 gnorm: 0.0289307
219 | # lbfgs::iter 94_1 fx: 275.757 gnorm: 0.0297646
220 | # lbfgs::iter 95_1 fx: 275.757 gnorm: 0.0807598
221 | # lbfgs::iter 96_1 fx: 275.757 gnorm: 0.0366649
222 | # lbfgs::iter 97_1 fx: 275.757 gnorm: 0.0289475
223 | # lbfgs::iter 98_1 fx: 275.757 gnorm: 0.0366467
224 | # lbfgs::iter 99_1 fx: 275.757 gnorm: 0.0515997
225 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SPOT-RNA2
2 | Improved RNA Secondary Structure and Tertiary Base-pairing Prediction using Evolutionary Profile, Mutational Coupling and Two-dimensional Transfer Learning.
3 |
4 | ## Contents
5 |
6 | * [Introduction](#introduction)
7 | * [Results](#results)
8 | * [System Requirments](#system-requirments)
9 | * [Installation](#installation)
10 | * [Usage](#Usage)
11 | * [Datasets](#datasets)
12 | * [Citation guide](#citation-guide)
13 | * [Licence](#licence)
14 | * [Contact](#contact)
15 |
16 | ## Introduction
17 |
18 | The recent discovery of numerous non-coding RNAs (long non-coding RNAs, in particular) has transformed our perception about the roles of RNAs in living organisms. Our ability to understand them, however, is hampered by our inability to solve their secondary and tertiary structures in high resolution efficiently by existing experimental techniques. Computational prediction of RNA secondary structure, on the other hand, has received much-needed improvement, recently, through deep learning of a large approximate data, followed by transfer learning with gold-standard base-pairing structures from high-resolution 3-D structures. Here, we expand this single-sequence-based learning to the use of evolutionary profiles and mutational coupling.
19 |
20 | |
21 | |----|
22 | |
Figure 1: (A) Inputted one dimensional (1-D) and two dimensional (2-D) features employed in SPOT-RNA2 (L is the RNA sequence length; BP is base-pair; CSS is consensus secondary structure). (B) An example of the model architecture of SPOT-RNA2. (C) The schematic diagram for model pre-training by the bpRNA data set (TR0) and transfer learning by PDB data set (TR1).|
23 |
24 | Results
25 | ----
26 | The new method allows large improvement not only in canonical base-pairs (RNA secondary structures) but more so in base-pairing associated with tertiary interactions such as pseudoknots, noncanonical and lone base-pairs. In particular, it is highly accurate for those RNAs of more than 1000 homologous sequences by achieving > 0.8 F1-score (harmonic mean of sensitivity and precision) for 14/16 RNAs tested. The method can also significantly improve base-pairing prediction by incorporating artificial but functional homologous sequences generated from deep mutational scanning without any modification. The fully automatic method should provide the scientific community a new powerful tool to capture not only the secondary structure but also tertiary base-pairing information for building three-dimensional models. It also highlights the future of accurately solving the base-pairing structure by using a large number of natural and/or artificial homologous sequences.
27 |
28 |
29 | |
30 | |----|
31 | |
Figure 2: Distribution of F1-scores for individual RNAs on the combined test sets TS1, TS2, and TS3 given by various methods as labeled. On each box, the central mark indicates the median, and the bottom and top edges of the box indicate the 25th and 75th percentiles, respectively. The outliers are plotted individually by using the “+” symbol.|
32 |
33 |
34 | ## System Requirments
35 |
36 | **Hardware Requirments:**
37 | It is recommended that your system should have 32 GB RAM, 500 GB disk space to support the in-memory operations for RNA sequence length less than 500. Multiple CPU threads are also recommended as the MSA generating process is computationally expensive.
38 |
39 | **Software Requirments:**
40 | * [Python3.6](https://docs.python-guide.org/starting/install3/linux/)
41 | * [Perl-5.4 or later](https://www.perl.org/get.html)
42 | * [virtualenv](https://virtualenv.pypa.io/en/latest/installation/) or [Anaconda](https://anaconda.org/anaconda/virtualenv)
43 | * [CUDA 10.0](https://developer.nvidia.com/cuda-10.0-download-archive) (Optional if using GPU)
44 | * [cuDNN (>= 7.4.1)](https://developer.nvidia.com/cudnn) (Optional if using GPU)
45 | * [Docker](https://docs.docker.com/engine/install/) (Optional if runnig SPOT-RNA2 through docker image)
46 |
47 | SPOT-RNA2 has been tested on Ubuntu 14.04, 16.04, and 18.04 operating systems.
48 |
49 |
50 | ## Installation
51 |
52 | ### Installation using Docker image:
53 |
54 | The following command can be used to install SPOT-RNA2 and its dependencies:
55 |
56 | 1. `git clone https://github.com/jaswindersingh2/SPOT-RNA2.git && cd SPOT-RNA2`
57 |
58 | 2. `docker image build -t spot_rna2 .`
59 |
60 | ### Manual installation:
61 |
62 | To install SPOT-RNA2 and its dependencies following commands can be used in the terminal:
63 |
64 |
65 | 1. `git clone https://github.com/jaswindersingh2/SPOT-RNA2.git && cd SPOT-RNA2`
66 | 2. `wget -O utils/models_ckps.tar.xz 'https://www.dropbox.com/s/udzcsva76lh5wvq/models_ckps.tar.xz' || wget -O utils/models_ckps.tar.xz 'https://app.nihaocloud.com/f/586acb2658d74ccb92b8/?dl=1'`
67 | 3. `tar -xvf utils/models_ckps.tar.xz -C utils/ && rm utils/models_ckps.tar.xz`
68 | 4. `sudo apt install cpanminus && sudo cpanm Graph && sudo apt install gawk`
69 |
70 | Based on the virtual environment package manager (**virtualenv** or **conda**) you have follow the stpes below:
71 |
72 | | | virtualenv | conda |
73 | | :- | :-------- | :--- |
74 | | 5. | `virtualenv -p python3.6 venv` | `conda create -n venv python=3.6` |
75 | | 6. | `source ./venv/bin/activate` | `conda activate venv` |
76 | | 7. | `pip install -r requirements.txt && deactivate` | `while read p; do conda install --yes $p; done < requirements.txt && conda deactivate` |
77 |
78 | If you have **Infernal** already installed, please set `binaries/` directory path of **Infernal** installation in line 12 of the `run_spotrna2.sh`. Otherwise, follow commands below to install **Infernal** tool. If you run into issue, please follow the [link](http://eddylab.org/infernal/) for more info.
79 |
80 | 8. `wget 'eddylab.org/infernal/infernal-1.1.3-linux-intel-gcc.tar.gz' && tar -xvzf infernal-*.tar.gz && rm infernal-*.tar.gz`
81 |
82 | If you have **BLASTN** already installed, please set `bin/` directory path of **BLASTN** installation in line 10 of the `run_spotrna2.sh`. Otherwise, follow commands below to install **BLASTN** tool. If you run into issue, please follow the [link](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download) for more info.
83 |
84 | 9. `wget 'ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-*+-x64-linux.tar.gz' && tar -xvzf ncbi-blast-*+-x64-linux.tar.gz && rm ncbi-blast-*+-x64-linux.tar.gz`
85 |
86 | To install the **SPOT-RNA** predictor, follow the commands below:
87 |
88 | 10. `git clone https://github.com/jaswindersingh2/SPOT-RNA.git && cd SPOT-RNA`
89 | 11. `wget 'https://www.dropbox.com/s/dsrcf460nbjqpxa/SPOT-RNA-models.tar.gz' || wget -O SPOT-RNA-models.tar.gz 'https://app.nihaocloud.com/f/fbf3315a91d542c0bdc2/?dl=1'`
90 | 12. `tar -xvzf SPOT-RNA-models.tar.gz && rm SPOT-RNA-models.tar.gz && cd ../`
91 |
92 | To install the DCA predictor, follow the commands below:
93 |
94 | 13. `git clone "https://github.com/sokrypton/GREMLIN_CPP" && cd GREMLIN_CPP && g++ -O3 -std=c++0x -o gremlin_cpp gremlin_cpp.cpp -fopenmp && cd ../`
95 |
96 | To install the LinearPartition, follow the commands below:
97 |
98 | 14. `git clone 'https://github.com/LinearFold/LinearPartition.git' && cd LinearPartition/ && make && cd ../`
99 |
100 | If NCBI's nt database already available in your system, please set path to the database directory in line 11 and 13 of the `run_spotrna.sh` file. Otherwise, use the following command to download. It can take few hours the download to finish depending on your internet speed. If you run into issue, please follow the [link](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download) for more info.
101 |
102 | 15. `wget -c "ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz" -O ./nt_database/nt.gz && gunzip ./nt_database/nt.gz`
103 |
104 | Database needs to be formated for using in **BLASTN**. Please follow the command below to format the database.
105 |
106 | 16. `./ncbi-blast-*+/bin/makeblastdb -in ./nt_database/nt -dbtype nucl`
107 |
108 |
109 | ## Usage
110 |
111 | ### Run SPOT-RNA2 using docker:
112 |
113 | `docker run --rm -ti -v $(pwd)/sample_run:/SPOT-RNA2/sample_run -v $(pwd)/nt_database/:/SPOT-RNA2/nt_database spot_rna2:latest ./run_spotrna2.sh sample_run/6ufj.fasta`
114 |
115 | ### Run SPOT-RNA2 using Manual installation:
116 | ```
117 | ./run_spotrna2.sh sample_run/6ufj.fasta
118 | ```
119 |
120 | Both command creates two folder `6ufj_features` and `6ufj_outputs` in input file directory (`sample_run/` in this case). `6ufj_features/` contains all the alignments (MSA-1, MSA-2) and features (PSSM, DCA, bps probability) generated from SPOT-RNA2 pipeline. `6ufj_outputs/` contains predicted secondary structure in bpseq format (`6ufj.bpseq`), ct format (`6ufj.ct`), dbn format (`6ufj.st`) with secondary structure motifs, and base-pair probability (`6ufj.prob`). The verify the results, it should be same as in `sample_seq_features` and `sample_seq_outputs` folder because both sequence (`sample_seq.fasta` and `6ufj.fasta`) are same.
121 |
122 | ## Datasets
123 |
124 | The following datasets were used for Initial Training:
125 | * bpRNA[1]: Initial Learning (Training TR0, validation VL0, and test TS0)
126 | [Dropbox](https://www.dropbox.com/s/sg1p1otsqnaqev8/bpRNA_dataset.tar.xz) or [Nihao Cloud](https://app.nihaocloud.com/f/6f7b456d874c4842b8ac/)
127 |
128 |
129 | The following datasets were used for Transfer Learning:
130 | * PDB[2]: Transfer Learning (Training TR1, validation VL1, and testsets TS1, TS2, TS3, and TS_hard)
131 | [Dropbox](https://www.dropbox.com/s/apqrsl7hm1091ie/PDB_dataset.tar.xz) or [Nihao Cloud](https://app.nihaocloud.com/f/f301baed4dac4474a185/)
132 |
133 | ## Citation guide
134 |
135 | **If you use SPOT-RNA2 for your research please cite the following papers:**
136 |
137 | Jaswinder Singh, Kuldip Paliwal, Tongchuan Zhang, Jaspreet Singh, Thomas Litfin, Yaoqi Zhou, Improved RNA Secondary Structure and Tertiary Base-pairing Prediction Using Evolutionary Profile, Mutational Coupling and Two-dimensional Transfer Learning, Bioinformatics, 2021;, btab165, https://doi.org/10.1093/bioinformatics/btab165
138 |
139 | **If you use SPOT-RNA2 data sets and/or input feature pipeline, please consider citing the following papers:**
140 |
141 | [1] Zhang, T., Singh, J., Litfin, T., Zhan, J., Paliwal, K. and Zhou, Y., 2020. RNAcmap: A Fully Automatic Method for Predicting Contact Maps of RNAs by Evolutionary Coupling Analysis. bioRxiv.
142 |
143 | [2] Zhang, H., Zhang, L., Mathews, D.H. and Huang, L., 2020. LinearPartition: linear-time approximation of RNA folding partition function and base-pairing probabilities. Bioinformatics, 36(Supplement_1), pp.i258-i267.
144 |
145 | [3] Singh, J., Hanson, J., Paliwal, K. and Zhou, Y., 2019. RNA secondary structure prediction using an ensemble of two-dimensional deep neural networks and transfer learning. Nature communications, 10(1), pp.1-13.
146 |
147 | [4] Nawrocki, E.P. and Eddy, S.R., 2013. Infernal 1.1: 100-fold faster RNA homology searches. Bioinformatics, 29(22), pp.2933-2935.
148 |
149 | [5] H.M. Berman, J. Westbrook, Z. Feng, G. Gilliland, T.N. Bhat, H. Weissig, I.N. Shindyalov, P.E. Bourne. (2000) The Protein Data Bank Nucleic Acids Research, 28: 235-242.
150 |
151 | [6] Padideh Danaee, Mason Rouches, Michelle Wiley, Dezhong Deng, Liang Huang, David Hendrix, bpRNA: large-scale automated annotation and analysis of RNA secondary structure, Nucleic Acids Research, Volume 46, Issue 11, 20 June 2018, Pages 5381–5394, https://doi.org/10.1093/nar/gky285
152 |
153 | [7] Kamisetty, H., Ovchinnikov, S. and Baker, D., 2013. Assessing the utility of coevolution-based residue–residue contact predictions in a sequence-and structure-rich era. Proceedings of the National Academy of Sciences, 110(39), pp.15674-15679.
154 |
155 | [8] Chiu, J.K.H. and Chen, Y.P.P., 2014. Efficient conversion of RNA pseudoknots to knot-free structures using a graphical model. IEEE Transactions on Biomedical Engineering, 62(5), pp.1265-1271.
156 |
157 | Licence
158 | -----
159 | Mozilla Public License 2.0
160 |
161 |
162 | Contact
163 | -----
164 | jaswinder.singh3@griffithuni.edu.au, yaoqi.zhou@griffith.edu.au
165 |
--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os, six, sys, subprocess
3 | import tensorflow as tf
4 | import random
5 | from tqdm import tqdm
6 | import pandas as pd
7 | from pathlib import Path
8 |
9 | # ------------- one hot encoding of RNA sequences -----------------#
10 | def one_hot(seq):
11 | RNN_seq = seq
12 | BASES = 'AUCG'
13 | bases = np.array([base for base in BASES])
14 | feat = np.concatenate(
15 | [[(bases == base.upper()).astype(int)] if str(base).upper() in BASES else np.array([[-1] * len(BASES)]) for base
16 | in RNN_seq])
17 |
18 | return feat
19 |
20 |
21 | def z_mask(seq_len):
22 | mask = np.ones((seq_len, seq_len))
23 | return np.triu(mask, 2)
24 |
25 | def l_mask(inp, seq_len):
26 | mask = np.ones((seq_len, seq_len))
27 | return np.triu(mask, 1)
28 |
29 | def get_data(seq, rna_id, args):
30 | seq_len = len(seq)
31 | one_hot_feat = one_hot(seq)
32 |
33 | with open(os.path.splitext(args.inputs)[0] + '.pssm') as f:
34 | temp = pd.read_csv(f, comment='#', delim_whitespace=True, header=None).values
35 | seq = ['U' if k == 'T' else k for k in temp[:, 0]]
36 | profile = temp[:, 1:5].astype(float)
37 | off_set = np.zeros((len(seq), profile.shape[1])) + 0.3
38 | for k, K in enumerate(seq):
39 | try:
40 | off_set[k, BASES.index(K)] = 8.7
41 | except:
42 | pass
43 | profile += off_set
44 | profile /= np.sum(profile, axis=1, keepdims=True)
45 | profile = -np.log(profile)
46 |
47 | profile_one_hot = np.concatenate([profile, one_hot_feat], axis=1)
48 |
49 | ############ load base-pair prob form linearpartition ##############################
50 | try:
51 | with open(os.path.splitext(args.inputs)[0] + '.prob', 'r') as f:
52 | prob = pd.read_csv(f, delimiter=None, delim_whitespace=True, header=None, skiprows=[0]).values
53 | bp_prob_lp = np.zeros((len(seq), len(seq)))
54 | for i in prob:
55 | bp_prob_lp[int(i[0])-1, int(i[1])-1] = i[2]
56 | bp_prob_lp = bp_prob_lp + np.transpose(bp_prob_lp)
57 | except:
58 | print("linearpartition output missing",rna_id)
59 | bp_prob_lp = np.zeros((len(seq), len(seq)))
60 |
61 | ############ load dca obtained from gremlin ##############################
62 | try:
63 | with open(os.path.splitext(args.inputs)[0] + '.dca') as f:
64 | temp4 = pd.read_csv(f, comment='#', delim_whitespace=True, header=None, skiprows=[0], usecols=[0,1,2]).values
65 | #print(temp4.shape)
66 | dca_output = np.zeros((len(seq), len(seq)))
67 | for k in temp4:
68 | if abs(int(k[0]) - int(k[1])) < 4:
69 | dca_output[int(k[0]), int(k[1])] = 0.01*k[2]
70 | else:
71 | dca_output[int(k[0]), int(k[1])] = k[2]
72 | dca_output = dca_output + np.transpose(dca_output)
73 | except:
74 | print("dca missing", rna_id)
75 | dca_output = np.zeros((len(seq), len(seq)))
76 |
77 | zero_mask = z_mask(seq_len)[None, :, :, None]
78 | label_mask = l_mask(profile_one_hot, seq_len)
79 | temp = profile_one_hot[None, :, :]
80 | temp = np.tile(temp, (temp.shape[1], 1, 1))
81 | feature = np.concatenate([temp, np.transpose(temp, [1, 0, 2])], 2)
82 | feature = np.concatenate([feature, np.expand_dims(dca_output, axis=2)], axis=2)
83 | feature = np.concatenate([feature, np.expand_dims(bp_prob_lp, axis=2)], axis=2)
84 |
85 | assert feature.shape==(seq_len,seq_len, 18)
86 |
87 | return seq_len, [i for i in (feature.astype(float)).flatten()], [i for i in zero_mask.flatten()], [i for i in label_mask.flatten()], [i for i in label_mask.flatten()]
88 |
89 | def _int64_feature(value):
90 | if not isinstance(value, list) and not isinstance(value, np.ndarray):
91 | value = [value]
92 |
93 | return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
94 |
95 |
96 | def _float_feature(value):
97 | if not isinstance(value, list) and not isinstance(value, np.ndarray):
98 | value = [value]
99 |
100 | return tf.train.Feature(float_list=tf.train.FloatList(value=value))
101 |
102 |
103 | def _bytes_feature(value):
104 | """Wrapper for inserting bytes features into Example proto."""
105 | if isinstance(value, six.string_types):
106 | value = six.binary_type(value, encoding='utf-8')
107 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
108 |
109 | def create_tfr_files(args):
110 |
111 | print('\nPreparing tfr records file for SPOT-RNA2:')
112 | path_tfrecords = os.path.splitext(args.inputs)[0] + ".tfrecords"
113 | with open(args.inputs) as file:
114 | input_data = [line.strip() for line in file.read().splitlines() if line.strip()]
115 |
116 | count = int(len(input_data)/2)
117 |
118 | ids = [input_data[2*i][1:].strip() for i in range(count)]
119 |
120 | with tf.io.TFRecordWriter(path_tfrecords) as writer:
121 | for i in tqdm(range(len(ids))):
122 | name = input_data[2*i].replace(">", "")
123 | sequence = input_data[2*i+1].replace(" ", "").replace("T", "U").upper()
124 | #print(sequence[-1])
125 |
126 | #print(len(sequence), name)
127 | seq_len, feature, zero_mask, label_mask, true_label = get_data(sequence, name, args)
128 |
129 | example = tf.train.Example(features=tf.train.Features(feature={'rna_name': _bytes_feature(name),
130 | 'seq_len': _int64_feature(seq_len),
131 | 'feature': _float_feature(feature),
132 | 'zero_mask': _float_feature(zero_mask),
133 | 'label_mask': _float_feature(label_mask),
134 | 'true_label': _float_feature(true_label)}))
135 |
136 | writer.write(example.SerializeToString())
137 |
138 | writer.close()
139 |
140 | # ----------------------- hair pin loop assumption i - j < 2 --------------------------------#
141 | def hair_pin_assumption(pred_pairs):
142 | pred_pairs_all = [i[:2] for i in pred_pairs]
143 | bad_pairs = []
144 | for i in pred_pairs_all:
145 | if abs(i[0] - i[1]) < 3:
146 | bad_pairs.append(i)
147 | return bad_pairs
148 |
149 | def flatten(x):
150 | result = []
151 | for el in x:
152 | if hasattr(el, "__iter__") and not isinstance(el, str):
153 | result.extend(flatten(el))
154 | else:
155 | result.append(el)
156 | return result
157 |
158 | def type_pairs(pairs, sequence):
159 | sequence = [i.upper() for i in sequence]
160 | # seq_pairs = [[sequence[i[0]],sequence[i[1]]] for i in pairs]
161 |
162 | AU_pair = []
163 | GC_pair = []
164 | GU_pair = []
165 | other_pairs = []
166 | for i in pairs:
167 | if [sequence[i[0]],sequence[i[1]]] in [["A","U"], ["U","A"]]:
168 | AU_pair.append(i)
169 | elif [sequence[i[0]],sequence[i[1]]] in [["G","C"], ["C","G"]]:
170 | GC_pair.append(i)
171 | elif [sequence[i[0]],sequence[i[1]]] in [["G","U"], ["U","G"]]:
172 | GU_pair.append(i)
173 | else:
174 | other_pairs.append(i)
175 | watson_pairs_t = AU_pair + GC_pair
176 | wobble_pairs_t = GU_pair
177 | other_pairs_t = other_pairs
178 | # print(watson_pairs_t, wobble_pairs_t, other_pairs_t)
179 | return watson_pairs_t, wobble_pairs_t, other_pairs_t
180 |
181 | # ----------------------- find multiplets pairs--------------------------------#
182 | def multiplets_pairs(pred_pairs):
183 |
184 | pred_pair = [i[:2] for i in pred_pairs]
185 | temp_list = flatten(pred_pair)
186 | temp_list.sort()
187 | new_list = sorted(set(temp_list))
188 | dup_list = []
189 | for i in range(len(new_list)):
190 | if (temp_list.count(new_list[i]) > 1):
191 | dup_list.append(new_list[i])
192 |
193 | dub_pairs = []
194 | for e in pred_pair:
195 | if e[0] in dup_list:
196 | dub_pairs.append(e)
197 | elif e[1] in dup_list:
198 | dub_pairs.append(e)
199 |
200 | temp3 = []
201 | for i in dup_list:
202 | temp4 = []
203 | for k in dub_pairs:
204 | if i in k:
205 | temp4.append(k)
206 | temp3.append(temp4)
207 |
208 | return temp3
209 |
210 | def multiplets_free_bp(pred_pairs, y_pred):
211 | L = len(pred_pairs)
212 | multiplets_bp = multiplets_pairs(pred_pairs)
213 | save_multiplets = []
214 | while len(multiplets_bp) > 0:
215 | remove_pairs = []
216 | for i in multiplets_bp:
217 | save_prob = []
218 | for j in i:
219 | save_prob.append(y_pred[j[0], j[1]])
220 | remove_pairs.append(i[save_prob.index(min(save_prob))])
221 | save_multiplets.append(i[save_prob.index(min(save_prob))])
222 | pred_pairs = [k for k in pred_pairs if k not in remove_pairs]
223 | multiplets_bp = multiplets_pairs(pred_pairs)
224 | save_multiplets = [list(x) for x in set(tuple(x) for x in save_multiplets)]
225 | assert L == len(pred_pairs)+len(save_multiplets)
226 | #print(L, len(pred_pairs), save_multiplets)
227 | return pred_pairs, save_multiplets
228 |
229 | def output_mask(seq, NC=True):
230 | if NC:
231 | include_pairs = ['AU', 'UA', 'GC', 'CG', 'GU', 'UG', 'CC', 'GG', 'AG', 'CA', 'AC', 'UU', 'AA', 'CU', 'GA', 'UC']
232 | else:
233 | include_pairs = ['AU', 'UA', 'GC', 'CG', 'GU', 'UG']
234 | mask = np.zeros((len(seq), len(seq)))
235 | for i, I in enumerate(seq):
236 | for j, J in enumerate(seq):
237 | if str(I) + str(J) in include_pairs:
238 | mask[i, j] = 1
239 | return mask
240 |
241 | def ct_file_output(pairs, seq, id, save_result_path):
242 |
243 | col1 = np.arange(1, len(seq) + 1, 1)
244 | col2 = np.array([i for i in seq])
245 | col3 = np.arange(0, len(seq), 1)
246 | col4 = np.append(np.delete(col1, 0), [0])
247 | col5 = np.zeros(len(seq), dtype=int)
248 |
249 | for i, I in enumerate(pairs):
250 | col5[I[0]] = int(I[1]) + 1
251 | col5[I[1]] = int(I[0]) + 1
252 | col6 = np.arange(1, len(seq) + 1, 1)
253 | temp = np.vstack((np.char.mod('%d', col1), col2, np.char.mod('%d', col3), np.char.mod('%d', col4),
254 | np.char.mod('%d', col5), np.char.mod('%d', col6))).T
255 |
256 | np.savetxt(os.path.join(save_result_path, str(id))+'.ct', (temp), delimiter='\t\t', fmt="%s", header=str(len(seq)) + '\t\t' + str(id) + '\t\t' + 'SPOT-RNA2 output\n' , comments='')
257 |
258 | return
259 |
260 | def bpseq_file_output(pairs, seq, id, save_result_path):
261 |
262 | col1 = np.arange(1, len(seq) + 1, 1)
263 | col2 = np.array([i for i in seq])
264 | #col3 = np.arange(0, len(seq), 1)
265 | #col4 = np.append(np.delete(col1, 0), [0])
266 | col5 = np.zeros(len(seq), dtype=int)
267 |
268 | for i, I in enumerate(pairs):
269 | col5[I[0]] = int(I[1]) + 1
270 | col5[I[1]] = int(I[0]) + 1
271 | #col6 = np.arange(1, len(seq) + 1, 1)
272 | temp = np.vstack((np.char.mod('%d', col1), col2, np.char.mod('%d', col5))).T
273 | #os.chdir(save_result_path)
274 | #print(os.path.join(save_result_path, str(id[0:-1]))+'.spotrna')
275 | np.savetxt(os.path.join(save_result_path, str(id))+'.bpseq', (temp), delimiter=' ', fmt="%s", header='#' + str(id) , comments='')
276 |
277 | return
278 |
279 | def lone_pair(pairs):
280 | lone_pairs = []
281 | pairs.sort()
282 | for i, I in enumerate(pairs):
283 | if ([I[0] - 1, I[1] + 1] not in pairs) and ([I[0] + 1, I[1] - 1] not in pairs):
284 | lone_pairs.append(I)
285 |
286 | return lone_pairs
287 |
288 | def prob_to_secondary_structure(ensemble_outputs, label_mask, seq, name, args):
289 | #save_result_path = 'outputs'
290 | Threshold = 0.795
291 | label_mask = np.triu(np.ones((len(seq), len(seq))),1)
292 | test_output = ensemble_outputs
293 | mask = output_mask(seq)
294 | inds = np.where(label_mask == 1)
295 | y_pred = np.zeros(label_mask.shape)
296 | for i in range(test_output.shape[0]):
297 | y_pred[inds[0][i], inds[1][i]] = test_output[i]
298 | #y_pred = np.multiply(y_pred, mask)
299 |
300 | tri_inds = np.triu_indices(y_pred.shape[0], k=1)
301 |
302 | out_pred = y_pred[tri_inds]
303 | outputs = out_pred[:, None]
304 | seq_pairs = [[tri_inds[0][j], tri_inds[1][j], ''.join([seq[tri_inds[0][j]], seq[tri_inds[1][j]]])] for j in
305 | range(tri_inds[0].shape[0])]
306 |
307 | outputs_T = np.greater_equal(outputs, Threshold)
308 | pred_pairs = [i for I, i in enumerate(seq_pairs) if outputs_T[I]]
309 | pred_pairs = [i[:2] for i in pred_pairs]
310 | pred_pairs, save_multiplets = multiplets_free_bp(pred_pairs, y_pred)
311 |
312 | if args.outputs=='outputs/':
313 | output_path = os.path.join(Path(os.path.dirname(os.path.realpath(__file__))).parent, args.outputs)
314 | else:
315 | output_path = args.outputs
316 |
317 | ct_file_output(pred_pairs, seq, name, output_path)
318 | bpseq_file_output(pred_pairs, seq, name, output_path)
319 | np.savetxt(output_path + '/'+ name +'.prob', y_pred, delimiter='\t')
320 |
321 | if args.motifs:
322 | try:
323 | os.chdir(args.outputs)
324 | p = subprocess.Popen(['perl', os.path.join(Path(os.path.dirname(os.path.realpath(__file__))).parent, 'utils/bpRNA.pl'), os.path.join(args.outputs, name + '.bpseq')])
325 | except:
326 | print('\nUnable to run bpRNA script;\nplease refer to "https://github.com/hendrixlab/bpRNA/" for system requirments to use bpRNA')
327 | #os.chdir('../')
328 |
329 | return
330 |
--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.msa:
--------------------------------------------------------------------------------
1 | # STOCKHOLM 1.0
2 | #=GF AU Infernal 1.1.3
3 |
4 | #=GS 6UFJ_A/1-51 DE Chain A, RNA (50-MER)
5 | #=GS 6UEY_A/1-50 DE Chain A, RNA (50-MER)
6 | #=GS HE577054.1/3246821-3246757 DE Paenibacillus polymyxa M1 main chromosome, complete genome
7 | #=GS MF288922.1/150528-150592 DE Bacillus phage Janet, complete genome
8 | #=GS CP033464.1/4485719-4485655 DE Brevibacillus laterosporus strain 1821L chromosome, complete genome
9 | #=GS KT307976.1/157679-157741 DE Bacillus phage AvesoBmore, complete genome
10 | #=GS CP032410.1/870062-870126 DE Brevibacillus laterosporus strain E7593-50 chromosome, complete genome
11 | #=GS MK892513.1/27480-27550 DE Prokaryotic dsDNA virus sp. isolate Unbinned_2716_contig-100_1, complete genome
12 | #=GS MK892777.1/32264-32334 DE Prokaryotic dsDNA virus sp. isolate Tp1_39_SUR_34326_1, partial genome
13 | #=GS MF288921.1/151458-151522 DE Bacillus phage OTooleKemple52, complete genome
14 | #=GS MH638310.1/151443-151507 DE Bacillus phage Kamfam, complete genome
15 | #=GS KJ489397.1/151758-151822 DE Bacillus phage CAM003, complete genome
16 | #=GS KJ489398.1/150857-150921 DE Bacillus phage Evoli, complete genome
17 | #=GS KJ489400.1/150952-151016 DE Bacillus phage Hoody T, complete genome
18 | #=GS KU737346.1/152020-152084 DE Bacillus phage Vinny, complete genome
19 | #=GS KF669647.1/155754-155816 DE Bacillus phage BigBertha, complete genome
20 | #=GS KU737345.1/154884-154946 DE Bacillus phage Juglone, complete genome
21 | #=GS KU737347.1/155734-155796 DE Bacillus phage Phrodo, complete genome
22 | #=GS MN038178.1/155190-155252 DE Bacillus phage Beyonphe, complete genome
23 | #=GS KF208639.2/156075-156137 DE Bacillus phage Troll, complete genome
24 | #=GS CP009278.1/2800251-2800310 DE Sphingobacterium sp. ML3W, complete genome
25 | #=GS CP045298.1/5377890-5377826 DE Paenibacillus brasilensis strain KACC 13842 chromosome, complete genome
26 | #=GS KF669662.1/155100-155162 DE Bacillus phage Spock, complete genome
27 | #=GS KR063281.1/60079-60028 DE Gordonia phage GMA2, complete genome
28 | #=GS KJ489402.1/153758-153819 DE Bacillus phage Riley, complete genome
29 | #=GS MF765814.1/155980-156041 DE Bacillus phage Taffo16, complete genome
30 | #=GS CP000154.2/3364238-3364174 DE Paenibacillus polymyxa E681, complete genome
31 | #=GS LN852800.1/7754-7693 DE Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0110
32 | #=GS CP019039.1/7984-8046 DE Bacillus velezensis strain GH1-13 plasmid unnamed, complete sequence
33 | #=GS LN852940.1/1904-1844 DE Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0268
34 | #=GS JN790865.1/35681-35620 DE Bacillus phage B4, complete genome
35 | #=GS JN797796.1/35736-35675 DE Bacillus phage B5S, complete genome
36 | #=GS KY888882.1/156410-156472 DE Bacillus phage Flapjack, complete genome
37 | #=GS CP014843.1/29638-29697 DE Bacillus licheniformis strain SCDB 14 plasmid pSCDB14, complete sequence
38 | #=GS CP021670.1/37922-37863 DE Bacillus licheniformis strain SRCM100141 plasmid pBL141-2 sequence
39 | #=GS CP035189.1/167253-167194 DE Bacillus licheniformis strain SRCM103914 plasmid unnamed1, complete sequence
40 | #=GS CP045906.1/14639513-14639571 DE Caligus rogercresseyi isolate FCH chromosome 17
41 | #=GS HG916826.1/843085-843030 DE Pseudomonas pseudoalcaligenes CECT 5344 complete genome
42 | #=GS LK391695.1/845304-845249 DE Pseudomonas pseudoalcaligenes genome assembly Ppseudo_Pac, chromosome : I
43 | #=GS XM_028713395.1/30-87 DE PREDICTED: Podarcis muralis solute carrier family 16 member 6 (SLC16A6), mRNA
44 | #=GS AC100771.2/133706-133648 DE Homo sapiens chromosome 11, clone RP11-159H22, complete sequence
45 | #=GS CP022654.2/63818-63880 DE Bacillus velezensis strain SCDB 291 chromosome, complete genome
46 | #=GS CP023320.1/44833-44771 DE Bacillus velezensis strain SCGB 1 chromosome, complete genome
47 | #=GS CP045899.1/5107513-5107456 DE Caligus rogercresseyi isolate FCH chromosome 10
48 | #=GS CP045890.1/2686952-2687009 DE Caligus rogercresseyi isolate FCH chromosome 1
49 | #=GS CP010557.1/4528803-4528858 DE Raoultella ornithinolytica strain S12, complete genome
50 | #=GS LR134253.1/1479651-1479596 DE Klebsiella aerogenes strain NCTC9997 genome assembly, chromosome: 3
51 | #=GS MH153801.1/58164-58217 DE Microbacterium phage Count, complete genome
52 | #=GS CP045896.1/486401-486459 DE Caligus rogercresseyi isolate FCH chromosome 7
53 | #=GS CP045901.1/8022709-8022767 DE Caligus rogercresseyi isolate FCH chromosome 12
54 |
55 | 6UFJ_A/1-51 ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCG.GAGAGCAG...A-...............---------
56 | #=GR 6UFJ_A/1-51 PP ******************************************.********...8.........................
57 | 6UEY_A/1-50 ACUCGUUUGAGCGAGUAUAAACAGUUGGUUAGGCUCAAAGCG.GAGAGCAG...--...............---------
58 | #=GR 6UEY_A/1-50 PP ******************************************.********.............................
59 | HE577054.1/3246821-3246757 ACUCGUCUGAGCGAGUAUAAACAGGUCAUUAAGCUCAGAGCG.UUCACCG-...--ggau....caug...-CGGUGAGG
60 | #=GR HE577054.1/3246821-3246757 PP ******************************************.******8......5666....6665....8*******
61 | MF288922.1/150528-150592 ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAAGCUCACAGCGuUAGAGAGG...--guu......ucu...CCUCUCUAG
62 | #=GR MF288922.1/150528-150592 PP *************************7.59*************88******9.....577......777...9********
63 | CP033464.1/4485719-4485655 ACUCGAUUGAGCGAGUAUAAACAGAC-CUUAGGCUCAAAGCG.UUGAGAAG...--caa.....aaag...CUUCUCAGG
64 | #=GR CP033464.1/4485719-4485655 PP ************************76.59*************.*******9.....677.....7777...9********
65 | KT307976.1/157679-157741 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGCGGG
66 | #=GR KT307976.1/157679-157741 PP ***********************854.499************.********76666...............*********
67 | CP032410.1/870062-870126 ACUCGAUUGAGCGAGUAUAAAUAGAC-CUUAAGCUCAAAGCG.UUGAGGAG...--cga.....ucag...CUUCUCAGG
68 | #=GR CP032410.1/870062-870126 PP ************************76.59*************.*******9.....677.....7777...9********
69 | MK892513.1/27480-27550 AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGuGCGUAUAG...--cuaggucaagug...CUAUACGCG
70 | #=GR MK892513.1/27480-27550 PP ************************9.89**********************9.....8***********...9********
71 | MK892777.1/32264-32334 AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGuGCGUAUAG...--cuaggucaagug...CUAUACGCG
72 | #=GR MK892777.1/32264-32334 PP ************************9.89**********************9.....8***********...9********
73 | MF288921.1/151458-151522 ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGuUAGAGAGG...--guu......ucu...CCUCUCUAG
74 | #=GR MF288921.1/151458-151522 PP ************************75.59*************88******9.....577......777...9********
75 | MH638310.1/151443-151507 ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGuUAGAGAGG...--guu......ucu...CCUCUCUAG
76 | #=GR MH638310.1/151443-151507 PP ************************75.59*************88******9.....577......777...9********
77 | KJ489397.1/151758-151822 ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG
78 | #=GR KJ489397.1/151758-151822 PP *************************7.59*************889*99999.....577......777...999****9*
79 | KJ489398.1/150857-150921 ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG
80 | #=GR KJ489398.1/150857-150921 PP *************************7.59*************889*99999.....577......777...999****9*
81 | KJ489400.1/150952-151016 ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG
82 | #=GR KJ489400.1/150952-151016 PP *************************7.59*************889*99999.....577......777...999****9*
83 | KU737346.1/152020-152084 ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG
84 | #=GR KU737346.1/152020-152084 PP *************************7.59*************889*99999.....577......777...999****9*
85 | KF669647.1/155754-155816 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG
86 | #=GR KF669647.1/155754-155816 PP ***********************854.499************.********76666...............*********
87 | KU737345.1/154884-154946 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG
88 | #=GR KU737345.1/154884-154946 PP ***********************854.499************.********76666...............*********
89 | KU737347.1/155734-155796 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG
90 | #=GR KU737347.1/155734-155796 PP ***********************854.499************.********76666...............*********
91 | MN038178.1/155190-155252 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG
92 | #=GR MN038178.1/155190-155252 PP ***********************854.499************.********76666...............*********
93 | KF208639.2/156075-156137 ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGcuuAU...............CCCCGUGGG
94 | #=GR KF208639.2/156075-156137 PP ***********************854.499************.********76666...............*********
95 | CP009278.1/2800251-2800310 AGUCGUUUGAGCGACUUAAAAUAGGU-UUUAAGCUCAAAGCG.CCCCGAUA...AU...............AAUCGGGAG
96 | #=GR CP009278.1/2800251-2800310 PP ************************98.499************.********...**...............*********
97 | CP045298.1/5377890-5377826 GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCG.UUCACCGG..gAU............cauCCGGUGAGG
98 | #=GR CP045298.1/5377890-5377826 PP ******************************************.*******9..643............334*********
99 | KF669662.1/155100-155162 ACUCGUGUAAGCGAGUAUAAAAAGGC-UUUAGGCUUACAGCG.UCGCGGAGuuuAU...............CUCCGCGGG
100 | #=GR KF669662.1/155100-155162 PP *********************99843.499************.********76666...............*********
101 | KR063281.1/60079-60028 ACUCGACUGAGCGAGUAUAAACAGUU-CUUAAGCUCAGAGCG.GCC-----...--ga........ga...-----GGCG
102 | #=GR KR063281.1/60079-60028 PP ************************88.59*************.985..........67........76........589*
103 | KJ489402.1/153758-153819 ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCG.UCGCGGG-...--guuu....aucu...-C--CCGCG
104 | #=GR KJ489402.1/153758-153819 PP ***********************854.49*************.6665555......4566....6654....4..5555*
105 | MF765814.1/155980-156041 ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCG.UCGCGGG-...--guuu....aucu...-C--CCGCG
106 | #=GR MF765814.1/155980-156041 PP ***********************854.49*************.6665555......4566....6654....4..5555*
107 | CP000154.2/3364238-3364174 GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCG.UUCACUGG...A-uu.......cgu...CCAGUGAGA
108 | #=GR CP000154.2/3364238-3364174 PP ******************************************.********...8.55.......555...*********
109 | LN852800.1/7754-7693 GCUCGUCUGGGCGAGGAUAAACAGCUA-UUAAGCCCAGAGCG.UUCCGGUU...AU............a.uGAUCGGAGG
110 | #=GR LN852800.1/7754-7693 PP **************************5.9*************.*****998...64............3.3789******
111 | CP019039.1/7984-8046 AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCUCAGAGCG.UCCUUCC-...--ugc.....uucg...-GGAAGGGG
112 | #=GR CP019039.1/7984-8046 PP ***********************975.69*************.***9997......688.....8886....7999****
113 | LN852940.1/1904-1844 GCUCGUCUGGGCGAGGGUAAAUAGCUAAUUAGGCCCAGAGCGuUCCAGGAU...G-...............AUCCUGGAG
114 | #=GR LN852940.1/1904-1844 PP ******************************************889******...9................*********
115 | JN790865.1/35681-35620 AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGG...--uu........ua...UCCCCCGUG
116 | #=GR JN790865.1/35681-35620 PP ***********************854.499************.99977665.....33........33...34555888*
117 | JN797796.1/35736-35675 AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGG...--uu........ua...UCCCCCGUG
118 | #=GR JN797796.1/35736-35675 PP ***********************854.499************.99977665.....33........33...34555888*
119 | KY888882.1/156410-156472 ACUCGUGUGAGUGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGG...--uuu......auc...CCCUGCG-G
120 | #=GR KY888882.1/156410-156472 PP ***********************854.499************.99999999.....455......555...8899999.*
121 | CP014843.1/29638-29697 AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCG.UUUCCCUU...CU...............AGGGGAGGU
122 | #=GR CP014843.1/29638-29697 PP ***********************975.69*************.********...**...............*********
123 | CP021670.1/37922-37863 AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCG.UUUCCCUU...CU...............AGGGGAGGU
124 | #=GR CP021670.1/37922-37863 PP ***********************975.69*************.********...**...............*********
125 | CP035189.1/167253-167194 AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCG.UUUCCCUU...CU...............AGGGGAGGU
126 | #=GR CP035189.1/167253-167194 PP ***********************975.69*************.********...**...............*********
127 | CP045906.1/14639513-14639571 UCUUGCUUGAGCAAGAAUAAAGAGCUGUACAUAAGCAAAGAG.UCUUGCCU...--...............GAGCAAGAG
128 | #=GR CP045906.1/14639513-14639571 PP ***************************999999999******.*****943....................569******
129 | HG916826.1/843085-843030 CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCG.AAAUUCGG...GG...............CCG-----G
130 | #=GR HG916826.1/843085-843030 PP ******************************************.********...**...............***.....*
131 | LK391695.1/845304-845249 CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCG.AAAUUCGG...GG...............CCG-----G
132 | #=GR LK391695.1/845304-845249 PP ******************************************.********...**...............***.....*
133 | XM_028713395.1/30-87 ACCGGCUCGAGCCGGUAUAAAAAGCU---UGAGCUCGAGCAC.AGCGGCAG...CA...............CUGCCGCAG
134 | #=GR XM_028713395.1/30-87 PP *************************7...669****998888.9*******...99...............*********
135 | AC100771.2/133706-133648 GUUCAUUUGGGUGAAUAUAAAAAGGAGAUUA--CUCAAAGCU.UUAAAAAA...AA...............UUUUUUUAA
136 | #=GR AC100771.2/133706-133648 PP ******************************9..9********.98888888...88...............*********
137 | CP022654.2/63818-63880 AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCG.UCCUUCC-...--ugc.....uacg...-GGAAGGGG
138 | #=GR CP022654.2/63818-63880 PP ************************86.69*************.****997......678.....8886....899*****
139 | CP023320.1/44833-44771 AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCG.UCCUUCC-...--ugc.....uacg...-GGAAGGGG
140 | #=GR CP023320.1/44833-44771 PP ************************86.69*************.****997......678.....8886....899*****
141 | CP045899.1/5107513-5107456 UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGCUG...--...............-AGCAAGAG
142 | #=GR CP045899.1/5107513-5107456 PP ***************************999999999******.******85.....................59******
143 | CP045890.1/2686952-2687009 UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGCUG...--...............-AGCAAGAG
144 | #=GR CP045890.1/2686952-2687009 PP ***************************999999999******.******85.....................59******
145 | CP010557.1/4528803-4528858 CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------.UCAGGCAG...AU..............uCUGCCAGAG
146 | #=GR CP010557.1/4528803-4528858 PP **********************************96.......8889****...96..............6*********
147 | LR134253.1/1479651-1479596 CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------.UCAGGCAG...AU..............uCUGCCAGAG
148 | #=GR LR134253.1/1479651-1479596 PP **********************************96.......8889****...96..............6*********
149 | MH153801.1/58164-58217 AGUCGUCUGAGCGACUUUAAAUAGGU-CUUAGGCUCAGAGCG.GAUAGAUG...--...............----UAUUG
150 | #=GR MH153801.1/58164-58217 PP ************************98.49*************.*9985433........................4566*
151 | CP045896.1/486401-486459 UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGC--...AU...............GAGCAAGAG
152 | #=GR CP045896.1/486401-486459 PP ***************************999999999******.*****9.....77...............78*******
153 | CP045901.1/8022709-8022767 UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGC--...AU...............GAGCAAGAG
154 | #=GR CP045901.1/8022709-8022767 PP ***************************999999999******.*****9.....77...............78*******
155 | #=GC SS_cons <<<<<<____>>>>>>--------------------------.<<<<<<<<...__~~~~~~~~~~~~...>>>>>>>>:
156 | #=GC RF ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCG.GAGAGCAG...AU~~~~~~~~~~~~...CUGCUCUCG
157 | //
158 |
--------------------------------------------------------------------------------
/utils/parse_blastn_local.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use strict;
3 |
4 |
5 |
6 | # Default options
7 | my $E_max=10;
8 | my $E_min=-1.0;
9 | my $P_max=1;
10 | my $cov_thrshd=0;
11 | my $sc_thrshd=-10;
12 | my $qid_thrshd=0;
13 | my $bl=-10; # minimum per-residue bit score with query at ends of HSP for loose end pruning
14 | my $bs=-10; # minimum per-residue bit score with query at ends of HSP for strict end pruning
15 | my $bg=30; # below this number of end gaps the loose HSP pruning score is used
16 | my $outformat="fas";
17 | my $append=0;
18 | my $query_file="";
19 | my $infile;
20 | my $outfile;
21 | my $v=2;
22 |
23 | # Variable declarations
24 | my $i; # residue index
25 | my $j; # residue index
26 | my $k; # sequence index
27 | my $options="";
28 | my $line; # line read in from file
29 | my $query_length=0; # number of residues in query sequence
30 | my $query_match=0; # number of upper-case residues (=match states) in query sequence
31 | my $capitalize=0; # capitalize query
32 | my $nameline; # >template_name
33 | my $Evalue; # e-value of hit
34 | my $score; # bit score of hit
35 | my $hit_length; # number of residues in HSP
36 | my $coverage; # hit-length/$query_length
37 | my $score_col; # score per column
38 | my $score_min=0; # $score_min=-3*log($P_max)/log(2);
39 |
40 | my $query_name; # name of query file
41 | my $queryseq; # residues of query read in with -q or -q2m option
42 | my $qfirst; # index of first query residue in pairwise alignment
43 | my $qlast; # index of last query residue in pairwise alignment
44 | my $tfirst; # index of first template residue in pairwise alignment
45 | my $tlast; # index of last template residue in pairwise alignment
46 | my $tlen=0; # length of template in pairwise alignment
47 | my @query_res; # query residues from current pairwise alignment
48 | my @template_res; # template residues from current pairwise alignment
49 | my $query_res; # query residues from current pairwise alignment
50 | my $template_res; # template residues from current pairwise alignment
51 | my $line_number=0;
52 | my $new_hit=""; # new sequence record; is only written if coverage threshold is exceeded
53 | my $nhit=0; # counts the number of sequences already in alignment
54 | my @hitnames; # $hitnames[$nhit] is the nameline of the ihit'th hit
55 | my @hitseqs; # $hitseqs[$nhit] contains the residues of the ihit'th hit
56 | my @match; # for -q option: $match[$i]=1 if $i'th query residue is capital letter in query, else 0
57 | my $qid; # $qid is sequence identity with query (for -q option: CONSIDER ONLY MATCH STATES)
58 | my $len; # $len is sequence number of residues of seq k aligned with a match state in query
59 | my $b; # minimum per-residue bit score with query at ends of HSP
60 | my $pfile=""; # alignment file used to calculate PSSM for -p and s/c options
61 | my $bfile=""; # alignment file used to calculate PSSM for -b option
62 | my $GAP=11.0/3.0; # gap opening penalty in bits (for BLOSUM62: 11 bits/3)
63 | my $EXTEND=1.0/3.0; # gap extension penalty in bits (for BLOSUM62: 1 bits/3)
64 | my @queryseq;
65 | my $skip=0; # skip this template sequence because it might be a synthetic fusion protein
66 | my $best=0; # extract only the best HSP per sequence
67 | my $rescaled_Gonnet=0; # Gonnet matrix not yet rescaled to bits
68 | my @qp=(); # $qb[$i][$a] is PSSM from alignment read in with -B option
69 | my @qb=(); # $qp[$i][$a] is PSSM from alignment read in with -P option
70 |
71 |
72 | $infile=$ARGV[0];
73 | $query_file= $ARGV[1];
74 | $outfile=$ARGV[2];
75 |
76 | #Include query sequence as first sequence in alignment?
77 | if ($query_file) {
78 | open(QUERYFILE,"<$query_file") or die ("ERROR: Cannot open $query_file: $!\n");
79 | while($line=) # Read name line
80 | {
81 | if ($line=~/^>(.*)/)
82 | {
83 | $query_name=$1;
84 | last;
85 | }
86 | }
87 | $hitseqs[0]="";
88 | while($line=) # Read residues
89 | {
90 | if ($line=~/^>/) {last;}
91 | chomp($line);
92 | $line=~s/\s+//g; # remove white space
93 | $hitseqs[0].=$line;
94 | }
95 | close(QUERYFILE);
96 |
97 | # Prepare name line of hit
98 | if ($outformat eq "psi") {
99 | $query_name=~/^(\S{1,20})\S*\s*(.*)/; # delete everything after first block
100 | $line=sprintf("%s",$1);
101 | $line=~ tr/ /_/;
102 | $hitnames[0] = sprintf("%-31.31s ",$line);
103 | } else {
104 | $hitnames[0] = sprintf(">%s E=0.0",$query_name);
105 | }
106 | $hitseqs[0] =~ tr/-.//d; # delete all gaps from query
107 | $queryseq = $hitseqs[0];
108 | $hitseqs[0] =~ tr/a-z/A-Z/d; # capitalize hitseq[0] and delete gaps
109 | # $hitseqs[0] =~ tr/Uu/Cc/; # nicht mehr noetig in blast. Kann aber alignhits.pl zum abschmieren bringen.
110 | $nhit=1;
111 |
112 | # Capitalize query?
113 | if ($capitalize) {$queryseq =~ tr/a-z/A-Z/;}
114 | $query_match = ($queryseq=~tr/A-Z/A-Z/); # count number of match states in query
115 |
116 | # Determine match columns as those with upper case residue in query
117 | @queryseq=unpack("C*",$queryseq);
118 | for ($j=0; $j<@queryseq; $j++) {
119 | if ($queryseq[$j]>=65 && $queryseq[$j]<=90) {$match[$j]=1;} else {$match[$j]=0;}
120 | }
121 | }
122 |
123 |
124 |
125 |
126 | # Scan Blast output file for query length (needed for coverage)
127 | open(INFILE,"<$infile") or die ("Error: cannot open $infile: $!\n");
128 | $line_number++;
129 | while ($line=)
130 | {
131 | if ($line=~/^Length\s*=\s*(\d+)/) {$query_length = $1; last;}
132 | $line_number++;
133 | }
134 | #print("Query length = $query_length\n");
135 |
136 | while ($line = ) #scan through PsiBlast-output line by line
137 | {
138 | # New nameline found?
139 | #print "$line";
140 | #if ($line=~/^Length\s*=\s*(\d+)/) { print "length=$1\n\n\n\n";}
141 |
142 | if ($line=~s/^>//)
143 | {
144 | #print "$line";
145 | $line=~s/\s+/ /g;
146 | chomp($line);
147 | $nameline=$line;
148 | while ($line=)
149 | {
150 | if ($line=~/^Length\s*=\s*(\d+)/) {last;}
151 | chomp($line);
152 | $nameline.=$line;
153 | }
154 | $line=~/^Length\s*=\s*(\d+)/;
155 | $tlen=$1;
156 | $nameline=~s/\s+/ /g;
157 | $nameline=~s/\s+gi\|/ gi\|/g;
158 | # Is sequence a synthetic fusion protein ?
159 | #if ($nameline=~/(\[synthetic| synthetic|construct|cloning|vector|chimeric|fusion)/i) {$skip=1;} else {$skip=0;}
160 |
161 | #print "$nameline\n";
162 | }
163 |
164 | # New HSP found?
165 | elsif (!$skip && $line=~/^ Score =/)
166 | {
167 | if($best) {$skip=1;} # skip all following hits with same sequence?
168 |
169 | # First check whether E-value is small enough
170 | if($line =~ /^ Score =\s*(\S+)\s*bits\s*\S*\s*Expect =\s*(\S+)/)
171 | {
172 | $score=$1;
173 | $Evalue=$2;
174 |
175 | #print "$score, $Evalue\n";
176 | }
177 | else
178 | {
179 | print("\nWARNING: wrong format in blast output. Expecting Score = ... Expect = ..\n$line\n");
180 | }
181 | $Evalue=~s/^(e|E)/1$1/; # Expect = e-123 -> 1e-123
182 | $Evalue=~tr/,//d;
183 | if ($Evalue>$E_max || $Evalue<$E_min) {$new_hit=""; next;} # reject hit
184 |
185 | # Record sequence identity
186 | # (not needed, qid calculated afterwards WITHOUT counting template residues aligned to gaps in query)
187 | $line=;
188 | if ($line =~ /^ Identities =\s*\S+\/(\S+)\s+\((\S+)%\)/)
189 | {
190 | $qid=$2;
191 | #print "$qid\n";
192 | $line=;
193 | }
194 | else
195 | {
196 | $qid=0.0; # if match is too poor then no identities are given
197 | }
198 |
199 | # Skip another line and read following line
200 |
201 | $line=;
202 | $line=;
203 |
204 | # Read pairwise alignment
205 | $qfirst="";
206 | $tfirst="";
207 | $query_res="";
208 | $template_res="";
209 | while ($line=~/^Query\s+\d+\s+\S+\s+\d*/) # Cycle in this loop until no new "Query:" lines are found
210 | {
211 | if ($line!~/^Query\s+(\d+)\s+(\S+)\s+(\d*)/)
212 | {
213 | print("WARNING 1: wrong format of blast output in $infile, line $.\n");
214 | last;
215 | }
216 | if ($3 eq "") {
217 | ; ; ; $line=;
218 | print("WARNING 2: wrong format of blast output in $infile, line $. Skipping alignment block.\n");
219 | next;
220 | }
221 | if ($qfirst eq "") {$qfirst=$1;}
222 | $query_res .= $2;
223 | $qlast=$3;
224 | ; $line=;
225 | if ($line!~/^Sbjct\s+(\d+)\s+(\S+)\s+(\d+)/)
226 | {
227 | print("WARNING 3: wrong format of blast output in $infile, line $.\n");
228 | last;
229 | }
230 | if ($tfirst eq "") {$tfirst=$1;}
231 | $template_res .= $2;
232 | $tlast=$3;
233 | ; $line=;
234 | } # end while(1)
235 | # Check lengths
236 | $query_res = uc($query_res);
237 | $template_res = uc($template_res);
238 | if (length($template_res)!=length($query_res)) {
239 | print("WARNING: Query and template lines do not have the same length in $infile, line $.\n");
240 | print("Q: $query_res\n");
241 | print("T: $template_res\n");
242 | next;
243 | }
244 |
245 |
246 | #print "$query_res\n";
247 | #print "$template_res\n";
248 |
249 | # Check whether hit has sufficient score per column
250 | $hit_length=($template_res=~tr/a-zA-Z/a-zA-Z/);
251 | if ($hit_length==0) {next;} # Reject hit?
252 | $score_col=$score/$hit_length;
253 |
254 | @query_res =unpack("C*",$query_res);
255 | @template_res=unpack("C*",$template_res);
256 |
257 | # Prune ends of HSP which are not reliably homologous
258 | #if (($bs>-9 || $bl>-9) && !&PruneHSP()) {next;} # if entire HSP is pruned away, goto next alignment
259 |
260 | # Check whether hit has sufficient sequence identity and coverage with query
261 | if (!$query_file)
262 | {
263 | $len=0; $qid=0;
264 | for ($i=0; $i-9 || $score_min>0) {
297 | if (!&CheckScorePerColumn()) {next;}
298 | }
299 |
300 | if ($v>=3) {printf("nhit=%-2i qid=%-3i qlen=%-3i qid=%-3i%% s/c=%-6.3f\n",$nhit,$qid,$len,100*$qid/$len,$score_col);}
301 |
302 | # Record residues
303 | $new_hit = "-"x($qfirst-1); # Print gaps at beginning of sequence
304 | if ($outformat eq "psi") {
305 | for ($i=0; $i%s(%i-%i:%i) %s E=%g s/c=%4.2f id=%.0f%% cov=%.0f%%",
335 | $1,$tfirst,$tlast,$tlen,$2,$Evalue,$score_col,100*$qid/$len,$coverage);
336 | }
337 |
338 | $nhit++;
339 |
340 | #print "$nhit\n" if($nhit%100 ==0);
341 | } # end elseif new HSP found
342 | } # end while ($line)
343 |
344 | close(INFILE);
345 |
346 |
347 |
348 | # If output format is fasta or a2m we have to insert gaps:
349 | if ($outformat ne "psi")
350 | {
351 | my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $j.
352 | my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $k after the $j'th match state
353 | my $insert;
354 | my $ngap;
355 |
356 | # For each match state determine length of LONGEST insert after this match state and store in @len_ins
357 | for ($k=0; $k<$nhit; $k++) {
358 | # split into list of single match states and variable-length inserts
359 | # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
360 | # The '#' symbol is prepended to get rid of a perl bug in split
361 | $j=0;
362 | @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#");
363 | # printf("%3i: %12.12s %s\n",$k,$hitnames[$k],$hitseqs[$k]);
364 | # printf("Sequence $k: @inserts\n");
365 | foreach $insert (@inserts) {
366 | if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {
367 | $len_ins[$j]=length($insert);
368 | }
369 | $j++;
370 | # printf("$insert|");
371 | }
372 | # for (my $i=0; $i<@inserts; $i++) {printf("%s%-2i ",$inserts[$i],$len_ins[$i]);}
373 | # printf("\n");
374 | }
375 |
376 | # After each match state insert residues and fill up with gaps to $len_ins[$i] characters
377 | for ($k=0; $k<$nhit; $k++) {
378 | # split into list of single match states and variable-length inserts
379 | @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#");
380 | $j=0;
381 |
382 | # append the missing number of gaps after each match state
383 | foreach $insert (@inserts) {
384 | if($outformat eq "fas") {
385 | for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.="-";}
386 | }
387 | else {
388 | for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";}
389 | }
390 | $j++;
391 | }
392 | $hitseqs[$k] = join("",@inserts);
393 | $hitseqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
394 | }
395 | }
396 |
397 |
398 | if ($query_file) {
399 | # Determine match states
400 | my @qa2m = unpack("C*",$hitseqs[0]); # $hitseq[0] is query sequence WITH INSERTS
401 | my @matchali=();
402 | my $L=scalar(@qa2m);
403 | $j=0;
404 | for ($i=0; $i<@match; $i++) {
405 | while ($j<$L && !($qa2m[$j]>=65 && $qa2m[$j]<=90)) {$matchali[$j++]=0;} #move to column with next upper case residue
406 | $matchali[$j++]=$match[$i]; #is next query residue upper-case or not?
407 | }
408 |
409 | # Set all match states to upper case, non-match states to lower case
410 | my @res;
411 | for ($k=0; $k<$nhit; $k++) {
412 | @res = unpack("C*",$hitseqs[$k]);
413 | # printf("Q: %s\n",$hitseqs[0]);
414 | # printf("T: %s\n",$hitseqs[$k]);
415 | for ($i=0; $i<@res; $i++) {
416 | if ($matchali[$i]) {
417 | if ($res[$i]>=97 && $res[$i]<=122) {$res[$i]-=32;} #convert to upper case
418 | } else {
419 | if ($res[$i]>=65 && $res[$i]<=90) {$res[$i]+=32;} # convert to lower case
420 | elsif ($res[$i]==45) {$res[$i]=46;} # convert '-' to '.'
421 | }
422 | # printf("%3i Q:%s T:%s match=%i len=%i\n",$i,chr($qa2m[$i]),chr($res[$i]),$qid[$k],$len);
423 | }
424 | $hitseqs[$k] = pack("C*",@res);
425 | }
426 | }
427 |
428 |
429 | # Remove gaps? Captialize?
430 | if ($outformat eq "ufas") {
431 | for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/a-z.-/A-Z/d;} # Transform to upper case and remove all gaps
432 | } elsif ($outformat eq "fas") {
433 | for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/a-z./A-Z-/;} # Transform to upper case
434 | } elsif ($outformat eq "a3m") {
435 | for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/.//d;} # Remove gaps aligned to inserts
436 | }
437 |
438 | # Write sequences into output file
439 | open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n");
440 | if ($outformat eq "psi") {
441 | for ($k=0; $k<$nhit; $k++) {
442 | $hitseqs[$k] =~ tr/./-/;
443 | printf(OUTFILE "%s %s\n",$hitnames[$k],$hitseqs[$k]);
444 | }
445 | }
446 | else {
447 | for ($k=0; $k<$nhit; $k++) {
448 | printf(OUTFILE "%s\n%s\n",$hitnames[$k],$hitseqs[$k]);
449 | }
450 | }
451 | close OUTFILE;
452 |
453 | if ($v>=1) {printf("$nhit sequences extracted from $infile and written to $outfile\n");}
454 | exit(0);
455 |
456 |
457 |
458 |
459 |
--------------------------------------------------------------------------------
/run_spotrna2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | start=`date +%s`
4 |
5 | input="$(cd "$(dirname "$1")"; pwd)/$(basename "$1")"
6 | input_dir=$(dirname $input)
7 | seq_id=$(basename $(basename $input) | cut -d. -f1)
8 | program_dir=$(dirname $(readlink -f $0))
9 |
10 | path_blastn=$program_dir/ncbi-blast-*+/bin # set path to the folder contains executable binary files of Blast package
11 | path_blastn_database=$program_dir/nt_database/nt # set path to the formatted NCBI's database file without extension
12 | path_infernal=$program_dir/infernal-*-linux-intel-gcc/binaries # set path to the folder contains executable binary files Infernal package
13 | path_infernal_database=$program_dir/nt_database/nt # set path to the NCBI's database database file
14 |
15 | mkdir -p $input_dir/${seq_id}_features && mkdir -p $input_dir/${seq_id}_outputs
16 | echo ">"$seq_id > $input_dir/${seq_id}_features/$seq_id.fasta
17 | awk -i inplace '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' $input
18 | tail -n1 $input >> $input_dir/${seq_id}_features/$seq_id.fasta
19 |
20 | feature_dir=$input_dir/${seq_id}_features
21 | output_dir=$input_dir/${seq_id}_outputs
22 |
23 | #exit 1
24 |
25 | if [ ! -f $path_blastn_database ]; then
26 | echo ""
27 | echo "========================================================================================"
28 | echo " Looks like nt database doesn't exists in the path $path_blastn_database. "
29 | echo " If you want to download the database now, please make sure you have enough "
30 | echo " space in mounted directory and internet connection have enough bandwidth as "
31 | echo " file is of size 270 GBs after unzip. It may take forever to download if "
32 | echo " internet is slow! "
33 | echo "========================================================================================"
34 | echo ""
35 |
36 | echo -n "Type 'y' for download or any other key to exit: "
37 | read userinput
38 |
39 | if [[ $(echo $userinput | tr '[A-Z]' '[a-z]') == 'y' ]]; then
40 |
41 | echo ""
42 | echo "=============================================================================================="
43 | echo " Downloading NCBI's database form ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz link. "
44 | echo " May take few hours to download. "
45 | echo "=============================================================================================="
46 | echo ""
47 | wget -c "ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz" -O $program_dir/nt_database/nt.gz
48 |
49 |
50 | if [[ $? -eq 0 ]]; then
51 | echo ""
52 | echo "======================================================================="
53 | echo " nt database is completed successfully. "
54 | echo "======================================================================="
55 | echo ""
56 | else
57 | echo ""
58 | echo "======================================================================="
59 | echo " Error! Unable to download database sucessfully. "
60 | echo " Check wget command or internet connection. "
61 | echo "======================================================================="
62 | echo ""
63 | exit 1
64 | fi
65 |
66 | echo ""
67 | echo "======================================================================"
68 | echo " Unziping the downloaded nt database. "
69 | echo " May take few hours as size of unzipped file is around 270 GBs. "
70 | echo "======================================================================"
71 | echo ""
72 |
73 | ############ unzip the nt data base file ############
74 | gunzip $program_dir/nt_database/nt.gz
75 |
76 | if [[ $? -eq 0 ]]; then
77 | echo ""
78 | echo "======================================================================="
79 | echo " nt database unzip completed successfully. "
80 | echo "======================================================================="
81 | echo ""
82 | else
83 | echo ""
84 | echo "======================================================================="
85 | echo " Error! unable to unzip database sucessfully. "
86 | echo " Please check if gunzip program exists! "
87 | echo "======================================================================="
88 | echo ""
89 | exit 1
90 | fi
91 |
92 | else
93 | echo ""
94 | echo "==========================================================="
95 | echo " Exiting the program because nt database is missing! "
96 | echo "==========================================================="
97 | echo ""
98 | exit 1
99 | fi
100 |
101 | fi
102 |
103 |
104 | ###### check if aligned homologous sequences file already exists ############
105 | if [ -f $feature_dir/$seq_id.a2m ]; then
106 | echo ""
107 | echo "======================================================================"
108 | echo " MSA file $feature_dir/$seq_id.a2m from Infernal Pipeline already "
109 | echo " exists for query sequence $feature_dir/$seq_id.fasta. "
110 | echo " "
111 | echo " Delete existing $feature_dir/$seq_id.a2m if want to generate new "
112 | echo " alignment file "
113 | echo "======================================================================"
114 | echo ""
115 | else
116 |
117 | #### check if formatted nt database exists or not #####
118 | if [[ ! -f "$path_blastn_database.nal" ]]; then
119 | echo ""
120 | echo "====================================================================="
121 | echo " Nucleotide database file $path_database/nt need to formated "
122 | echo " formated to use with 'makeblastdb' program in BLAST-N program. "
123 | echo ""
124 | echo " Formatting may take 2-3 hours as size of file is around 270 GBs. "
125 | echo "====================================================================="
126 | echo ""
127 | $path_blastn/makeblastdb -in $path_database/nt -dbtype nucl
128 |
129 | if [[ $? -eq 0 ]]; then
130 | echo ""
131 | echo "======================================================="
132 | echo " nt database formatted successfully. "
133 | echo "======================================================="
134 | echo ""
135 | else
136 | echo ""
137 | echo "=================================================================="
138 | echo " Error occured while formatting the nt database. "
139 | echo ""
140 | echo " Check for '$path_blastn/makeblastdb' program in BLAST package "
141 | echo "=================================================================="
142 | echo ""
143 | exit 1
144 | fi
145 | fi
146 |
147 |
148 | #################### check if blastn alignment file ready exists ######################
149 | if [ -f $feature_dir/$seq_id.bla ]; then
150 | echo ""
151 | echo "======================================================================="
152 | echo " MSA-1 file $feature_dir/$seq_id.bla from Infernal Pipeline already "
153 | echo " exists for query sequence $feature_dir/$seq_id.fasta. "
154 | echo " "
155 | echo " Delete existing $feature_dir/$seq_id.a2m if want to generate new "
156 | echo " alignment file. "
157 | echo "======================================================================="
158 | echo ""
159 | else
160 | echo ""
161 | echo "==========================================================================================================================="
162 | echo " Running BLASTN for first round of homologous sequence search for query sequence $feature_dir/$seq_id.fasta. "
163 | echo " May take 5 mins to few hours depending on sequence length and no. of homologous sequences in database. "
164 | echo "==========================================================================================================================="
165 | echo ""
166 | $path_blastn/blastn -db $path_blastn_database -query $feature_dir/$seq_id.fasta -out $feature_dir/$seq_id.bla -evalue 0.001 -num_descriptions 1 -num_threads 8 -line_length 1000 -num_alignments 50000
167 | fi
168 |
169 | if [ $? -eq 0 ]; then
170 | echo ""
171 | echo "==========================================================="
172 | echo " First round of MSA-1 search completed successfully. "
173 | echo "==========================================================="
174 | echo ""
175 | else
176 | echo ""
177 | echo "=================================================================="
178 | echo " Error occured while formatting the nt database. "
179 | echo ""
180 | echo " Check for '$path_blastn/makeblastdb' program in BLAST package "
181 | echo "=================================================================="
182 | echo ""
183 | exit 1
184 | fi
185 |
186 | ######## reformat the output ################
187 | echo ""
188 | echo "========================================================================================"
189 | echo " Converting $feature_dir/$seq_id.bla from BLASTN to $feature_dir/$seq_id.sto. "
190 | echo "========================================================================================"
191 | echo ""
192 | $program_dir/utils/parse_blastn_local.pl $feature_dir/$seq_id.bla $feature_dir/$seq_id.fasta $feature_dir/$seq_id.aln
193 | $program_dir/utils/reformat.pl fas sto $feature_dir/$seq_id.aln $feature_dir/$seq_id.sto
194 |
195 |
196 | if [ $? -eq 0 ]; then
197 | echo ""
198 | echo "=========================================="
199 | echo " Converison completed successfully. "
200 | echo "=========================================="
201 | echo ""
202 | else
203 | echo ""
204 | echo "============================================================================================="
205 | echo " Error occured while Converting $feature_dir/$seq_id.bla to $feature_dir/$seq_id.sto "
206 | echo " "
207 | echo " Check for $program_dir/utils/parse_blastn_local.pl and $program_dir/utils/reformat.pl file."
208 | echo "============================================================================================="
209 | echo ""
210 | exit 1
211 | fi
212 |
213 | ######## predict secondary structure from SPOT-RNA ################
214 | echo ""
215 | echo "==============================================================================================================================="
216 | echo " Predicting Consensus Secondary Structure (CSS) of query sequence $feature_dir/$seq_id.fasta using SPOT-RNA predictor. "
217 | echo "==============================================================================================================================="
218 | echo ""
219 | source $program_dir/venv/bin/activate || conda activate venv
220 | cd $program_dir/SPOT-RNA
221 | python3 SPOT-RNA.py --inputs $feature_dir/$seq_id.fasta --outputs $feature_dir
222 | cd -
223 |
224 | export PERL5LIB=$program_dir/utils/FreeKnot
225 | perl $program_dir/utils/FreeKnot/remove_pseudoknot.pl -i bpseq -s bp $feature_dir/$seq_id.bpseq > $feature_dir/$seq_id.bpseq.unknotted
226 | python3 $program_dir/utils/bpseq2dbn.py --inputs $feature_dir --outputs $feature_dir --rna_id $seq_id
227 | tail -n +3 $feature_dir/$seq_id.dbn > $feature_dir/$seq_id.db
228 |
229 | deactivate || conda deactivate
230 |
231 | ################ reformat ss with according to gaps in reference sequence of .sto file from blastn ################
232 | for i in `awk '{print $2}' $feature_dir/$seq_id.sto | head -n5 | tail -n1 | grep -b -o - | sed 's/..$//'`; do sed -i "s/./&-/$i" $feature_dir/$seq_id.db; done
233 |
234 | ######### add reformated ss from last step to .sto file of blastn ##############
235 | head -n -1 $feature_dir/$seq_id.sto > $feature_dir/temp.sto
236 | echo "#=GC SS_cons "`cat $feature_dir/$seq_id.db` > $feature_dir/temp.txt
237 | cat $feature_dir/temp.sto $feature_dir/temp.txt > $feature_dir/$seq_id.sto
238 | echo "//" >> $feature_dir/$seq_id.sto
239 |
240 | if [ $? -eq 0 ]; then
241 | echo ""
242 | echo "=================================================================="
243 | echo " Consensus Secondary Structure (CSS) generated successfully. "
244 | echo "=================================================================="
245 | echo ""
246 | else
247 | echo ""
248 | echo "=============================================================================="
249 | echo " Error occured while generating structure from SPOT-RNA. "
250 | echo " "
251 | echo " Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA2/issues'."
252 | echo "=============================================================================="
253 | echo ""
254 | exit 1
255 | fi
256 |
257 | ######## run infernal ################
258 | echo ""
259 | echo "=============================================================================================================="
260 | echo " Building Covariance Model from BLASTN alignment (with SS from SPOT-RNA) from $feature_dir/$seq_id.sto file. "
261 | echo "=============================================================================================================="
262 | echo ""
263 | $path_infernal/cmbuild --hand -F $feature_dir/$seq_id.cm $feature_dir/$seq_id.sto
264 |
265 | if [ $? -eq 0 ]; then
266 | echo ""
267 | echo "============================================================================"
268 | echo " Covariance Model (CM) built successfully from $feature_dir/$seq_id.sto. "
269 | echo "============================================================================"
270 | echo ""
271 | else
272 | echo ""
273 | echo "==============================================================================================="
274 | echo " Error occured while building Covariance Model (CM) from $path_infernal/cmbuild. "
275 | echo " "
276 | echo " Please check for $path_infernal/cmbuild program. "
277 | echo "==============================================================================================="
278 | echo ""
279 | exit 1
280 | fi
281 |
282 | echo ""
283 | echo "===================================================================="
284 | echo " Calibrating the Covariance Model $feature_dir/$seq_id.cm. "
285 | echo "===================================================================="
286 | echo ""
287 | $path_infernal/cmcalibrate $feature_dir/$seq_id.cm
288 |
289 | if [ $? -eq 0 ]; then
290 | echo ""
291 | echo "==========================================================="
292 | echo " CM calibrated $feature_dir/$seq_id.cm successfully. "
293 | echo "==========================================================="
294 | echo ""
295 | else
296 | echo ""
297 | echo "==============================================================="
298 | echo " Error occured while calibrating $feature_dir/$seq_id.cm. "
299 | echo " "
300 | echo " Please check for $path_infernal/cmcalibrate program. "
301 | echo "==============================================================="
302 | echo ""
303 | exit 1
304 | fi
305 |
306 | echo ""
307 | echo "======================================================================================================================"
308 | echo " Second round of homologous sequences search using the calibrated covariance model $feature_dir/$seq_id.cm. "
309 | echo " May take 15 mins to few hours for this step. "
310 | echo "======================================================================================================================"
311 | echo ""
312 | $path_infernal/cmsearch -o $feature_dir/$seq_id.out -A $feature_dir/$seq_id.msa --cpu 24 --incE 10.0 $feature_dir/$seq_id.cm $path_infernal_database
313 |
314 | if [ $? -eq 0 ]; then
315 | echo ""
316 | echo "==========================================================="
317 | echo " Second round of MSA-2 search completed successfully. "
318 | echo "==========================================================="
319 | echo ""
320 | else
321 | echo ""
322 | echo "===================================================================================="
323 | echo " Error occured during the second round search using CM $feature_dir/$seq_id.cm. "
324 | echo " "
325 | echo " Please check for $path_infernal/cmsearch program. "
326 | echo "===================================================================================="
327 | echo ""
328 | exit 1
329 | fi
330 |
331 | ######### reformat the alignment without gaps and dashes ###############
332 | echo ""
333 | echo "======================================================================="
334 | echo " Reformatting the output alignment $feature_dir/$seq_id.msa "
335 | echo " for PSSM and DCA features by removing the gaps and dashes. "
336 | echo "======================================================================="
337 | echo ""
338 |
339 | ##### check if .msa is not empty #########
340 | if [[ -s $feature_dir/$seq_id.msa ]]
341 | then
342 | $path_infernal/esl-reformat --replace acgturyswkmbdhvn:................ a2m $feature_dir/$seq_id.msa > $feature_dir/temp.a2m
343 | else
344 | cat $feature_dir/$seq_id.fasta > $feature_dir/temp.a2m
345 | cat $feature_dir/$seq_id.fasta >> $feature_dir/temp.a2m
346 | sed -i '$ s/.$/./' $feature_dir/temp.a2m
347 | fi
348 |
349 | # $path_infernal/esl-reformat --replace acgturyswkmbdhvn:................ a2m $feature_dir/$seq_id.msa > $feature_dir/temp.a2m
350 |
351 | if [ $? -eq 0 ]; then
352 | echo ""
353 | echo "==========================================================="
354 | echo " Reformatted the $feature_dir/$seq_id.msa successfully. "
355 | echo "==========================================================="
356 | echo ""
357 | else
358 | echo ""
359 | echo "========================================================================================"
360 | echo " Error occured during the refomatting the alignment file $feature_dir/$seq_id.msa. "
361 | echo " "
362 | echo " Please check for $path_infernal/esl-reformat program. "
363 | echo "========================================================================================"
364 | echo ""
365 | exit 1
366 | fi
367 |
368 | ######### remove duplicates sequences from the alignment ###############
369 | echo ""
370 | echo "======================================================================="
371 | echo " Removing duplicates from the alignment. "
372 | echo "======================================================================="
373 | echo ""
374 | $program_dir/utils/seqkit rmdup -s $feature_dir/temp.a2m > $feature_dir/$seq_id.a2m
375 |
376 | if [ $? -eq 0 ]; then
377 | echo ""
378 | echo "==============================================="
379 | echo " Duplicate sequences removed successfully. "
380 | echo "==============================================="
381 | echo ""
382 | else
383 | echo ""
384 | echo "========================================================================================"
385 | echo " Error occured during the removel of duplicates from MSA-2. "
386 | echo " "
387 | echo " Please check for $program_dir/utils/seqkit program. "
388 | echo "========================================================================================"
389 | echo ""
390 | exit 1
391 | fi
392 |
393 | ############# multiline fasta to single line fasta file #############
394 | awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < $feature_dir/$seq_id.a2m | sed '/^$/d' > $feature_dir/temp.a2m
395 | ############# add query sequence at the top of MSA file #############
396 | cat $feature_dir/$seq_id.fasta $feature_dir/temp.a2m > $feature_dir/$seq_id.a2m
397 |
398 | fi
399 |
400 | ############# check if pssm file already exists otherwise generate from alignment file #############
401 | if [ -f $feature_dir/$seq_id.pssm ]; then
402 | echo ""
403 | echo "=============================================================================================================================================="
404 | echo " PSSM feature file $feature_dir/$seq_id.pssm already exists for query sequence $feature_dir/$seq_id.fasta. "
405 | echo "=============================================================================================================================================="
406 | echo ""
407 | else
408 | echo ""
409 | echo "======================================================================================"
410 | echo " Extracting PSSM features from the alignment $feature_dir/$seq_id.a2m. "
411 | echo "======================================================================================"
412 | echo ""
413 | $program_dir/utils/getpssm.pl $feature_dir/$seq_id.fasta $feature_dir/$seq_id.a2m $feature_dir/$seq_id.pssm
414 |
415 | if [ $? -eq 0 ]; then
416 | echo ""
417 | echo "==============================================================="
418 | echo " PSSM extracted successfully from $feature_dir/$seq_id.a2m. "
419 | echo "==============================================================="
420 | echo ""
421 | else
422 | echo ""
423 | echo "========================================================================="
424 | echo " Error occured while extracting PSSM from $feature_dir/$seq_id.a2m. "
425 | echo " "
426 | echo " Please check for $program_dir/utils/getpssm.pl program. "
427 | echo "========================================================================="
428 | echo ""
429 | exit 1
430 | fi
431 | fi
432 |
433 | ######### run linearpartition RNA secondary structure base-pair probability predictor ###############
434 | echo ""
435 | echo "============================================================================"
436 | echo " Running LinearPartition-V for base-pair probabilty features. "
437 | echo "============================================================================"
438 | echo ""
439 | tail -n +2 $feature_dir/$seq_id.fasta | $program_dir/LinearPartition/linearpartition -V -r $feature_dir/$seq_id.prob
440 |
441 | if [ $? -eq 0 ]; then
442 | echo ""
443 | echo "===================================================================="
444 | echo " Base-pair probabilty successfully obtained from LinearPartition. "
445 | echo "===================================================================="
446 | echo ""
447 | else
448 | echo ""
449 | echo "============================================================================="
450 | echo " Error occured while running LinearPartition. "
451 | echo " "
452 | echo " Please check for $program_dir/LinearPartition/linearpartition program. "
453 | echo "============================================================================="
454 | echo ""
455 | exit 1
456 | fi
457 |
458 | ############# check if dca file already exists otherwise generate from alignment file #############
459 | if [ -f $feature_dir/$seq_id.dca ]; then
460 | echo ""
461 | echo "==============================================================="
462 | echo " GRELMLIN feature file $feature_dir/$seq_id.dca already "
463 | echo " exists for query sequence $feature_dir/$seq_id.fasta. "
464 | echo " "
465 | echo " Delete the existing file if want to generate new dca file. "
466 | echo "==============================================================="
467 | echo ""
468 | else
469 | echo ""
470 | echo "============================================================================"
471 | echo " Running GREMLIN for DCA features. "
472 | echo "============================================================================"
473 | echo ""
474 | $program_dir/GREMLIN_CPP/gremlin_cpp -alphabet rna -i $feature_dir/$seq_id.a2m -o $feature_dir/$seq_id.dca > $feature_dir/$seq_id.log_gremlin
475 | if [ $? -eq 0 ]; then
476 | echo ""
477 | echo "===================================================="
478 | echo " DCA features successfully obtained from GREMLIN. "
479 | echo "===================================================="
480 | echo ""
481 | else
482 | echo ""
483 | echo "============================================================================="
484 | echo " Error occured while running GREMLIN. "
485 | echo " "
486 | echo " Please check for $program_dir/GREMLIN_CPP/gremlin_cpp program. "
487 | echo "============================================================================="
488 | echo ""
489 | exit 1
490 | fi
491 | fi
492 |
493 |
494 | echo ""
495 | echo "============================================================================"
496 | echo " Running SPOT-RNA2 for RNA secondary structure prediction. "
497 | echo "============================================================================"
498 | echo ""
499 | source $program_dir/venv/bin/activate || conda activate venv
500 | python3 $program_dir/utils/SPOT-RNA2.py --inputs $feature_dir/$seq_id.fasta --outputs $output_dir --motifs True
501 | deactivate || conda deactivate
502 |
503 | end=`date +%s`
504 |
505 | runtime=$((end-start))
506 |
507 | echo -e "\ncomputation time = "$runtime" seconds"
508 |
509 |
--------------------------------------------------------------------------------