├── nt_database
    └── empty_db.fasta
├── utils
    ├── seqkit
    ├── __pycache__
    │   └── utils.cpython-36.pyc
    ├── FreeKnot
    │   ├── COPYRIGHT
    │   ├── COPYRIGHT.txt
    │   ├── BpseqWriter.pm
    │   ├── BracketPairs.pm
    │   ├── ScoringFunctions.pm
    │   ├── DPWriter.pm
    │   ├── BpseqParser.pm
    │   ├── README
    │   ├── ChordModel.pm
    │   ├── README.txt
    │   ├── MIS.pm
    │   ├── CircleGraph.pm
    │   ├── DPParser.pm
    │   ├── PrimitivePseudoknotExtractor.pm
    │   ├── VertexSubset.pm
    │   ├── MWIS.pm
    │   └── remove_pseudoknot.pl
    ├── bpseq2dbn.py
    ├── getpssm.pl
    ├── SPOT-RNA2.py
    ├── utils.py
    └── parse_blastn_local.pl
├── requirements.txt
├── sample_run
    ├── 6ufj.fasta
    ├── sample_seq.fasta
    ├── sample_seq_features
    │   ├── sample_seq.db
    │   ├── sample_seq.fasta
    │   ├── temp.txt
    │   ├── sample_seq.tfrecords
    │   ├── sample_seq.dbn
    │   ├── sample_seq.aln
    │   ├── temp.sto
    │   ├── sample_seq.sto
    │   ├── sample_seq.bpseq.unknotted
    │   ├── sample_seq.bpseq
    │   ├── sample_seq.ct
    │   ├── sample_seq.bla
    │   ├── sample_seq.pssm
    │   ├── sample_seq.prob
    │   ├── temp.a2m
    │   ├── sample_seq.a2m
    │   ├── sample_seq.log_gremlin
    │   └── sample_seq.msa
    └── sample_seq_outputs
    │   ├── sample_seq.bpseq
    │   ├── sample_seq.st
    │   └── sample_seq.ct
├── docs
    ├── SPOTRNA2_pipeline.png
    └── benchmark_results.png
├── __pycache__
    └── utils.cpython-36.pyc
├── Dockerfile
├── README.md
└── run_spotrna2.sh


/nt_database/empty_db.fasta:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/seqkit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/utils/seqkit


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.14.0
2 | pandas
3 | numpy==1.16.4
4 | argparse
5 | tqdm
6 | six
7 | 


--------------------------------------------------------------------------------
/sample_run/6ufj.fasta:
--------------------------------------------------------------------------------
1 | >6ufj: chain A,B
2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
3 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq.fasta:
--------------------------------------------------------------------------------
1 | 
2 | >6ufj_A_B
3 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG


--------------------------------------------------------------------------------
/docs/SPOTRNA2_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/docs/SPOTRNA2_pipeline.png


--------------------------------------------------------------------------------
/docs/benchmark_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/docs/benchmark_results.png


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.db:
--------------------------------------------------------------------------------
1 | ((((((....))))))..........................((((((((..)))))))).
2 | 


--------------------------------------------------------------------------------
/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.fasta:
--------------------------------------------------------------------------------
1 | >sample_seq
2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/temp.txt:
--------------------------------------------------------------------------------
1 | #=GC SS_cons                     ((((((....))))))..........................((((((((..)))))))).
2 | 


--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/utils/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.tfrecords:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaswindersingh2/SPOT-RNA2/HEAD/sample_run/sample_seq_features/sample_seq.tfrecords


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.dbn:
--------------------------------------------------------------------------------
1 | >single_seq
2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
3 | ((((((....))))))..........................((((((((..)))))))).
4 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.aln:
--------------------------------------------------------------------------------
1 | >sample_seq  E=0.0
2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
3 | >6UFJ_A(1-51:51) Chain A, RNA (50-MER) 6UFJ_C Chain C, RNA (50-MER) 6UFK_A Chain A, RNA (50-MER) 6UFK_C Chain C, RNA (50-MER)  E=2e-16 s/c=1.87 id=98% cov=85%
4 | ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA----------
5 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/temp.sto:
--------------------------------------------------------------------------------
1 | # STOCKHOLM 1.0
2 | 
3 | #=GF DE                          E=0.0
4 | #=GC RF                          ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
5 | sample_seq                       ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
6 | 6UFJ_A(1-51:51)                  ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA----------
7 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.sto:
--------------------------------------------------------------------------------
1 | # STOCKHOLM 1.0
2 | 
3 | #=GF DE                          E=0.0
4 | #=GC RF                          ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
5 | sample_seq                       ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
6 | 6UFJ_A(1-51:51)                  ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA----------
7 | #=GC SS_cons                     ((((((....))))))..........................((((((((..)))))))).
8 | //
9 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.bpseq.unknotted:
--------------------------------------------------------------------------------
 1 | 1 A 16
 2 | 2 C 15
 3 | 3 U 14
 4 | 4 C 13
 5 | 5 G 12
 6 | 6 U 11
 7 | 7 U 0
 8 | 8 U 0
 9 | 9 G 0
10 | 10 A 0
11 | 11 G 6
12 | 12 C 5
13 | 13 G 4
14 | 14 A 3
15 | 15 G 2
16 | 16 U 1
17 | 17 A 0
18 | 18 U 0
19 | 19 A 0
20 | 20 A 0
21 | 21 A 0
22 | 22 C 0
23 | 23 A 0
24 | 24 G 0
25 | 25 C 0
26 | 26 U 0
27 | 27 G 0
28 | 28 G 0
29 | 29 U 0
30 | 30 U 0
31 | 31 A 0
32 | 32 A 0
33 | 33 G 0
34 | 34 C 0
35 | 35 U 0
36 | 36 C 0
37 | 37 A 0
38 | 38 A 0
39 | 39 A 0
40 | 40 G 0
41 | 41 C 0
42 | 42 G 0
43 | 43 G 60
44 | 44 A 59
45 | 45 G 58
46 | 46 A 57
47 | 47 G 56
48 | 48 C 55
49 | 49 A 54
50 | 50 G 53
51 | 51 A 0
52 | 52 U 0
53 | 53 C 50
54 | 54 U 49
55 | 55 G 48
56 | 56 C 47
57 | 57 U 46
58 | 58 C 45
59 | 59 U 44
60 | 60 C 43
61 | 61 G 0
62 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2012  Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen
 2 | 
 3 | This file is part of FreeKnot.
 4 | 
 5 | FreeKnot is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation, either version 3 of the License, or
 8 | (at your option) any later version.
 9 | 
10 | FreeKnot is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | GNU General Public License for more details.
14 | 
15 | You should have received a copy of the GNU General Public License
16 | along with FreeKnot.  If not, see <http://www.gnu.org/licenses/>.
17 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.bpseq:
--------------------------------------------------------------------------------
 1 | #sample_seq
 2 | 1 A 16
 3 | 2 C 15
 4 | 3 U 14
 5 | 4 C 13
 6 | 5 G 12
 7 | 6 U 11
 8 | 7 U 0
 9 | 8 U 37
10 | 9 G 36
11 | 10 A 35
12 | 11 G 6
13 | 12 C 5
14 | 13 G 4
15 | 14 A 3
16 | 15 G 2
17 | 16 U 1
18 | 17 A 0
19 | 18 U 0
20 | 19 A 0
21 | 20 A 0
22 | 21 A 0
23 | 22 C 0
24 | 23 A 0
25 | 24 G 0
26 | 25 C 0
27 | 26 U 0
28 | 27 G 0
29 | 28 G 0
30 | 29 U 0
31 | 30 U 0
32 | 31 A 0
33 | 32 A 0
34 | 33 G 0
35 | 34 C 0
36 | 35 U 10
37 | 36 C 9
38 | 37 A 8
39 | 38 A 0
40 | 39 A 0
41 | 40 G 0
42 | 41 C 0
43 | 42 G 0
44 | 43 G 60
45 | 44 A 59
46 | 45 G 58
47 | 46 A 57
48 | 47 G 56
49 | 48 C 55
50 | 49 A 54
51 | 50 G 53
52 | 51 A 0
53 | 52 U 0
54 | 53 C 50
55 | 54 U 49
56 | 55 G 48
57 | 56 C 47
58 | 57 U 46
59 | 58 C 45
60 | 59 U 44
61 | 60 C 43
62 | 61 G 0
63 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_outputs/sample_seq.bpseq:
--------------------------------------------------------------------------------
 1 | #sample_seq
 2 | 1 A 16
 3 | 2 C 15
 4 | 3 U 14
 5 | 4 C 13
 6 | 5 G 0
 7 | 6 U 39
 8 | 7 U 38
 9 | 8 U 37
10 | 9 G 36
11 | 10 A 35
12 | 11 G 34
13 | 12 C 33
14 | 13 G 4
15 | 14 A 3
16 | 15 G 2
17 | 16 U 1
18 | 17 A 0
19 | 18 U 0
20 | 19 A 0
21 | 20 A 0
22 | 21 A 0
23 | 22 C 0
24 | 23 A 0
25 | 24 G 0
26 | 25 C 0
27 | 26 U 0
28 | 27 G 0
29 | 28 G 0
30 | 29 U 0
31 | 30 U 0
32 | 31 A 0
33 | 32 A 0
34 | 33 G 12
35 | 34 C 11
36 | 35 U 10
37 | 36 C 9
38 | 37 A 8
39 | 38 A 7
40 | 39 A 6
41 | 40 G 0
42 | 41 C 0
43 | 42 G 61
44 | 43 G 60
45 | 44 A 59
46 | 45 G 58
47 | 46 A 57
48 | 47 G 56
49 | 48 C 55
50 | 49 A 54
51 | 50 G 53
52 | 51 A 0
53 | 52 U 0
54 | 53 C 50
55 | 54 U 49
56 | 55 G 48
57 | 56 C 47
58 | 57 U 46
59 | 58 C 45
60 | 59 U 44
61 | 60 C 43
62 | 61 G 42
63 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/COPYRIGHT.txt:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2012  Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen
 2 | 
 3 | This file is part of FreeKnot.
 4 | 
 5 | FreeKnot is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation, either version 3 of the License, or
 8 | (at your option) any later version.
 9 | 
10 | FreeKnot is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | GNU General Public License for more details.
14 | 
15 | You should have received a copy of the GNU General Public License
16 | along with FreeKnot.  If not, see <http://www.gnu.org/licenses/>.
17 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/BpseqWriter.pm:
--------------------------------------------------------------------------------
 1 | #Writer for BPSEQ format
 2 | 
 3 | package BpseqWriter;
 4 | 
 5 | use strict;
 6 | 
 7 | sub output_results {
 8 |     my (undef, $combined_base_pair_removal_pos, $base_seq, $paired_pos_ptrs, $base_count) = @_;
 9 | 
10 |     if (@{$combined_base_pair_removal_pos} == 0) {
11 | 	for (my $i = 1; $i <= $base_count; $i++) {
12 | 	    print $i . ' ' . $base_seq->[$i - 1] . ' ' . $paired_pos_ptrs->[$i] . "\n";
13 | 	}
14 |     }
15 | 
16 |     foreach (@{$combined_base_pair_removal_pos}) {
17 | 	for (my $i = 1; $i <= $base_count; $i++) {
18 | 	    print $i . ' ' . $base_seq->[$i - 1] . ' ';
19 | 	    if (exists($_->{$i})) {
20 | 		print "0\n";
21 | 	    }
22 | 	    else {
23 | 		print $paired_pos_ptrs->[$i] . "\n";
24 | 	    }
25 | 	}
26 |     }
27 | }
28 | 
29 | 1;
30 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_outputs/sample_seq.st:
--------------------------------------------------------------------------------
 1 | #Name: sample_seq
 2 | #Length:  61 
 3 | #PageNumber: 2
 4 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
 5 | [[[[.(((((((]]]]................)))))))..(((((((((..)))))))))
 6 | EEEEESSSSSSSHHHHHHHHHHHHHHHHHHHHSSSSSSSXXSSSSSSSSSHHSSSSSSSSS
 7 | KKKKNNNNNNNNKKKKNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
 8 | S1 6..12 "UUUGAGC" 33..39 "GCUCAAA"
 9 | S2 42..50 "GGAGAGCAG" 53..61 "CUGCUCUCG"
10 | H1 13..32 "GAGUAUAAACAGCUGGUUAA" (12,33) C:G PK{1}
11 | H2 51..52 "AU" (50,53) G:C 
12 | X1 40..41 "GC" (39,6) A:U (42,61) G:G 
13 | E1 1..5 "ACUCG" PK{1}
14 | PK1 4bp 1..4 13..16 E1 1..5 H1 13..32
15 | PK1.1 1 A 16 U
16 | PK1.2 2 C 15 G
17 | PK1.3 3 U 14 A
18 | PK1.4 4 C 13 G
19 | NCBP1 42 G 61 G S2
20 | segment1 7bp 6..12 UUUGAGC 33..39 GCUCAAA
21 | segment2 9bp 42..50 GGAGAGCAG 53..61 CUGCUCUCG
22 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/BracketPairs.pm:
--------------------------------------------------------------------------------
 1 | #Bracket handler for DPParser
 2 | 
 3 | package BracketPairs;
 4 | use strict;
 5 | 
 6 | my $open_bracket_map = {")" => "(", "]" => "[", "}" => "{", ">" => "<"};
 7 | 
 8 | #Check whether a symbol (in dot-parentheses format) is an open bracket
 9 | sub is_open_bracket {
10 |     my (undef, $symbol) = @_;
11 | 
12 |     if ($symbol =~ /^[\(\[{<A-Z]$/) {
13 | 	return 1;
14 |     }
15 | 
16 |     return 0;
17 | }
18 | 
19 | #Return a corresponding close bracket for an open bracket input  
20 | sub get_open_bracket {
21 |     my (undef, $close_bracket) = @_;
22 | 
23 |     if ($close_bracket =~ /^[\)\]}>]$/) {
24 | 	return $open_bracket_map->{$close_bracket};
25 |     }
26 |     elsif ($close_bracket =~ /^[a-z]$/) {
27 | 	return uc $close_bracket;
28 |     }
29 |     else {
30 | 	die "Unknown closing bracket\n";
31 |     }
32 | }
33 | 
34 | 1;
35 | 


--------------------------------------------------------------------------------
/utils/bpseq2dbn.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import argparse
 4 | import os
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('--inputs', default='inputs', type=str, help='Path to input file in fasta format, accept multiple sequences as well in fasta format; default = ''inputs/2zzm-1-B.fasta''\n', metavar='')
 8 | parser.add_argument('--outputs',default='inputs', type=str, help='Path to output files; SPOT-RNA outputs at least three files .ct, .bpseq, and .prob files; default = ''inputs/\n', metavar='')
 9 | parser.add_argument('--rna_id', default='sample_seq', type=str, help='Name of the input sequence file\n')
10 | 
11 | args = parser.parse_args()
12 | 
13 | with open(os.path.join(args.inputs, args.rna_id + ".bpseq.unknotted")) as f:
14 |     temp = pd.read_csv(f,comment='#', delim_whitespace=True, header=None, usecols=[0,1,2]).values
15 | seq = temp[:,1]
16 | 
17 | pairs = [[i,j] for i,j in zip(temp[:,0], temp[:,2]) if i!=0 and j!=0 and i<j]
18 | 
19 | 
20 | dbn = ['.']*temp.shape[0]
21 | for pair in pairs:
22 | 	dbn[pair[0]-1] = '('
23 | 	dbn[pair[1]-1] = ')'
24 | 
25 | row1 = seq
26 | row2 = np.array(dbn)
27 | temp = np.vstack((row1, row2))
28 | 
29 | np.savetxt(os.path.join(args.outputs, args.rna_id + '.dbn'), temp, delimiter='', fmt="%s", header='>' + 'single_seq', comments='')
30 | 
31 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.ct:
--------------------------------------------------------------------------------
 1 | 61		sample_seq		SPOT-RNA output
 2 | 
 3 | 1		A		0		2		16		1
 4 | 2		C		1		3		15		2
 5 | 3		U		2		4		14		3
 6 | 4		C		3		5		13		4
 7 | 5		G		4		6		12		5
 8 | 6		U		5		7		11		6
 9 | 7		U		6		8		0		7
10 | 8		U		7		9		37		8
11 | 9		G		8		10		36		9
12 | 10		A		9		11		35		10
13 | 11		G		10		12		6		11
14 | 12		C		11		13		5		12
15 | 13		G		12		14		4		13
16 | 14		A		13		15		3		14
17 | 15		G		14		16		2		15
18 | 16		U		15		17		1		16
19 | 17		A		16		18		0		17
20 | 18		U		17		19		0		18
21 | 19		A		18		20		0		19
22 | 20		A		19		21		0		20
23 | 21		A		20		22		0		21
24 | 22		C		21		23		0		22
25 | 23		A		22		24		0		23
26 | 24		G		23		25		0		24
27 | 25		C		24		26		0		25
28 | 26		U		25		27		0		26
29 | 27		G		26		28		0		27
30 | 28		G		27		29		0		28
31 | 29		U		28		30		0		29
32 | 30		U		29		31		0		30
33 | 31		A		30		32		0		31
34 | 32		A		31		33		0		32
35 | 33		G		32		34		0		33
36 | 34		C		33		35		0		34
37 | 35		U		34		36		10		35
38 | 36		C		35		37		9		36
39 | 37		A		36		38		8		37
40 | 38		A		37		39		0		38
41 | 39		A		38		40		0		39
42 | 40		G		39		41		0		40
43 | 41		C		40		42		0		41
44 | 42		G		41		43		0		42
45 | 43		G		42		44		60		43
46 | 44		A		43		45		59		44
47 | 45		G		44		46		58		45
48 | 46		A		45		47		57		46
49 | 47		G		46		48		56		47
50 | 48		C		47		49		55		48
51 | 49		A		48		50		54		49
52 | 50		G		49		51		53		50
53 | 51		A		50		52		0		51
54 | 52		U		51		53		0		52
55 | 53		C		52		54		50		53
56 | 54		U		53		55		49		54
57 | 55		G		54		56		48		55
58 | 56		C		55		57		47		56
59 | 57		U		56		58		46		57
60 | 58		C		57		59		45		58
61 | 59		U		58		60		44		59
62 | 60		C		59		61		43		60
63 | 61		G		60		0		0		61
64 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_outputs/sample_seq.ct:
--------------------------------------------------------------------------------
 1 | 61		sample_seq		SPOT-RNA2 output
 2 | 
 3 | 1		A		0		2		16		1
 4 | 2		C		1		3		15		2
 5 | 3		U		2		4		14		3
 6 | 4		C		3		5		13		4
 7 | 5		G		4		6		0		5
 8 | 6		U		5		7		39		6
 9 | 7		U		6		8		38		7
10 | 8		U		7		9		37		8
11 | 9		G		8		10		36		9
12 | 10		A		9		11		35		10
13 | 11		G		10		12		34		11
14 | 12		C		11		13		33		12
15 | 13		G		12		14		4		13
16 | 14		A		13		15		3		14
17 | 15		G		14		16		2		15
18 | 16		U		15		17		1		16
19 | 17		A		16		18		0		17
20 | 18		U		17		19		0		18
21 | 19		A		18		20		0		19
22 | 20		A		19		21		0		20
23 | 21		A		20		22		0		21
24 | 22		C		21		23		0		22
25 | 23		A		22		24		0		23
26 | 24		G		23		25		0		24
27 | 25		C		24		26		0		25
28 | 26		U		25		27		0		26
29 | 27		G		26		28		0		27
30 | 28		G		27		29		0		28
31 | 29		U		28		30		0		29
32 | 30		U		29		31		0		30
33 | 31		A		30		32		0		31
34 | 32		A		31		33		0		32
35 | 33		G		32		34		12		33
36 | 34		C		33		35		11		34
37 | 35		U		34		36		10		35
38 | 36		C		35		37		9		36
39 | 37		A		36		38		8		37
40 | 38		A		37		39		7		38
41 | 39		A		38		40		6		39
42 | 40		G		39		41		0		40
43 | 41		C		40		42		0		41
44 | 42		G		41		43		61		42
45 | 43		G		42		44		60		43
46 | 44		A		43		45		59		44
47 | 45		G		44		46		58		45
48 | 46		A		45		47		57		46
49 | 47		G		46		48		56		47
50 | 48		C		47		49		55		48
51 | 49		A		48		50		54		49
52 | 50		G		49		51		53		50
53 | 51		A		50		52		0		51
54 | 52		U		51		53		0		52
55 | 53		C		52		54		50		53
56 | 54		U		53		55		49		54
57 | 55		G		54		56		48		55
58 | 56		C		55		57		47		56
59 | 57		U		56		58		46		57
60 | 58		C		57		59		45		58
61 | 59		U		58		60		44		59
62 | 60		C		59		61		43		60
63 | 61		G		60		0		42		61
64 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.bla:
--------------------------------------------------------------------------------
 1 | BLASTN 2.10.1+
 2 | 
 3 | 
 4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb
 5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J
 6 | Comput Biol 2000; 7(1-2):203-14.
 7 | 
 8 | 
 9 | 
10 | Database: /nt_database/nt
11 |            55,908,648 sequences; 260,722,916,040 total letters
12 | 
13 | 
14 | 
15 | Query= sample_seq
16 | 
17 | Length=61
18 |                                                                       Score     E
19 | Sequences producing significant alignments:                          (Bits)  Value
20 | 
21 | 6UFJ_A Chain A, RNA (50-MER) 6UFJ_C Chain C, RNA (50-MER) 6UFK_A ...  95.3    2e-16
22 | 
23 | 
24 | >6UFJ_A Chain A, RNA (50-MER) 6UFJ_C Chain C, RNA (50-MER) 6UFK_A 
25 | Chain A, RNA (50-MER) 6UFK_C Chain C, RNA (50-MER)
26 | Length=51
27 | 
28 |  Score = 95.3 bits (51),  Expect = 2e-16
29 |  Identities = 51/51 (100%), Gaps = 0/51 (0%)
30 |  Strand=Plus/Plus
31 | 
32 | Query  1   ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA  51
33 |            |||||||||||||||||||||||||||||||||||||||||||||||||||
34 | Sbjct  1   ACTCGTTTGAGCGAGTATAAACAGCTGGTTAAGCTCAAAGCGGAGAGCAGA  51
35 | 
36 | 
37 | 
38 | Lambda      K        H
39 |     1.33    0.621     1.12 
40 | 
41 | Gapped
42 | Lambda      K        H
43 |     1.28    0.460    0.850 
44 | 
45 | Effective search space used: 7769692438560
46 | 
47 | 
48 |   Database: /nt_database/nt
49 |     Posted date:  May 30, 2020  5:58 AM
50 |   Number of letters in database: 260,722,916,040
51 |   Number of sequences in database:  55,908,648
52 | 
53 | 
54 | 
55 | Matrix: blastn matrix 1 -2
56 | Gap Penalties: Existence: 0, Extension: 2.5
57 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | MAINTAINER Jaswinder Singh (jaswinder.singh3@griffithuni.edu.au)
 3 | 
 4 | RUN rm /bin/sh && ln -s /bin/bash /bin/sh
 5 | RUN apt-get update && apt-get install -y build-essential wget virtualenv git python-minimal cpanminus gawk
 6 | RUN cpanm Graph
 7 | 
 8 | RUN wget 'https://www.dropbox.com/s/h6j53u7wjyj6uir/SPOT-RNA2.tar.xz' || wget 'https://app.nihaocloud.com/f/3e826caf8efc43adaaa0/?dl=1' && tar -xvf SPOT-RNA2.tar.xz && rm SPOT-RNA2.tar.xz
 9 | WORKDIR SPOT-RNA2
10 | 
11 | RUN wget -O utils/models_ckps.tar.xz 'https://www.dropbox.com/s/udzcsva76lh5wvq/models_ckps.tar.xz' || wget -O utils/models_ckps.tar.xz 'https://app.nihaocloud.com/f/586acb2658d74ccb92b8/?dl=1' && tar -xvf utils/models_ckps.tar.xz -C utils/ && rm utils/models_ckps.tar.xz
12 | RUN virtualenv -p python3.6 venv && source ./venv/bin/activate &&  pip install tensorflow==1.14.0 && pip install -r requirements.txt && deactivate
13 | 
14 | RUN wget 'eddylab.org/infernal/infernal-1.1.3-linux-intel-gcc.tar.gz' && tar -xvzf infernal-*.tar.gz && rm infernal-*.tar.gz
15 | RUN wget 'ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-*+-x64-linux.tar.gz' && tar -xvzf ncbi-blast-*+-x64-linux.tar.gz && rm ncbi-blast-*+-x64-linux.tar.gz
16 | RUN git clone https://github.com/jaswindersingh2/SPOT-RNA.git && cd SPOT-RNA && wget 'https://www.dropbox.com/s/dsrcf460nbjqpxa/SPOT-RNA-models.tar.gz' || wget -O SPOT-RNA-models.tar.gz 'https://app.nihaocloud.com/f/fbf3315a91d542c0bdc2/?dl=1' && tar -xvzf SPOT-RNA-models.tar.gz && rm SPOT-RNA-models.tar.gz && cd ../
17 | RUN git clone "https://github.com/sokrypton/GREMLIN_CPP" && cd GREMLIN_CPP && g++ -O3 -std=c++0x -o gremlin_cpp gremlin_cpp.cpp -fopenmp && cd ../
18 | RUN git clone 'https://github.com/LinearFold/LinearPartition.git' && cd LinearPartition/ && make && cd ../
19 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/ScoringFunctions.pm:
--------------------------------------------------------------------------------
 1 | package ScoringFunctions;
 2 | 
 3 | use strict;
 4 | 
 5 | #my $free_energy_params;
 6 | #my $canonical_base_pairs = {'AU' => 0, 'CG' => 0, 'GC' => 0, 'UA' => 0, 'GU' => 0, 'UG' => 0};
 7 | 
 8 | #Return a scoring function according to the choice selected
 9 | sub get_scoring_function {
10 |     my (undef, $option) = @_;
11 | 
12 |     if ($option eq 'bp') {
13 | 	return \&_base_pair_score, 'max', 0;
14 |     }
15 |     elsif ($option eq 'stem') {
16 | 	return \&_stem_score, 'max', 0;
17 |     }
18 | #    elsif ($option eq 'sstab') {
19 | #	$free_energy_params = _init_free_energy_parameters();
20 | #	return \&_stem_bp_stability, 'min', 1;
21 | #    }
22 |     elsif ($option eq 'hb') {
23 | 	return \&_hydrogen_bond, 'max', 0;
24 |     }
25 |     elsif ($option eq 'fe') {
26 | 	return \&_overall_stability, 'min', 1;
27 |     }
28 |     else {
29 | 	return undef, undef, undef;
30 |     }
31 | }
32 | 
33 | #Number of base pairs in a stem as the stem score
34 | sub _base_pair_score {
35 |     my $chord_attrs = shift;
36 | 
37 |     my $stem_pair_count = $chord_attrs->{pair_count};
38 |     if (defined($stem_pair_count)) {
39 | 	return $stem_pair_count;
40 |     }
41 | 
42 |     return 0;
43 | }
44 | 
45 | #Each stem scores equally as 1
46 | sub _stem_score {
47 |     return 1;
48 | }
49 | 
50 | #GC and CG bonds = 3, other canonical of GU pairs = 2
51 | sub _hydrogen_bond {
52 |     my ($chord_attrs, $base_seq) = @_;
53 | 
54 |     my $stem_base_pairs = $chord_attrs->{base_pairs};
55 |     my $total_score = 0;
56 | 
57 |     foreach (@{$stem_base_pairs}) {
58 | 	my $base_pair_type = uc($base_seq->[$_->[0] - 1] . $base_seq->[$_->[1] - 1]);
59 | 	if ($base_pair_type eq 'GC' || $base_pair_type eq 'CG') {
60 | 	    $total_score += 3;
61 | 	}
62 | 	elsif ($base_pair_type eq 'AU' || $base_pair_type eq 'UA' ||
63 | 	       $base_pair_type eq 'GU' || $base_pair_type eq 'UG') {
64 | 	    $total_score += 2;
65 | 	}
66 |     }
67 | 
68 |     return $total_score;
69 | }
70 | 
71 | #This allows all MISs to be reported as MWISs and they will be converted to all possible
72 | #de-knotted structures to determine the minimum free energy (MFE)
73 | sub _overall_stability {
74 |     return 0;
75 | }
76 | 
77 | 1;
78 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/DPWriter.pm:
--------------------------------------------------------------------------------
 1 | #Writer for dot-parentheses format
 2 | 
 3 | package DPWriter;
 4 | 
 5 | use strict;
 6 | 
 7 | use constant DOT => '.';
 8 | use constant OPEN_BRACKET => '(';
 9 | use constant CLOSE_BRACKET => ')';
10 | use constant TEMP_DP_FILE => 'MWIS_temp.dp';
11 | 
12 | sub output_results {
13 |     my (undef, $combined_base_pair_removal_pos, $structure_symbols, $base_seq_str) = @_;
14 | 
15 |     if (@{$combined_base_pair_removal_pos} == 0) {
16 | 	my $output_structure = join('', @{$structure_symbols});
17 | 	print "$base_seq_str\n$output_structure\n";
18 |     }
19 | 
20 |     foreach (@{$combined_base_pair_removal_pos}) {
21 | 	my $output_structure = '';
22 | 	for (my $i = 0; $i < @{$structure_symbols}; $i++) {
23 | 	    if (exists($_->{$i + 1})) {
24 | 		$output_structure = $output_structure . DOT;
25 | 	    }
26 | 	    else {
27 | 		$output_structure = $output_structure . $structure_symbols->[$i];
28 | 	    }
29 | 	}
30 | 
31 | 	print "$base_seq_str\n$output_structure\n";
32 |     }
33 | }
34 | 
35 | sub output_mfe_candidate {
36 |     my (undef, $base_pair_removal_pos, $paired_pos_ptrs, $structure_symbols, $base_seq_str) = @_;
37 | 
38 |     my $base_seq_len = length($base_seq_str);
39 |     my $output_structure = '';
40 |     if (defined($paired_pos_ptrs)) {
41 | 	for (my $i = 1; $i <= $base_seq_len; $i++) {
42 | 	    if (exists($base_pair_removal_pos->{$i})) {
43 | 		$output_structure = $output_structure . DOT;
44 | 	    }
45 | 	    else {
46 | 		my $paired_pos = $paired_pos_ptrs->[$i];
47 | 		if ($paired_pos == 0) {
48 | 		    $output_structure = $output_structure . DOT;
49 | 		}
50 | 		elsif ($i < $paired_pos) {
51 | 		    $output_structure = $output_structure . OPEN_BRACKET;
52 | 		}
53 | 		else {
54 | 		    $output_structure = $output_structure . CLOSE_BRACKET;
55 | 		}
56 | 	    }
57 | 	}
58 |     }
59 |     elsif (defined($structure_symbols)) {
60 | 	for (my $i = 1; $i <= $base_seq_len; $i++) {
61 | 	    if (exists($base_pair_removal_pos->{$i})) {
62 | 		$output_structure = $output_structure . DOT;
63 | 	    }
64 | 	    else {
65 | 		$output_structure = $output_structure . $structure_symbols->[$i - 1];
66 | 	    }
67 | 	}
68 |     }
69 | 
70 |     $output_structure =~ s/[\[\{<A-Z]/\(/g;
71 |     $output_structure =~ s/[\]\}>a-z]/\)/g;
72 | 
73 |     open (DP, ">" . TEMP_DP_FILE) or die "Cannot open file at " . TEMP_DP_FILE;
74 |     print DP "$base_seq_str\n$output_structure\n";
75 |     close DP or die "Cannot close file at " . TEMP_DP_FILE;
76 | }
77 | 
78 | 1;
79 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.pssm:
--------------------------------------------------------------------------------
 1 | A     23      3      5      2      0 
 2 | C      0      3      8     22      0 
 3 | U      0     31      0      2      0 
 4 | C      0      3      1     29      0 
 5 | G      1      0     32      0      0 
 6 | U      3     24      0      6      0 
 7 | U      0     13      9     11      0 
 8 | U      0     31      1      1      0 
 9 | G      1      0     32      0      0 
10 | A     26      0      6      1      0 
11 | G      1      0     32      0      0 
12 | C      0      2      0     31      0 
13 | G      3      0     29      1      0 
14 | A     31      0      2      0      0 
15 | G      3      0     22      8      0 
16 | U      3     24      4      2      0 
17 | A     27      3      3      0      0 
18 | U      3     28      0      2      0 
19 | A     32      0      0      1      0 
20 | A     33      0      0      0      0 
21 | A     32      0      0      1      0 
22 | C      3      6      3     21      0 
23 | A     32      1      0      0      0 
24 | G      0      0     32      1      0 
25 | C      7      2     14     10      0 
26 | U      2     14      0     16      1 
27 | G      2      1      9      3     18 
28 | G      8     14      5      4      2 
29 | U      3     29      0      0      1 
30 | U      0     29      0      4      0 
31 | A     32      0      1      0      0 
32 | A     18      3     11      0      1 
33 | G      3      0     29      0      1 
34 | C      3      0      0     30      0 
35 | U      0     26      3      4      0 
36 | C      1      1      0     31      0 
37 | A     31      0      1      0      1 
38 | A     13      0     10      9      1 
39 | A     31      0      1      0      1 
40 | G      0      0     31      1      1 
41 | C      4      0      0     28      1 
42 | G      0      1     30      1      1 
43 | G      2     24      6      1      0 
44 | A      8      8      1     16      0 
45 | G      3      5     15     10      0 
46 | A     13      7      3      9      1 
47 | G      2      3     23      4      1 
48 | C      7      2      9     14      1 
49 | A     11      5     12      3      2 
50 | G      2      4     21      0      6 
51 | A     12      0      2      2     17 
52 | U      2     10      1      0     20 
53 | C      3      2      3     16      9 
54 | U      5      9      3     12      4 
55 | G      2      8     12      6      5 
56 | C      2      3      4     18      6 
57 | U      6     12      7      4      4 
58 | C      6      2     10     12      3 
59 | U      6      6     17      1      3 
60 | C     11      2     12      4      4 
61 | G      2      1     28      0      2 
62 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/BpseqParser.pm:
--------------------------------------------------------------------------------
 1 | #Parser for BPSEQ format
 2 | #It returns primitive pseudoknot objects, base sequence and paired positions
 3 | 
 4 | package BpseqParser;
 5 | 
 6 | use strict;
 7 | 
 8 | sub parse {
 9 |     my (undef, $bpseq_file_path) = @_;
10 | 
11 |     my ($base_seq, $paired_pos_ptrs) = ([], []);
12 |     my ($next_paired_pos, $prev_paired_pos) = ({}, {});
13 |     my $matched_pos = {};
14 |     my $last_paired_pos = 0;
15 |     my $base_count = 0;
16 | 
17 |     open (BPSEQ, "<$bpseq_file_path") or die "Cannot open file at $bpseq_file_path";
18 | 
19 |     while (<BPSEQ>) {
20 | 	if ($_ =~ /^([0-9]+) ([A-Za-z]{1}) ([0-9]+)[\r\n]*$/) {
21 | 	    my ($pos, $base, $paired_pos) = ($1, $2, $3);
22 | 	    if ($pos != ++$base_count) {
23 | 		die "Base position $base_count is missing";
24 | 	    }
25 | 
26 | 	    if ($paired_pos > 0) {
27 | 		if ($pos < $paired_pos) {
28 | 		    $matched_pos->{$pos} = $paired_pos;
29 | 		}
30 | 		else {
31 | 		    if ($matched_pos->{$paired_pos} != $pos) {
32 | 			die "Unmatched pair position $pos and $paired_pos";
33 | 		    }
34 | 		}
35 | 
36 | 		$next_paired_pos->{$last_paired_pos} = $pos;
37 | 		$prev_paired_pos->{$pos} = $last_paired_pos;
38 | 		$last_paired_pos = $pos;
39 | 	    }
40 | 
41 | 	    $paired_pos_ptrs->[$pos] = $paired_pos;
42 | 	    $base_seq->[$pos - 1] = $base;
43 | 	}
44 | 	elsif ($_ !~ /^#.*/ && $_ !~ /^\s+/) {
45 | 	    die "Unknown input: $_";
46 | 	}
47 |     }
48 | 
49 |     $next_paired_pos->{$last_paired_pos} = 0;
50 |     $prev_paired_pos->{0} = $last_paired_pos;
51 | 
52 |     close BPSEQ or die "Cannot close file at $bpseq_file_path";
53 | 
54 |     #Group the base pairs into base pair stems
55 |     my ($stem_outermost_pairs, $stems) = _group_to_stems($next_paired_pos, $prev_paired_pos, $paired_pos_ptrs);
56 |     #Extract primitive pseudoknots from the base pair stems
57 |     my $primitive_pseudoknots = PrimitivePseudoknotExtractor->extract($stem_outermost_pairs, $stems, $paired_pos_ptrs);
58 | 
59 |     return ($primitive_pseudoknots, $base_seq, $paired_pos_ptrs, $base_count);
60 | }
61 | 
62 | sub _group_to_stems {
63 |     my ($next_paired_pos, $prev_paired_pos, $paired_pos_ptrs) = @_;
64 | 
65 |     my $stems = {};
66 |     my $stem_outermost_pairs = [];
67 |     my $stem;
68 |     my $last_pair;
69 | 
70 |     my $curr_pos = $next_paired_pos->{0};
71 |     while ($curr_pos > 0) {
72 | 	my $paired_pos = $paired_pos_ptrs->[$curr_pos];
73 | 	if ($paired_pos < $curr_pos) {
74 | 	    undef $last_pair;
75 | 	    $curr_pos = $next_paired_pos->{$curr_pos};
76 | 	    next;
77 | 	}
78 | 
79 | 	my $curr_pair = [$curr_pos, $paired_pos];
80 | 
81 | 	if (defined($last_pair) && $prev_paired_pos->{$last_pair->[1]} == $paired_pos) {
82 | 	    push @{$stem}, $curr_pair;
83 | 	}
84 | 	else {
85 | 	    $stem = [$curr_pair];
86 | 	    $stems->{$curr_pos} = $stem;
87 | 	    push @{$stem_outermost_pairs}, $curr_pair;
88 | 	}
89 | 
90 | 	$last_pair = $curr_pair;
91 | 	$curr_pos = $next_paired_pos->{$curr_pos};
92 |     }
93 | 
94 |     return ($stem_outermost_pairs, $stems);
95 | }
96 | 
97 | 1;
98 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/README:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------------------
 2 | FreeKnot
 3 | -------------------------------------------------------------------------
 4 | Authors: Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen
 5 | Last updated on 15 Apr 2014
 6 | 
 7 | -------------------------------------------------------------------------
 8 | Purpose
 9 | 
10 | FreeKnot is a tool for RNA pseudoknot removal. It converts any pseudoknot
11 | into nested substructures in RNA secondary structures. It removes some
12 | crossing stems to eliminate crossings based on certain scoring functions
13 | (details will be provided later in this README file) and reports one or
14 | more optimized pseudoknot-free structures.
15 | 
16 | -------------------------------------------------------------------------
17 | Platform and pre-requisites
18 | 
19 | FreeKnot has been tested on various platforms including Linux (Ubuntu),
20 | Mac OS X and Windows. Perl (v5.14 or later) is recommended. Earlier
21 | versions might work but without guarantee. Windows users can download
22 | various Perl distributions for Windows. ViennaRNA package 2.1 is required
23 | for the free energy scoring function.
24 | 
25 | -------------------------------------------------------------------------
26 | Program/Module Description
27 | 
28 | BpseqParser.pm, DPParser.pm     - parser to accept bpseq or
29 |                                   dot-parentheses formats as input
30 | BpseqWriter.pm, DPWriter.pm     - writer to output converted results in
31 |                                   bpseq or dot-parentheses formats
32 | ChordModel.pm, CircleGraph.pm   - graphical object for primitive
33 |                                   pseudoknot representation
34 | MIS.pm				- MIS algorithm (for free energy scoring
35 | 				  function)
36 | MWIS.pm                         - MWIS algorithm
37 | ScoringFunctions.pm             - scoring functions
38 | remove_pseudoknot.pl            - main program for pseudoknot removal
39 | PrimitivePseudoknotExtractor.pm - primitive pseudoknot extraction from
40 |                                   the input secondary structure
41 | BracketPairs.pm                 - processing brackets in input secondary
42 |                                   structure
43 | VertexSubset.pm			- subset object for storing graph
44 | 				  vertices in the MIS algorithm
45 | 
46 | -------------------------------------------------------------------------
47 | Usage
48 | 
49 | FreeKnot is executed in console. The command is:
50 | 
51 | perl remove_pseudoknot.pl -i <secondary structure format of input file>
52 |     -s <scoring function option> <input file path>
53 | 
54 | Secondary structure format available: dp (dot-parentheses) / bpseq
55 | The secondary structure format for the output file follows that of the
56 | input file. So, if the input file is in bpseq format then the output
57 | file is also in bpseq format. Note that every line of data must end with
58 | a newline character (i.e. \n).
59 | 
60 | Scoring function options: bp (# of base pairs) / stem (# of base pair
61 | stems) / hb (# of hydrogen bonds) / fe (structure overall free energy)
62 | 
63 | The results are outputted to the console (stdout) by default. They can be
64 | directed to a file. For example,
65 | 
66 | perl remove_pseudoknot.pl -i bpseq -s bp input.bpseq > output.bpseq
67 | 
68 | -------------------------------------------------------------------------
69 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.prob:
--------------------------------------------------------------------------------
  1 | 1 16 2.4607e-01
  2 | 2 11 4.8058e-04
  3 | 2 15 2.7600e-01
  4 | 2 40 2.7181e-04
  5 | 2 42 5.0162e-03
  6 | 2 43 7.0046e-04
  7 | 2 61 3.1390e-04
  8 | 3 10 4.9712e-04
  9 | 3 14 2.7639e-01
 10 | 3 39 2.2049e-04
 11 | 3 42 2.1906e-03
 12 | 3 43 3.6608e-03
 13 | 4 9 4.9913e-04
 14 | 4 13 2.7651e-01
 15 | 4 40 4.7134e-03
 16 | 4 42 6.8810e-01
 17 | 5 12 2.7567e-01
 18 | 5 34 3.2105e-03
 19 | 5 41 7.0013e-01
 20 | 6 10 1.6400e-04
 21 | 6 11 2.4958e-01
 22 | 6 33 3.2126e-03
 23 | 6 39 1.8319e-01
 24 | 6 40 5.2746e-01
 25 | 7 11 2.1717e-04
 26 | 7 32 3.2300e-03
 27 | 7 37 2.5131e-05
 28 | 7 38 3.5702e-01
 29 | 7 39 3.4129e-01
 30 | 8 31 3.0849e-03
 31 | 8 37 5.4291e-01
 32 | 8 38 1.6981e-01
 33 | 9 30 2.3436e-03
 34 | 9 36 7.1718e-01
 35 | 10 26 8.7135e-05
 36 | 10 29 1.9660e-03
 37 | 10 30 1.5846e-03
 38 | 10 35 7.1728e-01
 39 | 11 25 1.2568e-04
 40 | 11 29 5.5166e-03
 41 | 11 34 7.1395e-01
 42 | 12 24 1.3000e-04
 43 | 12 27 1.1131e-03
 44 | 12 28 1.5587e-02
 45 | 12 33 7.0364e-01
 46 | 12 40 1.8068e-05
 47 | 13 22 1.9136e-04
 48 | 13 25 1.9429e-04
 49 | 13 26 4.2317e-04
 50 | 13 29 9.7416e-04
 51 | 13 30 9.5602e-02
 52 | 13 36 1.4656e-03
 53 | 14 26 3.0481e-02
 54 | 14 29 1.1306e-01
 55 | 14 30 7.3809e-03
 56 | 14 35 1.5024e-03
 57 | 15 22 3.7622e-03
 58 | 15 25 3.5921e-02
 59 | 15 26 1.5459e-04
 60 | 15 29 5.8125e-03
 61 | 15 30 2.9255e-02
 62 | 15 34 1.5186e-03
 63 | 15 41 4.7121e-05
 64 | 16 21 3.4426e-03
 65 | 16 23 2.3995e-04
 66 | 16 24 3.4999e-02
 67 | 16 27 1.3027e-01
 68 | 16 28 2.1053e-03
 69 | 16 31 1.0100e-02
 70 | 16 32 1.1925e-03
 71 | 16 33 1.4888e-03
 72 | 16 40 6.5565e-05
 73 | 17 26 1.2737e-01
 74 | 17 29 4.0552e-02
 75 | 17 30 1.0248e-02
 76 | 18 23 8.8966e-04
 77 | 18 24 2.7892e-03
 78 | 18 27 1.9342e-01
 79 | 18 28 3.8211e-02
 80 | 18 31 7.2033e-03
 81 | 18 32 3.3442e-03
 82 | 18 37 8.8863e-05
 83 | 18 38 1.8289e-04
 84 | 18 39 3.4985e-04
 85 | 18 40 5.0928e-04
 86 | 18 42 5.8368e-05
 87 | 19 26 1.7940e-01
 88 | 19 29 3.9621e-04
 89 | 19 30 8.6332e-03
 90 | 20 26 1.0309e-02
 91 | 20 29 5.0768e-03
 92 | 20 30 2.9850e-02
 93 | 21 26 2.6013e-03
 94 | 21 29 3.8919e-02
 95 | 22 27 1.2742e-02
 96 | 22 28 3.9957e-02
 97 | 22 40 1.4106e-03
 98 | 22 42 4.3392e-03
 99 | 22 61 1.7007e-04
100 | 23 29 1.2407e-04
101 | 23 30 1.3139e-04
102 | 23 35 2.1023e-01
103 | 23 57 2.0713e-04
104 | 24 29 8.2146e-05
105 | 24 34 2.4004e-01
106 | 24 41 2.7964e-02
107 | 24 56 2.2795e-04
108 | 25 33 2.4021e-01
109 | 25 40 2.8015e-02
110 | 25 55 2.2719e-04
111 | 26 31 1.3356e-04
112 | 26 32 2.3811e-01
113 | 26 37 2.0999e-03
114 | 26 38 1.5837e-04
115 | 26 39 2.5461e-02
116 | 26 42 1.5538e-04
117 | 27 34 4.0980e-04
118 | 27 35 8.8677e-03
119 | 27 36 4.1981e-03
120 | 27 41 1.8599e-04
121 | 27 53 2.1974e-04
122 | 28 34 1.0404e-02
123 | 28 35 1.9487e-03
124 | 28 36 2.1486e-03
125 | 28 41 4.1504e-04
126 | 28 52 2.2328e-04
127 | 29 33 9.0913e-03
128 | 29 37 6.7256e-05
129 | 29 38 1.8006e-04
130 | 29 39 1.7555e-04
131 | 29 40 4.1642e-04
132 | 29 51 2.2158e-04
133 | 30 37 1.8760e-04
134 | 30 38 1.7507e-04
135 | 30 39 3.9742e-04
136 | 30 50 2.1004e-04
137 | 33 41 6.4184e-03
138 | 33 48 2.2673e-04
139 | 34 40 6.4329e-03
140 | 34 47 2.2717e-04
141 | 35 39 5.2554e-03
142 | 35 46 2.2709e-04
143 | 36 40 1.2025e-04
144 | 36 42 2.8760e-03
145 | 36 45 2.2703e-04
146 | 39 57 2.1183e-04
147 | 40 56 2.3156e-04
148 | 41 55 2.3165e-04
149 | 41 61 5.8108e-03
150 | 42 54 2.3119e-04
151 | 42 60 3.9038e-03
152 | 43 53 2.3140e-04
153 | 43 59 7.1042e-04
154 | 43 60 9.8885e-01
155 | 44 52 2.2817e-04
156 | 44 59 9.9764e-01
157 | 45 58 9.9940e-01
158 | 46 57 9.9913e-01
159 | 47 56 9.9941e-01
160 | 48 55 9.9908e-01
161 | 49 54 8.1076e-01
162 | 50 54 1.6715e-04
163 | 
164 | 


--------------------------------------------------------------------------------
/utils/getpssm.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | use strict;
  3 | 
  4 | 
  5 | my $ecut=0.001;
  6 | my @AA=qw(A  U  G  C -);
  7 | my %AA2index = ('A'=>'1', 'U'=>'2', 'G'=>'3', 'C'=>'4', '-'=>'5');
  8 | 
  9 | my $seq=$ARGV[0];
 10 | my $aln=$ARGV[1];
 11 | my $outfile=$ARGV[2];
 12 | 
 13 | my @seq=`cat $seq`; chomp(@seq);	
 14 | my $len=length $seq[1];
 15 | 
 16 | #print "parse ...\n";
 17 | my %freq=&wfreq($len, $aln);
 18 | 	
 19 | my @nn=split(//, $seq[1]);
 20 | open(PRO, ">$outfile");
 21 | for(my $i=1; $i<=$len; $i++)
 22 | {
 23 | 	print PRO "$nn[$i-1] ";
 24 | 	foreach my $A(@AA)
 25 | 	{
 26 | 		printf PRO "%6d ", $freq{$i, $A};
 27 | 	}
 28 | 	    
 29 | 	printf PRO "\n";
 30 | }
 31 | close(PRO);
 32 | 	
 33 | 
 34 | 
 35 | sub wfreq
 36 | {
 37 |     my ($len, $file)=@_;
 38 | 
 39 |     my %ALN=();
 40 |     my $Pcount=0;
 41 |     open(ALN,"$file") || die "Cant open $file";
 42 |     while(my $line=<ALN>)
 43 |     {
 44 |         chomp($line);
 45 |         if($line =~ /^>(\S+)/)
 46 |         {
 47 |             my $Pname=$1;            
 48 | #            my $Evalue= $1 if($line =~ /E=(\S+)/);
 49 | #	    last if($Evalue>$ecut);
 50 | 	    $Pcount++;
 51 |             $ALN{$Pcount, 0}=$Pname;
 52 | #            $ALN{$Pcount, 1}=$Evalue;
 53 |         }
 54 |         else
 55 |         {
 56 | 	    $line =~ s/T/U/g;  ###replace T by U	    
 57 |             $ALN{$Pcount, 2}=$line;
 58 |         }
 59 |     }
 60 |     close(ALN);
 61 | 
 62 |     my %freq=();
 63 | 	$Pcount=50000 if($Pcount>50000);
 64 |     printf "%d sequences\n", $Pcount;
 65 |     if($Pcount >= 1)
 66 |     {
 67 |         %freq = &frquency(\%ALN, $Pcount, \%AA2index);
 68 |     }
 69 |     else
 70 |     {
 71 | 	my @Qres   = split(//, $ALN{1, 2});
 72 | 	for(my $j=0; $j<@Qres; $j++)
 73 | 	{
 74 |              foreach my $key (@AA)
 75 |              {
 76 |                  $freq{$j+1, $key}=0;
 77 |              }
 78 | 	}
 79 |     }
 80 | 
 81 |     return %freq;
 82 | }
 83 | 
 84 | 
 85 | sub frquency
 86 | {
 87 |     my ($ALN_ref, $Nseq, $AA_ref)=@_;
 88 |     my %align   = %$ALN_ref;
 89 |     my %AA2in   = %$AA_ref;
 90 | 
 91 |     my @Qres   = split(//, $align{1, 2});
 92 |     my $Ncol   = $#Qres;
 93 |     my %res_count=();
 94 | 
 95 | 
 96 |     my $Qresno=0;
 97 |     my %Qmapping=();
 98 |     for(my $j=0; $j<=$#Qres; $j++)
 99 |     {
100 |         $res_count{$j}=0;
101 |         if($Qres[$j] ne '-')
102 |         {
103 |             $Qresno++;
104 |             $Qmapping{$Qresno}=$j;
105 |         }
106 |     }
107 | 
108 | 
109 |     my @ARR=();
110 |     for(my $i=1; $i<=$Nseq; $i++)
111 |     {
112 |         my @res=split(//, $align{$i, 2});
113 |         for(my $j=0; $j<=$#res; $j++)
114 |         {
115 |             $ARR[$i][$j]=$res[$j];
116 |         }
117 |     }
118 |     my $AAcount = keys %AA2in;
119 |     my %AA_freq=();
120 |     my %sum_seq_weights=();
121 |     my $k=0;
122 | 
123 |     for(my $j=0; $j<=$Ncol; $j++)
124 |     {
125 |         if($Qres[$j] eq '-')
126 |         {
127 |             next;
128 |         }
129 |         $k++;
130 |         foreach my $key (@AA)
131 |         {
132 |             $AA_freq{$k, $key}=0;
133 |         }
134 |         my $w=0;
135 |         for(my $i=1; $i<=$Nseq; $i++)
136 |         {
137 |             my $AAN="";
138 | 	    
139 |             if(!exists $AA2in{$ARR[$i][$j]})
140 |             {
141 | 		print "replace $ARR[$i][$j] by $ARR[1][$j]\n";
142 |                 $AAN=$ARR[1][$j]; #replace nonstandard base in templates by query base
143 |             }
144 |             else
145 |             {
146 |                 $AAN=$ARR[$i][$j];
147 |             }
148 | 
149 | #	    print "$AAN ";
150 |             $AA_freq{$k, $AAN} += 1; ##weighted frequency in clolumn $j
151 |         }
152 | 	#print "\n";
153 | 	
154 |     }
155 |     return %AA_freq;
156 | }
157 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/ChordModel.pm:
--------------------------------------------------------------------------------
  1 | #Chord model of the circle graph representing a primitive pseudoknot. Each chord denotes a unique
  2 | #crossing base pair stem in the primitive pseudoknot. If two stems cross, then their corresponding
  3 | #chords also cross. Each chord is associated with its underlying base pairs.
  4 | 
  5 | package ChordModel;
  6 | 
  7 | use strict;
  8 | 
  9 | sub new {
 10 |     my (undef, $primitive_pseudoknot) = @_;
 11 | 
 12 |     my $prim_pseudoknot_stems = $primitive_pseudoknot->[0];
 13 |     my $chord_end_point_num_map = _get_chord_end_point_num_map($prim_pseudoknot_stems);
 14 | 
 15 |     my ($chord_edges, $all_chord_base_pairs) = ({}, {});
 16 |     my ($chord_end_point_nums, $end_point_to_edge_map, $is_left_end_points) = ([], [], []);
 17 | 
 18 |     foreach (@{$prim_pseudoknot_stems}) {
 19 | 	my $chord_left_end_point_num = $chord_end_point_num_map->{$_->[0][0]};
 20 | 	my $chord_right_end_point_num = $chord_end_point_num_map->{$_->[0][1]};
 21 | 	push @{$chord_end_point_nums}, ($chord_left_end_point_num, $chord_right_end_point_num);
 22 | 	$is_left_end_points->[$chord_left_end_point_num] = 1;
 23 | 	$is_left_end_points->[$chord_right_end_point_num] = 0;
 24 | 	$all_chord_base_pairs->{$chord_left_end_point_num . '-' . $chord_right_end_point_num} = $_;
 25 | 
 26 | 	my $chord_edge = [$chord_left_end_point_num, $chord_right_end_point_num];
 27 | 	$chord_edges->{$chord_left_end_point_num . '-' . $chord_right_end_point_num} = $chord_edge;
 28 | 	$end_point_to_edge_map->[$chord_left_end_point_num] = $chord_edge;
 29 | 	$end_point_to_edge_map->[$chord_right_end_point_num] = $chord_edge;
 30 |     }
 31 | 
 32 |     my @sorted_chord_end_point_nums = sort {$b <=> $a} @{$chord_end_point_nums};
 33 | 
 34 |     my $self = {};
 35 |     $self->{chord_end_point_nums} = \@sorted_chord_end_point_nums;
 36 |     $self->{chord_edges} = $chord_edges;
 37 |     $self->{end_point_to_edge_map} = $end_point_to_edge_map;
 38 |     $self->{is_left_end_points} = $is_left_end_points;
 39 |     $self->{all_chord_base_pairs} = $all_chord_base_pairs;
 40 | 
 41 |     bless $self;
 42 | 
 43 |     return $self;
 44 | }
 45 | 
 46 | sub _get_chord_end_point_num_map {
 47 |     my $prim_pseudoknot_stems = shift;
 48 | 
 49 |     my $stem_end_points = [];
 50 | 
 51 |     foreach (@{$prim_pseudoknot_stems}) {
 52 | 	push @{$stem_end_points}, $_->[0][0];
 53 | 	push @{$stem_end_points}, $_->[0][1];
 54 |     }
 55 | 
 56 |     my @sorted_stem_end_points = sort {$a <=> $b} @{$stem_end_points};
 57 | 
 58 |     my $chord_end_point_num_map = {};
 59 |     for (my $i = 0; $i < @sorted_stem_end_points; $i++) {
 60 | 	$chord_end_point_num_map->{$sorted_stem_end_points[$i]} = $i + 1;
 61 |     }
 62 | 
 63 |     return $chord_end_point_num_map;
 64 | }
 65 | 
 66 | sub get_chord_end_point_nums {
 67 |     my $self = shift;
 68 | 
 69 |     return $self->{chord_end_point_nums};
 70 | }
 71 | 
 72 | sub get_chord_edges {
 73 |     my $self = shift;
 74 | 
 75 |     return $self->{chord_edges};
 76 | }
 77 | 
 78 | sub get_chord_edge_count {
 79 |     my $self = shift;
 80 | 
 81 |     return scalar(keys %{$self->{chord_edges}});
 82 | }
 83 | 
 84 | sub get_chord_edge_by_end_point {
 85 |     my ($self, $end_point_num) = @_;
 86 | 
 87 |     my $end_point_to_edge_map = $self->{end_point_to_edge_map};
 88 | 
 89 |     return $end_point_to_edge_map->[$end_point_num];
 90 | }
 91 | 
 92 | sub is_left_end_point {
 93 |     my ($self, $end_point_num) = @_;
 94 | 
 95 |     my $is_left_end_points = $self->{is_left_end_points};
 96 | 
 97 |     return $is_left_end_points->[$end_point_num];
 98 | }
 99 | 
100 | sub get_chord_base_pairs {
101 |     my ($self, $chord_left_end_point, $chord_right_end_point) = @_;
102 | 
103 |     my $all_chord_base_pairs = $self->{all_chord_base_pairs};
104 | 
105 |     return $all_chord_base_pairs->{$chord_left_end_point . '-' . $chord_right_end_point};
106 | }
107 | 
108 | 1;
109 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/README.txt:
--------------------------------------------------------------------------------
  1 | -------------------------------------------------------------------------
  2 | 
  3 | FreeKnot
  4 | 
  5 | -------------------------------------------------------------------------
  6 | 
  7 | Authors: Jimmy Ka Ho Chiu and Yi-Ping Phoebe Chen
  8 | 
  9 | Last updated on 15 Apr 2014
 10 | 
 11 | 
 12 | 
 13 | -------------------------------------------------------------------------
 14 | 
 15 | Purpose
 16 | 
 17 | 
 18 | 
 19 | FreeKnot is a tool for RNA pseudoknot removal. It converts any pseudoknot
 20 | 
 21 | into nested substructures in RNA secondary structures. It removes some
 22 | 
 23 | crossing stems to eliminate crossings based on certain scoring functions
 24 | 
 25 | (details will be provided later in this README file) and reports one or
 26 | 
 27 | more optimized pseudoknot-free structures.
 28 | 
 29 | 
 30 | 
 31 | -------------------------------------------------------------------------
 32 | 
 33 | Platform and pre-requisites
 34 | 
 35 | 
 36 | 
 37 | FreeKnot has been tested on various platforms including Linux (Ubuntu),
 38 | 
 39 | Mac OS X and Windows. Perl (v5.14 or later) is recommended. Earlier
 40 | 
 41 | versions might work but without guarantee. Windows users can download
 42 | 
 43 | various Perl distributions for Windows. ViennaRNA package 2.1 is required
 44 | 
 45 | for the free energy scoring function.
 46 | 
 47 | 
 48 | 
 49 | -------------------------------------------------------------------------
 50 | 
 51 | Program/Module Description
 52 | 
 53 | 
 54 | 
 55 | BpseqParser.pm, DPParser.pm     - parser to accept bpseq or
 56 | 
 57 |                                   dot-parentheses formats as input
 58 | 
 59 | BpseqWriter.pm, DPWriter.pm     - writer to output converted results in
 60 | 
 61 |                                   bpseq or dot-parentheses formats
 62 | 
 63 | ChordModel.pm, CircleGraph.pm   - graphical object for primitive
 64 | 
 65 |                                   pseudoknot representation
 66 | 
 67 | MIS.pm				- MIS algorithm (for free energy scoring
 68 | 
 69 | 				  function) 
 70 | MWIS.pm                         - MWIS algorithm
 71 | 
 72 | ScoringFunctions.pm             - scoring functions
 73 | 
 74 | remove_pseudoknot.pl	        - main program for pseudoknot removal
 75 | 
 76 | PrimitivePseudoknotExtractor.pm - primitive pseudoknot extraction from
 77 | 
 78 |                                   the input secondary structure
 79 | 
 80 | BracketPairs.pm                 - processing brackets in input secondary
 81 | 
 82 |                                   structure
 83 | 
 84 | 
 85 | VertexSubset.pm			- subset objects for storing graph
 86 | 				  vertices in the MIS algorithm
 87 | 
 88 | -------------------------------------------------------------------------
 89 | 
 90 | Usage
 91 | 
 92 | 
 93 | 
 94 | FreeKnot is executed in console. The command is:
 95 | 
 96 | 
 97 | 
 98 | perl remove_pseudoknot.pl -i <secondary structure format of input file>
 99 | 
100 |     -s <scoring function option> <input file path>
101 | 
102 | 
103 | 
104 | Secondary structure format available: dp (dot-parentheses) / bpseq
105 | 
106 | The secondary structure format for the output file follows that of the
107 | 
108 | input file. So, if the input file is in bpseq format then the output
109 | 
110 | file is also in bpseq format. Note that every line of data must end with
111 | 
112 | a newline character (i.e. \n).
113 | 
114 | 
115 | 
116 | Scoring function options: bp (# of base pairs) / stem (# of base pair
117 | 
118 | stems) / hb (# of hydrogen bonds) / fe (structure overall free energy)
119 | 
120 | 
121 | 
122 | The results are outputted to the console (stdout) by default. They can be
123 | 
124 | directed to a file. For example,
125 | 
126 | 
127 | 
128 | perl remove_pseudoknot.pl -i bpseq -s bp input.bpseq > output.bpseq
129 | 
130 | 
131 | 
132 | -------------------------------------------------------------------------
133 | 


--------------------------------------------------------------------------------
/utils/SPOT-RNA2.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import os
 4 | from tqdm import tqdm
 5 | import argparse
 6 | from utils import create_tfr_files, prob_to_secondary_structure
 7 | import time
 8 | start = time.time()
 9 | from argparse import RawTextHelpFormatter
10 | from pathlib import Path
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--inputs', default='inputs/single_seq.fasta', type=str, help='Path to input file in fasta format, accept multiple sequences as well in fasta format; default = ''inputs/2zzm-1-B.fasta''\n', metavar='')
14 | parser.add_argument('--outputs',default='outputs/', type=str, help='Path to output files; SPOT-RNA outputs at least three files .ct, .bpseq, and .prob files; default = ''outputs/\n', metavar='')
15 | parser.add_argument('--gpu', default=1, type=int, help='To run on GPU, specifiy GPU number. If only one GPU in computer specifiy 0; default = -1 (no GPU)\n', metavar='')
16 | parser.add_argument('--plots',default=False, type=bool, help='Set this to "True" to get the 2D plots of predicted secondary structure by SPOT-RNA; default = False\n', metavar='')
17 | parser.add_argument('--motifs',default=False, type=bool, help='Set this to "True" to get the motifs of predicted secondary structure by SPOT-RNA; default = False\n', metavar='')
18 | #parser.add_argument('--NC',default=True, type=bool, help='Set this to "False" to predict only canonical pairs; default = True\n', metavar='')
19 | args = parser.parse_args()
20 | 
21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
22 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
23 | 
24 | base_path = os.path.dirname(os.path.realpath(__file__))
25 | 
26 | create_tfr_files(args)
27 | 
28 | with open(args.inputs) as file:
29 |     input_data = [line.strip() for line in file.read().splitlines() if line.strip()]
30 | 
31 | count = int(len(input_data)/2)
32 | 
33 | ids = [input_data[2*i].replace(">", "") for i in range(count)]
34 | sequences = {}
35 | for i,I in enumerate(ids):
36 |     sequences[I] = input_data[2*i+1].replace(" ", "").replace("T", "U").upper()
37 | 
38 | os.environ["CUDA_VISIBLE_DEVICES"]= str(args.gpu)
39 | #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
40 | NUM_MODELS = 4
41 | 
42 | test_loc = [os.path.splitext(args.inputs)[0] + ".tfrecords"]
43 | 
44 | outputs = {}
45 | mask = {}
46 | def sigmoid(x):
47 |     return 1/(1+np.exp(-np.array(x, dtype=np.float128)))
48 | 
49 | #for MODEL in range(NUM_MODELS):
50 | for MODEL in [0, 1, 2, 3]:
51 | #for MODEL in [0, 1, 2, 3]:
52 |     print(MODEL)
53 |     config = tf.ConfigProto()
54 |     #config.gpu_options.allow_growth = True
55 |     config.allow_soft_placement=True
56 |     config.log_device_placement=False
57 |     print('\nPredicting for SPOT-RNA2 model '+str(MODEL))
58 |     with tf.Session(config=config) as sess:
59 |         saver = tf.train.import_meta_graph(os.path.join(base_path, 'models_ckps'+'/model_'+str(MODEL)+'.meta'))
60 |         saver.restore(sess, os.path.join(base_path, 'models_ckps'+'/model_'+str(MODEL)))
61 |         graph = tf.get_default_graph()
62 |         init_test =  graph.get_operation_by_name('make_initializer_1')
63 |         tmp_out = graph.get_tensor_by_name('output_FC/fully_connected/BiasAdd:0')
64 |         name_tensor = graph.get_tensor_by_name('tensors_1/component_0:0')
65 |         RNA_name = graph.get_tensor_by_name('IteratorGetNext:0')
66 |         label_mask = graph.get_tensor_by_name('IteratorGetNext:4')
67 |         sess.run([init_test], feed_dict={name_tensor:test_loc})
68 |         
69 |         pbar = tqdm(total = count)
70 |         for rna in ids:
71 |             out = sess.run([tmp_out,RNA_name,label_mask],feed_dict={'dropout:0':1})
72 |             out[1] = rna
73 | 
74 |             mask[out[1]] = out[2]
75 |             
76 |             if MODEL == 0:
77 |                 outputs[out[1]] = [sigmoid(out[0])]
78 |             else:
79 |                 outputs[out[1]].append(sigmoid(out[0]))
80 |             pbar.update(1)
81 |         pbar.close()
82 |     tf.reset_default_graph()
83 | 
84 | 
85 | RNA_ids = [i for i in list(outputs.keys())]
86 | ensemble_outputs = {}
87 | 
88 | print('\nPost Processing and Saving Output')
89 | for i in RNA_ids:
90 |     #print(i, mask[i].shape, len(sequences[i]))
91 |     ensemble_outputs[i] = np.mean(outputs[i],0)
92 |     prob_to_secondary_structure(ensemble_outputs[i], mask[i], sequences[i], i, args)
93 | 
94 | print('\nFinished!')
95 | end = time.time()
96 | print('\nProcesssing Time {} seconds'.format(end - start))
97 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/MIS.pm:
--------------------------------------------------------------------------------
 1 | #The MIS algorithm module. It is an extension of the k-MIS algorithm proposed by Byskov (Byskov, J., 2004)
 2 | 
 3 | package MIS;
 4 | 
 5 | use strict;
 6 | 
 7 | use constant D => 3;
 8 | 
 9 | my $miss;
10 | my $checked_sets;
11 | 
12 | sub get_mis {
13 |     my (undef, $circle_graph) = @_;
14 | 
15 |     $miss = [];
16 |     $checked_sets = {};
17 | 
18 |     #Initialize the vertex set with goal opposing vertices filtered
19 |     my $vertex_set = VertexSubset->new($circle_graph);
20 |     #Call the branching algorithm _search_mis, all the MWISs will be stored in $mwiss
21 |     _search_mis($vertex_set, [], $circle_graph);
22 | 
23 |     undef $checked_sets;
24 | 
25 |     return $miss;
26 | }
27 | 
28 | sub _search_mis {
29 |     my ($vertex_subset, $candidate_set, $circle_graph) = @_;
30 | 
31 |     if ($vertex_subset->get_size() == 0) {
32 | 	#If the vertex subset is empty, check whether the $candidate_set is an independent set. If so then it is
33 | 	#an MIS and the toal vertex weight is evaluated. Those with the best overall weight (according to the
34 | 	#goal specified by $criteria) are put in $miss. Since the same subset may appear more than once,
35 | 	#$checked_sets stores all the subset verified before to avoid unnecessary checking.
36 | 	@{$candidate_set} = sort {$a <=> $b} @{$candidate_set};
37 | 	my $candidate_set_id = join('-', @{$candidate_set});
38 | 	if (!exists($checked_sets->{$candidate_set_id}) && _is_independent_set($candidate_set, $circle_graph)) {
39 | 	    push @{$miss}, $candidate_set;
40 | 	    $checked_sets->{$candidate_set_id} = $candidate_set;
41 | 	}
42 |     }
43 |     else {
44 | 	my ($highest_degree_vertices, $highest_vertex_degree) = $vertex_subset->get_highest_degree_vertex_info();
45 | 	#If the highest vertex degree is at least D, select a vertex with such degree to branch
46 | 	if ($highest_vertex_degree >= D) {
47 | 	    my @self_adj_vertices = (@{$vertex_subset->get_adjacent_vertices_at($highest_degree_vertices->[0])}, $highest_degree_vertices->[0]);
48 | 	    my @expanded_candidate_set = (@{$candidate_set}, $highest_degree_vertices->[0]);
49 | 	    #Branch on by including the selected vertex in $candidate_set
50 | 	    _search_mis($vertex_subset->get_subset(\@self_adj_vertices), \@expanded_candidate_set, $circle_graph);
51 | 
52 | 	    #Branch on by just excluding the selected vertex in $candidate_set
53 | 	    _search_mis($vertex_subset->get_subset([$highest_degree_vertices->[0]]), $candidate_set, $circle_graph);
54 | 	}
55 | 	#If the highest vertex degree is lower than D, select a vertex with the lowest vertex degree to branch instead
56 | 	else {
57 | 	    my ($lowest_degree_vertices, undef) = $vertex_subset->get_lowest_degree_vertex_info();
58 | 	    my $adj_vertices = $vertex_subset->get_adjacent_vertices_at($lowest_degree_vertices->[0]);
59 | 	    my @self_adj_vertices1 = (@{$adj_vertices}, $lowest_degree_vertices->[0]);
60 | 	    my @expanded_candidate_set1 = (@{$candidate_set}, $lowest_degree_vertices->[0]);
61 | 	    #Branch on by including the selected vertex in $candidate_set
62 | 	    _search_mis($vertex_subset->get_subset(\@self_adj_vertices1), \@expanded_candidate_set1, $circle_graph);
63 | 
64 | 	    #Branch on by enumerating and including each adjacent vertex of the selected vertex in $candidate_set
65 | 	    foreach (@{$adj_vertices}) {
66 | 		my @expanded_candidate_set2 = (@{$candidate_set}, $_);
67 | 		my @self_adj_vertices2 = (@{$vertex_subset->get_adjacent_vertices_at($_)}, $_);
68 | 		_search_mis($vertex_subset->get_subset(\@self_adj_vertices2), \@expanded_candidate_set2, $circle_graph);
69 | 	    }
70 | 	}
71 |     }
72 | }
73 | 
74 | sub _is_independent_set {
75 |     my ($candidate_set, $circle_graph) = @_;
76 | 
77 |     my ($all_non_adj_vertex_mask, $candidate_set_bitstrings) = ([], []);
78 | 
79 |     for (my $i = @{$candidate_set} - 1; $i >= 0; $i--) {
80 | 	my $non_adj_vertex_mask = $circle_graph->get_non_adj_vertex_mask_at($candidate_set->[$i]);
81 | 	for (my $j = 0; $j < @{$candidate_set_bitstrings}; $j++) {
82 | 	    if (($candidate_set_bitstrings->[$j] & $non_adj_vertex_mask->[$j]) != $candidate_set_bitstrings->[$j]) {
83 | 		return 0;
84 | 	    }
85 | 	}
86 | 
87 | 	my ($vertex_bitstring_segment_num, $vertex_bitstring) = @{$circle_graph->get_vertex_bitstring_segment_at($candidate_set->[$i])};
88 | 	$candidate_set_bitstrings->[$vertex_bitstring_segment_num] = $candidate_set_bitstrings->[$vertex_bitstring_segment_num] | $vertex_bitstring;
89 |     }
90 | 
91 |     return 1;
92 | }
93 | 
94 | 1;
95 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/temp.a2m:
--------------------------------------------------------------------------------
 1 | >6UFJ_A/1-51 Chain A, RNA (50-MER)
 2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGA----------
 3 | >6UEY_A/1-50 Chain A, RNA (50-MER)
 4 | ACUCGUUUGAGCGAGUAUAAACAGUUGGUUAGGCUCAAAGCGGAGAGCAG-----------
 5 | >HE577054.1/3246821-3246757 Paenibacillus polymyxa M1 main chromosome, complete genome
 6 | ACUCGUCUGAGCGAGUAUAAACAGGUCAUUAAGCUCAGAGCGUUCACCG----CGGUGAGG
 7 | >MF288922.1/150528-150592 Bacillus phage Janet, complete genome
 8 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAAGCUCACAGCGUAGAGAGG--CCUCUCUAG
 9 | >CP033464.1/4485719-4485655 Brevibacillus laterosporus strain 1821L chromosome, complete genome
10 | ACUCGAUUGAGCGAGUAUAAACAGAC-CUUAGGCUCAAAGCGUUGAGAAG--CUUCUCAGG
11 | >KT307976.1/157679-157741 Bacillus phage AvesoBmore, complete genome
12 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGCGGG
13 | >CP032410.1/870062-870126 Brevibacillus laterosporus strain E7593-50 chromosome, complete genome
14 | ACUCGAUUGAGCGAGUAUAAAUAGAC-CUUAAGCUCAAAGCGUUGAGGAG--CUUCUCAGG
15 | >MK892513.1/27480-27550 Prokaryotic dsDNA virus sp. isolate Unbinned_2716_contig-100_1, complete genome
16 | AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGGCGUAUAG--CUAUACGCG
17 | >MF288921.1/151458-151522 Bacillus phage OTooleKemple52, complete genome
18 | ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGUAGAGAGG--CCUCUCUAG
19 | >KJ489397.1/151758-151822 Bacillus phage CAM003, complete genome
20 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGUAGGGAGG--CCUCUCUAG
21 | >KF669647.1/155754-155816 Bacillus phage BigBertha, complete genome
22 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGUGGG
23 | >CP009278.1/2800251-2800310 Sphingobacterium sp. ML3W, complete genome
24 | AGUCGUUUGAGCGACUUAAAAUAGGU-UUUAAGCUCAAAGCGCCCCGAUAAUAAUCGGGAG
25 | >CP045298.1/5377890-5377826 Paenibacillus brasilensis strain KACC 13842 chromosome, complete genome
26 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACCGGAUCCGGUGAGG
27 | >KF669662.1/155100-155162 Bacillus phage Spock, complete genome
28 | ACUCGUGUAAGCGAGUAUAAAAAGGC-UUUAGGCUUACAGCGUCGCGGAGAUCUCCGCGGG
29 | >KR063281.1/60079-60028 Gordonia phage GMA2, complete genome
30 | ACUCGACUGAGCGAGUAUAAACAGUU-CUUAAGCUCAGAGCGGCC------------GGCG
31 | >KJ489402.1/153758-153819 Bacillus phage Riley, complete genome
32 | ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCGUCGCGGG----C--CCGCG
33 | >CP000154.2/3364238-3364174 Paenibacillus polymyxa E681, complete genome
34 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACUGGA-CCAGUGAGA
35 | >LN852800.1/7754-7693 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0110
36 | GCUCGUCUGGGCGAGGAUAAACAGCUA-UUAAGCCCAGAGCGUUCCGGUUAUGAUCGGAGG
37 | >CP019039.1/7984-8046 Bacillus velezensis strain GH1-13 plasmid unnamed, complete sequence
38 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCUCAGAGCGUCCUUCC----GGAAGGGG
39 | >LN852940.1/1904-1844 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0268
40 | GCUCGUCUGGGCGAGGGUAAAUAGCUAAUUAGGCCCAGAGCGUCCAGGAUG-AUCCUGGAG
41 | >JN790865.1/35681-35620 Bacillus phage B4, complete genome
42 | AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--UCCCCCGUG
43 | >KY888882.1/156410-156472 Bacillus phage Flapjack, complete genome
44 | ACUCGUGUGAGUGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--CCCUGCG-G
45 | >CP014843.1/29638-29697 Bacillus licheniformis strain SCDB 14 plasmid pSCDB14, complete sequence
46 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCGUUUCCCUUCUAGGGGAGGU
47 | >CP045906.1/14639513-14639571 Caligus rogercresseyi isolate FCH chromosome 17
48 | UCUUGCUUGAGCAAGAAUAAAGAGCUGUACAUAAGCAAAGAGUCUUGCCU--GAGCAAGAG
49 | >HG916826.1/843085-843030 Pseudomonas pseudoalcaligenes CECT 5344 complete genome
50 | CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCGAAAUUCGGGGCCG-----G
51 | >XM_028713395.1/30-87 PREDICTED: Podarcis muralis solute carrier family 16 member 6 (SLC16A6), mRNA
52 | ACCGGCUCGAGCCGGUAUAAAAAGCU---UGAGCUCGAGCACAGCGGCAGCACUGCCGCAG
53 | >AC100771.2/133706-133648 Homo sapiens chromosome 11, clone RP11-159H22, complete sequence
54 | GUUCAUUUGGGUGAAUAUAAAAAGGAGAUUA--CUCAAAGCUUUAAAAAAAAUUUUUUUAA
55 | >CP022654.2/63818-63880 Bacillus velezensis strain SCDB 291 chromosome, complete genome
56 | AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCGUCCUUCC----GGAAGGGG
57 | >CP045899.1/5107513-5107456 Caligus rogercresseyi isolate FCH chromosome 10
58 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGCUG---AGCAAGAG
59 | >CP010557.1/4528803-4528858 Raoultella ornithinolytica strain S12, complete genome
60 | CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------UCAGGCAGAUCUGCCAGAG
61 | >MH153801.1/58164-58217 Microbacterium phage Count, complete genome
62 | AGUCGUCUGAGCGACUUUAAAUAGGU-CUUAGGCUCAGAGCGGAUAGAUG------UAUUG
63 | >CP045896.1/486401-486459 Caligus rogercresseyi isolate FCH chromosome 7
64 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGC--AUGAGCAAGAG
65 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.a2m:
--------------------------------------------------------------------------------
 1 | >sample_seq
 2 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
 3 | >6UFJ_A/1-51 Chain A, RNA (50-MER)
 4 | ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGA----------
 5 | >6UEY_A/1-50 Chain A, RNA (50-MER)
 6 | ACUCGUUUGAGCGAGUAUAAACAGUUGGUUAGGCUCAAAGCGGAGAGCAG-----------
 7 | >HE577054.1/3246821-3246757 Paenibacillus polymyxa M1 main chromosome, complete genome
 8 | ACUCGUCUGAGCGAGUAUAAACAGGUCAUUAAGCUCAGAGCGUUCACCG----CGGUGAGG
 9 | >MF288922.1/150528-150592 Bacillus phage Janet, complete genome
10 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAAGCUCACAGCGUAGAGAGG--CCUCUCUAG
11 | >CP033464.1/4485719-4485655 Brevibacillus laterosporus strain 1821L chromosome, complete genome
12 | ACUCGAUUGAGCGAGUAUAAACAGAC-CUUAGGCUCAAAGCGUUGAGAAG--CUUCUCAGG
13 | >KT307976.1/157679-157741 Bacillus phage AvesoBmore, complete genome
14 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGCGGG
15 | >CP032410.1/870062-870126 Brevibacillus laterosporus strain E7593-50 chromosome, complete genome
16 | ACUCGAUUGAGCGAGUAUAAAUAGAC-CUUAAGCUCAAAGCGUUGAGGAG--CUUCUCAGG
17 | >MK892513.1/27480-27550 Prokaryotic dsDNA virus sp. isolate Unbinned_2716_contig-100_1, complete genome
18 | AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGGCGUAUAG--CUAUACGCG
19 | >MF288921.1/151458-151522 Bacillus phage OTooleKemple52, complete genome
20 | ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGUAGAGAGG--CCUCUCUAG
21 | >KJ489397.1/151758-151822 Bacillus phage CAM003, complete genome
22 | ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGUAGGGAGG--CCUCUCUAG
23 | >KF669647.1/155754-155816 Bacillus phage BigBertha, complete genome
24 | ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGGAUCCCCGUGGG
25 | >CP009278.1/2800251-2800310 Sphingobacterium sp. ML3W, complete genome
26 | AGUCGUUUGAGCGACUUAAAAUAGGU-UUUAAGCUCAAAGCGCCCCGAUAAUAAUCGGGAG
27 | >CP045298.1/5377890-5377826 Paenibacillus brasilensis strain KACC 13842 chromosome, complete genome
28 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACCGGAUCCGGUGAGG
29 | >KF669662.1/155100-155162 Bacillus phage Spock, complete genome
30 | ACUCGUGUAAGCGAGUAUAAAAAGGC-UUUAGGCUUACAGCGUCGCGGAGAUCUCCGCGGG
31 | >KR063281.1/60079-60028 Gordonia phage GMA2, complete genome
32 | ACUCGACUGAGCGAGUAUAAACAGUU-CUUAAGCUCAGAGCGGCC------------GGCG
33 | >KJ489402.1/153758-153819 Bacillus phage Riley, complete genome
34 | ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCGUCGCGGG----C--CCGCG
35 | >CP000154.2/3364238-3364174 Paenibacillus polymyxa E681, complete genome
36 | GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCGUUCACUGGA-CCAGUGAGA
37 | >LN852800.1/7754-7693 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0110
38 | GCUCGUCUGGGCGAGGAUAAACAGCUA-UUAAGCCCAGAGCGUUCCGGUUAUGAUCGGAGG
39 | >CP019039.1/7984-8046 Bacillus velezensis strain GH1-13 plasmid unnamed, complete sequence
40 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCUCAGAGCGUCCUUCC----GGAAGGGG
41 | >LN852940.1/1904-1844 Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0268
42 | GCUCGUCUGGGCGAGGGUAAAUAGCUAAUUAGGCCCAGAGCGUCCAGGAUG-AUCCUGGAG
43 | >JN790865.1/35681-35620 Bacillus phage B4, complete genome
44 | AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--UCCCCCGUG
45 | >KY888882.1/156410-156472 Bacillus phage Flapjack, complete genome
46 | ACUCGUGUGAGUGAGUAUAAACAGGC-UUUAGGCUCACAGCGUCGCGGGG--CCCUGCG-G
47 | >CP014843.1/29638-29697 Bacillus licheniformis strain SCDB 14 plasmid pSCDB14, complete sequence
48 | AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCGUUUCCCUUCUAGGGGAGGU
49 | >CP045906.1/14639513-14639571 Caligus rogercresseyi isolate FCH chromosome 17
50 | UCUUGCUUGAGCAAGAAUAAAGAGCUGUACAUAAGCAAAGAGUCUUGCCU--GAGCAAGAG
51 | >HG916826.1/843085-843030 Pseudomonas pseudoalcaligenes CECT 5344 complete genome
52 | CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCGAAAUUCGGGGCCG-----G
53 | >XM_028713395.1/30-87 PREDICTED: Podarcis muralis solute carrier family 16 member 6 (SLC16A6), mRNA
54 | ACCGGCUCGAGCCGGUAUAAAAAGCU---UGAGCUCGAGCACAGCGGCAGCACUGCCGCAG
55 | >AC100771.2/133706-133648 Homo sapiens chromosome 11, clone RP11-159H22, complete sequence
56 | GUUCAUUUGGGUGAAUAUAAAAAGGAGAUUA--CUCAAAGCUUUAAAAAAAAUUUUUUUAA
57 | >CP022654.2/63818-63880 Bacillus velezensis strain SCDB 291 chromosome, complete genome
58 | AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCGUCCUUCC----GGAAGGGG
59 | >CP045899.1/5107513-5107456 Caligus rogercresseyi isolate FCH chromosome 10
60 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGCUG---AGCAAGAG
61 | >CP010557.1/4528803-4528858 Raoultella ornithinolytica strain S12, complete genome
62 | CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------UCAGGCAGAUCUGCCAGAG
63 | >MH153801.1/58164-58217 Microbacterium phage Count, complete genome
64 | AGUCGUCUGAGCGACUUUAAAUAGGU-CUUAGGCUCAGAGCGGAUAGAUG------UAUUG
65 | >CP045896.1/486401-486459 Caligus rogercresseyi isolate FCH chromosome 7
66 | UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAGUCUUGC--AUGAGCAAGAG
67 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/CircleGraph.pm:
--------------------------------------------------------------------------------
  1 | #Circle graph is the graphical model for a primitive pseudoknot. Each vertex represents a crossing stem of
  2 | #the pseudoknot, and each edge represents a crossing between two stems. The vertex attributes store information
  3 | #such as number of base pairs, paired positions.
  4 | #
  5 | #Every vertex is represented by a unique bitstring and its adjacent vertices are represented by a bitstring
  6 | #mask. The least significant bit (LSB) represents the most preceding vertex and the most significant bit (MSB)
  7 | #represents the least preceding vertex of the knot-stem graph. However, the no. of vertices may exceed the
  8 | #length of one bitstring. To solve this problem, multiple bitstrings are required to form a bitstring long
  9 | #enough for each bit position to uniquely identify a vertex. This 'long' bitstring is disassembled into an
 10 | #array of bitstrings and each array element is called a bitstring segment. Every bit position of the 'long'
 11 | #bitstring is then transformed by a (segment no., segment bitstring) pair.
 12 | 
 13 | package CircleGraph;
 14 | 
 15 | use strict;
 16 | 
 17 | sub new {
 18 |     my (undef, $primitive_pseudoknot, $os_bit) = @_;
 19 | 
 20 |     my $vertex_attrs = [];
 21 | #    my ($stem_pair_counts, $gains) = ([], []);
 22 |     my ($vertex_bitstring_segments, $non_adj_vertex_masks) = ([], []);
 23 | 
 24 |     my ($prim_pseudoknot_stems, $prim_pseudoknot_stem_crossings) = @{$primitive_pseudoknot};
 25 |     my $vertex_count = @{$prim_pseudoknot_stems};
 26 | 
 27 |     my ($bitstring_segment_num, $vertex_bit) = (0, 0);
 28 | 
 29 |     for (my $i = $vertex_count - 1; $i >= 0; $i--) {
 30 | 	my $prim_pseudoknot_stem = $prim_pseudoknot_stems->[$i];
 31 | #	$stem_pair_counts->[$i] = @{$prim_pseudoknot_stem};
 32 | #	$gains->[$i] = $stem_pair_counts->[$i];
 33 | 
 34 | 	$vertex_bitstring_segments->[$i] = [$bitstring_segment_num, 1 << $vertex_bit];
 35 | 	my $non_adj_vertex_mask_bitstrings = [];
 36 | 
 37 | 	my $stem_crossings = $prim_pseudoknot_stem_crossings->[$i];
 38 | 	my $next_crossing_index = @{$stem_crossings} - 1;
 39 | 	my $next_crossing_stem_id;
 40 | 	if ($next_crossing_index >= 0) {
 41 | 	    $next_crossing_stem_id = $stem_crossings->[$next_crossing_index];
 42 | 	}
 43 | 
 44 | 	for (my $j = $vertex_count - 1; $j > $i; $j--) {
 45 | 	    if ($next_crossing_index >= 0 && $j == $next_crossing_stem_id) {
 46 | #		$gains->[$i] -= $stem_pair_counts->[$j];
 47 | #		$gains->[$j] -= $stem_pair_counts->[$i];
 48 | 		if (--$next_crossing_index >= 0) {
 49 | 		    $next_crossing_stem_id = $stem_crossings->[$next_crossing_index];
 50 | 		}
 51 | 	    }
 52 | 	    else {
 53 | 		my $non_adj_vertex_bitstring_segment_num = $vertex_bitstring_segments->[$j][0];
 54 | 		$non_adj_vertex_mask_bitstrings->[$non_adj_vertex_bitstring_segment_num] = $non_adj_vertex_mask_bitstrings->[$non_adj_vertex_bitstring_segment_num] | $vertex_bitstring_segments->[$j][1];
 55 | 	    }
 56 | 	}
 57 | 	
 58 | 	$non_adj_vertex_masks->[$i] = $non_adj_vertex_mask_bitstrings;
 59 | 
 60 | 	if (++$vertex_bit == $os_bit) {
 61 | 	    $bitstring_segment_num++;
 62 | 	    $vertex_bit = 0;
 63 | 	}
 64 |     }
 65 | 
 66 |     for (my $i = 0; $i < $vertex_count; $i++) {
 67 | 	my $attrs = {};
 68 | #	$attrs->{pair_count} = $stem_pair_counts->[$i];
 69 | #	$attrs->{gain} = $gains->[$i];
 70 | 	$attrs->{stem_pairs} = $prim_pseudoknot_stems->[$i];
 71 | 	$vertex_attrs->[$i] = $attrs;
 72 |     }
 73 | 
 74 |     my $self = {};
 75 |     $self->{vertex_count} = $vertex_count;
 76 |     $self->{vertex_attrs} = $vertex_attrs;
 77 |     $self->{edges} = $prim_pseudoknot_stem_crossings;
 78 |     $self->{vertex_bitstring_segments} = $vertex_bitstring_segments;
 79 |     $self->{non_adj_vertex_masks} = $non_adj_vertex_masks;
 80 | 
 81 |     bless $self;
 82 | 
 83 |     return $self;
 84 | }
 85 | 
 86 | sub get_vertex_count {
 87 |     my $self = shift;
 88 | 
 89 |     return $self->{vertex_count};
 90 | }
 91 | 
 92 | sub get_vertex_attrs_at {
 93 |     my ($self, $vertex_num) = @_;
 94 | 
 95 |     if ($vertex_num >= $self->{vertex_count}) {
 96 | 	return [];
 97 |     }
 98 | 
 99 |     my $vertex_attrs = $self->{vertex_attrs};
100 | 
101 |     return $vertex_attrs->[$vertex_num];
102 | }
103 | 
104 | sub get_edges_at {
105 |     my ($self, $vertex_num) = @_;
106 | 
107 |     if ($vertex_num >= $self->{vertex_count}) {
108 | 	return [];
109 |     }
110 | 
111 |     my $edges = $self->{edges};
112 | 
113 |     return $edges->[$vertex_num];
114 | }
115 | 
116 | #Return the bitstring segment of the vertex. Each bitstring segment is a (segment no.,
117 | #segment bitstring) pair.
118 | sub get_vertex_bitstring_segment_at {
119 |     my ($self, $vertex_num) = @_;
120 | 
121 |     if ($vertex_num >= $self->{vertex_count}) {
122 | 	return [];
123 |     }
124 | 
125 |     my $bitstring_segments = $self->{vertex_bitstring_segments};
126 | 
127 |     return $bitstring_segments->[$vertex_num];
128 | }
129 | 
130 | #Returns bitstring segments that filter all the subsequent adjacent vertices
131 | sub get_non_adj_vertex_mask_at {
132 |     my ($self, $vertex_num) = @_;
133 | 
134 |     if ($vertex_num >= $self->{vertex_count}) {
135 | 	return 0;
136 |     }
137 | 
138 |     my $non_adj_vertex_mask_bitstrings = $self->{non_adj_vertex_masks};
139 | 
140 |     return $non_adj_vertex_mask_bitstrings->[$vertex_num];
141 | }
142 | 
143 | 1;
144 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/DPParser.pm:
--------------------------------------------------------------------------------
  1 | #Parser for dot-parentheses format
  2 | #It returns primitive pseudoknot objects, base sequence and dot-parentheses array
  3 | 
  4 | package DPParser;
  5 | 
  6 | use strict;
  7 | 
  8 | use constant DOT => '.';
  9 | 
 10 | sub parse{
 11 |     my (undef, $dp_file_path) = @_;
 12 | 
 13 |     my $primitive_pseudoknots = [];
 14 |     my ($base_seq_str, $secondary_structure) = ('', '');
 15 | 
 16 |     open (DP, "<$dp_file_path") or die "Cannot open file at $dp_file_path";
 17 |     while (<DP>) {
 18 | 	if ($_ =~ /^([A-Za-z]+)[\r\n]*$/) {
 19 | 	    $base_seq_str = $base_seq_str . $1;
 20 | 	}
 21 | 	elsif ($_ =~ /^([\.\(\)\[\]\{\}<>A-Za-z]+)[\r\n]*$/) {
 22 | 	    $secondary_structure = $secondary_structure . $1;
 23 | 	}
 24 | 	elsif ($_ !~ /^#.*/ && $_ !~ /^\s+/) {
 25 | 	    die "Unknown input: $_";
 26 | 	}
 27 |     }
 28 | 
 29 |     close DP or die "Cannot close file at $dp_file_path";
 30 | 
 31 |     if ($base_seq_str eq '') {
 32 | 	die 'Base sequence is missing';
 33 |     }
 34 | 
 35 |     if ($secondary_structure eq '') {
 36 | 	die 'Secondary structure is missing';
 37 |     }
 38 | 
 39 |     if (length($base_seq_str) != length($secondary_structure)) {
 40 | 	die 'Base sequence length not equal to secondary structure length';
 41 |     }
 42 | 
 43 |     #Group the base pairs into base pair stems
 44 |     my ($stem_outermost_pairs, $stems, $paired_pos_ptrs, $structure_symbols) = _group_to_stems($secondary_structure);
 45 |     #Extract primitive pseudoknots from the base pair stems
 46 |     my $primitive_pseudoknots = PrimitivePseudoknotExtractor->extract($stem_outermost_pairs, $stems, $paired_pos_ptrs);
 47 |     my @base_seq = split(//, $base_seq_str);
 48 | 
 49 |     return $primitive_pseudoknots, \@base_seq, $structure_symbols, $base_seq_str;
 50 | }
 51 | 
 52 | sub _group_to_stems {
 53 |     my $secondary_structure = shift;
 54 |     my $stems = {};
 55 |     my ($stem_outermost_pairs, $stem, $outermost_base_pair) = ([], [], []);
 56 |     my $paired_pos_ptrs = [];
 57 |     my $unsettled_bracket_upstream_pos = {};
 58 |     my $next_paired_pos = {};
 59 |     my $last_paired_pos = 0;
 60 | 
 61 |     my @structure_symbols = split(//, $secondary_structure);
 62 |     my $structure_length = scalar @structure_symbols;
 63 | 
 64 |     for (my $i = 0; $i < $structure_length; $i++) {
 65 | 	my $symbol = $structure_symbols[$i];
 66 | 	if ($symbol eq DOT) {
 67 | 	    next;
 68 | 	}
 69 | 	elsif (BracketPairs->is_open_bracket($symbol)) {
 70 | 	    my $unsettled_upstream_pos = $unsettled_bracket_upstream_pos->{$symbol};
 71 | 	    if (!defined($unsettled_upstream_pos)) {
 72 | 		$unsettled_upstream_pos = [];
 73 | 		$unsettled_bracket_upstream_pos->{$symbol} = $unsettled_upstream_pos;
 74 | 	    }
 75 | 
 76 | 	    my $curr_upstream_pos = $i + 1;
 77 | 	    push @{$unsettled_upstream_pos}, $curr_upstream_pos;
 78 | 
 79 | 	    if (defined($outermost_base_pair->[0])) {
 80 | 		($stem_outermost_pairs, $stems, $outermost_base_pair, $stem) = _add_to_stems($stem_outermost_pairs, $stems, $outermost_base_pair, $stem);
 81 | 	    }
 82 | 
 83 | 	    $next_paired_pos->{$last_paired_pos} = $curr_upstream_pos;
 84 | 	    $last_paired_pos = $curr_upstream_pos;
 85 | 	}
 86 | 	else {
 87 | 	    my $pair_open_bracket = BracketPairs->get_open_bracket($symbol);
 88 | 	    my $unsettled_upstream_pos = $unsettled_bracket_upstream_pos->{$pair_open_bracket};
 89 | 	    if (defined($unsettled_upstream_pos) && defined($unsettled_upstream_pos->[0])) {
 90 | 		my $paired_upstream_pos = pop @{$unsettled_upstream_pos};
 91 | 		my $curr_downstream_pos = $i + 1;
 92 | 
 93 | 		if (defined($outermost_base_pair->[0])) {
 94 | 		    if ($next_paired_pos->{$paired_upstream_pos} != $outermost_base_pair->[0]) {
 95 | 			($stem_outermost_pairs, $stems, $outermost_base_pair, $stem) = _add_to_stems($stem_outermost_pairs, $stems, $outermost_base_pair, $stem);
 96 | 		    }
 97 | 
 98 | 		    $outermost_base_pair = [$paired_upstream_pos, $curr_downstream_pos];
 99 | 		    unshift @{$stem}, $outermost_base_pair;
100 | 		}
101 | 		else {
102 | 		    $outermost_base_pair = [$paired_upstream_pos, $curr_downstream_pos];
103 | 		    $stem = [$outermost_base_pair];
104 | 		}
105 | 
106 | 		$paired_pos_ptrs->[$paired_upstream_pos] = $curr_downstream_pos;
107 | 		$paired_pos_ptrs->[$curr_downstream_pos] = $paired_upstream_pos;
108 | 
109 | 		$next_paired_pos->{$last_paired_pos} = $curr_downstream_pos;
110 | 		$last_paired_pos = $curr_downstream_pos;
111 | 	    }
112 | 	    else {
113 | 		die "Closing bracket $symbol not paired\n";
114 | 	    }
115 | 	}
116 |     }
117 | 
118 |     if (!_is_all_open_bracket_settled($unsettled_bracket_upstream_pos)) {
119 | 	die "Unpaired open bracket remains\n";
120 |     }
121 | 
122 |     if (defined($outermost_base_pair->[0])) {
123 | 	($stem_outermost_pairs, $stems, undef, undef) = _add_to_stems($stem_outermost_pairs, $stems, $outermost_base_pair, $stem);
124 |     }
125 | 
126 |     my @sorted_outermost_pairs = sort {$a->[0] <=> $b->[0]} @{$stem_outermost_pairs};
127 | 
128 |     return (\@sorted_outermost_pairs, $stems, $paired_pos_ptrs, \@structure_symbols);
129 | }
130 | 
131 | sub _add_to_stems {
132 |     my ($stem_outermost_pairs, $stems, $stem_outermost_pair, $stem) = @_;
133 | 
134 |     $stems->{$stem_outermost_pair->[0]} = $stem;
135 |     push @{$stem_outermost_pairs}, $stem_outermost_pair;
136 | 
137 |     return ($stem_outermost_pairs, $stems, [], []);
138 | }
139 | 
140 | sub _is_all_open_bracket_settled {
141 |     my $unsettled_open_bracket_pos = shift;
142 | 
143 |     foreach (values %{$unsettled_open_bracket_pos}) {
144 | 	if (defined($_->[0])) {
145 | 	    return 0;
146 | 	}
147 |     }
148 | 
149 |     return 1;
150 | }
151 | 
152 | 1;
153 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/PrimitivePseudoknotExtractor.pm:
--------------------------------------------------------------------------------
  1 | #Module that extracts primitive pseudoknots from all the base pair stems of the RNA secondary structure
  2 | 
  3 | package PrimitivePseudoknotExtractor;
  4 | 
  5 | use strict;
  6 | 
  7 | sub extract {
  8 |     my (undef, $stem_outermost_pairs, $stems, $paired_pos_ptrs) = @_;
  9 | 
 10 |     #Group together the crossing stems of a pseudoknot
 11 |     my ($knotted_pair_pos_groups, $outermost_pair_crossings) = _group_knotted_outermost_pairs($stem_outermost_pairs);
 12 |     #Create the pseudoknot objects
 13 |     my $primitive_pseudoknots = _get_prim_pseudoknots($stems, $knotted_pair_pos_groups, $outermost_pair_crossings, $paired_pos_ptrs);
 14 | 
 15 |     return $primitive_pseudoknots;
 16 | }
 17 | 
 18 | sub _group_knotted_outermost_pairs {
 19 |     my $stem_outermost_pairs = shift;
 20 | 
 21 |     my $knotted_pair_pos_groups = [];
 22 |     my $outermost_pair_crossings = {};
 23 |     my $paired_pos_to_group_id = {};
 24 |     my $max_group_id;
 25 | 
 26 |     my $outermost_pair_count = @{$stem_outermost_pairs};
 27 | 
 28 |     for (my $i = 0; $i < $outermost_pair_count; $i++) {
 29 | 	my ($curr_pair_upstream_pos, $curr_pair_downstream_pos) = @{$stem_outermost_pairs->[$i]};
 30 | 	my $curr_pair_group_id = $paired_pos_to_group_id->{$curr_pair_upstream_pos};
 31 | 
 32 | 	my $succ_pair_crossings = [];
 33 | 
 34 | 	for (my $j = $i + 1; $j < $outermost_pair_count; $j++) {
 35 | 	    my ($candidate_pair_upstream_pos, $candidate_pair_downstream_pos) = @{$stem_outermost_pairs->[$j]};
 36 | 	    if ($candidate_pair_upstream_pos > $curr_pair_downstream_pos) {
 37 | 		last;
 38 | 	    }
 39 | 
 40 | 	    if ($candidate_pair_downstream_pos > $curr_pair_downstream_pos) {
 41 | 		my $crossing_pair_group_id = $paired_pos_to_group_id->{$candidate_pair_upstream_pos};
 42 | 		if (defined($curr_pair_group_id)) {
 43 | 		    if (!defined($crossing_pair_group_id)) {
 44 | 			push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $candidate_pair_upstream_pos;
 45 | 			push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $candidate_pair_downstream_pos;
 46 | 			$paired_pos_to_group_id->{$candidate_pair_upstream_pos} = $curr_pair_group_id;
 47 | 		    }
 48 | 		    elsif ($crossing_pair_group_id != $curr_pair_group_id) {
 49 | 			my @merged_pos_group = (@{$knotted_pair_pos_groups->[$curr_pair_group_id]}, @{$knotted_pair_pos_groups->[$crossing_pair_group_id]});
 50 | 			$knotted_pair_pos_groups->[$curr_pair_group_id] = \@merged_pos_group;
 51 | 
 52 | 			foreach (@{$knotted_pair_pos_groups->[$crossing_pair_group_id]}) {
 53 | 			    if (exists($paired_pos_to_group_id->{$_})) {
 54 | 				$paired_pos_to_group_id->{$_} = $curr_pair_group_id;
 55 | 			    }
 56 | 			}
 57 | 
 58 | 			delete $knotted_pair_pos_groups->[$crossing_pair_group_id];
 59 | 		    }
 60 | 		}
 61 | 		else {
 62 | 		    if (defined($crossing_pair_group_id)) {
 63 | 			$curr_pair_group_id = $crossing_pair_group_id;
 64 | 			push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $curr_pair_upstream_pos;
 65 | 			push @{$knotted_pair_pos_groups->[$curr_pair_group_id]}, $curr_pair_downstream_pos;
 66 | 		    }
 67 | 		    else {
 68 | 			$curr_pair_group_id = $max_group_id++;
 69 | 			$knotted_pair_pos_groups->[$curr_pair_group_id] = [$curr_pair_upstream_pos, $curr_pair_downstream_pos, $candidate_pair_upstream_pos, $candidate_pair_downstream_pos];
 70 | 			$paired_pos_to_group_id->{$candidate_pair_upstream_pos} = $curr_pair_group_id;
 71 | 		    }
 72 | 		}
 73 | 
 74 | 		push @{$succ_pair_crossings}, $candidate_pair_upstream_pos;
 75 | 	    }
 76 | 	}
 77 | 
 78 | 	$outermost_pair_crossings->{$curr_pair_upstream_pos} = $succ_pair_crossings;
 79 |     }
 80 | 
 81 |     return ($knotted_pair_pos_groups, $outermost_pair_crossings);
 82 | }
 83 | 
 84 | sub _get_prim_pseudoknots {
 85 |     my ($stems, $knotted_pair_pos_groups, $outermost_pair_crossings, $paired_pos_ptrs) = @_;
 86 | 
 87 |     my $primitive_pseudoknots = [];
 88 | 
 89 |     for (my $i = 0; $i < @{$knotted_pair_pos_groups}; $i++) {
 90 | 	if (!defined($knotted_pair_pos_groups->[$i])) {
 91 | 	    next;
 92 | 	}
 93 | 
 94 | 	my @sorted_knot_pair_pos = sort {$a <=> $b} @{$knotted_pair_pos_groups->[$i]};
 95 | 	my $prev_knot_pair_pos = {};
 96 | 	for (my $j = 1; $j < @sorted_knot_pair_pos; $j++) {
 97 | 	    $prev_knot_pair_pos->{$sorted_knot_pair_pos[$j]} = $sorted_knot_pair_pos[$j - 1];
 98 | 	}
 99 | 
100 | 	my ($prim_pseudoknot_stems, $prim_pseudoknot_stem) = ([], []);
101 | 	my $knot_pair_pos_to_stem_id = {};
102 | 	my $max_stem_id = 0;
103 | 
104 | 	for (my $j = 0; $j < (@sorted_knot_pair_pos - 1); $j++) {
105 | 	    my $curr_pos = $sorted_knot_pair_pos[$j];
106 | 	    my $curr_paired_pos = $paired_pos_ptrs->[$curr_pos];
107 | 	    if ($curr_pos > $curr_paired_pos) {
108 | 		next;
109 | 	    }
110 | 
111 | 	    my @merged_stem = (@{$prim_pseudoknot_stem}, @{$stems->{$curr_pos}});
112 | 	    $prim_pseudoknot_stem = \@merged_stem;
113 | 
114 | 	    my $next_pos = $sorted_knot_pair_pos[$j + 1];
115 | 	    my $next_paired_pos = $paired_pos_ptrs->[$next_pos];
116 | 	    if ($prev_knot_pair_pos->{$curr_paired_pos} != $next_paired_pos) {
117 | 		push @{$prim_pseudoknot_stems}, $prim_pseudoknot_stem;
118 | 		$knot_pair_pos_to_stem_id->{$curr_pos} = $max_stem_id++;
119 | 		$prim_pseudoknot_stem = [];
120 | 	    }
121 | 	}
122 | 
123 | 	my $prim_pseudoknot_stem_crossings = [];
124 | 	while (my ($knot_pair_upstream_pos, $stem_id) = each %{$knot_pair_pos_to_stem_id}) {
125 | 	    my $stem_crossings = [];
126 | 	    my $knot_pair_crossings = $outermost_pair_crossings->{$knot_pair_upstream_pos};
127 | 	    foreach (@{$knot_pair_crossings}) {
128 | 		if (exists($knot_pair_pos_to_stem_id->{$_})) {
129 | 		    push @{$stem_crossings}, $knot_pair_pos_to_stem_id->{$_};
130 | 		}
131 | 	    }
132 | 
133 | 	    $prim_pseudoknot_stem_crossings->[$stem_id] = $stem_crossings;
134 | 	}
135 | 
136 | 	push @{$primitive_pseudoknots}, [$prim_pseudoknot_stems, $prim_pseudoknot_stem_crossings];
137 |     }
138 | 
139 |     return $primitive_pseudoknots;
140 | }
141 | 
142 | 1;
143 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/VertexSubset.pm:
--------------------------------------------------------------------------------
  1 | #Module that represents the vertex subset in the MWIS algorithm. All the vertices of the knot-stem
  2 | #graph are added to this subset (with the goal opposing vertices filtered) at initialization. When
  3 | #the MWIS algorithm proceeds, vertices are gradually removed from this subset and the algorithm
  4 | #stops when this subset is empty.
  5 | #
  6 | #This subset also keeps the adjacent vertices for each vertex in it, as well as the vertex degrees.
  7 | #It enables the MWIS algorithm to select the highest degree and lowest degree vertices, and to
  8 | #further generate a new subset of it while updating the adjacent vertices and vertex degrees.
  9 | 
 10 | package VertexSubset;
 11 | 
 12 | use strict;
 13 | 
 14 | sub new {
 15 | #    my (undef, $circle_graph, $stem_scores, $criteria) = @_;
 16 |     my (undef, $circle_graph) = @_;
 17 | 
 18 |     my ($vertex_degrees, $adj_vertex_sets) = ({}, {});
 19 | 
 20 |     my $subset_size = 0;
 21 | 
 22 |     for (my $i = $circle_graph->get_vertex_count() - 1; $i >= 0; $i--) {
 23 | 	$vertex_degrees->{$i} = 0;
 24 | 
 25 | 	foreach (@{$circle_graph->get_edges_at($i)}) {
 26 | 	    $vertex_degrees->{$i}++;
 27 | 	    $vertex_degrees->{$_}++;
 28 | 	    $adj_vertex_sets->{$i}{$_} = 1;
 29 | 	    $adj_vertex_sets->{$_}{$i} = 1;
 30 | 	}
 31 | 
 32 | 	$subset_size++;
 33 |     }
 34 | 
 35 |     my ($highest_degree_vertices, $lowest_degree_vertices, $highest_vertex_degree, $lowest_vertex_degree) = _get_highest_and_lowest_degree_vertices($vertex_degrees);
 36 | 
 37 |     my $self = {};
 38 |     $self->{subset_size} = $subset_size;
 39 |     $self->{vertex_degrees} = $vertex_degrees;
 40 |     $self->{adj_vertex_sets} = $adj_vertex_sets;
 41 |     $self->{highest_degree_vertices} = $highest_degree_vertices;
 42 |     $self->{lowest_degree_vertices} = $lowest_degree_vertices;
 43 |     $self->{highest_vertex_degree} = $highest_vertex_degree;
 44 |     $self->{lowest_vertex_degree} = $lowest_vertex_degree;
 45 | 
 46 |     bless $self;
 47 | 
 48 |     return $self;
 49 | }
 50 | 
 51 | #Generate a new subset instance by removing the vertices specified in the input
 52 | sub get_subset {
 53 |     my ($self, $vertices_to_remove) = @_;
 54 | 
 55 |     my $subset_size = 0;
 56 |     my ($subset_vertex_degrees, $subset_adj_vertex_sets) = ({}, {});
 57 | 
 58 |     my %delete_vertices = map {$_ => 1} @{$vertices_to_remove};
 59 |     my $vertex_degrees = $self->{vertex_degrees};
 60 |     foreach (keys %{$vertex_degrees}) {
 61 | 	if (!exists($delete_vertices{$_})) {
 62 | 	    $subset_vertex_degrees->{$_} = 0;
 63 | 	    $subset_adj_vertex_sets->{$_} = {};
 64 | 	    $subset_size++;
 65 | 	}
 66 |     }
 67 | 
 68 |     my $adj_vertex_sets = $self->{adj_vertex_sets};
 69 |     while (my ($vertex, $adj_vertices) = each %{$adj_vertex_sets}) {
 70 | 	if (!exists($delete_vertices{$vertex})) {
 71 | 	    foreach (keys %{$adj_vertices}) {
 72 | 		if ($vertex < $_ && !exists($delete_vertices{$_})) {
 73 | 		    $subset_adj_vertex_sets->{$vertex}{$_} = 1;
 74 | 		    $subset_adj_vertex_sets->{$_}{$vertex} = 1;
 75 | 		    $subset_vertex_degrees->{$vertex}++;
 76 | 		    $subset_vertex_degrees->{$_}++;
 77 | 		}
 78 | 	    }
 79 | 	}
 80 |     }
 81 | 
 82 |     my ($highest_degree_vertices, $lowest_degree_vertices, $highest_vertex_degree, $lowest_vertex_degree) = _get_highest_and_lowest_degree_vertices($subset_vertex_degrees);
 83 | 
 84 |     my $subset_self = {};
 85 |     $subset_self->{subset_size} = $subset_size;
 86 |     $subset_self->{vertex_degrees} = $subset_vertex_degrees;
 87 |     $subset_self->{adj_vertex_sets} = $subset_adj_vertex_sets;
 88 |     $subset_self->{highest_degree_vertices} = $highest_degree_vertices;
 89 |     $subset_self->{lowest_degree_vertices} = $lowest_degree_vertices;
 90 |     $subset_self->{highest_vertex_degree} = $highest_vertex_degree;
 91 |     $subset_self->{lowest_vertex_degree} = $lowest_vertex_degree;
 92 | 
 93 |     bless $subset_self;
 94 | 
 95 |     return $subset_self;
 96 | }
 97 | 
 98 | sub _get_highest_and_lowest_degree_vertices {
 99 |     my $vertex_degrees = shift;
100 | 
101 |     my ($highest_degree_vertices, $lowest_degree_vertices) = ([], []);
102 |     my ($highest_vertex_degree, $lowest_vertex_degree) = (-1, -1);
103 | 
104 |     while (my ($vertex, $vertex_degree) = each %{$vertex_degrees}) {
105 | 	if ($vertex_degree > $highest_vertex_degree) {
106 | 	    $highest_degree_vertices = [$vertex];
107 | 	    $highest_vertex_degree = $vertex_degree;
108 | 	}
109 | 	elsif ($vertex_degree == $highest_vertex_degree) {
110 | 	    push @{$highest_degree_vertices}, $vertex;
111 | 	}
112 | 
113 | 	if ($vertex_degree < $lowest_vertex_degree || $lowest_vertex_degree < 0) {
114 | 	    $lowest_degree_vertices = [$vertex];
115 | 	    $lowest_vertex_degree = $vertex_degree;
116 | 	}
117 | 	elsif ($vertex_degree == $lowest_vertex_degree) {
118 | 	    push @{$lowest_degree_vertices}, $vertex;
119 | 	}
120 |     }
121 | 
122 |     my @sorted_highest_degree_vertices = sort {$a <=> $b} @{$highest_degree_vertices};
123 |     my @sorted_lowest_degree_vertices = sort {$a <=> $b} @{$lowest_degree_vertices};
124 | 
125 |     return \@sorted_highest_degree_vertices, \@sorted_lowest_degree_vertices, $highest_vertex_degree, $lowest_vertex_degree;
126 | }
127 | 
128 | sub get_size {
129 |     my $self = shift;
130 | 
131 |     return $self->{subset_size};
132 | }
133 | 
134 | sub get_vertices {
135 |     my $self = shift;
136 | 
137 |     my @vertices = sort {$a <=> $b} keys %{$self->{vertex_degrees}};
138 | 
139 |     return \@vertices;
140 | }
141 | 
142 | sub get_adjacent_vertices_at {
143 |     my ($self, $vertex) = @_;
144 | 
145 |     my $adj_vertex_sets = $self->{adj_vertex_sets};
146 |     if (exists($adj_vertex_sets->{$vertex})) {
147 | 	my @adj_vertices = sort {$a <=> $b} keys %{$adj_vertex_sets->{$vertex}};
148 | 	return \@adj_vertices;
149 |     }
150 | 
151 |     return [];
152 | }
153 | 
154 | sub get_highest_degree_vertex_info {
155 |     my $self = shift;
156 | 
157 |     return $self->{highest_degree_vertices}, $self->{highest_vertex_degree};
158 | }
159 | 
160 | sub get_lowest_degree_vertex_info {
161 |     my $self = shift;
162 | 
163 |     return $self->{lowest_degree_vertices}, $self->{lowest_vertex_degree};
164 | }
165 | 
166 | 1;
167 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/MWIS.pm:
--------------------------------------------------------------------------------
  1 | #Modified circle graph MWIS algorithm based on that proposed by Valiente (Valiente, G., 2003), with
  2 | #enhancement suggested by Nash et al. (Nash, N., Lelait, S., and Gregg, D., 2009). It operates with
  3 | #the chord model and reports either single solution or all solutions according to the user option.
  4 | 
  5 | package MWIS;
  6 | 
  7 | use strict;
  8 | 
  9 | sub get_mwis {
 10 |     my (undef, $chord_model, $base_seq, $scoring_function, $criteria, $is_report_all) = @_;
 11 | 
 12 |     my $chord_weights = _get_chord_weights($chord_model, $base_seq, $scoring_function);
 13 | 
 14 |     my $end_point_count = $chord_model->get_chord_edge_count() * 2;
 15 | 
 16 |     #Enhancement by Nash et al. to get MWISs (in variable c) and the scores (in variable cmis) in
 17 |     #every region bounded by the endpoints of each chord.
 18 |     my ($m, $p) = ([], []);
 19 |     my ($cmis, $c) = ({}, {});
 20 | 
 21 |     for (my $i = 1; $i <= $end_point_count + 1; $i++) {
 22 | 	$m->[$i] = 0;
 23 | 	$p->[$i] = [0];
 24 |     }
 25 | 
 26 |     my $last = 1;
 27 | 
 28 |     for (my $i = 1; $i <= $end_point_count; $i++) {
 29 | 	if ($chord_model->is_left_end_point($i)) {
 30 | 	    next;
 31 | 	}
 32 | 
 33 | 	my ($left_end_point, $right_end_point) = @{$chord_model->get_chord_edge_by_end_point($i)};
 34 | 
 35 | 	for (my $j = $last; $j > $left_end_point; $j--) {
 36 | 	    $m->[$j] = $m->[$j + 1];
 37 | 	    $p->[$j] = $p->[$j + 1];
 38 | 
 39 | 	    if ($chord_model->is_left_end_point($j)) {
 40 | 		my (undef, $inner_right_end_point) = @{$chord_model->get_chord_edge_by_end_point($j)};
 41 | 		my $candidate_m = $m->[$inner_right_end_point + 1] + $cmis->{$j . '-' . $inner_right_end_point};
 42 | 
 43 | 		if (($criteria eq 'max' && $candidate_m > $m->[$j]) ||
 44 | 		    ($criteria eq 'min' && $candidate_m < $m->[$j])) {
 45 | 		    $m->[$j] = $candidate_m;
 46 | 		    $p->[$j] = [$inner_right_end_point];
 47 | 		}
 48 | 		elsif ($is_report_all && $candidate_m == $m->[$j]) {
 49 | 		    my @arr_clone = @{$p->[$j + 1]};
 50 | 		    $p->[$j] = [$inner_right_end_point];
 51 | 		    push @{$p->[$j]}, @arr_clone;
 52 | 		}
 53 | 	    }
 54 | 	}
 55 | 
 56 | 	$cmis->{$left_end_point . '-' . $right_end_point} = $m->[$left_end_point + 1] + $chord_weights->{$left_end_point . '-' . $right_end_point};
 57 | 	$c->{$left_end_point . '-' . $right_end_point} = _add_front($p, $left_end_point + 1, $chord_model, []);
 58 | 	$last = $left_end_point;
 59 |     }
 60 | 
 61 |     #Algorithm proposed by Valiente to obtain MWISs starting at each endpoint. Only those chords
 62 |     #in the MWIS that are not bounded by other chords in the same MWIS set are stored.
 63 |     my ($t_structures, $t_struct_weights) = ([], []);
 64 | 
 65 |     foreach (@{$chord_model->get_chord_end_point_nums()}) {
 66 | 	$t_structures->[$_] = [[]];
 67 | 
 68 | 	if (!$chord_model->is_left_end_point($_)) {
 69 | 	    if ($_ < $end_point_count) {
 70 | 		@{$t_structures->[$_]} = @{$t_structures->[$_ + 1]};
 71 | 		$t_struct_weights->[$_] = $t_struct_weights->[$_ + 1];
 72 | 	    }
 73 | 	    else {
 74 | 		$t_struct_weights->[$_] = 0;
 75 | 	    }
 76 | 	}
 77 | 	else {
 78 | 	    my $chord_edge = $chord_model->get_chord_edge_by_end_point($_);
 79 | 	    my $candidate_total_chord_weight = $cmis->{$chord_edge->[0] . '-' . $chord_edge->[1]};
 80 | 
 81 | 	    if ($chord_edge->[1] < $end_point_count) {
 82 | 		$candidate_total_chord_weight += $t_struct_weights->[$chord_edge->[1] + 1];
 83 | 	    }
 84 | 
 85 | 	    if (($criteria eq 'max' && $candidate_total_chord_weight > $t_struct_weights->[$_ + 1]) ||
 86 | 		($criteria eq 'min' && $candidate_total_chord_weight < $t_struct_weights->[$_ + 1]) ||
 87 | 		($candidate_total_chord_weight == $t_struct_weights->[$_ + 1] && $is_report_all)) {
 88 | 		my $generated_new_t_structures;
 89 | 
 90 | 		if ($candidate_total_chord_weight == $t_struct_weights->[$_ + 1]) {
 91 | 		    @{$generated_new_t_structures} = @{$t_structures->[$_ + 1]};
 92 | 		}
 93 | 		else {
 94 | 		    $generated_new_t_structures = [];
 95 | 		}
 96 | 
 97 | 		if ($chord_edge->[1] < $end_point_count) {
 98 | 		    foreach my $t_structure (@{$t_structures->[$chord_edge->[1] + 1]}) {
 99 | 			my @new_t_structure = @{$t_structure};
100 | 			unshift @new_t_structure, $chord_edge;
101 | 			push @{$generated_new_t_structures}, \@new_t_structure;
102 | 		    }
103 | 		}
104 | 		else {
105 | 		    push @{$generated_new_t_structures}, [$chord_edge];
106 | 		}
107 | 
108 | 		$t_structures->[$_] = $generated_new_t_structures;
109 | 		$t_struct_weights->[$_] = $candidate_total_chord_weight;
110 | 	    }
111 | 	    else {
112 | 		$t_structures->[$_] = $t_structures->[$_ + 1];
113 | 		$t_struct_weights->[$_] = $t_struct_weights->[$_ + 1];
114 | 	    }
115 | 	}
116 |     }
117 | 
118 |     my $mwiss = _restore_chord_mwiss($t_structures->[1], $c);
119 | 
120 |     return $mwiss;
121 | }
122 | 
123 | #Generate all the MWISs in the region bounded by the endpoints of a single chord
124 | sub _add_front {
125 |     my ($p, $start_pos, $chord_model, $org_c_element) = @_;
126 | 
127 |     my $p_element = $p->[$start_pos];
128 | 
129 |     if ($p_element->[0] > 0) {
130 | 	my $new_c_element = [];
131 | 
132 | 	foreach (@{$p_element}) {
133 | 	    my $chord_edge = $chord_model->get_chord_edge_by_end_point($_);
134 | 	    my $expanded_c_element = [];
135 | 
136 | 	    if (!defined($org_c_element->[0])) {
137 | 		push @{$expanded_c_element}, [$chord_edge];
138 | 	    }
139 | 	    else {
140 | 		foreach my $element_value (@{$org_c_element}) {
141 | 		    my @arr_clone = @{$element_value};
142 | 		    push @arr_clone, $chord_edge;
143 | 		    push @{$expanded_c_element}, \@arr_clone;
144 | 		}
145 | 	    }
146 | 
147 | 	    my $new_values = _add_front($p, $_, $chord_model, $expanded_c_element);
148 | 	    push @{$new_c_element}, @{$new_values};
149 | 	}
150 | 
151 | 	return $new_c_element;
152 |     }
153 | 
154 |     return $org_c_element;
155 | }
156 | 
157 | sub _get_chord_weights {
158 |     my ($chord_model, $base_seq, $scoring_function) = @_;
159 | 
160 |     my $chord_weights = {};
161 | 
162 |     foreach (values %{$chord_model->get_chord_edges()}) {
163 | 	my $chord_base_pairs = $chord_model->get_chord_base_pairs($_->[0], $_->[1]);
164 | 	my $chord_attrs = {};
165 | 	$chord_attrs->{base_pairs} = $chord_base_pairs;
166 | 	$chord_attrs->{pair_count} = @{$chord_base_pairs};
167 | 	$chord_weights->{$_->[0] . '-' . $_->[1]} = $scoring_function->($chord_attrs, $base_seq);
168 |     }
169 | 
170 |     return $chord_weights;
171 | }
172 | 
173 | #Recover the MWISs from the chord sets in variable c
174 | sub _restore_chord_mwiss {
175 |     my ($chord_edge_sets, $c) = @_;
176 | 
177 |     my $chord_mwiss = [];
178 | 
179 |     foreach my $chord_edge_set (@{$chord_edge_sets}) {
180 | 	my $single_chord_edge_set_mwiss = [$chord_edge_set];
181 | 
182 | 	foreach my $chord_edge (@{$chord_edge_set}) {
183 | 	    my $inner_chord_edge_sets = $c->{$chord_edge->[0] . '-' . $chord_edge->[1]};
184 | 	    if (!defined($inner_chord_edge_sets->[0])) {
185 | 		next;
186 | 	    }
187 | 
188 | 	    my $inner_chord_mwiss = _restore_chord_mwiss($inner_chord_edge_sets, $c);
189 | 	    my @org_single_chord_edge_set_mwiss = @{$single_chord_edge_set_mwiss};
190 | 	    $single_chord_edge_set_mwiss = [];
191 | 
192 | 	    foreach my $single_chord_edge_set_mwis (@org_single_chord_edge_set_mwiss) {
193 | 		foreach my $inner_chord_mwis (@{$inner_chord_mwiss}) {
194 | 		    my @merged_mwis = (@{$single_chord_edge_set_mwis}, @{$inner_chord_mwis});
195 | 		    push @{$single_chord_edge_set_mwiss}, \@merged_mwis;
196 | 		}
197 | 	    }
198 | 	}
199 | 
200 | 	push @{$chord_mwiss}, @{$single_chord_edge_set_mwiss};
201 | 
202 |     }
203 | 
204 |     return $chord_mwiss;
205 | }
206 | 
207 | 1;
208 | 


--------------------------------------------------------------------------------
/utils/FreeKnot/remove_pseudoknot.pl:
--------------------------------------------------------------------------------
  1 | #Main program for pseudoknot removal
  2 | #It accepts input RNA secondary structure as BPSEQ format or dot-parentheses format
  3 | #There are four choices of scoring functions: No. of base pairs, no. of stems, no. of hydrogen
  4 | #bonds, and Turner free energy (Turner, D. H. & Mathews, D. H., NAR 2009)). The optimization goal
  5 | #for the first three options is to maximize the score as all the choices only give positive values.
  6 | #For the last option, the goal is to minimize the score (i.e. free energy).
  7 | 
  8 | #!/usr/bin/perl
  9 | 
 10 | use BpseqParser;
 11 | use BpseqWriter;
 12 | use BracketPairs;
 13 | use ChordModel;
 14 | use CircleGraph;
 15 | use DPParser;
 16 | use DPWriter;
 17 | use MIS;
 18 | use MWIS;
 19 | use PrimitivePseudoknotExtractor;
 20 | use ScoringFunctions;
 21 | use VertexSubset;
 22 | use strict;
 23 | 
 24 | #OS_BIT specifies the length of a bitstring used in the circle graph
 25 | use constant OS_BIT => 32;
 26 | 
 27 | if (@ARGV < 5) {
 28 |     print "Usage: perl $0 -i <input file format> -s <scoring function> <input file path> [-a : report all optimal solutions]\n";
 29 |     exit;
 30 | }
 31 | 
 32 | my ($input_file_path, $input_file_format, $scoring_fx_option);
 33 | my $is_report_all = 0;
 34 | 
 35 | for (my $i = 0; $i < @ARGV; $i++) {
 36 |     if ($ARGV[$i] eq '-i') {
 37 | 	if (defined($input_file_format)) {
 38 | 	    print "Duplicated input file format specification\n";
 39 | 	    exit;
 40 | 	}
 41 | 	else {
 42 | 	    $input_file_format = $ARGV[++$i];
 43 | 	}
 44 |     }
 45 |     elsif ($ARGV[$i] eq '-s') {
 46 | 	if (defined($scoring_fx_option)) {
 47 | 	    print "Duplicated scoring function specification\n";
 48 | 	    exit;
 49 | 	}
 50 | 	else {
 51 | 	    $scoring_fx_option = $ARGV[++$i];
 52 | 	}
 53 |     }
 54 |     elsif ($ARGV[$i] eq '-a') {
 55 | 	$is_report_all = 1;
 56 |     }
 57 |     elsif (substr($ARGV[$i], 0, 1) eq '-') {
 58 | 	print "Unknown parameter $ARGV[$i]\n";
 59 | 	exit;
 60 |     }
 61 |     elsif (!defined($input_file_path)) {
 62 | 	$input_file_path = $ARGV[$i];
 63 |     }
 64 | }
 65 | 
 66 | if (!defined($input_file_path)) {
 67 |     print "No input file path specified\n";
 68 |     exit;
 69 | }
 70 | 
 71 | #Select the scoring function according to the user option. It will be used to calculate the score of
 72 | #each stem in the MWIS algorithm
 73 | my ($scoring_function, $criteria, $is_fe) = ScoringFunctions->get_scoring_function($scoring_fx_option);
 74 | if (!defined($scoring_function)) {
 75 |     print "Unknown scoring function specified: $scoring_fx_option\n";
 76 |     exit;
 77 | }
 78 | 
 79 | my ($primitive_pseudoknots, $base_seq, $paired_pos_ptrs, $base_count, $structure_symbols, $base_seq_str);
 80 | 
 81 | #Parse the input structure file to generate pseudoknot objects
 82 | if ($input_file_format eq 'bpseq') {
 83 |     ($primitive_pseudoknots, $base_seq, $paired_pos_ptrs, $base_count) = BpseqParser->parse($input_file_path);
 84 | }
 85 | elsif ($input_file_format eq 'dp') {
 86 |     ($primitive_pseudoknots, $base_seq, $structure_symbols, $base_seq_str) = DPParser->parse($input_file_path);
 87 | }
 88 | else {
 89 |     print "Unknown input file format: $input_file_format\n";
 90 |     exit;
 91 | }
 92 | 
 93 | my $pseudoknot_base_pair_removal_pos = [];
 94 | my $prim_pseudoknot_count = 0;
 95 | 
 96 | #If free energy is selected as the scoring function, then MIS algorithm is applied to generate
 97 | #all MISs of the circle graph, and evaluated the free energy for each of them
 98 | if ($is_fe) {
 99 |     foreach (@{$primitive_pseudoknots}) {
100 | 	my $circle_graph = CircleGraph->new($_, OS_BIT);
101 | 	my $miss = MIS->get_mis($circle_graph, $criteria);
102 | 	my $base_pair_removal_pos = convert_to_base_pair_removal_pos_circle_graph($circle_graph, $miss);
103 | 	push @{$pseudoknot_base_pair_removal_pos}, $base_pair_removal_pos;
104 | 	$prim_pseudoknot_count++;
105 |     }
106 | }
107 | #For other scoring function options, MWIS algorithm is applied to generate one/all MWISs from
108 | #the chord model of the circle graph
109 | else{
110 |     foreach (@{$primitive_pseudoknots}) {
111 | 	my $chord_model = ChordModel->new($_);
112 | 	my $mwiss = MWIS->get_mwis($chord_model, $base_seq, $scoring_function, $criteria, $is_report_all);
113 | 	my $base_pair_removal_pos = convert_to_base_pair_removal_pos($chord_model, $mwiss);
114 | 	push @{$pseudoknot_base_pair_removal_pos}, $base_pair_removal_pos;
115 | 	$prim_pseudoknot_count++;
116 |     }
117 | }
118 | 
119 | #Combine the possible removal positions sets for all primitive pseudoknots
120 | my $combined_base_pair_removal_pos = combine_base_pair_removal_pos($pseudoknot_base_pair_removal_pos, []);
121 | 
122 | #Determine the free energy of every structure converted from the MISs combinations of different
123 | #primitive pseudoknots in the structure. It writes the structure to a temporary file and call
124 | #RNAeval in ViennaRNA package to calculate its free energy
125 | if ($is_fe) {
126 |     my $mfe;
127 |     my $mfe_base_pair_models = [];
128 | 
129 |     if (!defined($base_seq_str)) {
130 | 	$base_seq_str = join('', @{$base_seq});
131 |     }
132 | 
133 |     foreach (@{$combined_base_pair_removal_pos}) {
134 | 	DPWriter->output_mfe_candidate($_, $paired_pos_ptrs, $structure_symbols, $base_seq_str);
135 | 	my $rna_eval_output = `RNAeval < MWIS_temp.dp`;
136 | 	$rna_eval_output =~ /(-?\d+\.\d+)/;
137 | 	if ($1 < $mfe || !defined($mfe)) {
138 | 	    $mfe_base_pair_models = [$_];
139 | 	    $mfe = $1;
140 | 	}
141 | 	elsif ($1 == $mfe) {
142 | 	    push @{$mfe_base_pair_models}, $_;
143 | 	}
144 |     }
145 | 
146 |     $combined_base_pair_removal_pos = $mfe_base_pair_models;
147 | }
148 | 
149 | if ($input_file_format eq 'bpseq') {
150 |     BpseqWriter->output_results($combined_base_pair_removal_pos, $base_seq, $paired_pos_ptrs, $base_count);
151 | }
152 | elsif ($input_file_format eq 'dp') {
153 |     DPWriter->output_results($combined_base_pair_removal_pos, $structure_symbols, $base_seq_str);
154 | }
155 | 
156 | sub convert_to_base_pair_removal_pos_circle_graph {
157 |     my ($circle_graph, $miss) = @_;
158 | 
159 |     my $base_pair_removal_pos = [];
160 | 
161 |     foreach my $mis (@{$miss}) {
162 | 	my $removed_vertex_nums = [];
163 | 	for (my $i = 0; $i < $mis->[0]; $i++) {
164 | 	    push @{$removed_vertex_nums}, $i;
165 | 	}
166 | 
167 | 	for (my $i = 1; $i < @{$mis}; $i++) {
168 | 	    for (my $j = $mis->[$i - 1] + 1; $j < $mis->[$i]; $j++) {
169 | 		push @{$removed_vertex_nums}, $j;
170 | 	    }
171 | 	}
172 | 
173 | 	for (my $i = $mis->[-1] + 1; $i < $circle_graph->get_vertex_count(); $i++) {
174 | 	    push @{$removed_vertex_nums}, $i;
175 | 	}
176 | 
177 | 	my $removal_pos = {};
178 | 	foreach (@{$removed_vertex_nums}) {
179 | 	    my $vertex_attrs = $circle_graph->get_vertex_attrs_at($_);
180 | 	    my $stem_pairs = $vertex_attrs->{stem_pairs};
181 | 	    foreach (@{$stem_pairs}) {
182 | 		my ($pair_upstream_pos, $pair_downstream_pos) = @{$_};
183 | 		$removal_pos->{$pair_upstream_pos} = 1;
184 | 		$removal_pos->{$pair_downstream_pos} = 1;
185 | 	    }
186 | 	}
187 | 
188 | 	push @{$base_pair_removal_pos}, $removal_pos;
189 |     }
190 | 
191 |     return $base_pair_removal_pos;
192 | }
193 | 
194 | sub convert_to_base_pair_removal_pos {
195 |     my ($chord_model, $mwiss) = @_;
196 | 
197 |     my $base_pair_removal_pos = [];
198 | 
199 |     foreach my $mwis (@{$mwiss}) {
200 | 	my %removed_chord_edges = %{$chord_model->get_chord_edges()};
201 | 	foreach (@{$mwis}) {
202 | 	    delete $removed_chord_edges{$_->[0] . '-' . $_->[1]};
203 | 	}
204 | 
205 | 	my $removal_pos = {};
206 | 	foreach my $removed_chord_edge (values %removed_chord_edges) {
207 | 	    my $removed_chord_base_pairs = $chord_model->get_chord_base_pairs($removed_chord_edge->[0], $removed_chord_edge->[1]);
208 | 	    foreach (@{$removed_chord_base_pairs}) {
209 | 		$removal_pos->{$_->[0]} = 1;
210 | 		$removal_pos->{$_->[1]} = 1;
211 | 	    }
212 | 	}
213 | 
214 | 	push @{$base_pair_removal_pos}, $removal_pos;
215 |     }
216 | 
217 |     return $base_pair_removal_pos;
218 | }
219 | 
220 | sub combine_base_pair_removal_pos {
221 |     my ($pseudoknot_base_pair_removal_pos, $combined_base_pair_removal_pos) = @_;
222 | 
223 |     my $expanded_base_pair_removal_pos = [];
224 |     my $base_pair_removal_pos = pop @{$pseudoknot_base_pair_removal_pos};
225 |     foreach my $removal_pos (@{$base_pair_removal_pos}) {
226 | 	if (defined($combined_base_pair_removal_pos->[0])) {
227 | 	    foreach (@{$combined_base_pair_removal_pos}) {
228 | 		my %expanded_removal_pos = (%{$removal_pos}, %{$_});
229 | 		push @{$expanded_base_pair_removal_pos}, \%expanded_removal_pos;
230 | 	    }
231 | 	}
232 | 	else {
233 | 	    push @{$expanded_base_pair_removal_pos}, $removal_pos;
234 | 	}
235 |     }
236 | 
237 |     if (defined($pseudoknot_base_pair_removal_pos->[0])) {
238 | 	$expanded_base_pair_removal_pos = combine_base_pair_removal_pos($pseudoknot_base_pair_removal_pos, $expanded_base_pair_removal_pos);
239 |     }
240 | 
241 |     return $expanded_base_pair_removal_pos;
242 | }
243 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.log_gremlin:
--------------------------------------------------------------------------------
  1 | # ---------------------------------------------------------------------------------------------
  2 | #                                GREMLIN_CPP v1.0                                              
  3 | # ---------------------------------------------------------------------------------------------
  4 | #   -i           /home/jaswinder/github/SPOT-RNA2/sample_run/sample_seq_features/sample_seq.a2m
  5 | #   -o           /home/jaswinder/github/SPOT-RNA2/sample_run/sample_seq_features/sample_seq.dca
  6 | # ---------------------------------------------------------------------------------------------
  7 | #   -only_neff   0
  8 | #   -only_v      0
  9 | #   -gap_cutoff  0.5
 10 | #   -alphabet    rna
 11 | #   -eff_cutoff  0.8
 12 | #   -lambda      0.01
 13 | # ---------------------------------------------------------------------------------------------
 14 | #   -min_type    lbfgs
 15 | #   -max_iter    100
 16 | # ---------------------------------------------------------------------------------------------
 17 | # removing 3 out of 61 positions with >= 50% gaps!
 18 | # SEQ ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCGGAGAGCAGAUCUGCUCUCG
 19 | # CUT ACUCGUUUGAGCGAGUAUAAACAGCU-GUUAAGCUCAAAGCGGAGAGCAG--CUGCUCUCG
 20 | # NC 58
 21 | # NEFF 16.9
 22 | # learning MRF ...
 23 | # lbfgs::iter S_S fx: 1577.57 gnorm: 79.1061
 24 | # lbfgs::iter 0_1 fx: 1569.68 gnorm: 78.7645
 25 | # lbfgs::iter 1_1 fx: 867.926 gnorm: 14.9869
 26 | # lbfgs::iter 2_1 fx: 835.903 gnorm: 7.66255
 27 | # lbfgs::iter 3_1 fx: 823.585 gnorm: 6.19229
 28 | # lbfgs::iter 4_1 fx: 814.088 gnorm: 5.88803
 29 | # lbfgs::iter 5_1 fx: 808.359 gnorm: 3.65714
 30 | # lbfgs::iter 6_1 fx: 804.819 gnorm: 2.33437
 31 | # lbfgs::iter 7_1 fx: 801.784 gnorm: 2.29087
 32 | # lbfgs::iter 8_1 fx: 801.007 gnorm: 3.56033
 33 | # lbfgs::iter 9_1 fx: 800.087 gnorm: 0.639645
 34 | # lbfgs::iter 10_1 fx: 800.041 gnorm: 0.287188
 35 | # lbfgs::iter 11_1 fx: 800.021 gnorm: 0.235963
 36 | # lbfgs::iter 12_1 fx: 799.996 gnorm: 0.228575
 37 | # lbfgs::iter 13_1 fx: 799.98 gnorm: 0.446223
 38 | # lbfgs::iter 14_1 fx: 799.957 gnorm: 0.200702
 39 | # lbfgs::iter 15_1 fx: 799.942 gnorm: 0.176329
 40 | # lbfgs::iter 16_1 fx: 799.93 gnorm: 0.171215
 41 | # lbfgs::iter 17_1 fx: 799.93 gnorm: 0.42462
 42 | # lbfgs::iter 18_1 fx: 799.916 gnorm: 0.110312
 43 | # lbfgs::iter 19_1 fx: 799.914 gnorm: 0.0797637
 44 | # lbfgs::iter 20_1 fx: 799.91 gnorm: 0.0900997
 45 | # lbfgs::iter 21_1 fx: 799.907 gnorm: 0.112634
 46 | # lbfgs::iter 22_1 fx: 799.906 gnorm: 0.106627
 47 | # lbfgs::iter 23_1 fx: 799.905 gnorm: 0.0325551
 48 | # lbfgs::iter 24_1 fx: 799.905 gnorm: 0.0234513
 49 | # lbfgs::iter 25_1 fx: 799.905 gnorm: 0.0211741
 50 | # lbfgs::iter 26_1 fx: 799.905 gnorm: 0.0347908
 51 | # lbfgs::iter 27_1 fx: 799.905 gnorm: 0.00985535
 52 | # lbfgs::iter 28_1 fx: 799.905 gnorm: 0.00879822
 53 | # lbfgs::iter 29_1 fx: 799.904 gnorm: 0.00837304
 54 | # lbfgs::iter 30_1 fx: 799.904 gnorm: 0.0161944
 55 | # lbfgs::iter 31_1 fx: 799.904 gnorm: 0.00567908
 56 | # lbfgs::iter 32_1 fx: 799.904 gnorm: 0.00464024
 57 | # lbfgs::iter 33_1 fx: 799.904 gnorm: 0.00515186
 58 | # lbfgs::iter 34_1 fx: 799.904 gnorm: 0.00769589
 59 | # lbfgs::iter 35_1 fx: 799.904 gnorm: 0.00417999
 60 | # lbfgs::iter 36_1 fx: 799.904 gnorm: 0.00196562
 61 | # lbfgs::iter 37_1 fx: 799.904 gnorm: 0.00138669
 62 | # lbfgs::iter 38_1 fx: 799.904 gnorm: 0.00217301
 63 | # lbfgs::iter 39_1 fx: 799.904 gnorm: 0.00122576
 64 | # lbfgs::iter 40_1 fx: 799.904 gnorm: 0.000992969
 65 | # lbfgs::iter 41_1 fx: 799.904 gnorm: 0.00110079
 66 | # lbfgs::iter 42_1 fx: 799.904 gnorm: 0.00177581
 67 | # lbfgs::iter 43_1 fx: 799.904 gnorm: 0.000827648
 68 | # lbfgs::iter 44_1 fx: 799.904 gnorm: 0.000548524
 69 | # lbfgs::iter 45_1 fx: 799.904 gnorm: 0.000489395
 70 | # lbfgs::iter 46_1 fx: 799.904 gnorm: 0.000693037
 71 | # lbfgs::iter 47_1 fx: 799.904 gnorm: 0.00017789
 72 | # lbfgs::iter 48_1 fx: 799.904 gnorm: 0.000143922
 73 | # lbfgs::iter 49_1 fx: 799.904 gnorm: 0.000167766
 74 | # lbfgs::iter 50_1 fx: 799.904 gnorm: 0.000338155
 75 | # lbfgs::iter 51_1 fx: 799.904 gnorm: 0.000158961
 76 | # lbfgs::iter 52_1 fx: 799.904 gnorm: 0.000106748
 77 | # lbfgs::iter 53_1 fx: 799.904 gnorm: 0.000105824
 78 | # lbfgs::iter 54_1 fx: 799.904 gnorm: 0.000298927
 79 | # lbfgs::iter 55_1 fx: 799.904 gnorm: 7.52617e-05
 80 | # lbfgs::iter 56_1 fx: 799.904 gnorm: 5.87916e-05
 81 | # lbfgs::iter 57_1 fx: 799.904 gnorm: 6.59898e-05
 82 | # lbfgs::iter 58_1 fx: 799.904 gnorm: 0.000150251
 83 | # lbfgs::iter 59_1 fx: 799.904 gnorm: 4.73333e-05
 84 | # lbfgs::iter 60_1 fx: 799.904 gnorm: 3.42972e-05
 85 | # lbfgs::iter 61_1 fx: 799.904 gnorm: 3.77292e-05
 86 | # lbfgs::iter 62_1 fx: 799.904 gnorm: 3.80927e-05
 87 | # lbfgs::iter 63_1 fx: 799.904 gnorm: 9.36524e-05
 88 | # lbfgs::iter 64_1 fx: 799.904 gnorm: 1.61026e-05
 89 | # lbfgs::iter 65_1 fx: 799.904 gnorm: 1.07761e-05
 90 | # lbfgs::iter 66_1 fx: 799.904 gnorm: 8.87304e-06
 91 | # lbfgs::iter 67_1 fx: 799.904 gnorm: 1.4495e-05
 92 | # lbfgs::iter 68_1 fx: 799.904 gnorm: 7.46466e-06
 93 | # lbfgs::iter 69_1 fx: 799.904 gnorm: 6.47399e-06
 94 | # lbfgs::iter 70_1 fx: 799.904 gnorm: 7.70032e-06
 95 | # lbfgs::iter 71_1 fx: 799.904 gnorm: 1.43926e-05
 96 | # lbfgs::iter 72_1 fx: 799.904 gnorm: 3.22099e-06
 97 | # lbfgs::iter 73_1 fx: 799.904 gnorm: 2.48525e-06
 98 | # lbfgs::iter 74_1 fx: 799.904 gnorm: 2.60497e-06
 99 | # lbfgs::iter 75_1 fx: 799.904 gnorm: 3.34597e-06
100 | # lbfgs::iter 76_1 fx: 799.904 gnorm: 7.18017e-06
101 | # lbfgs::iter 77_1 fx: 799.904 gnorm: 1.53372e-06
102 | # lbfgs::iter 78_1 fx: 799.904 gnorm: 1.04848e-06
103 | # lbfgs::iter 79_1 fx: 799.904 gnorm: 9.07634e-07
104 | # lbfgs::iter 80_1 fx: 799.904 gnorm: 1.613e-06
105 | # lbfgs::iter 81_1 fx: 799.904 gnorm: 7.12023e-07
106 | # lbfgs::iter 82_1 fx: 799.904 gnorm: 5.6185e-07
107 | # lbfgs::iter 83_1 fx: 799.904 gnorm: 5.75372e-07
108 | # lbfgs::iter 84_1 fx: 799.904 gnorm: 1.19944e-06
109 | # lbfgs::iter 85_1 fx: 799.904 gnorm: 4.0767e-07
110 | # lbfgs::iter 86_1 fx: 799.904 gnorm: 2.73253e-07
111 | # lbfgs::iter 87_1 fx: 799.904 gnorm: 2.46659e-07
112 | # lbfgs::iter 88_1 fx: 799.904 gnorm: 4.38814e-07
113 | # lbfgs::iter 89_1 fx: 799.904 gnorm: 2.32778e-07
114 | # lbfgs::iter 90_1 fx: 799.904 gnorm: 1.52613e-07
115 | # lbfgs::iter 91_1 fx: 799.904 gnorm: 1.24444e-07
116 | # lbfgs::iter 92_1 fx: 799.904 gnorm: 1.97241e-07
117 | # lbfgs::iter 93_1 fx: 799.904 gnorm: 8.6033e-08
118 | # lbfgs::iter 94_1 fx: 799.904 gnorm: 7.09053e-08
119 | # lbfgs::iter 95_1 fx: 799.904 gnorm: 6.83742e-08
120 | # lbfgs::iter 96_1 fx: 799.904 gnorm: 1.44261e-07
121 | # lbfgs::iter 97_1 fx: 799.904 gnorm: 3.08042e-08
122 | # lbfgs::iter 98_1 fx: 799.904 gnorm: 2.62992e-08
123 | # lbfgs::iter 99_1 fx: 799.904 gnorm: 2.98886e-08
124 | # lbfgs::iter S_S fx: 799.904 gnorm: 152.499
125 | # lbfgs::iter 0_1 fx: 784.797 gnorm: 149.682
126 | # lbfgs::iter 1_1 fx: 395.893 gnorm: 133.288
127 | # lbfgs::iter 2_2 fx: 364.519 gnorm: 99.8387
128 | # lbfgs::iter 3_1 fx: 298.457 gnorm: 29.6813
129 | # lbfgs::iter 4_1 fx: 287.945 gnorm: 19.0757
130 | # lbfgs::iter 5_1 fx: 282.982 gnorm: 11.4838
131 | # lbfgs::iter 6_1 fx: 280.631 gnorm: 10.3144
132 | # lbfgs::iter 7_1 fx: 279.448 gnorm: 6.3234
133 | # lbfgs::iter 8_1 fx: 279.008 gnorm: 3.7695
134 | # lbfgs::iter 9_1 fx: 278.802 gnorm: 2.4342
135 | # lbfgs::iter 10_1 fx: 278.677 gnorm: 2.46252
136 | # lbfgs::iter 11_1 fx: 278.606 gnorm: 2.49599
137 | # lbfgs::iter 12_1 fx: 278.556 gnorm: 1.34479
138 | # lbfgs::iter 13_1 fx: 278.511 gnorm: 1.29845
139 | # lbfgs::iter 14_1 fx: 278.466 gnorm: 1.43844
140 | # lbfgs::iter 15_1 fx: 278.428 gnorm: 3.48674
141 | # lbfgs::iter 16_1 fx: 278.349 gnorm: 1.66639
142 | # lbfgs::iter 17_1 fx: 278.274 gnorm: 1.79046
143 | # lbfgs::iter 18_1 fx: 278.147 gnorm: 2.82733
144 | # lbfgs::iter 19_1 fx: 277.925 gnorm: 4.0464
145 | # lbfgs::iter 20_1 fx: 277.712 gnorm: 7.464
146 | # lbfgs::iter 21_1 fx: 277.334 gnorm: 3.37071
147 | # lbfgs::iter 22_1 fx: 277.034 gnorm: 3.46683
148 | # lbfgs::iter 23_1 fx: 276.762 gnorm: 3.62724
149 | # lbfgs::iter 24_2 fx: 276.697 gnorm: 3.55826
150 | # lbfgs::iter 25_1 fx: 276.443 gnorm: 2.13056
151 | # lbfgs::iter 26_2 fx: 276.428 gnorm: 1.77109
152 | # lbfgs::iter 27_1 fx: 276.376 gnorm: 1.13688
153 | # lbfgs::iter 28_1 fx: 276.358 gnorm: 0.928894
154 | # lbfgs::iter 29_1 fx: 276.357 gnorm: 1.96084
155 | # lbfgs::iter 30_1 fx: 276.342 gnorm: 0.534038
156 | # lbfgs::iter 31_1 fx: 276.338 gnorm: 0.423256
157 | # lbfgs::iter 32_1 fx: 276.33 gnorm: 0.588926
158 | # lbfgs::iter 33_1 fx: 276.323 gnorm: 0.630135
159 | # lbfgs::iter 34_1 fx: 276.312 gnorm: 1.41765
160 | # lbfgs::iter 35_1 fx: 276.293 gnorm: 0.710723
161 | # lbfgs::iter 36_1 fx: 276.277 gnorm: 0.761555
162 | # lbfgs::iter 37_1 fx: 276.25 gnorm: 1.13114
163 | # lbfgs::iter 38_1 fx: 276.24 gnorm: 2.68521
164 | # lbfgs::iter 39_1 fx: 276.209 gnorm: 1.1845
165 | # lbfgs::iter 40_1 fx: 276.183 gnorm: 0.777926
166 | # lbfgs::iter 41_1 fx: 276.163 gnorm: 0.93568
167 | # lbfgs::iter 42_1 fx: 276.145 gnorm: 1.4975
168 | # lbfgs::iter 43_1 fx: 276.134 gnorm: 0.83518
169 | # lbfgs::iter 44_1 fx: 276.129 gnorm: 0.427227
170 | # lbfgs::iter 45_1 fx: 276.125 gnorm: 0.378219
171 | # lbfgs::iter 46_1 fx: 276.119 gnorm: 0.444098
172 | # lbfgs::iter 47_1 fx: 276.115 gnorm: 0.874621
173 | # lbfgs::iter 48_1 fx: 276.109 gnorm: 0.501375
174 | # lbfgs::iter 49_1 fx: 276.102 gnorm: 0.515069
175 | # lbfgs::iter 50_1 fx: 276.094 gnorm: 0.662954
176 | # lbfgs::iter 51_1 fx: 276.079 gnorm: 1.44839
177 | # lbfgs::iter 52_1 fx: 276.055 gnorm: 1.06933
178 | # lbfgs::iter 53_1 fx: 276.023 gnorm: 0.856106
179 | # lbfgs::iter 54_1 fx: 275.997 gnorm: 1.6488
180 | # lbfgs::iter 55_1 fx: 275.976 gnorm: 0.904621
181 | # lbfgs::iter 56_1 fx: 275.967 gnorm: 0.634115
182 | # lbfgs::iter 57_1 fx: 275.956 gnorm: 0.598754
183 | # lbfgs::iter 58_1 fx: 275.949 gnorm: 0.767746
184 | # lbfgs::iter 59_1 fx: 275.942 gnorm: 0.506698
185 | # lbfgs::iter 60_1 fx: 275.935 gnorm: 0.478146
186 | # lbfgs::iter 61_1 fx: 275.93 gnorm: 0.839587
187 | # lbfgs::iter 62_1 fx: 275.922 gnorm: 0.640863
188 | # lbfgs::iter 63_1 fx: 275.895 gnorm: 0.832216
189 | # lbfgs::iter 64_1 fx: 275.881 gnorm: 1.28579
190 | # lbfgs::iter 65_1 fx: 275.865 gnorm: 0.934848
191 | # lbfgs::iter 66_1 fx: 275.833 gnorm: 0.930706
192 | # lbfgs::iter 67_1 fx: 275.815 gnorm: 1.23511
193 | # lbfgs::iter 68_1 fx: 275.793 gnorm: 0.828008
194 | # lbfgs::iter 69_1 fx: 275.777 gnorm: 0.945493
195 | # lbfgs::iter 70_1 fx: 275.769 gnorm: 0.724122
196 | # lbfgs::iter 71_1 fx: 275.765 gnorm: 0.422678
197 | # lbfgs::iter 72_1 fx: 275.761 gnorm: 0.263507
198 | # lbfgs::iter 73_1 fx: 275.759 gnorm: 0.253831
199 | # lbfgs::iter 74_1 fx: 275.758 gnorm: 0.341821
200 | # lbfgs::iter 75_1 fx: 275.758 gnorm: 0.102256
201 | # lbfgs::iter 76_1 fx: 275.758 gnorm: 0.0740558
202 | # lbfgs::iter 77_1 fx: 275.758 gnorm: 0.0552766
203 | # lbfgs::iter 78_1 fx: 275.758 gnorm: 0.170435
204 | # lbfgs::iter 79_1 fx: 275.757 gnorm: 0.0399481
205 | # lbfgs::iter 80_1 fx: 275.757 gnorm: 0.0298459
206 | # lbfgs::iter 81_1 fx: 275.757 gnorm: 0.0316006
207 | # lbfgs::iter 82_1 fx: 275.757 gnorm: 0.0479131
208 | # lbfgs::iter 83_1 fx: 275.757 gnorm: 0.0287621
209 | # lbfgs::iter 84_1 fx: 275.757 gnorm: 0.0368384
210 | # lbfgs::iter 85_1 fx: 275.757 gnorm: 0.0438442
211 | # lbfgs::iter 86_1 fx: 275.757 gnorm: 0.146457
212 | # lbfgs::iter 87_1 fx: 275.757 gnorm: 0.05263
213 | # lbfgs::iter 88_1 fx: 275.757 gnorm: 0.0397041
214 | # lbfgs::iter 89_1 fx: 275.757 gnorm: 0.0548509
215 | # lbfgs::iter 90_1 fx: 275.757 gnorm: 0.0667815
216 | # lbfgs::iter 91_1 fx: 275.757 gnorm: 0.182811
217 | # lbfgs::iter 92_1 fx: 275.757 gnorm: 0.0391792
218 | # lbfgs::iter 93_1 fx: 275.757 gnorm: 0.0289307
219 | # lbfgs::iter 94_1 fx: 275.757 gnorm: 0.0297646
220 | # lbfgs::iter 95_1 fx: 275.757 gnorm: 0.0807598
221 | # lbfgs::iter 96_1 fx: 275.757 gnorm: 0.0366649
222 | # lbfgs::iter 97_1 fx: 275.757 gnorm: 0.0289475
223 | # lbfgs::iter 98_1 fx: 275.757 gnorm: 0.0366467
224 | # lbfgs::iter 99_1 fx: 275.757 gnorm: 0.0515997
225 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SPOT-RNA2
  2 | Improved RNA Secondary Structure and Tertiary Base-pairing Prediction using Evolutionary Profile, Mutational Coupling and Two-dimensional Transfer Learning.
  3 | 
  4 | ## Contents
  5 | 
  6 |   * [Introduction](#introduction)
  7 |   * [Results](#results)
  8 |   * [System Requirments](#system-requirments)
  9 |   * [Installation](#installation)
 10 |   * [Usage](#Usage)
 11 |   * [Datasets](#datasets)
 12 |   * [Citation guide](#citation-guide)
 13 |   * [Licence](#licence)
 14 |   * [Contact](#contact)
 15 | 
 16 | ## Introduction
 17 | 
 18 | The recent discovery of numerous non-coding RNAs (long non-coding RNAs, in particular) has transformed our perception about the roles of RNAs in living organisms. Our ability to understand them, however, is hampered by our inability to solve their secondary and tertiary structures in high resolution efficiently by existing experimental techniques. Computational prediction of RNA secondary structure, on the other hand, has received much-needed improvement, recently, through deep learning of a large approximate data, followed by transfer learning with gold-standard base-pairing structures from high-resolution 3-D structures. Here, we expand this single-sequence-based learning to the use of evolutionary profiles and mutational coupling.
 19 | 
 20 | |![](./docs/SPOTRNA2_pipeline.png)
 21 | |----|
 22 | | <p align="center"> <b>Figure 1:</b> (A) Inputted one dimensional (1-D) and two dimensional (2-D) features employed in SPOT-RNA2 (L is the RNA sequence length; BP is base-pair; CSS is consensus secondary structure). (B) An example of the model architecture of SPOT-RNA2. (C) The schematic diagram for model pre-training by the bpRNA data set (TR0) and transfer learning by PDB data set (TR1).|
 23 | 
 24 | Results
 25 | ----
 26 | The new method allows large improvement not only in canonical base-pairs (RNA secondary structures) but more so in base-pairing associated with tertiary interactions such as pseudoknots, noncanonical and lone base-pairs. In particular, it is highly accurate for those RNAs of more than 1000 homologous sequences by achieving > 0.8 F1-score (harmonic mean of sensitivity and precision) for 14/16 RNAs tested. The method can also significantly improve base-pairing prediction by incorporating artificial but functional homologous sequences generated from deep mutational scanning without any modification. The fully automatic method should provide the scientific community a new powerful tool to capture not only the secondary structure but also tertiary base-pairing information for building three-dimensional models. It also highlights the future of accurately solving the base-pairing structure by using a large number of natural and/or artificial homologous sequences.
 27 | 
 28 | 
 29 | |![](./docs/benchmark_results.png)
 30 | |----|
 31 | | <p align="center"> <b>Figure 2:</b> Distribution of F1-scores for individual RNAs on the combined test sets TS1, TS2, and TS3 given by various methods as labeled. On each box, the central mark indicates the median, and the bottom and top edges of the box indicate the 25th and 75th percentiles, respectively. The outliers are plotted individually by using the “+” symbol.|
 32 | 
 33 | 
 34 | ## System Requirments
 35 | 
 36 | **Hardware Requirments:**
 37 | It is recommended that your system should have 32 GB RAM, 500 GB disk space to support the in-memory operations for RNA sequence length less than 500. Multiple CPU threads are also recommended as the MSA generating process is computationally expensive.
 38 | 
 39 | **Software Requirments:**
 40 | * [Python3.6](https://docs.python-guide.org/starting/install3/linux/)
 41 | * [Perl-5.4 or later](https://www.perl.org/get.html)
 42 | * [virtualenv](https://virtualenv.pypa.io/en/latest/installation/) or [Anaconda](https://anaconda.org/anaconda/virtualenv)
 43 | * [CUDA 10.0](https://developer.nvidia.com/cuda-10.0-download-archive) (Optional if using GPU)
 44 | * [cuDNN (>= 7.4.1)](https://developer.nvidia.com/cudnn) (Optional if using GPU)
 45 | * [Docker](https://docs.docker.com/engine/install/) (Optional if runnig SPOT-RNA2 through docker image)
 46 | 
 47 | SPOT-RNA2 has been tested on Ubuntu 14.04, 16.04, and 18.04 operating systems.
 48 | 
 49 | 
 50 | ## Installation
 51 | 
 52 | ### Installation using Docker image:
 53 | 
 54 | The following command can be used to install SPOT-RNA2 and its dependencies:
 55 | 
 56 | 1. `git clone https://github.com/jaswindersingh2/SPOT-RNA2.git && cd SPOT-RNA2`
 57 | 
 58 | 2. `docker image build -t spot_rna2 .`
 59 | 
 60 | ### Manual installation:
 61 | 
 62 | To install SPOT-RNA2 and its dependencies following commands can be used in the terminal:
 63 | 
 64 | 
 65 | 1. `git clone https://github.com/jaswindersingh2/SPOT-RNA2.git && cd SPOT-RNA2`
 66 | 2. `wget -O utils/models_ckps.tar.xz 'https://www.dropbox.com/s/udzcsva76lh5wvq/models_ckps.tar.xz' || wget -O utils/models_ckps.tar.xz 'https://app.nihaocloud.com/f/586acb2658d74ccb92b8/?dl=1'`
 67 | 3. `tar -xvf utils/models_ckps.tar.xz -C utils/ && rm utils/models_ckps.tar.xz`
 68 | 4. `sudo apt install cpanminus && sudo cpanm Graph && sudo apt install gawk`
 69 | 
 70 | Based on the virtual environment package manager (**virtualenv** or **conda**) you have follow the stpes below:<br />
 71 | 
 72 | |  | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; virtualenv | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; conda |
 73 | | :- | :-------- | :--- |
 74 | | 5. | `virtualenv -p python3.6 venv` | `conda create -n venv python=3.6` |
 75 | | 6. | `source ./venv/bin/activate` | `conda activate venv` | 
 76 | | 7. | `pip install -r requirements.txt && deactivate` | `while read p; do conda install --yes $p; done < requirements.txt && conda deactivate` | 
 77 | 
 78 | If you have **Infernal** already installed, please set `binaries/` directory path of **Infernal** installation in line 12 of the `run_spotrna2.sh`. Otherwise, follow commands below to install **Infernal** tool. If you run into issue, please follow the [link](http://eddylab.org/infernal/) for more info.
 79 | 
 80 | 8. `wget 'eddylab.org/infernal/infernal-1.1.3-linux-intel-gcc.tar.gz' && tar -xvzf infernal-*.tar.gz && rm infernal-*.tar.gz`
 81 | 
 82 | If you have **BLASTN** already installed, please set `bin/` directory path of **BLASTN** installation in line 10 of the `run_spotrna2.sh`. Otherwise, follow commands below to install **BLASTN** tool. If you run into issue, please follow the [link](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download) for more info.
 83 | 
 84 | 9. `wget 'ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-*+-x64-linux.tar.gz' && tar -xvzf ncbi-blast-*+-x64-linux.tar.gz && rm ncbi-blast-*+-x64-linux.tar.gz`
 85 | 
 86 | To install the **SPOT-RNA** predictor, follow the commands below:<br />
 87 | 
 88 | 10. `git clone https://github.com/jaswindersingh2/SPOT-RNA.git && cd SPOT-RNA`
 89 | 11. `wget 'https://www.dropbox.com/s/dsrcf460nbjqpxa/SPOT-RNA-models.tar.gz' || wget -O SPOT-RNA-models.tar.gz 'https://app.nihaocloud.com/f/fbf3315a91d542c0bdc2/?dl=1'`
 90 | 12. `tar -xvzf SPOT-RNA-models.tar.gz && rm SPOT-RNA-models.tar.gz && cd ../`
 91 | 
 92 | To install the DCA predictor, follow the commands below:<br />
 93 | 
 94 | 13. `git clone "https://github.com/sokrypton/GREMLIN_CPP" && cd GREMLIN_CPP && g++ -O3 -std=c++0x -o gremlin_cpp gremlin_cpp.cpp -fopenmp && cd ../`
 95 | 
 96 | To install the LinearPartition, follow the commands below:<br />
 97 | 
 98 | 14. `git clone 'https://github.com/LinearFold/LinearPartition.git' && cd LinearPartition/ && make && cd ../`
 99 | 
100 | If NCBI's nt database already available in your system, please set path to the database directory in line 11 and 13 of the `run_spotrna.sh` file. Otherwise, use the following command to download. It can take few hours the download to finish depending on your internet speed. If you run into issue, please follow the [link](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download) for more info.
101 | 
102 | 15. `wget -c "ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz" -O ./nt_database/nt.gz && gunzip ./nt_database/nt.gz`
103 | 
104 | Database needs to be formated for using in **BLASTN**. Please follow the command below to format the database.<br />
105 | 
106 | 16. `./ncbi-blast-*+/bin/makeblastdb -in ./nt_database/nt -dbtype nucl`
107 | 
108 | 
109 | ## Usage
110 | 
111 | ### Run SPOT-RNA2 using docker:
112 | 
113 | `docker run --rm -ti -v $(pwd)/sample_run:/SPOT-RNA2/sample_run -v $(pwd)/nt_database/:/SPOT-RNA2/nt_database spot_rna2:latest ./run_spotrna2.sh sample_run/6ufj.fasta`
114 | 
115 | ### Run SPOT-RNA2 using Manual installation:
116 | ```
117 | ./run_spotrna2.sh sample_run/6ufj.fasta 
118 | ```
119 | 
120 | Both command creates two folder `6ufj_features` and `6ufj_outputs` in input file directory (`sample_run/` in this case). `6ufj_features/` contains all the alignments (MSA-1, MSA-2) and features (PSSM, DCA, bps probability) generated from SPOT-RNA2 pipeline. `6ufj_outputs/` contains predicted secondary structure in bpseq format (`6ufj.bpseq`), ct format (`6ufj.ct`), dbn format (`6ufj.st`) with secondary structure motifs, and base-pair probability (`6ufj.prob`). The verify the results, it should be same as in `sample_seq_features` and `sample_seq_outputs` folder because both sequence (`sample_seq.fasta` and `6ufj.fasta`) are same.
121 | 
122 | ## Datasets
123 | 
124 | The following datasets were used for Initial Training:
125 | * bpRNA[1]: Initial Learning (Training TR0, validation VL0, and test TS0)  
126 | [Dropbox](https://www.dropbox.com/s/sg1p1otsqnaqev8/bpRNA_dataset.tar.xz) or [Nihao Cloud](https://app.nihaocloud.com/f/6f7b456d874c4842b8ac/)
127 | 
128 | 
129 | The following datasets were used for Transfer Learning:
130 | * PDB[2]: Transfer Learning (Training TR1, validation VL1, and testsets TS1, TS2, TS3, and TS_hard)  
131 | [Dropbox](https://www.dropbox.com/s/apqrsl7hm1091ie/PDB_dataset.tar.xz) or [Nihao Cloud](https://app.nihaocloud.com/f/f301baed4dac4474a185/)
132 | 
133 | ## Citation guide
134 | 
135 | **If you use SPOT-RNA2 for your research please cite the following papers:**
136 | 
137 | Jaswinder Singh, Kuldip Paliwal, Tongchuan Zhang, Jaspreet Singh, Thomas Litfin, Yaoqi Zhou, Improved RNA Secondary Structure and Tertiary Base-pairing Prediction Using Evolutionary Profile, Mutational Coupling and Two-dimensional Transfer Learning, Bioinformatics, 2021;, btab165, https://doi.org/10.1093/bioinformatics/btab165
138 | 
139 | **If you use SPOT-RNA2 data sets and/or input feature pipeline, please consider citing the following papers:**
140 | 
141 | [1] Zhang, T., Singh, J., Litfin, T., Zhan, J., Paliwal, K. and Zhou, Y., 2020. RNAcmap: A Fully Automatic Method for Predicting Contact Maps of RNAs by Evolutionary Coupling Analysis. bioRxiv.
142 | 
143 | [2] Zhang, H., Zhang, L., Mathews, D.H. and Huang, L., 2020. LinearPartition: linear-time approximation of RNA folding partition function and base-pairing probabilities. Bioinformatics, 36(Supplement_1), pp.i258-i267.
144 | 
145 | [3] Singh, J., Hanson, J., Paliwal, K. and Zhou, Y., 2019. RNA secondary structure prediction using an ensemble of two-dimensional deep neural networks and transfer learning. Nature communications, 10(1), pp.1-13.
146 | 
147 | [4] Nawrocki, E.P. and Eddy, S.R., 2013. Infernal 1.1: 100-fold faster RNA homology searches. Bioinformatics, 29(22), pp.2933-2935.
148 | 
149 | [5] H.M. Berman, J. Westbrook, Z. Feng, G. Gilliland, T.N. Bhat, H. Weissig, I.N. Shindyalov, P.E. Bourne. (2000) The Protein Data Bank Nucleic Acids Research, 28: 235-242.
150 | 
151 | [6] Padideh Danaee, Mason Rouches, Michelle Wiley, Dezhong Deng, Liang Huang, David Hendrix, bpRNA: large-scale automated annotation and analysis of RNA secondary structure, Nucleic Acids Research, Volume 46, Issue 11, 20 June 2018, Pages 5381–5394, https://doi.org/10.1093/nar/gky285
152 | 
153 | [7] Kamisetty, H., Ovchinnikov, S. and Baker, D., 2013. Assessing the utility of coevolution-based residue–residue contact predictions in a sequence-and structure-rich era. Proceedings of the National Academy of Sciences, 110(39), pp.15674-15679.
154 | 
155 | [8] Chiu, J.K.H. and Chen, Y.P.P., 2014. Efficient conversion of RNA pseudoknots to knot-free structures using a graphical model. IEEE Transactions on Biomedical Engineering, 62(5), pp.1265-1271.
156 | 
157 | Licence
158 | -----
159 | Mozilla Public License 2.0
160 | 
161 | 
162 | Contact
163 | -----
164 | jaswinder.singh3@griffithuni.edu.au, yaoqi.zhou@griffith.edu.au
165 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os, six, sys, subprocess
  3 | import tensorflow as tf
  4 | import random
  5 | from tqdm import tqdm
  6 | import pandas as pd
  7 | from pathlib import Path
  8 | 
  9 | # ------------- one hot encoding of RNA sequences -----------------#
 10 | def one_hot(seq):
 11 |     RNN_seq = seq
 12 |     BASES = 'AUCG'
 13 |     bases = np.array([base for base in BASES])
 14 |     feat = np.concatenate(
 15 |         [[(bases == base.upper()).astype(int)] if str(base).upper() in BASES else np.array([[-1] * len(BASES)]) for base
 16 |          in RNN_seq])
 17 | 
 18 |     return feat
 19 | 
 20 | 
 21 | def z_mask(seq_len):
 22 |     mask = np.ones((seq_len, seq_len))
 23 |     return np.triu(mask, 2)
 24 | 
 25 | def l_mask(inp, seq_len):
 26 |     mask = np.ones((seq_len, seq_len))
 27 |     return np.triu(mask, 1)
 28 | 
 29 | def get_data(seq, rna_id, args):
 30 |     seq_len = len(seq)
 31 |     one_hot_feat = one_hot(seq)
 32 | 
 33 |     with open(os.path.splitext(args.inputs)[0] + '.pssm') as f:
 34 |         temp = pd.read_csv(f, comment='#', delim_whitespace=True, header=None).values
 35 |     seq = ['U' if k == 'T' else k for k in temp[:, 0]]
 36 |     profile = temp[:, 1:5].astype(float)
 37 |     off_set = np.zeros((len(seq), profile.shape[1])) + 0.3
 38 |     for k, K in enumerate(seq):
 39 |         try:
 40 |             off_set[k, BASES.index(K)] = 8.7
 41 |         except:
 42 |             pass
 43 |     profile += off_set
 44 |     profile /= np.sum(profile, axis=1, keepdims=True)
 45 |     profile = -np.log(profile)
 46 | 
 47 |     profile_one_hot = np.concatenate([profile, one_hot_feat], axis=1)
 48 | 
 49 | ############ load base-pair prob form linearpartition ##############################
 50 |     try:
 51 |         with open(os.path.splitext(args.inputs)[0] + '.prob', 'r') as f:
 52 |             prob = pd.read_csv(f, delimiter=None, delim_whitespace=True, header=None, skiprows=[0]).values
 53 |         bp_prob_lp =  np.zeros((len(seq), len(seq)))
 54 |         for i in prob:
 55 |             bp_prob_lp[int(i[0])-1, int(i[1])-1] = i[2]
 56 |         bp_prob_lp = bp_prob_lp + np.transpose(bp_prob_lp)
 57 |     except:
 58 |         print("linearpartition output missing",rna_id)
 59 |         bp_prob_lp =  np.zeros((len(seq), len(seq)))
 60 | 
 61 | ############ load dca obtained from gremlin ##############################
 62 |     try:
 63 |         with open(os.path.splitext(args.inputs)[0] + '.dca') as f:
 64 |             temp4 = pd.read_csv(f, comment='#', delim_whitespace=True, header=None, skiprows=[0], usecols=[0,1,2]).values
 65 |         #print(temp4.shape)
 66 |         dca_output = np.zeros((len(seq), len(seq)))
 67 |         for k in temp4:
 68 |             if abs(int(k[0]) - int(k[1])) < 4:
 69 |                 dca_output[int(k[0]), int(k[1])] = 0.01*k[2]
 70 |             else:
 71 |                 dca_output[int(k[0]), int(k[1])] = k[2]
 72 |         dca_output = dca_output + np.transpose(dca_output)
 73 |     except:
 74 |         print("dca missing", rna_id)
 75 |         dca_output = np.zeros((len(seq), len(seq)))
 76 | 
 77 |     zero_mask = z_mask(seq_len)[None, :, :, None]
 78 |     label_mask = l_mask(profile_one_hot, seq_len)
 79 |     temp = profile_one_hot[None, :, :]
 80 |     temp = np.tile(temp, (temp.shape[1], 1, 1))
 81 |     feature = np.concatenate([temp, np.transpose(temp, [1, 0, 2])], 2)
 82 |     feature = np.concatenate([feature, np.expand_dims(dca_output, axis=2)], axis=2)
 83 |     feature = np.concatenate([feature, np.expand_dims(bp_prob_lp, axis=2)], axis=2)
 84 | 
 85 |     assert feature.shape==(seq_len,seq_len, 18)
 86 | 
 87 |     return seq_len, [i for i in (feature.astype(float)).flatten()], [i for i in zero_mask.flatten()], [i for i in label_mask.flatten()], [i for i in label_mask.flatten()]
 88 | 
 89 | def _int64_feature(value):
 90 |     if not isinstance(value, list) and not isinstance(value, np.ndarray):
 91 |         value = [value]
 92 | 
 93 |     return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
 94 | 
 95 | 
 96 | def _float_feature(value):
 97 |     if not isinstance(value, list) and not isinstance(value, np.ndarray):
 98 |         value = [value]
 99 | 
100 |     return tf.train.Feature(float_list=tf.train.FloatList(value=value))
101 | 
102 | 
103 | def _bytes_feature(value):
104 |     """Wrapper for inserting bytes features into Example proto."""
105 |     if isinstance(value, six.string_types):
106 |         value = six.binary_type(value, encoding='utf-8')
107 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
108 | 
109 | def create_tfr_files(args):
110 | 
111 |     print('\nPreparing tfr records file for SPOT-RNA2:')
112 |     path_tfrecords = os.path.splitext(args.inputs)[0] + ".tfrecords"
113 |     with open(args.inputs) as file:
114 |         input_data = [line.strip() for line in file.read().splitlines() if line.strip()]
115 | 
116 |     count = int(len(input_data)/2)
117 | 
118 |     ids = [input_data[2*i][1:].strip() for i in range(count)]
119 |     
120 |     with tf.io.TFRecordWriter(path_tfrecords) as writer:
121 |         for i in tqdm(range(len(ids))):
122 |             name     = input_data[2*i].replace(">", "") 
123 |             sequence = input_data[2*i+1].replace(" ", "").replace("T", "U").upper()
124 |             #print(sequence[-1])
125 |             
126 |             #print(len(sequence), name)                
127 |             seq_len, feature, zero_mask, label_mask, true_label = get_data(sequence, name, args)
128 | 
129 |             example = tf.train.Example(features=tf.train.Features(feature={'rna_name': _bytes_feature(name),
130 |                                                                            'seq_len': _int64_feature(seq_len),
131 |                                                                            'feature': _float_feature(feature),
132 |                                                                            'zero_mask': _float_feature(zero_mask),
133 |                                                                            'label_mask': _float_feature(label_mask),
134 |                                                                            'true_label': _float_feature(true_label)}))
135 | 
136 |             writer.write(example.SerializeToString())
137 | 
138 |     writer.close()
139 | 
140 | # ----------------------- hair pin loop assumption i - j < 2 --------------------------------#
141 | def hair_pin_assumption(pred_pairs):
142 |     pred_pairs_all = [i[:2] for i in pred_pairs]
143 |     bad_pairs = []
144 |     for i in pred_pairs_all:
145 |         if abs(i[0] - i[1]) < 3:
146 |             bad_pairs.append(i)
147 |     return bad_pairs
148 | 
149 | def flatten(x):
150 |     result = []
151 |     for el in x:
152 |         if hasattr(el, "__iter__") and not isinstance(el, str):
153 |             result.extend(flatten(el))
154 |         else:
155 |             result.append(el)
156 |     return result
157 | 
158 | def type_pairs(pairs, sequence):
159 |     sequence = [i.upper() for i in sequence]
160 |     # seq_pairs = [[sequence[i[0]],sequence[i[1]]] for i in pairs]
161 | 
162 |     AU_pair = []
163 |     GC_pair = []
164 |     GU_pair = []
165 |     other_pairs = []
166 |     for i in pairs:
167 |         if [sequence[i[0]],sequence[i[1]]] in [["A","U"], ["U","A"]]:
168 |             AU_pair.append(i)
169 |         elif [sequence[i[0]],sequence[i[1]]] in [["G","C"], ["C","G"]]:
170 |             GC_pair.append(i)
171 |         elif [sequence[i[0]],sequence[i[1]]] in [["G","U"], ["U","G"]]:
172 |             GU_pair.append(i)
173 |         else:
174 |             other_pairs.append(i)
175 |     watson_pairs_t = AU_pair + GC_pair
176 |     wobble_pairs_t = GU_pair
177 |     other_pairs_t = other_pairs
178 |         # print(watson_pairs_t, wobble_pairs_t, other_pairs_t)
179 |     return watson_pairs_t, wobble_pairs_t, other_pairs_t
180 | 
181 | # ----------------------- find multiplets pairs--------------------------------#
182 | def multiplets_pairs(pred_pairs):
183 | 
184 |     pred_pair = [i[:2] for i in pred_pairs]
185 |     temp_list = flatten(pred_pair)
186 |     temp_list.sort()
187 |     new_list = sorted(set(temp_list))
188 |     dup_list = []
189 |     for i in range(len(new_list)):
190 |         if (temp_list.count(new_list[i]) > 1):
191 |             dup_list.append(new_list[i])
192 | 
193 |     dub_pairs = []
194 |     for e in pred_pair:
195 |         if e[0] in dup_list:
196 |             dub_pairs.append(e)
197 |         elif e[1] in dup_list:
198 |             dub_pairs.append(e)
199 | 
200 |     temp3 = []
201 |     for i in dup_list:
202 |         temp4 = []
203 |         for k in dub_pairs:
204 |             if i in k:
205 |                 temp4.append(k)
206 |         temp3.append(temp4)
207 |         
208 |     return temp3
209 | 
210 | def multiplets_free_bp(pred_pairs, y_pred):
211 |     L = len(pred_pairs)
212 |     multiplets_bp = multiplets_pairs(pred_pairs)
213 |     save_multiplets = []
214 |     while len(multiplets_bp) > 0:
215 |         remove_pairs = []
216 |         for i in multiplets_bp:
217 |             save_prob = []
218 |             for j in i:
219 |                 save_prob.append(y_pred[j[0], j[1]])
220 |             remove_pairs.append(i[save_prob.index(min(save_prob))])
221 |             save_multiplets.append(i[save_prob.index(min(save_prob))])
222 |         pred_pairs = [k for k in pred_pairs if k not in remove_pairs]
223 |         multiplets_bp = multiplets_pairs(pred_pairs)
224 |     save_multiplets = [list(x) for x in set(tuple(x) for x in save_multiplets)]
225 |     assert L == len(pred_pairs)+len(save_multiplets)
226 |     #print(L, len(pred_pairs), save_multiplets)
227 |     return pred_pairs, save_multiplets
228 |         
229 | def output_mask(seq, NC=True):
230 |     if NC:
231 |         include_pairs = ['AU', 'UA', 'GC', 'CG', 'GU', 'UG', 'CC', 'GG', 'AG', 'CA', 'AC', 'UU', 'AA', 'CU', 'GA', 'UC']
232 |     else:
233 |         include_pairs = ['AU', 'UA', 'GC', 'CG', 'GU', 'UG']
234 |     mask = np.zeros((len(seq), len(seq)))
235 |     for i, I in enumerate(seq):
236 |         for j, J in enumerate(seq):
237 |             if str(I) + str(J) in include_pairs:
238 |                 mask[i, j] = 1
239 |     return mask
240 | 
241 | def ct_file_output(pairs, seq, id, save_result_path):
242 | 
243 |     col1 = np.arange(1, len(seq) + 1, 1)
244 |     col2 = np.array([i for i in seq])
245 |     col3 = np.arange(0, len(seq), 1)
246 |     col4 = np.append(np.delete(col1, 0), [0])
247 |     col5 = np.zeros(len(seq), dtype=int)
248 | 
249 |     for i, I in enumerate(pairs):
250 |         col5[I[0]] = int(I[1]) + 1
251 |         col5[I[1]] = int(I[0]) + 1
252 |     col6 = np.arange(1, len(seq) + 1, 1)
253 |     temp = np.vstack((np.char.mod('%d', col1), col2, np.char.mod('%d', col3), np.char.mod('%d', col4),
254 |                       np.char.mod('%d', col5), np.char.mod('%d', col6))).T
255 | 
256 |     np.savetxt(os.path.join(save_result_path, str(id))+'.ct', (temp), delimiter='\t\t', fmt="%s", header=str(len(seq)) + '\t\t' + str(id) + '\t\t' + 'SPOT-RNA2 output\n' , comments='')
257 | 
258 |     return
259 | 
260 | def bpseq_file_output(pairs, seq, id, save_result_path):
261 | 
262 |     col1 = np.arange(1, len(seq) + 1, 1)
263 |     col2 = np.array([i for i in seq])
264 |     #col3 = np.arange(0, len(seq), 1)
265 |     #col4 = np.append(np.delete(col1, 0), [0])
266 |     col5 = np.zeros(len(seq), dtype=int)
267 | 
268 |     for i, I in enumerate(pairs):
269 |         col5[I[0]] = int(I[1]) + 1
270 |         col5[I[1]] = int(I[0]) + 1
271 |     #col6 = np.arange(1, len(seq) + 1, 1)
272 |     temp = np.vstack((np.char.mod('%d', col1), col2, np.char.mod('%d', col5))).T
273 |     #os.chdir(save_result_path)
274 |     #print(os.path.join(save_result_path, str(id[0:-1]))+'.spotrna')
275 |     np.savetxt(os.path.join(save_result_path, str(id))+'.bpseq', (temp), delimiter=' ', fmt="%s", header='#' + str(id) , comments='')
276 | 
277 |     return
278 | 
279 | def lone_pair(pairs):
280 |     lone_pairs = []
281 |     pairs.sort()
282 |     for i, I in enumerate(pairs):
283 |         if ([I[0] - 1, I[1] + 1] not in pairs) and ([I[0] + 1, I[1] - 1] not in pairs):
284 |             lone_pairs.append(I)
285 | 
286 |     return lone_pairs
287 | 
288 | def prob_to_secondary_structure(ensemble_outputs, label_mask, seq, name, args):
289 |     #save_result_path = 'outputs'
290 |     Threshold = 0.795
291 |     label_mask = np.triu(np.ones((len(seq), len(seq))),1)
292 |     test_output = ensemble_outputs
293 |     mask = output_mask(seq)
294 |     inds = np.where(label_mask == 1)
295 |     y_pred = np.zeros(label_mask.shape)
296 |     for i in range(test_output.shape[0]):
297 |         y_pred[inds[0][i], inds[1][i]] = test_output[i]
298 |     #y_pred = np.multiply(y_pred, mask)
299 | 
300 |     tri_inds = np.triu_indices(y_pred.shape[0], k=1)
301 | 
302 |     out_pred = y_pred[tri_inds]
303 |     outputs = out_pred[:, None]
304 |     seq_pairs = [[tri_inds[0][j], tri_inds[1][j], ''.join([seq[tri_inds[0][j]], seq[tri_inds[1][j]]])] for j in
305 |                  range(tri_inds[0].shape[0])]
306 | 
307 |     outputs_T = np.greater_equal(outputs, Threshold)
308 |     pred_pairs = [i for I, i in enumerate(seq_pairs) if outputs_T[I]]
309 |     pred_pairs = [i[:2] for i in pred_pairs]
310 |     pred_pairs, save_multiplets = multiplets_free_bp(pred_pairs, y_pred)
311 |     
312 |     if args.outputs=='outputs/':
313 |         output_path = os.path.join(Path(os.path.dirname(os.path.realpath(__file__))).parent, args.outputs)
314 |     else:
315 |         output_path = args.outputs
316 | 
317 |     ct_file_output(pred_pairs, seq, name, output_path)
318 |     bpseq_file_output(pred_pairs, seq, name, output_path)
319 |     np.savetxt(output_path + '/'+ name +'.prob', y_pred, delimiter='\t')
320 |     
321 |     if args.motifs:
322 |         try:
323 |             os.chdir(args.outputs)
324 |             p = subprocess.Popen(['perl', os.path.join(Path(os.path.dirname(os.path.realpath(__file__))).parent, 'utils/bpRNA.pl'), os.path.join(args.outputs, name + '.bpseq')])
325 |         except:
326 |             print('\nUnable to run bpRNA script;\nplease refer to "https://github.com/hendrixlab/bpRNA/" for system requirments to use bpRNA')
327 |         #os.chdir('../')
328 | 
329 |     return
330 | 


--------------------------------------------------------------------------------
/sample_run/sample_seq_features/sample_seq.msa:
--------------------------------------------------------------------------------
  1 | # STOCKHOLM 1.0
  2 | #=GF AU Infernal 1.1.3
  3 | 
  4 | #=GS 6UFJ_A/1-51                  DE Chain A, RNA (50-MER)
  5 | #=GS 6UEY_A/1-50                  DE Chain A, RNA (50-MER)
  6 | #=GS HE577054.1/3246821-3246757   DE Paenibacillus polymyxa M1 main chromosome, complete genome
  7 | #=GS MF288922.1/150528-150592     DE Bacillus phage Janet, complete genome
  8 | #=GS CP033464.1/4485719-4485655   DE Brevibacillus laterosporus strain 1821L chromosome, complete genome
  9 | #=GS KT307976.1/157679-157741     DE Bacillus phage AvesoBmore, complete genome
 10 | #=GS CP032410.1/870062-870126     DE Brevibacillus laterosporus strain E7593-50 chromosome, complete genome
 11 | #=GS MK892513.1/27480-27550       DE Prokaryotic dsDNA virus sp. isolate Unbinned_2716_contig-100_1, complete genome
 12 | #=GS MK892777.1/32264-32334       DE Prokaryotic dsDNA virus sp. isolate Tp1_39_SUR_34326_1, partial genome
 13 | #=GS MF288921.1/151458-151522     DE Bacillus phage OTooleKemple52, complete genome
 14 | #=GS MH638310.1/151443-151507     DE Bacillus phage Kamfam, complete genome
 15 | #=GS KJ489397.1/151758-151822     DE Bacillus phage CAM003, complete genome
 16 | #=GS KJ489398.1/150857-150921     DE Bacillus phage Evoli, complete genome
 17 | #=GS KJ489400.1/150952-151016     DE Bacillus phage Hoody T, complete genome
 18 | #=GS KU737346.1/152020-152084     DE Bacillus phage Vinny, complete genome
 19 | #=GS KF669647.1/155754-155816     DE Bacillus phage BigBertha, complete genome
 20 | #=GS KU737345.1/154884-154946     DE Bacillus phage Juglone, complete genome
 21 | #=GS KU737347.1/155734-155796     DE Bacillus phage Phrodo, complete genome
 22 | #=GS MN038178.1/155190-155252     DE Bacillus phage Beyonphe, complete genome
 23 | #=GS KF208639.2/156075-156137     DE Bacillus phage Troll, complete genome
 24 | #=GS CP009278.1/2800251-2800310   DE Sphingobacterium sp. ML3W, complete genome
 25 | #=GS CP045298.1/5377890-5377826   DE Paenibacillus brasilensis strain KACC 13842 chromosome, complete genome
 26 | #=GS KF669662.1/155100-155162     DE Bacillus phage Spock, complete genome
 27 | #=GS KR063281.1/60079-60028       DE Gordonia phage GMA2, complete genome
 28 | #=GS KJ489402.1/153758-153819     DE Bacillus phage Riley, complete genome
 29 | #=GS MF765814.1/155980-156041     DE Bacillus phage Taffo16, complete genome
 30 | #=GS CP000154.2/3364238-3364174   DE Paenibacillus polymyxa E681, complete genome
 31 | #=GS LN852800.1/7754-7693         DE Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0110
 32 | #=GS CP019039.1/7984-8046         DE Bacillus velezensis strain GH1-13 plasmid unnamed, complete sequence
 33 | #=GS LN852940.1/1904-1844         DE Uncultured prokaryote from Rat gut metagenome metamobilome, plasmid pRGRH0268
 34 | #=GS JN790865.1/35681-35620       DE Bacillus phage B4, complete genome
 35 | #=GS JN797796.1/35736-35675       DE Bacillus phage B5S, complete genome
 36 | #=GS KY888882.1/156410-156472     DE Bacillus phage Flapjack, complete genome
 37 | #=GS CP014843.1/29638-29697       DE Bacillus licheniformis strain SCDB 14 plasmid pSCDB14, complete sequence
 38 | #=GS CP021670.1/37922-37863       DE Bacillus licheniformis strain SRCM100141 plasmid pBL141-2 sequence
 39 | #=GS CP035189.1/167253-167194     DE Bacillus licheniformis strain SRCM103914 plasmid unnamed1, complete sequence
 40 | #=GS CP045906.1/14639513-14639571 DE Caligus rogercresseyi isolate FCH chromosome 17
 41 | #=GS HG916826.1/843085-843030     DE Pseudomonas pseudoalcaligenes CECT 5344 complete genome
 42 | #=GS LK391695.1/845304-845249     DE Pseudomonas pseudoalcaligenes genome assembly Ppseudo_Pac, chromosome : I
 43 | #=GS XM_028713395.1/30-87         DE PREDICTED: Podarcis muralis solute carrier family 16 member 6 (SLC16A6), mRNA
 44 | #=GS AC100771.2/133706-133648     DE Homo sapiens chromosome 11, clone RP11-159H22, complete sequence
 45 | #=GS CP022654.2/63818-63880       DE Bacillus velezensis strain SCDB 291 chromosome, complete genome
 46 | #=GS CP023320.1/44833-44771       DE Bacillus velezensis strain SCGB 1 chromosome, complete genome
 47 | #=GS CP045899.1/5107513-5107456   DE Caligus rogercresseyi isolate FCH chromosome 10
 48 | #=GS CP045890.1/2686952-2687009   DE Caligus rogercresseyi isolate FCH chromosome 1
 49 | #=GS CP010557.1/4528803-4528858   DE Raoultella ornithinolytica strain S12, complete genome
 50 | #=GS LR134253.1/1479651-1479596   DE Klebsiella aerogenes strain NCTC9997 genome assembly, chromosome: 3
 51 | #=GS MH153801.1/58164-58217       DE Microbacterium phage Count, complete genome
 52 | #=GS CP045896.1/486401-486459     DE Caligus rogercresseyi isolate FCH chromosome 7
 53 | #=GS CP045901.1/8022709-8022767   DE Caligus rogercresseyi isolate FCH chromosome 12
 54 | 
 55 | 6UFJ_A/1-51                          ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCG.GAGAGCAG...A-...............---------
 56 | #=GR 6UFJ_A/1-51                  PP ******************************************.********...8.........................
 57 | 6UEY_A/1-50                          ACUCGUUUGAGCGAGUAUAAACAGUUGGUUAGGCUCAAAGCG.GAGAGCAG...--...............---------
 58 | #=GR 6UEY_A/1-50                  PP ******************************************.********.............................
 59 | HE577054.1/3246821-3246757           ACUCGUCUGAGCGAGUAUAAACAGGUCAUUAAGCUCAGAGCG.UUCACCG-...--ggau....caug...-CGGUGAGG
 60 | #=GR HE577054.1/3246821-3246757   PP ******************************************.******8......5666....6665....8*******
 61 | MF288922.1/150528-150592             ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAAGCUCACAGCGuUAGAGAGG...--guu......ucu...CCUCUCUAG
 62 | #=GR MF288922.1/150528-150592     PP *************************7.59*************88******9.....577......777...9********
 63 | CP033464.1/4485719-4485655           ACUCGAUUGAGCGAGUAUAAACAGAC-CUUAGGCUCAAAGCG.UUGAGAAG...--caa.....aaag...CUUCUCAGG
 64 | #=GR CP033464.1/4485719-4485655   PP ************************76.59*************.*******9.....677.....7777...9********
 65 | KT307976.1/157679-157741             ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGCGGG
 66 | #=GR KT307976.1/157679-157741     PP ***********************854.499************.********76666...............*********
 67 | CP032410.1/870062-870126             ACUCGAUUGAGCGAGUAUAAAUAGAC-CUUAAGCUCAAAGCG.UUGAGGAG...--cga.....ucag...CUUCUCAGG
 68 | #=GR CP032410.1/870062-870126     PP ************************76.59*************.*******9.....677.....7777...9********
 69 | MK892513.1/27480-27550               AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGuGCGUAUAG...--cuaggucaagug...CUAUACGCG
 70 | #=GR MK892513.1/27480-27550       PP ************************9.89**********************9.....8***********...9********
 71 | MK892777.1/32264-32334               AGUCGUUUGAGCGACUUAAAAUAGC-GUUUAAGCUCAAAGCGuGCGUAUAG...--cuaggucaagug...CUAUACGCG
 72 | #=GR MK892777.1/32264-32334       PP ************************9.89**********************9.....8***********...9********
 73 | MF288921.1/151458-151522             ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGuUAGAGAGG...--guu......ucu...CCUCUCUAG
 74 | #=GR MF288921.1/151458-151522     PP ************************75.59*************88******9.....577......777...9********
 75 | MH638310.1/151443-151507             ACUCGUGUGAGCGAGUAUAAACAGAC-UUUAGGCUCACAGCGuUAGAGAGG...--guu......ucu...CCUCUCUAG
 76 | #=GR MH638310.1/151443-151507     PP ************************75.59*************88******9.....577......777...9********
 77 | KJ489397.1/151758-151822             ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG
 78 | #=GR KJ489397.1/151758-151822     PP *************************7.59*************889*99999.....577......777...999****9*
 79 | KJ489398.1/150857-150921             ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG
 80 | #=GR KJ489398.1/150857-150921     PP *************************7.59*************889*99999.....577......777...999****9*
 81 | KJ489400.1/150952-151016             ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG
 82 | #=GR KJ489400.1/150952-151016     PP *************************7.59*************889*99999.....577......777...999****9*
 83 | KU737346.1/152020-152084             ACUCGUGUGAGCGAGUAUAAACAGCC-UUUAGGCUCACAGCGuUAGGGAGG...--guu......ucu...CCUCUCUAG
 84 | #=GR KU737346.1/152020-152084     PP *************************7.59*************889*99999.....577......777...999****9*
 85 | KF669647.1/155754-155816             ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG
 86 | #=GR KF669647.1/155754-155816     PP ***********************854.499************.********76666...............*********
 87 | KU737345.1/154884-154946             ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG
 88 | #=GR KU737345.1/154884-154946     PP ***********************854.499************.********76666...............*********
 89 | KU737347.1/155734-155796             ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG
 90 | #=GR KU737347.1/155734-155796     PP ***********************854.499************.********76666...............*********
 91 | MN038178.1/155190-155252             ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGuuuAU...............CCCCGUGGG
 92 | #=GR MN038178.1/155190-155252     PP ***********************854.499************.********76666...............*********
 93 | KF208639.2/156075-156137             ACUCGUGUGAGCGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGGcuuAU...............CCCCGUGGG
 94 | #=GR KF208639.2/156075-156137     PP ***********************854.499************.********76666...............*********
 95 | CP009278.1/2800251-2800310           AGUCGUUUGAGCGACUUAAAAUAGGU-UUUAAGCUCAAAGCG.CCCCGAUA...AU...............AAUCGGGAG
 96 | #=GR CP009278.1/2800251-2800310   PP ************************98.499************.********...**...............*********
 97 | CP045298.1/5377890-5377826           GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCG.UUCACCGG..gAU............cauCCGGUGAGG
 98 | #=GR CP045298.1/5377890-5377826   PP ******************************************.*******9..643............334*********
 99 | KF669662.1/155100-155162             ACUCGUGUAAGCGAGUAUAAAAAGGC-UUUAGGCUUACAGCG.UCGCGGAGuuuAU...............CUCCGCGGG
100 | #=GR KF669662.1/155100-155162     PP *********************99843.499************.********76666...............*********
101 | KR063281.1/60079-60028               ACUCGACUGAGCGAGUAUAAACAGUU-CUUAAGCUCAGAGCG.GCC-----...--ga........ga...-----GGCG
102 | #=GR KR063281.1/60079-60028       PP ************************88.59*************.985..........67........76........589*
103 | KJ489402.1/153758-153819             ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCG.UCGCGGG-...--guuu....aucu...-C--CCGCG
104 | #=GR KJ489402.1/153758-153819     PP ***********************854.49*************.6665555......4566....6654....4..5555*
105 | MF765814.1/155980-156041             ACUCGUGUGAGCGAGUAUAAAUAGGC-UUUAAGCUCACAGCG.UCGCGGG-...--guuu....aucu...-C--CCGCG
106 | #=GR MF765814.1/155980-156041     PP ***********************854.49*************.6665555......4566....6654....4..5555*
107 | CP000154.2/3364238-3364174           GUUCGUCUGAGCGAACGCAAACAGGCCAUUAAGCUCAGAGCG.UUCACUGG...A-uu.......cgu...CCAGUGAGA
108 | #=GR CP000154.2/3364238-3364174   PP ******************************************.********...8.55.......555...*********
109 | LN852800.1/7754-7693                 GCUCGUCUGGGCGAGGAUAAACAGCUA-UUAAGCCCAGAGCG.UUCCGGUU...AU............a.uGAUCGGAGG
110 | #=GR LN852800.1/7754-7693         PP **************************5.9*************.*****998...64............3.3789******
111 | CP019039.1/7984-8046                 AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCUCAGAGCG.UCCUUCC-...--ugc.....uucg...-GGAAGGGG
112 | #=GR CP019039.1/7984-8046         PP ***********************975.69*************.***9997......688.....8886....7999****
113 | LN852940.1/1904-1844                 GCUCGUCUGGGCGAGGGUAAAUAGCUAAUUAGGCCCAGAGCGuUCCAGGAU...G-...............AUCCUGGAG
114 | #=GR LN852940.1/1904-1844         PP ******************************************889******...9................*********
115 | JN790865.1/35681-35620               AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGG...--uu........ua...UCCCCCGUG
116 | #=GR JN790865.1/35681-35620       PP ***********************854.499************.99977665.....33........33...34555888*
117 | JN797796.1/35736-35675               AGUCGUGUGAGCGACUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGG...--uu........ua...UCCCCCGUG
118 | #=GR JN797796.1/35736-35675       PP ***********************854.499************.99977665.....33........33...34555888*
119 | KY888882.1/156410-156472             ACUCGUGUGAGUGAGUAUAAACAGGC-UUUAGGCUCACAGCG.UCGCGGGG...--uuu......auc...CCCUGCG-G
120 | #=GR KY888882.1/156410-156472     PP ***********************854.499************.99999999.....455......555...8899999.*
121 | CP014843.1/29638-29697               AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCG.UUUCCCUU...CU...............AGGGGAGGU
122 | #=GR CP014843.1/29638-29697       PP ***********************975.69*************.********...**...............*********
123 | CP021670.1/37922-37863               AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCG.UUUCCCUU...CU...............AGGGGAGGU
124 | #=GR CP021670.1/37922-37863       PP ***********************975.69*************.********...**...............*********
125 | CP035189.1/167253-167194             AGUCGUCUGGGCGACUAUAAACAGGC-AUUAAGCCCAGAGCG.UUUCCCUU...CU...............AGGGGAGGU
126 | #=GR CP035189.1/167253-167194     PP ***********************975.69*************.********...**...............*********
127 | CP045906.1/14639513-14639571         UCUUGCUUGAGCAAGAAUAAAGAGCUGUACAUAAGCAAAGAG.UCUUGCCU...--...............GAGCAAGAG
128 | #=GR CP045906.1/14639513-14639571 PP ***************************999999999******.*****943....................569******
129 | HG916826.1/843085-843030             CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCG.AAAUUCGG...GG...............CCG-----G
130 | #=GR HG916826.1/843085-843030     PP ******************************************.********...**...............***.....*
131 | LK391695.1/845304-845249             CCCCGCUGGCGCGGGGAACACCACCUUGUCAAGCUCAAAGCG.AAAUUCGG...GG...............CCG-----G
132 | #=GR LK391695.1/845304-845249     PP ******************************************.********...**...............***.....*
133 | XM_028713395.1/30-87                 ACCGGCUCGAGCCGGUAUAAAAAGCU---UGAGCUCGAGCAC.AGCGGCAG...CA...............CUGCCGCAG
134 | #=GR XM_028713395.1/30-87         PP *************************7...669****998888.9*******...99...............*********
135 | AC100771.2/133706-133648             GUUCAUUUGGGUGAAUAUAAAAAGGAGAUUA--CUCAAAGCU.UUAAAAAA...AA...............UUUUUUUAA
136 | #=GR AC100771.2/133706-133648     PP ******************************9..9********.98888888...88...............*********
137 | CP022654.2/63818-63880               AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCG.UCCUUCC-...--ugc.....uacg...-GGAAGGGG
138 | #=GR CP022654.2/63818-63880       PP ************************86.69*************.****997......678.....8886....899*****
139 | CP023320.1/44833-44771               AGUCGUCUGGGCGACUAUAAACAGAC-AUUAAGCCCAGAGCG.UCCUUCC-...--ugc.....uacg...-GGAAGGGG
140 | #=GR CP023320.1/44833-44771       PP ************************86.69*************.****997......678.....8886....899*****
141 | CP045899.1/5107513-5107456           UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGCUG...--...............-AGCAAGAG
142 | #=GR CP045899.1/5107513-5107456   PP ***************************999999999******.******85.....................59******
143 | CP045890.1/2686952-2687009           UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGCUG...--...............-AGCAAGAG
144 | #=GR CP045890.1/2686952-2687009   PP ***************************999999999******.******85.....................59******
145 | CP010557.1/4528803-4528858           CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------.UCAGGCAG...AU..............uCUGCCAGAG
146 | #=GR CP010557.1/4528803-4528858   PP **********************************96.......8889****...96..............6*********
147 | LR134253.1/1479651-1479596           CGUCGCCUGAACGACGAUAAACUGAAGGUUAAGCUA------.UCAGGCAG...AU..............uCUGCCAGAG
148 | #=GR LR134253.1/1479651-1479596   PP **********************************96.......8889****...96..............6*********
149 | MH153801.1/58164-58217               AGUCGUCUGAGCGACUUUAAAUAGGU-CUUAGGCUCAGAGCG.GAUAGAUG...--...............----UAUUG
150 | #=GR MH153801.1/58164-58217       PP ************************98.49*************.*9985433........................4566*
151 | CP045896.1/486401-486459             UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGC--...AU...............GAGCAAGAG
152 | #=GR CP045896.1/486401-486459     PP ***************************999999999******.*****9.....77...............78*******
153 | CP045901.1/8022709-8022767           UCUUGCUUGAGCAAGAAUAAAGAGAUGUACAUAAGCAAAGAG.UCUUGC--...AU...............GAGCAAGAG
154 | #=GR CP045901.1/8022709-8022767   PP ***************************999999999******.*****9.....77...............78*******
155 | #=GC SS_cons                         <<<<<<____>>>>>>--------------------------.<<<<<<<<...__~~~~~~~~~~~~...>>>>>>>>:
156 | #=GC RF                              ACUCGUUUGAGCGAGUAUAAACAGCUGGUUAAGCUCAAAGCG.GAGAGCAG...AU~~~~~~~~~~~~...CUGCUCUCG
157 | //
158 | 


--------------------------------------------------------------------------------
/utils/parse_blastn_local.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | use strict;
  3 | 
  4 | 
  5 | 
  6 | # Default options
  7 | my $E_max=10;
  8 | my $E_min=-1.0;
  9 | my $P_max=1;
 10 | my $cov_thrshd=0;
 11 | my $sc_thrshd=-10;
 12 | my $qid_thrshd=0;
 13 | my $bl=-10;            # minimum per-residue bit score with query at ends of HSP for loose end pruning
 14 | my $bs=-10;            # minimum per-residue bit score with query at ends of HSP for strict end pruning
 15 | my $bg=30;             # below this number of end gaps the loose HSP pruning score is used
 16 | my $outformat="fas";
 17 | my $append=0;
 18 | my $query_file="";
 19 | my $infile;
 20 | my $outfile;
 21 | my $v=2;
 22 | 
 23 | # Variable declarations
 24 | my $i;                 # residue index
 25 | my $j;                 # residue index
 26 | my $k;                 # sequence index
 27 | my $options="";
 28 | my $line;              # line read in from file
 29 | my $query_length=0;    # number of residues in query sequence
 30 | my $query_match=0;     # number of upper-case residues (=match states) in query sequence
 31 | my $capitalize=0;      # capitalize query
 32 | my $nameline;          # >template_name
 33 | my $Evalue;            # e-value of hit
 34 | my $score;             # bit score of hit
 35 | my $hit_length;        # number of residues in HSP
 36 | my $coverage;          # hit-length/$query_length
 37 | my $score_col;         # score per column
 38 | my $score_min=0;       # $score_min=-3*log($P_max)/log(2);
 39 | 
 40 | my $query_name;        # name of query file
 41 | my $queryseq;          # residues of query read in  with -q or -q2m option
 42 | my $qfirst;            # index of first query residue in pairwise alignment
 43 | my $qlast;             # index of last  query residue in pairwise alignment
 44 | my $tfirst;            # index of first template residue in pairwise alignment
 45 | my $tlast;             # index of last  template residue in pairwise alignment
 46 | my $tlen=0;            # length of template in pairwise alignment
 47 | my @query_res;         # query residues from current pairwise alignment
 48 | my @template_res;      # template residues from current pairwise alignment
 49 | my $query_res;         # query residues from current pairwise alignment
 50 | my $template_res;      # template residues from current pairwise alignment
 51 | my $line_number=0;
 52 | my $new_hit="";        # new sequence record; is only written if coverage threshold is exceeded
 53 | my $nhit=0;            # counts the number of sequences already in alignment
 54 | my @hitnames;          # $hitnames[$nhit] is the nameline of the ihit'th hit
 55 | my @hitseqs;           # $hitseqs[$nhit] contains the residues of the ihit'th hit
 56 | my @match;             # for -q option: $match[$i]=1 if $i'th query residue is capital letter in query, else 0
 57 | my $qid;               # $qid is sequence identity with query (for -q option: CONSIDER ONLY MATCH STATES)
 58 | my $len;               # $len is sequence number of residues of seq k aligned with a match state in query
 59 | my $b;                 # minimum per-residue bit score with query at ends of HSP
 60 | my $pfile="";          # alignment file used to calculate PSSM for -p and s/c options
 61 | my $bfile="";          # alignment file used to calculate PSSM for -b option
 62 | my $GAP=11.0/3.0;        # gap opening penalty in bits (for BLOSUM62: 11 bits/3)
 63 | my $EXTEND=1.0/3.0;      # gap extension penalty in bits (for BLOSUM62: 1 bits/3)
 64 | my @queryseq;
 65 | my $skip=0;            # skip this template sequence because it might be a synthetic fusion protein
 66 | my $best=0;            # extract only the best HSP per sequence
 67 | my $rescaled_Gonnet=0; # Gonnet matrix not yet rescaled to bits
 68 | my @qp=();             # $qb[$i][$a] is PSSM from alignment read in with -B option
 69 | my @qb=();             # $qp[$i][$a] is PSSM from alignment read in with -P option
 70 | 
 71 | 
 72 | $infile=$ARGV[0];
 73 | $query_file= $ARGV[1];
 74 | $outfile=$ARGV[2];
 75 | 
 76 | #Include query sequence as first sequence in alignment?
 77 | if ($query_file) {
 78 |     open(QUERYFILE,"<$query_file") or die ("ERROR: Cannot open $query_file: $!\n");
 79 |     while($line=<QUERYFILE>) # Read name line
 80 |     {
 81 |         if ($line=~/^>(.*)/)
 82 |         {
 83 |             $query_name=$1;
 84 |             last;
 85 |         }
 86 |     }
 87 |     $hitseqs[0]="";
 88 |     while($line=<QUERYFILE>)  # Read residues
 89 |     {
 90 |         if ($line=~/^>/) {last;}
 91 |         chomp($line);
 92 |         $line=~s/\s+//g;      # remove white space
 93 |         $hitseqs[0].=$line;
 94 |     }
 95 |     close(QUERYFILE);
 96 | 
 97 |     # Prepare name line of hit
 98 |     if ($outformat eq "psi") {
 99 |         $query_name=~/^(\S{1,20})\S*\s*(.*)/;       # delete everything after first block
100 |         $line=sprintf("%s",$1);
101 |         $line=~ tr/ /_/;
102 |         $hitnames[0] = sprintf("%-31.31s ",$line);
103 |     } else {
104 |         $hitnames[0] = sprintf(">%s  E=0.0",$query_name);
105 |     }
106 |     $hitseqs[0] =~ tr/-.//d;      # delete all gaps from query
107 |     $queryseq = $hitseqs[0];
108 |     $hitseqs[0] =~ tr/a-z/A-Z/d;  # capitalize hitseq[0] and delete gaps
109 | #    $hitseqs[0] =~ tr/Uu/Cc/;  # nicht mehr noetig in blast. Kann aber alignhits.pl zum abschmieren bringen.
110 |     $nhit=1;
111 | 
112 |     # Capitalize query?
113 |     if ($capitalize) {$queryseq =~ tr/a-z/A-Z/;}
114 |     $query_match = ($queryseq=~tr/A-Z/A-Z/);  # count number of match states in query
115 | 
116 |     # Determine match columns as those with upper case residue in query
117 |     @queryseq=unpack("C*",$queryseq);
118 |     for ($j=0; $j<@queryseq; $j++) {
119 |         if ($queryseq[$j]>=65 && $queryseq[$j]<=90) {$match[$j]=1;} else {$match[$j]=0;}
120 |     }
121 | }
122 | 
123 | 
124 | 
125 | 
126 | # Scan Blast output file for query length (needed for coverage)
127 | open(INFILE,"<$infile") or die ("Error: cannot open $infile: $!\n");
128 | $line_number++;
129 | while ($line=<INFILE>)
130 | {
131 |     if ($line=~/^Length\s*=\s*(\d+)/) {$query_length = $1; last;}
132 |     $line_number++;
133 | }
134 | #print("Query length = $query_length\n");
135 | 
136 | while ($line = <INFILE>) #scan through PsiBlast-output line by line
137 | {
138 |     # New nameline found?
139 |     #print "$line";
140 |     #if ($line=~/^Length\s*=\s*(\d+)/) { print "length=$1\n\n\n\n";}
141 | 
142 |     if ($line=~s/^>//) 
143 |     {
144 | 	#print "$line";
145 |         $line=~s/\s+/ /g;
146 |         chomp($line);
147 |         $nameline=$line;
148 |         while ($line=<INFILE>) 
149 | 	{
150 |             if ($line=~/^Length\s*=\s*(\d+)/) {last;}
151 |             chomp($line);
152 |             $nameline.=$line;
153 |         }
154 |         $line=~/^Length\s*=\s*(\d+)/;
155 |         $tlen=$1;
156 |         $nameline=~s/\s+/ /g;
157 |         $nameline=~s/\s+gi\|/   gi\|/g;
158 |         # Is sequence a synthetic fusion protein ?
159 |         #if ($nameline=~/(\[synthetic| synthetic|construct|cloning|vector|chimeric|fusion)/i) {$skip=1;} else {$skip=0;}
160 | 
161 | 	#print "$nameline\n";
162 |     }
163 | 
164 |     # New HSP found?
165 |     elsif (!$skip && $line=~/^ Score =/)
166 |     {
167 |         if($best) {$skip=1;} # skip all following hits with same sequence?
168 | 
169 |         # First check whether E-value is small enough
170 |         if($line =~ /^ Score =\s*(\S+)\s*bits\s*\S*\s*Expect =\s*(\S+)/) 
171 | 	{
172 |             $score=$1;
173 |             $Evalue=$2;
174 | 
175 | 	    #print "$score, $Evalue\n";
176 |         } 
177 | 	else 
178 | 	{
179 |             print("\nWARNING: wrong format in blast output. Expecting Score = ... Expect = ..\n$line\n");
180 |         }
181 |         $Evalue=~s/^(e|E)/1$1/;   # Expect = e-123 -> 1e-123
182 |         $Evalue=~tr/,//d;
183 |         if ($Evalue>$E_max || $Evalue<$E_min) {$new_hit=""; next;} # reject hit
184 | 
185 |         # Record sequence identity
186 |         # (not needed, qid calculated afterwards WITHOUT counting template residues aligned to gaps in query)
187 |         $line=<INFILE>;
188 |         if ($line =~ /^ Identities =\s*\S+\/(\S+)\s+\((\S+)%\)/)
189 | 	{
190 |             $qid=$2;
191 | 	    #print "$qid\n";
192 |             $line=<INFILE>;
193 |         }
194 | 	else
195 | 	{
196 |             $qid=0.0; # if match is too poor then no identities are given
197 |         }
198 | 
199 |         # Skip another line and read following line
200 | 
201 |         $line=<INFILE>;
202 |         $line=<INFILE>;
203 | 
204 |         # Read pairwise alignment
205 |         $qfirst="";
206 |         $tfirst="";
207 |         $query_res="";
208 |         $template_res="";
209 |         while ($line=~/^Query\s+\d+\s+\S+\s+\d*/) # Cycle in this loop until no new "Query:" lines are found
210 |         {	    
211 |             if ($line!~/^Query\s+(\d+)\s+(\S+)\s+(\d*)/) 
212 | 	    {
213 |                 print("WARNING 1: wrong format of blast output in $infile, line $.\n");
214 |                 last;
215 |             }
216 |             if ($3 eq "") {
217 |                 <INFILE>; <INFILE>; <INFILE>; $line=<INFILE>;
218 |                 print("WARNING 2: wrong format of blast output in $infile, line $. Skipping alignment block.\n");
219 |                 next;
220 |             }
221 |             if ($qfirst eq "") {$qfirst=$1;}
222 |             $query_res .= $2;
223 |             $qlast=$3;
224 |             <INFILE>; $line=<INFILE>;
225 |             if ($line!~/^Sbjct\s+(\d+)\s+(\S+)\s+(\d+)/) 
226 | 	    {
227 |                 print("WARNING 3: wrong format of blast output in $infile, line $.\n");
228 |                 last;
229 |             }
230 |             if ($tfirst eq "") {$tfirst=$1;}
231 |             $template_res .= $2;
232 |             $tlast=$3;
233 |             <INFILE>; $line=<INFILE>;
234 |         } # end while(1)
235 |         # Check lengths
236 | 	$query_res = uc($query_res);
237 |         $template_res = uc($template_res);
238 |         if (length($template_res)!=length($query_res)) {
239 |             print("WARNING: Query and template lines do not have the same length in $infile, line $.\n");
240 |             print("Q: $query_res\n");
241 |             print("T: $template_res\n");
242 |             next;
243 |         }
244 | 
245 | 
246 | 	#print "$query_res\n";
247 | 	#print "$template_res\n";
248 | 
249 |         # Check whether hit has sufficient score per column
250 |         $hit_length=($template_res=~tr/a-zA-Z/a-zA-Z/);
251 |         if ($hit_length==0) {next;}                # Reject hit?
252 |         $score_col=$score/$hit_length;
253 | 
254 |         @query_res =unpack("C*",$query_res);
255 |         @template_res=unpack("C*",$template_res);
256 | 
257 |         # Prune ends of HSP which are not reliably homologous
258 |         #if (($bs>-9 || $bl>-9) && !&PruneHSP()) {next;}   # if entire HSP is pruned away, goto next alignment
259 | 
260 |         # Check whether hit has sufficient sequence identity and coverage with query
261 |         if (!$query_file) 
262 | 	{
263 |             $len=0; $qid=0;
264 |             for ($i=0; $i<scalar(@query_res); $i++) 
265 | 	    {
266 |                 if ($template_res[$i]!=45 && $query_res[$i]!=45) {  # count only non-gap template residues in match columns!
267 |                     $len++;
268 |                     if ($query_res[$i]==$template_res[$i]) {$qid++;}
269 |                 }
270 |             }
271 |             $coverage = 100*$len/$query_length;
272 |         } 
273 | 	else 
274 | 	{
275 |             $len=1; $qid=0; $j=$qfirst-1; # if first_res=1 then $j=0
276 |             for ($i=0; $i<scalar(@query_res); $i++) 
277 | 	    {
278 |                 if ($query_res[$i]!=45) 
279 | 		{
280 |                     if ($template_res[$i]!=45 && $match[$j]) {      # count only non-gap template residues in match columns!
281 |                         $len++;
282 |                         if ($query_res[$i]==$template_res[$i]) {$qid++;}
283 |                     }
284 |                     $j++;                                         # $j = next position in query
285 |                 }
286 |             }
287 |             $coverage = 100*$len/$query_match;
288 |         }
289 |         if ($len==0) {next;}                              # Reject hit?
290 |         if (100*$qid/$len<$qid_thrshd) {next;}            # Reject hit?
291 |         if ($coverage<$cov_thrshd) {next;}                # Reject hit?
292 | #       print("Q: $query_res\n");
293 | #       print("T: $template_res\n\n");
294 | 
295 |         # Check score per column
296 |         if ($sc_thrshd>-9 || $score_min>0) {
297 |             if (!&CheckScorePerColumn()) {next;}
298 |         }
299 | 
300 |         if ($v>=3) {printf("nhit=%-2i  qid=%-3i  qlen=%-3i  qid=%-3i%% s/c=%-6.3f\n",$nhit,$qid,$len,100*$qid/$len,$score_col);}
301 | 
302 |         # Record residues
303 |         $new_hit = "-"x($qfirst-1);                       # Print gaps at beginning of sequence
304 |         if ($outformat eq "psi") {
305 |             for ($i=0; $i<scalar(@query_res); $i++) {
306 |                 if ($query_res[$i]!=45) {                 # residues aligned to gaps are ignored
307 |                     $new_hit .= uc(chr($template_res[$i])); # UPPER case if aligned with a query residue (match state)
308 |                 }
309 |             }
310 |         } else {
311 |             for ($i=0; $i<scalar(@query_res); $i++) {
312 |                 if ($query_res[$i]!=45) {
313 |                     $new_hit .= uc(chr($template_res[$i])); # UPPER case if aligned with a query residue (match state)
314 |                 } else {
315 |                     if($template_res[$i]!=45) {
316 |                         $new_hit.=lc(chr($template_res[$i])); # lower case if aligned with a gap in the query (insert state)
317 |                     }
318 |                 }
319 |             }
320 |         }
321 |         $new_hit .= "-" x ($query_length-$qlast);      # Print gaps at end of sequence
322 | #       $new_hit =~ tr/Uu/Cc/;   # nicht mehr noetig in blast. Kann aber alignhits.pl zum abschmieren bringen.
323 |         $hitseqs[$nhit] = $new_hit;
324 | #       printf("%s\n",$new_hit);
325 | 
326 |         # Prepare name line of hit
327 |         if ($outformat eq "psi") {
328 |             $nameline=~/^(\S{1,20})\S*\s*(.*)/;           # delete everything after first block
329 |             $line=sprintf("%s(%i-%i:%i)",$1,$tfirst,$tlast,$tlen);
330 |             $line=~ tr/ /_/;
331 |             $hitnames[$nhit] = sprintf("%-31.31s ",$line);
332 |         } else {
333 |             $nameline=~/^(\S*)\s*(.*)/;                   # delete everything after first block
334 |             $hitnames[$nhit] = sprintf(">%s(%i-%i:%i) %s  E=%g s/c=%4.2f id=%.0f%% cov=%.0f%%",
335 |                                        $1,$tfirst,$tlast,$tlen,$2,$Evalue,$score_col,100*$qid/$len,$coverage);
336 |         }
337 | 
338 |         $nhit++;
339 | 
340 | 	#print "$nhit\n" if($nhit%100 ==0);
341 |     } # end elseif new HSP found
342 | } # end while ($line)
343 | 
344 | close(INFILE);
345 | 
346 | 
347 | 
348 | # If output format is fasta or a2m we have to insert gaps:
349 | if ($outformat ne "psi")
350 | {
351 |     my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $j.
352 |     my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $k after the $j'th match state
353 |     my $insert;
354 |     my $ngap;
355 | 
356 |     # For each match state determine length of LONGEST insert after this match state and store in @len_ins
357 |     for ($k=0; $k<$nhit; $k++) {
358 |         # split into list of single match states and variable-length inserts
359 |         # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
360 |         # The '#' symbol is prepended to get rid of a perl bug in split
361 |         $j=0;
362 |         @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#");
363 | #       printf("%3i: %12.12s %s\n",$k,$hitnames[$k],$hitseqs[$k]);
364 | #       printf("Sequence $k: @inserts\n");
365 |         foreach $insert (@inserts) {
366 |             if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {
367 |                 $len_ins[$j]=length($insert);
368 |             }
369 |             $j++;
370 | #           printf("$insert|");
371 |         }
372 | #       for (my $i=0; $i<@inserts; $i++) {printf("%s%-2i ",$inserts[$i],$len_ins[$i]);}
373 | #       printf("\n");
374 |     }
375 | 
376 |     # After each match state insert residues and fill up with gaps to $len_ins[$i] characters
377 |     for ($k=0; $k<$nhit; $k++) {
378 |         # split into list of single match states and variable-length inserts
379 |         @inserts = split(/([A-Z]|-)/,"#".$hitseqs[$k]."#");
380 |         $j=0;
381 | 
382 |         # append the missing number of gaps after each match state
383 |         foreach $insert (@inserts) {
384 |             if($outformat eq "fas") {
385 |                 for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.="-";}
386 |             }
387 |             else {
388 |                 for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";}
389 |             }
390 |             $j++;
391 |         }
392 |         $hitseqs[$k] = join("",@inserts);
393 |         $hitseqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
394 |     }
395 | }
396 | 
397 | 
398 | if ($query_file) {
399 |     # Determine match states
400 |     my @qa2m = unpack("C*",$hitseqs[0]); # $hitseq[0] is query sequence WITH INSERTS
401 |     my @matchali=();
402 |     my $L=scalar(@qa2m);
403 |     $j=0;
404 |     for ($i=0; $i<@match; $i++) {
405 |         while ($j<$L && !($qa2m[$j]>=65 && $qa2m[$j]<=90)) {$matchali[$j++]=0;}  #move to column with next upper case residue
406 |         $matchali[$j++]=$match[$i];                                           #is next query residue upper-case or not?
407 |     }
408 | 
409 |     # Set all match states to upper case, non-match states to lower case
410 |     my @res;
411 |     for ($k=0; $k<$nhit; $k++) {
412 |         @res = unpack("C*",$hitseqs[$k]);
413 | #       printf("Q: %s\n",$hitseqs[0]);
414 | #       printf("T: %s\n",$hitseqs[$k]);
415 |         for ($i=0; $i<@res; $i++) {
416 |             if ($matchali[$i]) {
417 |                 if ($res[$i]>=97 && $res[$i]<=122) {$res[$i]-=32;}  #convert to upper case
418 |             } else {
419 |                 if ($res[$i]>=65 && $res[$i]<=90) {$res[$i]+=32;}   # convert to lower case
420 |                 elsif ($res[$i]==45) {$res[$i]=46;}     # convert '-' to '.'
421 |             }
422 | #           printf("%3i  Q:%s T:%s  match=%i len=%i\n",$i,chr($qa2m[$i]),chr($res[$i]),$qid[$k],$len);
423 |         }
424 |         $hitseqs[$k] =  pack("C*",@res);
425 |     }
426 | }
427 | 
428 | 
429 | # Remove gaps? Captialize?
430 | if ($outformat eq "ufas") {
431 |     for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/a-z.-/A-Z/d;} # Transform to upper case and remove all gaps
432 | } elsif ($outformat eq "fas") {
433 |     for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/a-z./A-Z-/;}  # Transform to upper case
434 | } elsif ($outformat eq "a3m") {
435 |     for ($k=0; $k<$nhit; $k++) {$hitseqs[$k]=~tr/.//d;}        # Remove gaps aligned to inserts
436 | }
437 | 
438 | # Write sequences into output file
439 | open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n");
440 | if ($outformat eq "psi") {
441 |     for ($k=0; $k<$nhit; $k++) {
442 |         $hitseqs[$k] =~ tr/./-/;
443 |         printf(OUTFILE "%s %s\n",$hitnames[$k],$hitseqs[$k]);
444 |     }
445 | } 
446 | else {
447 |     for ($k=0; $k<$nhit; $k++) {
448 |         printf(OUTFILE "%s\n%s\n",$hitnames[$k],$hitseqs[$k]);
449 |     }
450 | }
451 | close OUTFILE;
452 | 
453 | if ($v>=1) {printf("$nhit sequences extracted from $infile and written to $outfile\n");}
454 | exit(0);
455 | 
456 | 
457 | 
458 | 
459 | 


--------------------------------------------------------------------------------
/run_spotrna2.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | start=`date +%s`
  4 | 
  5 | input="$(cd "$(dirname "$1")"; pwd)/$(basename "$1")"
  6 | input_dir=$(dirname $input)
  7 | seq_id=$(basename $(basename $input) | cut -d. -f1)
  8 | program_dir=$(dirname $(readlink -f $0))
  9 | 
 10 | path_blastn=$program_dir/ncbi-blast-*+/bin       				# set path to the folder contains executable binary files of Blast package
 11 | path_blastn_database=$program_dir/nt_database/nt      				# set path to the formatted NCBI's database file without extension 
 12 | path_infernal=$program_dir/infernal-*-linux-intel-gcc/binaries  # set path to the folder contains executable binary files Infernal package
 13 | path_infernal_database=$program_dir/nt_database/nt					# set path to the NCBI's database database file
 14 | 
 15 | mkdir -p $input_dir/${seq_id}_features && mkdir -p $input_dir/${seq_id}_outputs
 16 | echo ">"$seq_id > $input_dir/${seq_id}_features/$seq_id.fasta
 17 | awk -i inplace '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' $input 
 18 | tail -n1 $input >> $input_dir/${seq_id}_features/$seq_id.fasta
 19 | 
 20 | feature_dir=$input_dir/${seq_id}_features
 21 | output_dir=$input_dir/${seq_id}_outputs
 22 | 
 23 | #exit 1
 24 | 
 25 | if [ ! -f $path_blastn_database ];  then
 26 |     echo ""
 27 |     echo "========================================================================================"
 28 |     echo "            Looks like nt database doesn't exists in the path $path_blastn_database.    "
 29 |     echo "            If you want to download the database now, please make sure you have enough  "
 30 |     echo "            space in mounted directory and internet connection have enough bandwidth as "
 31 |     echo "            file is of size 270 GBs after unzip. It may take forever to download if     "
 32 |     echo "                                internet is slow!                                       "
 33 |     echo "========================================================================================"
 34 |     echo ""
 35 | 
 36 |     echo -n "Type 'y' for download or any other key to exit: "    
 37 |     read userinput
 38 | 
 39 |     if [[ $(echo $userinput | tr '[A-Z]' '[a-z]') == 'y' ]]; then
 40 | 
 41 | 		echo ""
 42 | 		echo "=============================================================================================="
 43 | 		echo "       Downloading NCBI's database form ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz link. "
 44 | 		echo "                                 May take few hours to download.                              "
 45 | 		echo "=============================================================================================="
 46 | 		echo ""
 47 | 		wget -c "ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz" -O $program_dir/nt_database/nt.gz
 48 | 
 49 | 
 50 | 		if [[ $? -eq 0 ]]; then 
 51 | 	        echo ""
 52 | 	        echo "======================================================================="
 53 | 	        echo "            nt database is completed successfully.                     "
 54 | 	        echo "======================================================================="
 55 | 	        echo ""
 56 | 		else
 57 | 	        echo ""
 58 | 	        echo "======================================================================="
 59 | 	        echo "            Error! Unable to download database sucessfully.            "
 60 | 	        echo "            Check wget command or internet connection.            "
 61 | 	        echo "======================================================================="
 62 | 	        echo ""
 63 | 	        exit 1        
 64 | 		fi
 65 | 
 66 | 		echo ""
 67 | 		echo "======================================================================"
 68 | 		echo "            Unziping the downloaded nt database.                      "
 69 | 		echo "       May take few hours as size of unzipped file is around 270 GBs. "
 70 | 		echo "======================================================================"
 71 | 		echo ""
 72 | 		
 73 | 	############ unzip the nt data base file ############
 74 | 		gunzip $program_dir/nt_database/nt.gz
 75 | 
 76 | 		if [[ $? -eq 0 ]]; then 
 77 | 	        echo ""
 78 | 	        echo "======================================================================="
 79 | 	        echo "            nt database unzip completed successfully.                  "
 80 | 	        echo "======================================================================="
 81 | 	        echo ""
 82 | 		else
 83 | 	        echo ""
 84 | 	        echo "======================================================================="
 85 | 	        echo "            Error! unable to unzip database sucessfully.               "
 86 | 	        echo "            Please check if gunzip program exists!                     "
 87 | 	        echo "======================================================================="
 88 | 	        echo ""
 89 | 	        exit 1        
 90 | 		fi
 91 | 
 92 |     else
 93 | 		echo ""
 94 | 		echo "==========================================================="
 95 | 		echo "      Exiting the program because nt database is missing! "
 96 | 		echo "==========================================================="
 97 | 		echo ""
 98 |         exit 1
 99 |     fi
100 | 
101 | fi
102 | 
103 | 
104 | ###### check if aligned homologous sequences file already exists ############
105 | if [ -f $feature_dir/$seq_id.a2m ];	then
106 |         echo ""
107 |         echo "======================================================================"
108 |         echo "    MSA file $feature_dir/$seq_id.a2m from Infernal Pipeline already  "
109 |         echo "    exists for query sequence $feature_dir/$seq_id.fasta.             "
110 |         echo "                                                                      "
111 |         echo "    Delete existing $feature_dir/$seq_id.a2m if want to generate new  "
112 |         echo "    alignment file                                                    "
113 |         echo "======================================================================"
114 |     	echo ""
115 | else
116 | 
117 |    #### check if formatted nt database exists or not ##### 
118 |     if [[ ! -f "$path_blastn_database.nal" ]]; then
119 |         echo ""
120 |         echo "====================================================================="
121 |         echo "    Nucleotide database file $path_database/nt need to formated      "
122 |         echo "    formated to use with 'makeblastdb' program in BLAST-N program.   "  
123 |         echo ""          
124 | 		echo "    Formatting may take 2-3 hours as size of file is around 270 GBs. "
125 |         echo "====================================================================="
126 |         echo ""
127 |         $path_blastn/makeblastdb -in $path_database/nt -dbtype nucl
128 |         
129 |         if [[ $? -eq 0 ]]; then
130 |                 echo ""
131 |                 echo "======================================================="
132 |                 echo "          nt database formatted successfully.          "
133 |                 echo "======================================================="
134 |                 echo ""
135 |         else
136 |                 echo ""
137 |                 echo "=================================================================="
138 |                 echo "        Error occured while formatting the nt database.           "
139 |                 echo ""
140 |                 echo "  Check for '$path_blastn/makeblastdb' program in BLAST package   "
141 |                 echo "=================================================================="
142 |                 echo ""
143 |                 exit 1
144 |         fi                      
145 |     fi
146 | 
147 | 
148 |     #################### check if blastn alignment file ready exists ######################
149 |     if [ -f $feature_dir/$seq_id.bla ];       then
150 | 	    echo ""
151 | 	    echo "======================================================================="
152 | 	    echo "    MSA-1 file $feature_dir/$seq_id.bla from Infernal Pipeline already "
153 | 	    echo "    exists for query sequence $feature_dir/$seq_id.fasta.              "
154 | 	    echo "                                                                       "
155 | 	    echo "    Delete existing $feature_dir/$seq_id.a2m if want to generate new   "
156 | 	    echo "    alignment file.                                                    "
157 | 	    echo "======================================================================="
158 | 		echo ""
159 |     else
160 |         echo ""
161 |         echo "==========================================================================================================================="
162 |         echo "      Running BLASTN for first round of homologous sequence search for query sequence $feature_dir/$seq_id.fasta.          "
163 |         echo "      May take 5 mins to few hours depending on sequence length and no. of homologous sequences in database.               "
164 |         echo "==========================================================================================================================="
165 |         echo ""
166 |         $path_blastn/blastn -db $path_blastn_database -query $feature_dir/$seq_id.fasta -out $feature_dir/$seq_id.bla -evalue 0.001 -num_descriptions 1 -num_threads 8 -line_length 1000 -num_alignments 50000
167 |     fi
168 | 			
169 | 	if [ $? -eq 0 ]; then
170 | 	    echo ""
171 | 	    echo "==========================================================="
172 |         echo "      First round of MSA-1 search completed successfully.  "
173 | 	    echo "==========================================================="
174 | 	    echo ""
175 | 	else
176 |         echo ""
177 |         echo "=================================================================="
178 |         echo "        Error occured while formatting the nt database.           "
179 |         echo ""
180 |         echo "  Check for '$path_blastn/makeblastdb' program in BLAST package   "
181 |         echo "=================================================================="
182 |         echo ""
183 |         exit 1
184 |     fi
185 | 
186 | 	######## reformat the output ################
187 |     echo ""
188 |     echo "========================================================================================"
189 |     echo "         Converting $feature_dir/$seq_id.bla from BLASTN to $feature_dir/$seq_id.sto.   "
190 |     echo "========================================================================================"
191 |     echo ""
192 | 	$program_dir/utils/parse_blastn_local.pl $feature_dir/$seq_id.bla $feature_dir/$seq_id.fasta $feature_dir/$seq_id.aln
193 | 	$program_dir/utils/reformat.pl fas sto $feature_dir/$seq_id.aln $feature_dir/$seq_id.sto
194 | 
195 | 
196 | 	if [ $? -eq 0 ]; then
197 | 	    echo ""
198 | 	    echo "=========================================="
199 |         echo "      Converison completed successfully.  "
200 | 	    echo "=========================================="
201 | 	    echo ""
202 | 	else
203 |         echo ""
204 |         echo "============================================================================================="
205 |         echo "   Error occured while Converting $feature_dir/$seq_id.bla to $feature_dir/$seq_id.sto       "
206 |         echo " "
207 |         echo "  Check for $program_dir/utils/parse_blastn_local.pl and $program_dir/utils/reformat.pl file."
208 |         echo "============================================================================================="
209 |         echo ""
210 |         exit 1
211 |     fi
212 | 
213 | 	######## predict secondary structure from SPOT-RNA ################
214 |     echo ""
215 |     echo "==============================================================================================================================="
216 |     echo "       Predicting Consensus Secondary Structure (CSS) of query sequence $feature_dir/$seq_id.fasta using SPOT-RNA predictor.   "
217 |     echo "==============================================================================================================================="
218 |     echo ""
219 | 	source $program_dir/venv/bin/activate || conda activate venv
220 | 	cd $program_dir/SPOT-RNA
221 | 	python3 SPOT-RNA.py --inputs $feature_dir/$seq_id.fasta --outputs $feature_dir	
222 | 	cd -
223 | 
224 | 	export PERL5LIB=$program_dir/utils/FreeKnot
225 | 	perl $program_dir/utils/FreeKnot/remove_pseudoknot.pl -i bpseq -s bp $feature_dir/$seq_id.bpseq > $feature_dir/$seq_id.bpseq.unknotted
226 | 	python3 $program_dir/utils/bpseq2dbn.py --inputs $feature_dir --outputs $feature_dir --rna_id $seq_id
227 | 	tail -n +3 $feature_dir/$seq_id.dbn > $feature_dir/$seq_id.db
228 | 
229 | 	deactivate || conda deactivate
230 | 
231 | 	################ reformat ss with according to gaps in reference sequence of .sto file from blastn ################
232 | 	for i in `awk '{print $2}' $feature_dir/$seq_id.sto | head -n5 | tail -n1 | grep -b -o - | sed 's/..$//'`; do sed -i "s/./&-/$i" $feature_dir/$seq_id.db; done
233 | 
234 | 	#########  add reformated ss from last step to .sto file of blastn ##############
235 | 	head -n -1 $feature_dir/$seq_id.sto > $feature_dir/temp.sto
236 | 	echo "#=GC SS_cons                     "`cat $feature_dir/$seq_id.db` > $feature_dir/temp.txt
237 | 	cat $feature_dir/temp.sto $feature_dir/temp.txt > $feature_dir/$seq_id.sto
238 | 	echo "//" >> $feature_dir/$seq_id.sto
239 | 
240 | 	if [ $? -eq 0 ]; then
241 | 	    echo ""
242 | 	    echo "=================================================================="
243 |         echo "      Consensus Secondary Structure (CSS) generated successfully. "
244 | 	    echo "=================================================================="
245 | 	    echo ""
246 | 	else
247 |         echo ""
248 |         echo "=============================================================================="
249 |         echo "             Error occured while generating structure from SPOT-RNA.          "
250 |         echo " "
251 |         echo "  Please raise issue at 'https://github.com/jaswindersingh2/SPOT-RNA2/issues'."
252 |         echo "=============================================================================="
253 |         echo ""
254 |         exit 1
255 |     fi
256 | 
257 | 	######## run infernal ################
258 |     echo ""
259 |     echo "=============================================================================================================="
260 |     echo "      Building Covariance Model from BLASTN alignment (with SS from SPOT-RNA) from $feature_dir/$seq_id.sto file.         "
261 |     echo "=============================================================================================================="
262 |     echo ""
263 | 	$path_infernal/cmbuild --hand -F $feature_dir/$seq_id.cm $feature_dir/$seq_id.sto
264 | 
265 | 	if [ $? -eq 0 ]; then
266 | 	    echo ""
267 | 	    echo "============================================================================"
268 |         echo "    Covariance Model (CM) built successfully from $feature_dir/$seq_id.sto. "
269 | 	    echo "============================================================================"
270 | 	    echo ""
271 | 	else
272 |         echo ""
273 |         echo "==============================================================================================="
274 |         echo "     Error occured while building Covariance Model (CM) from $path_infernal/cmbuild.           "
275 |         echo " "
276 |         echo "     Please check for $path_infernal/cmbuild program.      "
277 |         echo "==============================================================================================="
278 |         echo ""
279 |         exit 1
280 |     fi
281 | 
282 |     echo ""
283 |     echo "===================================================================="
284 |     echo "       Calibrating the Covariance Model $feature_dir/$seq_id.cm.    "
285 |     echo "===================================================================="
286 |     echo ""
287 | 	$path_infernal/cmcalibrate $feature_dir/$seq_id.cm
288 | 
289 | 	if [ $? -eq 0 ]; then
290 | 	    echo ""
291 | 	    echo "==========================================================="
292 |         echo "    CM calibrated $feature_dir/$seq_id.cm successfully.    "
293 | 	    echo "==========================================================="
294 | 	    echo ""
295 | 	else
296 |         echo ""
297 |         echo "==============================================================="
298 |         echo "     Error occured while calibrating $feature_dir/$seq_id.cm.  "
299 |         echo " "
300 |         echo "     Please check for $path_infernal/cmcalibrate program.      "
301 |         echo "==============================================================="
302 |         echo ""
303 |         exit 1
304 |     fi
305 | 
306 |     echo ""
307 |     echo "======================================================================================================================"
308 |     echo "        Second round of homologous sequences search using the calibrated covariance model $feature_dir/$seq_id.cm.    "
309 |     echo "                 May take 15 mins to few hours for this step.                                                         "
310 |     echo "======================================================================================================================"
311 |     echo ""
312 | 	$path_infernal/cmsearch -o $feature_dir/$seq_id.out -A $feature_dir/$seq_id.msa --cpu 24 --incE 10.0 $feature_dir/$seq_id.cm $path_infernal_database
313 | 
314 | 	if [ $? -eq 0 ]; then
315 | 	    echo ""
316 | 	    echo "==========================================================="
317 |         echo "      Second round of MSA-2 search completed successfully.  "
318 | 	    echo "==========================================================="
319 | 	    echo ""
320 | 	else
321 |         echo ""
322 |         echo "===================================================================================="
323 |         echo "     Error occured during the second round search using CM $feature_dir/$seq_id.cm. "
324 |         echo " "
325 |         echo "     Please check for $path_infernal/cmsearch program.                              "
326 |         echo "===================================================================================="
327 |         echo ""
328 |         exit 1
329 |     fi
330 | 
331 | 	######### reformat the alignment without gaps and dashes  ###############
332 |     echo ""
333 |     echo "======================================================================="
334 |     echo "          Reformatting the output alignment $feature_dir/$seq_id.msa   "
335 |     echo "          for PSSM and DCA features by removing the gaps and dashes.   "
336 |     echo "======================================================================="
337 |     echo ""
338 | 
339 | 	##### check if .msa	is not empty  #########
340 | 	if [[ -s $feature_dir/$seq_id.msa ]] 
341 | 	  then 
342 | 		$path_infernal/esl-reformat --replace acgturyswkmbdhvn:................ a2m $feature_dir/$seq_id.msa > $feature_dir/temp.a2m
343 | 	else 
344 | 	  cat $feature_dir/$seq_id.fasta > $feature_dir/temp.a2m
345 | 	  cat $feature_dir/$seq_id.fasta >> $feature_dir/temp.a2m
346 | 	  sed -i '$ s/.$/./' $feature_dir/temp.a2m
347 | 	fi
348 | 
349 | #	$path_infernal/esl-reformat --replace acgturyswkmbdhvn:................ a2m $feature_dir/$seq_id.msa > $feature_dir/temp.a2m
350 | 
351 | 	if [ $? -eq 0 ]; then
352 | 	    echo ""
353 | 	    echo "==========================================================="
354 |         echo "   Reformatted the $feature_dir/$seq_id.msa successfully.  "
355 | 	    echo "==========================================================="
356 | 	    echo ""
357 | 	else
358 |         echo ""
359 |         echo "========================================================================================"
360 |         echo "     Error occured during the refomatting the alignment file $feature_dir/$seq_id.msa.  "
361 |         echo " "
362 |         echo "     Please check for $path_infernal/esl-reformat program.                              "
363 |         echo "========================================================================================"
364 |         echo ""
365 |         exit 1
366 |     fi
367 | 
368 | 	######### remove duplicates sequences from the alignment ###############
369 |     echo ""
370 |     echo "======================================================================="
371 |     echo "          Removing duplicates from the alignment.                      "
372 |     echo "======================================================================="
373 |     echo ""
374 | 	$program_dir/utils/seqkit rmdup -s $feature_dir/temp.a2m > $feature_dir/$seq_id.a2m
375 | 
376 | 	if [ $? -eq 0 ]; then
377 | 	    echo ""
378 | 	    echo "==============================================="
379 |         echo "   Duplicate sequences removed successfully.   "
380 | 	    echo "==============================================="
381 | 	    echo ""
382 | 	else
383 |         echo ""
384 |         echo "========================================================================================"
385 |         echo "     Error occured during the removel of duplicates from MSA-2.  "
386 |         echo " "
387 |         echo "     Please check for $program_dir/utils/seqkit program.                              "
388 |         echo "========================================================================================"
389 |         echo ""
390 |         exit 1
391 |     fi
392 | 
393 | 	############# multiline fasta to single line fasta file   #############
394 | 	awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < $feature_dir/$seq_id.a2m | sed '/^$/d' > $feature_dir/temp.a2m 
395 | 	############# add query sequence at the top of MSA file  #############
396 |     cat $feature_dir/$seq_id.fasta $feature_dir/temp.a2m > $feature_dir/$seq_id.a2m 
397 | 
398 | fi
399 | 
400 | ############# check if pssm file already exists otherwise generate from alignment file #############
401 | if [ -f $feature_dir/$seq_id.pssm ];	then
402 |         echo ""
403 |         echo "=============================================================================================================================================="
404 |         echo "    PSSM feature file $feature_dir/$seq_id.pssm already exists for query sequence $feature_dir/$seq_id.fasta.  "
405 |         echo "=============================================================================================================================================="
406 |     	echo ""
407 | else
408 | 	echo ""
409 | 	echo "======================================================================================"
410 | 	echo "          Extracting PSSM features from the alignment $feature_dir/$seq_id.a2m.       "
411 | 	echo "======================================================================================"
412 | 	echo ""
413 | 	$program_dir/utils/getpssm.pl $feature_dir/$seq_id.fasta $feature_dir/$seq_id.a2m $feature_dir/$seq_id.pssm
414 | 
415 | 	if [ $? -eq 0 ]; then
416 | 	    echo ""
417 | 	    echo "==============================================================="
418 |         echo "   PSSM extracted successfully from $feature_dir/$seq_id.a2m.  "
419 | 	    echo "==============================================================="
420 | 	    echo ""
421 | 	else
422 |         echo ""
423 |         echo "========================================================================="
424 |         echo "     Error occured while extracting PSSM from $feature_dir/$seq_id.a2m.  "
425 |         echo " "
426 |         echo "     Please check for $program_dir/utils/getpssm.pl program.             "
427 |         echo "========================================================================="
428 |         echo ""
429 |         exit 1
430 |     fi
431 | fi
432 | 
433 | ######### run linearpartition RNA secondary structure base-pair probability predictor ###############
434 | echo ""
435 | echo "============================================================================"
436 | echo "          Running LinearPartition-V for base-pair probabilty features.      "
437 | echo "============================================================================"
438 | echo ""
439 | tail -n +2 $feature_dir/$seq_id.fasta | $program_dir/LinearPartition/linearpartition -V -r $feature_dir/$seq_id.prob
440 | 
441 | if [ $? -eq 0 ]; then
442 |     echo ""
443 |     echo "===================================================================="
444 |     echo "   Base-pair probabilty successfully obtained from LinearPartition. "
445 |     echo "===================================================================="
446 |     echo ""
447 | else
448 |     echo ""
449 |     echo "============================================================================="
450 |     echo "                Error occured while running LinearPartition.  "
451 |     echo " "
452 |     echo "     Please check for $program_dir/LinearPartition/linearpartition program.  "
453 |     echo "============================================================================="
454 |     echo ""
455 |     exit 1
456 | fi
457 | 
458 | ############# check if dca file already exists otherwise generate from alignment file #############
459 | if [ -f $feature_dir/$seq_id.dca ];	then
460 |         echo ""
461 |         echo "==============================================================="
462 |         echo "    GRELMLIN feature file $feature_dir/$seq_id.dca already     "
463 |         echo "    exists for query sequence $feature_dir/$seq_id.fasta.      "
464 |         echo " "
465 |         echo "    Delete the existing file if want to generate new dca file. "
466 |         echo "==============================================================="
467 |     	echo ""
468 | else
469 | 	echo ""
470 | 	echo "============================================================================"
471 | 	echo "          Running GREMLIN for DCA features.                                 "
472 | 	echo "============================================================================"
473 | 	echo ""
474 | 	$program_dir/GREMLIN_CPP/gremlin_cpp -alphabet rna -i $feature_dir/$seq_id.a2m -o $feature_dir/$seq_id.dca > $feature_dir/$seq_id.log_gremlin
475 | 	if [ $? -eq 0 ]; then
476 | 		echo ""
477 | 		echo "===================================================="
478 | 		echo "   DCA features successfully obtained from GREMLIN. "
479 | 		echo "===================================================="
480 | 		echo ""
481 | 	else
482 | 		echo ""
483 | 		echo "============================================================================="
484 | 		echo "                Error occured while running GREMLIN.  "
485 | 		echo " "
486 | 		echo "     Please check for $program_dir/GREMLIN_CPP/gremlin_cpp program.  "
487 | 		echo "============================================================================="
488 | 		echo ""
489 | 		exit 1
490 | 	fi
491 | fi
492 | 
493 | 
494 | echo ""
495 | echo "============================================================================"
496 | echo "          Running SPOT-RNA2 for RNA secondary structure prediction.                                 "
497 | echo "============================================================================"
498 | echo ""
499 | source $program_dir/venv/bin/activate || conda activate venv
500 | python3 $program_dir/utils/SPOT-RNA2.py --inputs $feature_dir/$seq_id.fasta --outputs $output_dir --motifs True
501 | deactivate || conda deactivate
502 | 
503 | end=`date +%s`
504 | 
505 | runtime=$((end-start))
506 | 
507 | echo -e "\ncomputation time = "$runtime" seconds"
508 | 
509 | 


--------------------------------------------------------------------------------