├── blat
├── link_block
├── delete_linker
├── exon_length
├── find_end_node
├── select_nodes
├── convert_linker
├── find_start_node
├── delete_same_fragment
├── find_reliable_connection
├── count_connection_frequency
├── filter-out.pl
├── generate_unscaffold.pl
├── sam2fa.pl
├── count-match.pl
├── UNIQUE_psl.pl
├── generate_scaffold.pl
├── guider.pl
├── UNIQUE_sam_intron.pl
├── form_path.pl
├── README.md
└── P_RNA_scaffolder.sh


/blat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/blat


--------------------------------------------------------------------------------
/link_block:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/link_block


--------------------------------------------------------------------------------
/delete_linker:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/delete_linker


--------------------------------------------------------------------------------
/exon_length:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/exon_length


--------------------------------------------------------------------------------
/find_end_node:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/find_end_node


--------------------------------------------------------------------------------
/select_nodes:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/select_nodes


--------------------------------------------------------------------------------
/convert_linker:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/convert_linker


--------------------------------------------------------------------------------
/find_start_node:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/find_start_node


--------------------------------------------------------------------------------
/delete_same_fragment:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/delete_same_fragment


--------------------------------------------------------------------------------
/find_reliable_connection:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/find_reliable_connection


--------------------------------------------------------------------------------
/count_connection_frequency:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/count_connection_frequency


--------------------------------------------------------------------------------
/filter-out.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w 
 2 | use strict;
 3 | if( @ARGV != 3 ) {
 4 |     print "Usage: $0  SAM-file filter-ID-file ID-in-SAM-file \n";
 5 |     exit 0;
 6 | }
 7 | my $fh1=shift @ARGV;
 8 | my $fh2=shift @ARGV;
 9 | my $fh3=shift @ARGV;
10 | open FH2,"<$fh2";
11 | open FH1,"<$fh1";
12 | my $minus;
13 | while (<FH2>)
14 | {
15 |   chomp($_);
16 |   $_ =~ s# ##g;  
17 |   $minus->{$_}=1;
18 | }
19 | close FH2;
20 | while (<FH1>)
21 | {
22 |   chomp($_);
23 |   my @rec=split(/[\s]+/,$_);
24 |   $rec[$fh3-1] =~ s# ##g;
25 |   if (! exists($minus->{$rec[$fh3-1]}))
26 |    {
27 |    print $_."\n";
28 |    }
29 |    
30 | }
31 | close FH1;
32 | 
33 | 


--------------------------------------------------------------------------------
/generate_unscaffold.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | #mm_10008_m.0.17 32306003,32304536,32304385,32304049    32306103,32304612,32304439,32304090
 3 | #-1  18/0            AA033453
 4 | use strict;
 5 | if( @ARGV != 2 ) {
 6 |     print "Usage: $0 contig.fasta redundant.id\n";
 7 |     print "Destination: find the common dataset from the plus-file and minus-file\n";
 8 |     print "Note:This script can be used to identify cis- and trans-\n";
 9 |     exit 0;
10 | }
11 | my $fh1=shift @ARGV;
12 | 
13 | my $fh2=shift @ARGV;
14 | use Bio::Seq;
15 | use Bio::SeqIO;
16 | use Bio::PrimarySeq;
17 | my $in=Bio::SeqIO->new(-file=>"$fh1",'-format'=>'fasta');
18 | my $seq=$in->next_seq();
19 | my $id;
20 | my $temp;
21 | my $print;
22 | open FH2, "<$fh2";
23 | while (<FH2>)
24 | {
25 |  chomp($_);
26 |  my @rec=split(/\t/,$_);
27 |  $id->{$rec[0]}=1;
28 | }
29 | close FH2;
30 | my $disp=$seq->display_id();
31 | while ($seq)
32 | {
33 |  $disp=$seq->display_id();
34 |  chomp($disp);
35 |  if (!exists($id->{$disp}))
36 |  {  
37 |      print ">".$disp."\n";
38 |      print $seq->seq()."\n";
39 |  }
40 |  $seq=$in->next_seq();
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/sam2fa.pl:
--------------------------------------------------------------------------------
 1 | #!usr/local/bin/perl -w
 2 | use strict;
 3 | if( @ARGV != 3 )
 4 | {
 5 |     print "Usage: perl $0  sam-file fastq-file fasta-output-file\n";
 6 |     exit 0;
 7 | }
 8 | my $fh1 = shift @ARGV;
 9 | my $fh2 = shift @ARGV;
10 | my $fh3 = shift @ARGV;
11 | open OUT1,">$fh3";
12 | open FH1,"<$fh1";
13 | my $id ="";
14 | my %seq;
15 | open FH1,"<$fh1";
16 | while (my $line = <FH1>)
17 | {
18 |   chomp $line;
19 |   my @a = split / /,$line;
20 |   if($a[0]ne$id)
21 |   {
22 |     $seq{$a[0]}=1;
23 |   }
24 |   $id=$a[0];
25 | }
26 | close FH1;
27 | 
28 | open FH2,"<$fh2";
29 | while (<FH2>)
30 | {
31 |   my @temp;
32 |   chomp($temp[0] = $_);		# First line is an id.
33 |   chomp($temp[1] = <FH2>);	# Second line is a sequence.
34 |   chomp($temp[2] = <FH2>);	# Third line is an id.
35 |   chomp($temp[3] = <FH2>);	# Fourth line is quality.
36 |   my @aa = split / /,$temp[0];
37 |   my @a = split /\t/,$aa[0];
38 |   $id =$a[0];
39 |   $id =~ s/\@//g;
40 |   $id =~ s/\/1$//g;
41 |   $id =~ s/\/2$//g;
42 |   $id =~ s/\/F$//g;
43 |   $id =~ s/\/R$//g;
44 |   if (exists $seq{$id})
45 |   {
46 |     print OUT1 ">".$id."\n".$temp[1]."\n";
47 |   }
48 |   #print"$id\n$seq{$id}\n";
49 | }
50 | close FH2;
51 | close OUT1;
52 | 


--------------------------------------------------------------------------------
/count-match.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl
 2 | #use strict;
 3 | if( @ARGV != 2 ) {
 4 |     print "Usage: $0 F.sam R.sam \n";
 5 |     exit 0;
 6 | }
 7 | my $fh1=shift @ARGV;
 8 | my $fh2=shift @ARGV;
 9 | my $INFILE;
10 | open $INFILE, "< $fh1";
11 | while(<$INFILE>)
12 | {
13 |    chomp($_);
14 |     my @line = split(/\s/,$_);
15 |     my $sum_soft=0;
16 |     my @soft= ($line[5] =~ /(\d+)S/g);
17 |     foreach my $soft (@soft)
18 |     {
19 |       $sum_soft+=$soft;
20 |     }
21 |     my $sum_match=0;
22 |     my @match= ($line[5] =~ /(\d+)M/g);
23 |     foreach my $match (@match)
24 |     {
25 |       $sum_match+=$match;
26 |     } 
27 |     if ($sum_soft/($sum_soft+$sum_match) > 0.2)
28 |     {
29 |       print $line[0]."\n";
30 |     }
31 | }
32 | 
33 | open $INFILE, "< $fh2";
34 | while(<$INFILE>)
35 | {
36 |    chomp($_);
37 |     my @line = split(/\s/,$_);
38 |     my $sum_soft=0;
39 |     my @soft= ($line[5] =~ /(\d+)S/g);
40 |     foreach my $soft (@soft)
41 |     {
42 |       $sum_soft+=$soft;
43 |     }
44 |     my $sum_match=0;
45 |     my @match= ($line[5] =~ /(\d+)M/g);
46 |     foreach my $match (@match)
47 |     {
48 |       $sum_match+=$match;
49 |     }
50 |     if ($sum_soft/($sum_soft+$sum_match) > 0.2)
51 |     {
52 |       print $line[0]."\n";
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/UNIQUE_psl.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/perl -w
 2 | if( @ARGV != 6 ) {
 3 |     print "Usage: perl $0 -f1 read_1.psl -f2 read_2.psl -n 0.90 \n";
 4 |     exit 0;
 5 | }
 6 | 
 7 | use Getopt::Long;
 8 | Getopt::Long::GetOptions('f1=s'  => \$fh1,'f2=s'  => \$fh2,'n=s'  => \$n);
 9 | my $hash1;
10 | my $hash2;
11 | open FH1,"<$fh1";
12 | while (<FH1>)
13 | {
14 |   chomp $_;
15 |   my @a = split(/\t/,$_);
16 |   if($a[0]/$a[10] >= $n && $a[10] != 0)
17 |   {
18 |     if(exists$hash1{$a[9]})
19 |     {
20 |       $hash1{$a[9]}="multiple";
21 |     }
22 |     else
23 |     { 
24 |       $hash1{$a[9]}=$a[13];
25 |     }
26 |   }
27 | }
28 | close FH1;
29 | open FH2,"<$fh2";
30 | while (<FH2>)
31 | {
32 |   chomp $_;
33 |   my @a = split(/\t/,$_);
34 |   if($a[0]/$a[10] >= $n && $a[10] != 0)
35 |   {
36 |    if(exists$hash2{$a[9]})
37 |    {
38 |      $hash2{$a[9]}="multiple";
39 |    }
40 |    else
41 |    {
42 |      $hash2{$a[9]}=$a[13];
43 |    }
44 |   }
45 | }
46 | close FH2;
47 | foreach my $key ( keys(%hash1) )
48 | {
49 |   if($hash1{$key} eq "multiple") 
50 |   {
51 |      print $key."\n";
52 |   }
53 |   if (defined($hash2{$key}) && $hash1{$key} eq  $hash2{$key} )
54 |   {
55 |      print $key."\n";
56 |   }
57 | }
58 | foreach my $key ( keys(%hash2) )
59 | {
60 |   if($hash2{$key} eq "multiple") 
61 |   {
62 |      print $key."\n";
63 |   }
64 |   if (defined($hash1{$key}) && $hash1{$key} eq  $hash2{$key} )
65 |   {
66 |      print $key."\n";
67 |   }
68 | }
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/generate_scaffold.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | if( @ARGV != 3 ) {   
 4 |     print "Usage: perl $0 sequence.fasta find-linker.result name\n";
 5 |     exit 0;
 6 | }
 7 | use Bio::Seq;
 8 | use Bio::SeqIO;
 9 | use Bio::PrimarySeq;
10 | 
11 | my $fh1=shift @ARGV;
12 | my $fh2=shift @ARGV;
13 | my $name=shift @ARGV;
14 | 
15 | my $in=Bio::SeqIO->new(-file=>"$fh1",'-format'=>'fasta');
16 | my $seq=$in->next_seq();
17 | my $string=$seq->seq();
18 | my $SEQ;
19 | my $disp=$seq->id();
20 | while ($seq)
21 | {
22 |   $disp=$seq->id();
23 |  # print $disp."\n";
24 |   $string=$seq->seq();
25 |   $SEQ->{$disp}=$string;
26 |   $seq=$in->next_seq();
27 | }
28 | open FH2,"<$fh2";
29 | my $j=1;
30 | while (<FH2>)
31 | {
32 |   chomp($_);
33 |  # r.contig1034758|utg71800031128571F_1_F->N(212)->r.contig1034757|utg71800031128561F_1_F->N(214)->r.contig1034756|utg71800031128551F_1_F
34 |   my @rec=split(/\-\>/,$_);
35 |   print ">".$name.$j. "\n";
36 |   for (my $i=0;$i<=$#rec;$i++)
37 |   {
38 |     if ($rec[$i] =~ /^N\(([0-9]*)\)/)
39 |     {
40 |       my $null=$1;
41 |       if ($null > 0)
42 |       {
43 |          for (my $n=1;$n<=$null;$n++)
44 | 	 {
45 |              print "N";
46 | 	 }
47 |       }
48 |     }
49 |     elsif ($rec[$i] =~/^([\S]+)\/r$/)
50 |     {
51 |         my  $tmp_string = reverse $SEQ->{$1};
52 |         $tmp_string =~ tr/AaCcTtGg/TtGgAaCc/;
53 | 	print $tmp_string;
54 |     }
55 |     elsif ($rec[$i] =~/^([\S]+)$/)
56 |     {
57 |         print $SEQ->{$rec[$i]};
58 |     }
59 |   }
60 |   print "\n";
61 |   $j=$j + 1;
62 | }  
63 | close FH2;
64 | 


--------------------------------------------------------------------------------
/guider.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/perl -w
 2 | if( @ARGV != 3 )
 3 | {
 4 |     print "Usage: perl $0  contigs.fa filter_F.sam filter_R.sam\n";
 5 |     exit 0;
 6 | }
 7 | my $fh1 = shift @ARGV;
 8 | my $fh2 = shift @ARGV;
 9 | my $fh3 = shift @ARGV;
10 | open FH1,"<$fh1";
11 | my $len;
12 | while ($line = <FH1>)
13 | {
14 |   chomp $line;
15 |   my @a = split / /, $line;
16 |   if($a[0] =~ />/)
17 |   {
18 |     $a[0] =~s/\>//;
19 |     $id = $a[0];
20 |   }
21 |   else
22 |   {
23 |     $len{$id}+=length$line;
24 |   }
25 | }
26 | close FH1;
27 | open FH2,"<$fh2";
28 | while ($line = <FH2>)
29 | {
30 | my $sum=0;
31 | chomp $line;
32 | my @a = split / /, $line;
33 |  $a[1]=~s/0/+/;
34 |  $a[1]=~s/16/-/;
35 |  $a[5]=~ s/[A-Z]$//;
36 |  $a[5]=~ s/[A-Z]/\+/g;
37 |   $length1{$a[0]} =length$a[9];
38 |   my @cigar = split/\+/, $a[5];
39 |   foreach $c (@cigar){
40 |   $sum = $sum + $c;
41 |   }
42 |     $suma{$a[0]}=$a[3]+$sum;
43 |     $a0{$a[0]}=$a[0];$a1{$a[0]}=$a[1];$a2{$a[0]}=$a[2];$a3{$a[0]}=$a[3];
44 |   }
45 |   close FH2;
46 |     open FH3,"<$fh3";
47 |    while ($line = <FH3>)
48 |    {my $sum=0;
49 |     chomp $line;
50 |     my @b = split / /, $line;
51 |     $b[1]=~s/0/+/;
52 |      $b[1]=~s/16/-/;
53 |       $b[5]=~ s/[A-Z]$//;
54 |        $b[5]=~ s/[A-Z]/\+/g;
55 |        $length2=length$b[9];
56 |     $length2_1=$length1{$b[0]}+1;
57 |     $length2_2=$length2+$length1{$b[0]};
58 |     $readlength=$length1{$b[0]}+$length2;
59 |     my @cigar = split/\+/, $b[5];
60 | 	  foreach $c (@cigar){
61 | 	    $sum = $sum + $c;
62 | 	     }
63 | 	       $sumb=$b[3]+$sum;
64 | 	       $p1=$length1{$b[0]}/$readlength;
65 | 	       $p2=$length2/$readlength;
66 | print "$a0{$b[0]}\t1\t$length1{$b[0]}\t$length1{$b[0]}\t1\t$readlength\t$a2{$b[0]}\t$len{$a2{$b[0]}}\t$a3{$b[0]}\t$suma{$b[0]}\t$p1\t100\t$a1{$b[0]}\n";
67 | print "$b[0]\t$length2_1\t$length2_2\t$length2\t1\t$readlength\t$b[2]\t$len{$b[2]}\t$b[3]\t$sumb\t$p2\t100\t$b[1]\n";
68 | }
69 | close FH3;
70 | 


--------------------------------------------------------------------------------
/UNIQUE_sam_intron.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/perl -w
  2 | if (@ARGV!=4)
  3 | {
  4 |   print "Usage: perl $0 samfile read_1.sam read_2.sam intron.txt\n";
  5 |   exit 0;
  6 | }
  7 | my $read="";
  8 | my $n=0;
  9 | my $fh1 = shift @ARGV;
 10 | my $fh2 = shift @ARGV;
 11 | my $fh3 = shift @ARGV;
 12 | my $fh4 = shift @ARGV;
 13 | open OUT1,">$fh2";
 14 | open OUT2,">$fh3";
 15 | open OUT3,">$fh4";
 16 | open FH1,"<$fh1";
 17 | while ($line = <FH1>)
 18 | {  
 19 |   if($line!~/\t\=\t/ && $line!~/\t\*\t/)
 20 |   {
 21 |   chomp $line;
 22 |   my @a = split/\t/, $line;
 23 |   if ($a[0]ne$read)
 24 |   {
 25 |     if($n==2)
 26 |     {
 27 |       my @b = split/\n/, $hash{$read};
 28 |       my @c = split/\t/, $b[0];
 29 |       my @d = split/\t/, $b[1];
 30 |       if ($c[2]ne$d[2])
 31 |       {
 32 |         if ($c[1]==65||$c[1]==97)
 33 | 	{
 34 | 	  $c[1]=0;
 35 | 	  print OUT1 "@c\n";  
 36 | 	  if ($c[5]=~ /N/)
 37 | 	  {
 38 | 	    print OUT3 "$c[0]\t$c[5]\n";
 39 | 	  }
 40 | 	}
 41 |         elsif ($c[1]==129||$c[1]==161)
 42 | 	{
 43 |           $c[1]=16;
 44 |           print OUT2 "@c\n";
 45 | 	  if ($c[5]=~ /N/)
 46 | 	  {
 47 | 	    print OUT3 "$c[0]\t$c[5]\n";
 48 | 	  }
 49 | 	}
 50 |         elsif ($c[1]==113||$c[1]==81)
 51 | 	{
 52 | 	  $c[1]=16;
 53 | 	  print OUT1 "@c\n";
 54 | 	  if ($c[5]=~ /N/)
 55 | 	  {
 56 | 	    print OUT3 "$c[0]\t$c[5]\n";
 57 | 	  }
 58 | 	}
 59 | 	elsif ($c[1]==145||$c[1]==177)
 60 | 	{
 61 | 	  $c[1]=0;
 62 | 	  print OUT2 "@c\n";
 63 | 	  if ($c[5]=~ /N/)
 64 | 	  {
 65 | 	    print OUT3 "$c[0]\t$c[5]\n";
 66 | 	  }
 67 | 	}
 68 |         if ($d[1]==65||$d[1]==97)
 69 | 	{
 70 | 	  $d[1]=0;
 71 | 	  print OUT1 "@d\n";  
 72 | 	  if ($d[5]=~ /N/)
 73 | 	  {
 74 | 	    print OUT3 "$d[0]\t$d[5]\n";
 75 | 	  }
 76 | 	}
 77 |         elsif ($d[1]==129||$d[1]==161)
 78 | 	{
 79 |           $d[1]=16;
 80 |           print OUT2 "@d\n";
 81 | 	  if ($d[5]=~ /N/)
 82 | 	  {
 83 | 	    print OUT3 "$d[0]\t$d[5]\n";
 84 | 	  }
 85 | 	}
 86 |         elsif ($d[1]==113||$d[1]==81)
 87 | 	{
 88 | 	  $d[1]=16;
 89 | 	  print OUT1 "@d\n";
 90 | 	  if ($d[5]=~ /N/)
 91 | 	  {
 92 | 	    print OUT3 "$d[0]\t$d[5]\n";
 93 | 	  }
 94 | 	}
 95 | 	elsif ($d[1]==145||$d[1]==177)
 96 | 	{
 97 | 	  $d[1]=0;
 98 | 	  print OUT2 "@d\n";
 99 | 	  if ($d[5]=~ /N/)
100 | 	  {
101 | 	    print OUT3 "$d[0]\t$d[5]\n";
102 | 	  }
103 | 	}
104 |       }
105 |       undef %hash;
106 |     }
107 |     $n=1;
108 |     $read=$a[0];
109 |     $hash{$a[0]}=$line;
110 |   }
111 |   else
112 |   {
113 |     $n=$n+1;
114 |     $hash{$a[0]}.="\n".$line;
115 |   }
116 |   }
117 | };
118 | close FH1;
119 | close OUT2;
120 | close OUT3;
121 | 


--------------------------------------------------------------------------------
/form_path.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w 
  2 | use strict;
  3 | if( @ARGV != 3) {
  4 |     print "Usage:  $0 both.nodes unique_map.intron.file default_gap_size\n";
  5 |     exit 0;
  6 | }
  7 | 
  8 | #CR848821.12_R	f	1	95891	CABZ01041119.1_F	f	2525	11178	0	0	0.00039619651347067	1.04285073676547e-05	0.000406625020838325	One
  9 | #	One
 10 | 
 11 | my $fh1=shift @ARGV;
 12 | my $fh2=shift @ARGV;
 13 | my $gap=shift @ARGV;
 14 | 
 15 | my @intron;
 16 | my $m=0;
 17 | open FH2,"<$fh2";
 18 | while (<FH2>)
 19 | {
 20 |   chomp($_);
 21 |   #SRR324684.17630829/1	3S33M9266N15M
 22 |   my @rec=split(/\s+/,$_);
 23 |   if ($rec[1] =~/([\d]+)N/)
 24 |   {
 25 |     $intron[$m++]=$1;
 26 |   }
 27 | }
 28 | close FH2;
 29 | 
 30 | 
 31 |   my @list = sort{$a<=>$b} @intron;
 32 |   my $count = @list;
 33 |   my $lower;
 34 |   if(($count%2)==1)
 35 |   {
 36 | 	if (!exists $list[int(($count-1)/2)])
 37 |         {
 38 |                $list[int(($count-1)/2)]=0;
 39 |         }
 40 |         $lower= $list[int(($count-1)/2)];
 41 |   }
 42 |   elsif(($count%2)==0)
 43 |   {
 44 |         if (!exists $list[int(($count-1)/2)])
 45 |         {
 46 |                $list[int(($count-1)/2)]=0;
 47 |         }
 48 |         if (!exists $list[int(($count)/2)])
 49 |         {
 50 |                $list[int(($count)/2)]=0;
 51 |         }
 52 | 
 53 |         $lower= ($list[int(($count-1)/2)]+$list[int(($count)/2)])/2;
 54 |   }
 55 | 
 56 | #print "Start finding header.......\n";
 57 | my $next;
 58 | my $length;
 59 | my $next_info;
 60 | my $before;
 61 | my $read;
 62 | 
 63 | open FH1, "< $fh1";
 64 | while(<FH1>)
 65 | {
 66 |   chomp($_);
 67 |   my @rec=split(/\s+/,$_);
 68 |   $next->{$rec[0]}=$rec[1];
 69 |   $before->{$rec[1]}=$rec[0];
 70 |   $length->{$rec[0]}{$rec[1]}=$rec[3];
 71 |   $length->{$rec[1]}{$rec[0]}=$rec[3];
 72 | }
 73 | close FH1;
 74 | #my $real_header;
 75 | foreach my $key (keys %$next)
 76 | {
 77 |    if (!exists($before->{$key}) && !exists($read->{$key}))
 78 |    {
 79 |    print $key; 
 80 |    #   $real_header->{$key}=1;
 81 |     printnode ($key)
 82 |    }
 83 | }
 84 | #print "Finishing finding header.......\n";
 85 | my $temp;
 86 | sub printnode
 87 | { 
 88 |   my ($key1)=@_; 
 89 | #  print $key1."(".$next_info->{$key1}{$next->{$key1}}.")->";
 90 | #  $mark->{$key1}=1;
 91 |   if (exists ($next->{$key1}) )
 92 |   {
 93 |      print "->";
 94 |      
 95 |      if (int($length->{$key1}{$next->{$key1}}) < $lower)
 96 |      {
 97 |        print "N(".($lower - int($length->{$key1}{$next->{$key1}})).")->";
 98 |      }
 99 |      else
100 |      {
101 |         print "N(".$gap.")->";
102 |      }
103 |      print $next->{$key1};
104 |      return (printnode($next->{$key1})); 
105 |   }
106 |   elsif (!exists ($next->{$key1}))
107 |   {
108 |     if ($key1=~/([\S]+)\/r$/)
109 |     {
110 |     
111 |       $read->{$1}=1;
112 |     }
113 |     else 
114 |     {
115 |       $temp=$key1."/r";
116 |       $read->{$temp}=1;
117 |     }
118 |    #  print $key1."(".$next_info->{$key1}{$next->{$key1}}.")->";
119 |      print "\n";
120 |   }
121 | } 
122 | 
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <b>DESCRIPTION</b><p>
 2 |    P_RNA_scaffolder is a genome scaffolding tool with paired-end RNA-seq reads from studied species. Since the nucleotide sequences are not conserved across species, P_RNA_scaffolder does not support using paired-end RNA-seq from close speciess to scaffold the genome of studied species. The paired-end RNA-seq reads could be downloaed from public read archive database (for instance, NCBI SRA database) or be your own data. The paired-end reads are aligned to contigs using HISAT2 or BWA firstly and then BLAT. The SAM alignment files by HISAT2 or BWA are used as the input files of P_RNA_scaffolder. P_RNA_scaffolder searches "guide" pairs, two reads of which were mapped to two different contigs. Then the "guide" pairs orient and order the contigs into longer scaffolds.<p> 
 3 | <b>SYSTEM REQUIREMENTS</b><p>
 4 | (1)The software, written with Shell script, consists of C++ programs and Perl programs. The C programs have been precompiled and therefore could be directly executed. To run Perl program, perl and Bioperl modules should be installed on the system. 
 5 | (2)The program requires SAM files as input file. HISAT2 or BWA should be installed on the system.<p>
 6 | (3)P_RNA_scaffolder has been tested and is supported on Linux.<p>
 7 | <b>INPUT FILES</b><p>
 8 | (1)The SAM files are necessary for scaffolding. In eukaryotes, the SAM file was generated using HISAT2 program. In prokaryotes, the paired-end RNA-seq reads were aligned to the contigs using BWA program.<p>
 9 | (i)Take human contigs and RNA-seq reads as an eukaryote example. The alignment of RNA-seq reads could be performed as follows: <p>
10 | <I>hisat2-build contigs.fa human_hisat</I><p>
11 | <I>hisat2 -x human_hisat -1 read_1.fq -2 read_2.fq -k 3 -p 10 --pen-noncansplice 1000000 -S input.sam </I><p>
12 | where read_1.fq and read_2.fq are the fastq files of two ends of RNA-seq reads. <p>
13 | -k 3 means report up to 3 alignments per read. <p>
14 | -p 10 means using 10 threads to align reads. <p>
15 | --pen-noncansplice 1000000 means high penalty for a non-canonical splice site. <p>
16 | -S input.sam means that the alignments of all reads were stored in the file of 'input.sam'.<p> 
17 | (ii)Take E.coli contigs and RNA-seq reads as a prokaryote example. The alignment of RNA-seq reads could be performed as follows: <p>
18 | <I>bwa index -a is contigs.fa</I><p>
19 | <I>bwa mem -t 10 contigs.fa read_1.fq read_2.fq >input.sam </I><p>
20 | 
21 | where read_1.fq and read_2.fq are the fastq files of two ends of RNA-seq reads. <p>
22 | -t 10 means using 10 threads to align reads. <p>
23 | input.sam means that the alignments of all reads were stored in the file of 'input.sam'. <p>
24 | (2)The contig file is also required and should be fasta format, consistent with the subject sequences when alignment. <p>
25 | <b>COMMANDS AND OPTIONS</b><p>
26 |    P_RNA_scaffolder is run via the shell script: P_RNA_scaffolder.sh found in the base installation directory.<p>
27 |    Usage info is as follows:<p>
28 | <b>sh P_RNA_scaffolder.sh -d Program_dir -i input.sam -j contig.fa -F read_1.fa -R read_2.fq </b>
29 | <b>Input options</b><p>
30 |      -d           the installing direcotry of P_RNA_scaffolder           <b>[        mandatory ]</b> <p>
31 |      -i           SAM file of RNA-seq alignments to contigs with hisat   <b>[        mandatory ]</b> <p>
32 |      -j           Pre-assembled contig FASTA file                        <b>[        mandatory ]</b> <p>
33 |      -F           FASTQ file of left reads                               <b>[        mandatory ]</b> <p>
34 |      -R           FASTQ file of right reads                              <b>[        mandatory ]</b> <p>
35 | 
36 | <b>Output options</b><p>
37 |      -o            write all output files to this directory              [ default:      ./ ] <p>
38 | 
39 | <b>Species options</b><p>
40 |      -s           the target species is Eukaryote or Prokaryote          [default:      yes ] <p>
41 |                   (1) yes represents that the target species is Eukaryote. <p>
42 |                   (2) no represents that the target species is Prokaryote. <p>
43 | 
44 | <b>Two modes selection options</b><p>
45 |      -b            re-align filtered RNA-seq reads to contigs with BLAT  [ default:     yes ] <p>
46 |                    (1) If yes, perform the 'accurate' mode using BLAT to further filter out reads. The 'accurate' scaffolding has higher accuracy and longer running time than the 'fast' mode. <p>
47 |                    (2) If no, perform the 'fast' mode without BLAT re-alignment and this mode is faster than the 'accurate' mode with less accuracy.<p>
48 |      -p            BLAT alignment identity cutoff                        [ default:    0.90 ] <p>
49 |      -t            number of threads used in BLAT re-alignment           [ default:       5 ] <p>
50 | 
51 | <b>Scaffolding options</b><p>
52 |      -e            the maximal allowed intron length                     [ default:  100000 ] <p>
53 |                    For genomes of different size, the maximal allowed intron length is different. For instance, in human, the maximal allowed intron length is set as 100000 while in C.elegans, it is set as 15000. <p> 
54 |      -f            the minimal supporting RNA-seq pair number            [ default:       2 ] <p>
55 |      -n            the number of inserted N to indicate a gap            [ default:  100 bp ] <p>
56 | <b>OUTPUT FILES</b><p>
57 |    When P_RNA_scaffolder completes, it will create a P_RNA_scaffolder.fasta output file in the output_dir/ output directory.  <p>
58 | <b>SPEED</b><p>
59 |    P_RNA_scaffolder spent about 195 minutes in scaffolding human genome contigs with a SAM file generated from alignment of 113.8 millions of RNA-seq pairs. <p>
60 | 


--------------------------------------------------------------------------------
/P_RNA_scaffolder.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #!/bin/sh
  3 | output=./
  4 | intron=100000
  5 | frequency=2
  6 | N=100
  7 | pid=0.90
  8 | threads=5
  9 | blat=yes
 10 | species=yes
 11 | while getopts ":d:l:p:n:e:f:i:j:o:F:R:t:b:s" opt; do
 12 |   case $opt in
 13 |     d)
 14 |       directory=$OPTARG
 15 |       vardir=1
 16 |       ;;
 17 |     o)
 18 |       output=$OPTARG
 19 |       ;;
 20 |     p)
 21 |       pid=$OPTARG
 22 |       ;;
 23 |     e)
 24 |       intron=$OPTARG
 25 |       ;;
 26 |     f)
 27 |       frequency=$OPTARG
 28 |       ;;
 29 |     n)
 30 |       N=$OPTARG
 31 |       ;;
 32 |     t)
 33 |       threads=$OPTARG
 34 |       ;;
 35 |     b)
 36 |       blat=$OPTARG
 37 |       ;;
 38 |     i)
 39 |       inputfile=$OPTARG
 40 |       varsam=1
 41 |       ;;
 42 |     j)
 43 |       contig=$OPTARG
 44 |       varfasta=1
 45 |       ;;
 46 |     F)
 47 |       fastqF=$OPTARG
 48 |       varfqF=1
 49 |       ;;
 50 |     R)
 51 |       fastqR=$OPTARG
 52 |       varfqR=1
 53 |       ;;
 54 |     s)
 55 |       species=$OPTARG
 56 |       ;; 
 57 |     ?)
 58 | 
 59 |       echo "Usage: sh `basename $0` -d Program_DIR -i inputfile.sam -j contig.fasta -F read_1.fastq -R reads_2.fastq -s yes";
 60 |         echo "";
 61 |         echo "Input options";
 62 |         echo "     -d           the installing direcotry of P_RNA_scaffolder           [        mandatory ]";
 63 |         echo "     -i           SAM file of RNA-seq alignments to contigs with hisat   [        mandatory ]";
 64 |         echo "     -j           Pre-assembled contig FASTA file                        [        mandatory ]";
 65 |         echo "     -F           FASTQ file of left reads                               [        mandatory ]";
 66 |         echo "     -R           FASTQ file of right reads                              [        mandatory ]";
 67 |         echo "";
 68 | 	echo "Output options";
 69 |         echo "     -o           write all output files to this directory               [ default:      ./ ]"; 
 70 |         echo "";
 71 |         echo "Species options"
 72 |         echo "     -s           the target species is Eukaryote or Prokaryote          [default:      yes ]";
 73 |         echo "                  (1) yes represents that the target species is Eukaryote. ";
 74 |         echo "                  (2) no represents that the target species is Prokaryote";
 75 |         echo "";
 76 | 	echo "Two modes selection options";
 77 |         echo "     -b            re-align filtered RNA-seq reads to contigs with BLAT  [ default:     yes ]";
 78 |         echo "                   (1) If yes, perform the 'accurate' mode using BLAT to further filter      ";
 79 | 	echo "                   out reads. The 'accurate' scaffolding has higher accuracy and longer      ";
 80 | 	echo "                   running time than the 'fast' mode.";
 81 |         echo "                   (2) If no, perform the 'fast' mode without BLAT re-alignment and this mode";
 82 | 	echo "                   is faster than the 'accurate' mode with less accuracy. ";
 83 | 	echo "     -p            BLAT alignment identity cutoff                        [ default:    0.90 ]";
 84 |         echo "     -t            number of threads used in BLAT re-alignment           [ default:       5 ]";	
 85 |         echo "";
 86 |         echo "Scaffolding options";
 87 |         echo "     -e            the maximal allowed intron length                     [ default:  100000 ]";
 88 |         echo "     -f            the minimal supporting RNA-seq pair number            [ default:       2 ]";
 89 |         echo "     -n            the number of inserted N to indicate a gap            [ default:  100 bp ]";
 90 |         echo "";        
 91 |       
 92 |       exit 1
 93 |       ;;
 94 |       :)
 95 |       echo "Option -$OPTARG requires an argument." >&2
 96 |       exit 1
 97 |       ;;
 98 |   esac
 99 | done
100 | 
101 | if [[ $vardir -eq 1 ]] && [[ $varsam -eq 1 ]] && [[ $varfasta -eq 1 ]] && [[ $varfqF -eq 1 ]] && [[ $varfqR -eq 1 ]] && [[ $blat = no ]]; then
102 | 
103 |         if [ ! -d $output ] ; then
104 | 	mkdir "$output"	
105 | 	fi
106 |         `perl $directory/UNIQUE_sam_intron.pl $inputfile $output/F.sam $output/R.sam $output/intron.txt`;
107 | 	`perl $directory/guider.pl $contig $output/F.sam $output/R.sam  > $output/guider`;
108 | 	`$directory/link_block $output/guider $output/linker $intron`;
109 | 	`sort -k1,1 -k2,2n -k27,27n -k16,16nr $output/linker > $output/sort.linker`;
110 | 	`$directory/delete_linker $output/sort.linker $output/retained.linker`;
111 | 	`$directory/delete_same_fragment $output/retained.linker $output/linker.dif`;   
112 | 	`$directory/exon_length $output/linker.dif $output/linker.length`;
113 | 	`$directory/convert_linker $output/linker.length $output/linker.convert`;
114 | 	`sort -k2,2 -k3,3 -k4,4nr $output/linker.convert > $output/linker.select`;
115 | 	`cut -f 2-4 $output/linker.select |sort -k1,1 -k2,2 > $output/connections`;
116 | 	`$directory/count_connection_frequency $output/connections $output/connections.frequency`;
117 | 	`$directory/find_reliable_connection $output/connections.frequency $output/reliable.connections $frequency`;
118 | 	`sort -k1,1 -k3,3nr $output/reliable.connections > $output/sort.reliable.connection`;
119 | 	`$directory/find_end_node $output/sort.reliable.connection $output/end.node`;
120 | 	`sort -k2,2 -k3,3nr $output/end.node > $output/sort.end.node`;
121 | 	`$directory/find_start_node $output/sort.end.node $output/start.node`;
122 | 	`$directory/select_nodes $output/start.node $output/both.nodes`;
123 | 	`perl $directory/form_path.pl $output/both.nodes $output/intron.txt $N > $output/both.path`;
124 |         `sed 's/->/\n/g' $output/both.path |sed 's/\/r//g' |grep -v "N(" |sort -u > $output/scaffolded.fragment.id`;
125 |         perl $directory/generate_scaffold.pl $contig $output/both.path P_RNA_scaffold_ > $output/scaffold.fasta & perl $directory/generate_unscaffold.pl $contig $output/scaffolded.fragment.id  > $output/unscaffold.fasta
126 |         wait
127 |         `cat $output/scaffold.fasta $output/unscaffold.fasta >$output/P_RNA_scaffold.fasta`;
128 | 
129 | exit 1
130 | 
131 | elif [[ $vardir -eq 1 ]] && [[ $varsam -eq 1 ]] && [[ $varfasta -eq 1 ]] && [[ $varfqF -eq 1 ]] && [[ $varfqR -eq 1 ]] && [[ $blat = yes ]]; then
132 | 
133 |         if [ ! -d $output ] ; then
134 |         mkdir "$output"
135 |         fi
136 |         `perl $directory/UNIQUE_sam_intron.pl $inputfile $output/F.sam $output/R.sam $output/intron.txt`;
137 |         perl $directory/sam2fa.pl $output/F.sam $fastqF $output/F.fa & perl $directory/sam2fa.pl $output/R.sam $fastqR $output/R.fa
138 |         wait
139 |         lineF=(`wc $output/F.fa`)
140 |         splitlineF=`expr $lineF / 2 / $threads \* 2 + 2`
141 |         `split -d -l $splitlineF $output/F.fa $output/F_`
142 |         for  FILE in $output/F_*
143 |         do
144 |           $directory/blat $contig $FILE $FILE.psl -noHead 1>>$output/blatF_log 2>>$output/blatF_error &
145 |         done
146 |         wait
147 |         lineR=(`wc $output/R.fa`)
148 |         splitlineR=`expr $lineR / 2 / $threads \* 2 + 2`
149 |         `split -d -l $splitlineR $output/R.fa $output/R_`
150 |         for  FILE in $output/R_*
151 |         do
152 |           $directory/blat $contig $FILE $FILE.psl -noHead 1>>$output/blatR_log 2>>$output/blatR_error &
153 |         done
154 |         wait
155 |         lineN=(`ls $output/R_*.psl | wc`)
156 |         a=(`ls $output/F_*.psl`)
157 |         b=(`ls $output/R_*.psl`)
158 |         for (( i=0; i<$lineN; i++))
159 |         do
160 |           perl $directory/UNIQUE_psl.pl -f1 ${a[$i]} -f2 ${b[$i]} -n $pid >$output/blat-$i.filter.id &
161 |         done
162 |         wait
163 |         `cat $output/blat-*.filter.id |sort -u >$output/blat_filter.id`
164 |         if [[ $species = yes ]] ; then        
165 |          `perl $directory/count-match.pl $output/F.sam $output/R.sam |sort -u >>$output/blat_filter.id`  
166 |         fi
167 |         `rm $output/blat-*.filter.id $output/F_* $output/R_*`
168 |         `perl $directory/filter-out.pl $output/F.sam $output/blat_filter.id 1 >$output/filter_F.sam & perl $directory/filter-out.pl $output/R.sam $output/blat_filter.id 1 >$output/filter_R.sam`
169 | 	wait
170 |         `perl $directory/guider.pl $contig $output/filter_F.sam $output/filter_R.sam  > $output/guider`;
171 |         `$directory/link_block $output/guider $output/linker $intron`;
172 |         `sort -k1,1 -k2,2n -k27,27n -k16,16nr $output/linker > $output/sort.linker`;
173 |         `$directory/delete_linker $output/sort.linker $output/retained.linker`;
174 |         `$directory/delete_same_fragment $output/retained.linker $output/linker.dif`;
175 |         `$directory/exon_length $output/linker.dif $output/linker.length`;
176 |         `$directory/convert_linker $output/linker.length $output/linker.convert`;
177 |         `sort -k2,2 -k3,3 -k4,4nr $output/linker.convert > $output/linker.select`;
178 |         `cut -f 2-4 $output/linker.select |sort -k1,1 -k2,2 > $output/connections`;
179 |         `$directory/count_connection_frequency $output/connections $output/connections.frequency`;
180 |         `$directory/find_reliable_connection $output/connections.frequency $output/reliable.connections $frequency`;
181 |         `sort -k1,1 -k3,3nr $output/reliable.connections > $output/sort.reliable.connection`;
182 |         `$directory/find_end_node $output/sort.reliable.connection $output/start.node`;
183 |         `$directory/select_nodes $output/start.node $output/both.nodes`;
184 |         `perl $directory/form_path.pl $output/both.nodes $output/intron.txt $N > $output/both.path`;
185 |         `sed 's/->/\n/g' $output/both.path |sed 's/\/r//g' |grep -v "N(" |sort -u > $output/scaffolded.fragment.id`;
186 |         perl $directory/generate_scaffold.pl $contig $output/both.path P_RNA_scaffold_ > $output/scaffold.fasta & perl $directory/generate_unscaffold.pl $contig $output/scaffolded.fragment.id  > $output/unscaffold.fasta
187 |         wait
188 |         `cat $output/scaffold.fasta $output/unscaffold.fasta >$output/P_RNA_scaffold.fasta`;
189 | 
190 | exit 1
191 | 
192 | else
193 |         echo "Usage: sh `basename $0` -d Program_DIR -i inputfile.sam -j contig.fasta -F read_1.fastq -R read_2.fastq -s yes";
194 |         echo "";
195 |         echo "Input options";
196 |         echo "     -d           the installing direcotry of P_RNA_scaffolder           [        mandatory ]";
197 |         echo "     -i           SAM file of RNA-seq alignments to contigs with hisat   [        mandatory ]";
198 |         echo "     -j           Pre-assembled contig FASTA file                        [        mandatory ]";
199 |         echo "     -F           FASTQ file of left reads                               [        mandatory ]";
200 |         echo "     -R           FASTQ file of right reads                              [        mandatory ]";
201 |         echo "";
202 | 	echo "Output options";
203 |         echo "     -o            write all output files to this directory              [ default:      ./ ]"; 
204 |         echo "";
205 |         echo "Species options"
206 |         echo "     -s           the target species is Eukaryote or Prokaryote          [default:      yes ]";
207 |         echo "                  (1) yes represents that the target species is Eukaryote. ";
208 |         echo "                  (2) no represents that the target species is Prokaryote";
209 |         echo "";
210 | 	echo "Two modes selection options";
211 |         echo "     -b            re-align filtered RNA-seq reads to contigs with BLAT  [ default:     yes ]";
212 |         echo "                   (1) If yes, perform the 'accurate' mode using BLAT to further filter      ";
213 | 	echo "                   out reads. The 'accurate' scaffolding has higher accuracy and longer      ";
214 | 	echo "                   running time than the 'fast' mode.";
215 |         echo "                   (2) If no, perform the 'fast' mode without BLAT re-alignment and this mode";
216 | 	echo "                   is faster than the 'accurate' mode with less accuracy. ";
217 | 	echo "     -p            BLAT alignment identity cutoff                        [ default:    0.90 ]";
218 |         echo "     -t            number of threads used in BLAT re-alignment           [ default:       5 ]";	
219 |         echo "";
220 |         echo "Scaffolding options";
221 |         echo "     -e            the maximal allowed intron length                     [ default:  100000 ]";
222 |         echo "     -f            the minimal supporting RNA-seq pair number            [ default:       2 ]";
223 |         echo "     -n            the number of inserted N to indicate a gap            [ default:  100 bp ]";
224 |         echo "";        
225 | 
226 |         exit 1
227 | 
228 | fi
229 | 
230 | 


--------------------------------------------------------------------------------