├── blat ├── link_block ├── delete_linker ├── exon_length ├── find_end_node ├── select_nodes ├── convert_linker ├── find_start_node ├── delete_same_fragment ├── find_reliable_connection ├── count_connection_frequency ├── filter-out.pl ├── generate_unscaffold.pl ├── sam2fa.pl ├── count-match.pl ├── UNIQUE_psl.pl ├── generate_scaffold.pl ├── guider.pl ├── UNIQUE_sam_intron.pl ├── form_path.pl ├── README.md └── P_RNA_scaffolder.sh /blat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/blat -------------------------------------------------------------------------------- /link_block: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/link_block -------------------------------------------------------------------------------- /delete_linker: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/delete_linker -------------------------------------------------------------------------------- /exon_length: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/exon_length -------------------------------------------------------------------------------- /find_end_node: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/find_end_node -------------------------------------------------------------------------------- /select_nodes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/select_nodes -------------------------------------------------------------------------------- /convert_linker: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/convert_linker -------------------------------------------------------------------------------- /find_start_node: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/find_start_node -------------------------------------------------------------------------------- /delete_same_fragment: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/delete_same_fragment -------------------------------------------------------------------------------- /find_reliable_connection: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/find_reliable_connection -------------------------------------------------------------------------------- /count_connection_frequency: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CAFS-bioinformatics/P_RNA_scaffolder/HEAD/count_connection_frequency -------------------------------------------------------------------------------- /filter-out.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | if( @ARGV != 3 ) { 4 | print "Usage: $0 SAM-file filter-ID-file ID-in-SAM-file \n"; 5 | exit 0; 6 | } 7 | my $fh1=shift @ARGV; 8 | my $fh2=shift @ARGV; 9 | my $fh3=shift @ARGV; 10 | open FH2,"<$fh2"; 11 | open FH1,"<$fh1"; 12 | my $minus; 13 | while () 14 | { 15 | chomp($_); 16 | $_ =~ s# ##g; 17 | $minus->{$_}=1; 18 | } 19 | close FH2; 20 | while () 21 | { 22 | chomp($_); 23 | my @rec=split(/[\s]+/,$_); 24 | $rec[$fh3-1] =~ s# ##g; 25 | if (! exists($minus->{$rec[$fh3-1]})) 26 | { 27 | print $_."\n"; 28 | } 29 | 30 | } 31 | close FH1; 32 | 33 | -------------------------------------------------------------------------------- /generate_unscaffold.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #mm_10008_m.0.17 32306003,32304536,32304385,32304049 32306103,32304612,32304439,32304090 3 | #-1 18/0 AA033453 4 | use strict; 5 | if( @ARGV != 2 ) { 6 | print "Usage: $0 contig.fasta redundant.id\n"; 7 | print "Destination: find the common dataset from the plus-file and minus-file\n"; 8 | print "Note:This script can be used to identify cis- and trans-\n"; 9 | exit 0; 10 | } 11 | my $fh1=shift @ARGV; 12 | 13 | my $fh2=shift @ARGV; 14 | use Bio::Seq; 15 | use Bio::SeqIO; 16 | use Bio::PrimarySeq; 17 | my $in=Bio::SeqIO->new(-file=>"$fh1",'-format'=>'fasta'); 18 | my $seq=$in->next_seq(); 19 | my $id; 20 | my $temp; 21 | my $print; 22 | open FH2, "<$fh2"; 23 | while () 24 | { 25 | chomp($_); 26 | my @rec=split(/\t/,$_); 27 | $id->{$rec[0]}=1; 28 | } 29 | close FH2; 30 | my $disp=$seq->display_id(); 31 | while ($seq) 32 | { 33 | $disp=$seq->display_id(); 34 | chomp($disp); 35 | if (!exists($id->{$disp})) 36 | { 37 | print ">".$disp."\n"; 38 | print $seq->seq()."\n"; 39 | } 40 | $seq=$in->next_seq(); 41 | } 42 | 43 | -------------------------------------------------------------------------------- /sam2fa.pl: -------------------------------------------------------------------------------- 1 | #!usr/local/bin/perl -w 2 | use strict; 3 | if( @ARGV != 3 ) 4 | { 5 | print "Usage: perl $0 sam-file fastq-file fasta-output-file\n"; 6 | exit 0; 7 | } 8 | my $fh1 = shift @ARGV; 9 | my $fh2 = shift @ARGV; 10 | my $fh3 = shift @ARGV; 11 | open OUT1,">$fh3"; 12 | open FH1,"<$fh1"; 13 | my $id =""; 14 | my %seq; 15 | open FH1,"<$fh1"; 16 | while (my $line = ) 17 | { 18 | chomp $line; 19 | my @a = split / /,$line; 20 | if($a[0]ne$id) 21 | { 22 | $seq{$a[0]}=1; 23 | } 24 | $id=$a[0]; 25 | } 26 | close FH1; 27 | 28 | open FH2,"<$fh2"; 29 | while () 30 | { 31 | my @temp; 32 | chomp($temp[0] = $_); # First line is an id. 33 | chomp($temp[1] = ); # Second line is a sequence. 34 | chomp($temp[2] = ); # Third line is an id. 35 | chomp($temp[3] = ); # Fourth line is quality. 36 | my @aa = split / /,$temp[0]; 37 | my @a = split /\t/,$aa[0]; 38 | $id =$a[0]; 39 | $id =~ s/\@//g; 40 | $id =~ s/\/1$//g; 41 | $id =~ s/\/2$//g; 42 | $id =~ s/\/F$//g; 43 | $id =~ s/\/R$//g; 44 | if (exists $seq{$id}) 45 | { 46 | print OUT1 ">".$id."\n".$temp[1]."\n"; 47 | } 48 | #print"$id\n$seq{$id}\n"; 49 | } 50 | close FH2; 51 | close OUT1; 52 | -------------------------------------------------------------------------------- /count-match.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | #use strict; 3 | if( @ARGV != 2 ) { 4 | print "Usage: $0 F.sam R.sam \n"; 5 | exit 0; 6 | } 7 | my $fh1=shift @ARGV; 8 | my $fh2=shift @ARGV; 9 | my $INFILE; 10 | open $INFILE, "< $fh1"; 11 | while(<$INFILE>) 12 | { 13 | chomp($_); 14 | my @line = split(/\s/,$_); 15 | my $sum_soft=0; 16 | my @soft= ($line[5] =~ /(\d+)S/g); 17 | foreach my $soft (@soft) 18 | { 19 | $sum_soft+=$soft; 20 | } 21 | my $sum_match=0; 22 | my @match= ($line[5] =~ /(\d+)M/g); 23 | foreach my $match (@match) 24 | { 25 | $sum_match+=$match; 26 | } 27 | if ($sum_soft/($sum_soft+$sum_match) > 0.2) 28 | { 29 | print $line[0]."\n"; 30 | } 31 | } 32 | 33 | open $INFILE, "< $fh2"; 34 | while(<$INFILE>) 35 | { 36 | chomp($_); 37 | my @line = split(/\s/,$_); 38 | my $sum_soft=0; 39 | my @soft= ($line[5] =~ /(\d+)S/g); 40 | foreach my $soft (@soft) 41 | { 42 | $sum_soft+=$soft; 43 | } 44 | my $sum_match=0; 45 | my @match= ($line[5] =~ /(\d+)M/g); 46 | foreach my $match (@match) 47 | { 48 | $sum_match+=$match; 49 | } 50 | if ($sum_soft/($sum_soft+$sum_match) > 0.2) 51 | { 52 | print $line[0]."\n"; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /UNIQUE_psl.pl: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl -w 2 | if( @ARGV != 6 ) { 3 | print "Usage: perl $0 -f1 read_1.psl -f2 read_2.psl -n 0.90 \n"; 4 | exit 0; 5 | } 6 | 7 | use Getopt::Long; 8 | Getopt::Long::GetOptions('f1=s' => \$fh1,'f2=s' => \$fh2,'n=s' => \$n); 9 | my $hash1; 10 | my $hash2; 11 | open FH1,"<$fh1"; 12 | while () 13 | { 14 | chomp $_; 15 | my @a = split(/\t/,$_); 16 | if($a[0]/$a[10] >= $n && $a[10] != 0) 17 | { 18 | if(exists$hash1{$a[9]}) 19 | { 20 | $hash1{$a[9]}="multiple"; 21 | } 22 | else 23 | { 24 | $hash1{$a[9]}=$a[13]; 25 | } 26 | } 27 | } 28 | close FH1; 29 | open FH2,"<$fh2"; 30 | while () 31 | { 32 | chomp $_; 33 | my @a = split(/\t/,$_); 34 | if($a[0]/$a[10] >= $n && $a[10] != 0) 35 | { 36 | if(exists$hash2{$a[9]}) 37 | { 38 | $hash2{$a[9]}="multiple"; 39 | } 40 | else 41 | { 42 | $hash2{$a[9]}=$a[13]; 43 | } 44 | } 45 | } 46 | close FH2; 47 | foreach my $key ( keys(%hash1) ) 48 | { 49 | if($hash1{$key} eq "multiple") 50 | { 51 | print $key."\n"; 52 | } 53 | if (defined($hash2{$key}) && $hash1{$key} eq $hash2{$key} ) 54 | { 55 | print $key."\n"; 56 | } 57 | } 58 | foreach my $key ( keys(%hash2) ) 59 | { 60 | if($hash2{$key} eq "multiple") 61 | { 62 | print $key."\n"; 63 | } 64 | if (defined($hash1{$key}) && $hash1{$key} eq $hash2{$key} ) 65 | { 66 | print $key."\n"; 67 | } 68 | } 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /generate_scaffold.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | if( @ARGV != 3 ) { 4 | print "Usage: perl $0 sequence.fasta find-linker.result name\n"; 5 | exit 0; 6 | } 7 | use Bio::Seq; 8 | use Bio::SeqIO; 9 | use Bio::PrimarySeq; 10 | 11 | my $fh1=shift @ARGV; 12 | my $fh2=shift @ARGV; 13 | my $name=shift @ARGV; 14 | 15 | my $in=Bio::SeqIO->new(-file=>"$fh1",'-format'=>'fasta'); 16 | my $seq=$in->next_seq(); 17 | my $string=$seq->seq(); 18 | my $SEQ; 19 | my $disp=$seq->id(); 20 | while ($seq) 21 | { 22 | $disp=$seq->id(); 23 | # print $disp."\n"; 24 | $string=$seq->seq(); 25 | $SEQ->{$disp}=$string; 26 | $seq=$in->next_seq(); 27 | } 28 | open FH2,"<$fh2"; 29 | my $j=1; 30 | while () 31 | { 32 | chomp($_); 33 | # r.contig1034758|utg71800031128571F_1_F->N(212)->r.contig1034757|utg71800031128561F_1_F->N(214)->r.contig1034756|utg71800031128551F_1_F 34 | my @rec=split(/\-\>/,$_); 35 | print ">".$name.$j. "\n"; 36 | for (my $i=0;$i<=$#rec;$i++) 37 | { 38 | if ($rec[$i] =~ /^N\(([0-9]*)\)/) 39 | { 40 | my $null=$1; 41 | if ($null > 0) 42 | { 43 | for (my $n=1;$n<=$null;$n++) 44 | { 45 | print "N"; 46 | } 47 | } 48 | } 49 | elsif ($rec[$i] =~/^([\S]+)\/r$/) 50 | { 51 | my $tmp_string = reverse $SEQ->{$1}; 52 | $tmp_string =~ tr/AaCcTtGg/TtGgAaCc/; 53 | print $tmp_string; 54 | } 55 | elsif ($rec[$i] =~/^([\S]+)$/) 56 | { 57 | print $SEQ->{$rec[$i]}; 58 | } 59 | } 60 | print "\n"; 61 | $j=$j + 1; 62 | } 63 | close FH2; 64 | -------------------------------------------------------------------------------- /guider.pl: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl -w 2 | if( @ARGV != 3 ) 3 | { 4 | print "Usage: perl $0 contigs.fa filter_F.sam filter_R.sam\n"; 5 | exit 0; 6 | } 7 | my $fh1 = shift @ARGV; 8 | my $fh2 = shift @ARGV; 9 | my $fh3 = shift @ARGV; 10 | open FH1,"<$fh1"; 11 | my $len; 12 | while ($line = ) 13 | { 14 | chomp $line; 15 | my @a = split / /, $line; 16 | if($a[0] =~ />/) 17 | { 18 | $a[0] =~s/\>//; 19 | $id = $a[0]; 20 | } 21 | else 22 | { 23 | $len{$id}+=length$line; 24 | } 25 | } 26 | close FH1; 27 | open FH2,"<$fh2"; 28 | while ($line = ) 29 | { 30 | my $sum=0; 31 | chomp $line; 32 | my @a = split / /, $line; 33 | $a[1]=~s/0/+/; 34 | $a[1]=~s/16/-/; 35 | $a[5]=~ s/[A-Z]$//; 36 | $a[5]=~ s/[A-Z]/\+/g; 37 | $length1{$a[0]} =length$a[9]; 38 | my @cigar = split/\+/, $a[5]; 39 | foreach $c (@cigar){ 40 | $sum = $sum + $c; 41 | } 42 | $suma{$a[0]}=$a[3]+$sum; 43 | $a0{$a[0]}=$a[0];$a1{$a[0]}=$a[1];$a2{$a[0]}=$a[2];$a3{$a[0]}=$a[3]; 44 | } 45 | close FH2; 46 | open FH3,"<$fh3"; 47 | while ($line = ) 48 | {my $sum=0; 49 | chomp $line; 50 | my @b = split / /, $line; 51 | $b[1]=~s/0/+/; 52 | $b[1]=~s/16/-/; 53 | $b[5]=~ s/[A-Z]$//; 54 | $b[5]=~ s/[A-Z]/\+/g; 55 | $length2=length$b[9]; 56 | $length2_1=$length1{$b[0]}+1; 57 | $length2_2=$length2+$length1{$b[0]}; 58 | $readlength=$length1{$b[0]}+$length2; 59 | my @cigar = split/\+/, $b[5]; 60 | foreach $c (@cigar){ 61 | $sum = $sum + $c; 62 | } 63 | $sumb=$b[3]+$sum; 64 | $p1=$length1{$b[0]}/$readlength; 65 | $p2=$length2/$readlength; 66 | print "$a0{$b[0]}\t1\t$length1{$b[0]}\t$length1{$b[0]}\t1\t$readlength\t$a2{$b[0]}\t$len{$a2{$b[0]}}\t$a3{$b[0]}\t$suma{$b[0]}\t$p1\t100\t$a1{$b[0]}\n"; 67 | print "$b[0]\t$length2_1\t$length2_2\t$length2\t1\t$readlength\t$b[2]\t$len{$b[2]}\t$b[3]\t$sumb\t$p2\t100\t$b[1]\n"; 68 | } 69 | close FH3; 70 | -------------------------------------------------------------------------------- /UNIQUE_sam_intron.pl: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl -w 2 | if (@ARGV!=4) 3 | { 4 | print "Usage: perl $0 samfile read_1.sam read_2.sam intron.txt\n"; 5 | exit 0; 6 | } 7 | my $read=""; 8 | my $n=0; 9 | my $fh1 = shift @ARGV; 10 | my $fh2 = shift @ARGV; 11 | my $fh3 = shift @ARGV; 12 | my $fh4 = shift @ARGV; 13 | open OUT1,">$fh2"; 14 | open OUT2,">$fh3"; 15 | open OUT3,">$fh4"; 16 | open FH1,"<$fh1"; 17 | while ($line = ) 18 | { 19 | if($line!~/\t\=\t/ && $line!~/\t\*\t/) 20 | { 21 | chomp $line; 22 | my @a = split/\t/, $line; 23 | if ($a[0]ne$read) 24 | { 25 | if($n==2) 26 | { 27 | my @b = split/\n/, $hash{$read}; 28 | my @c = split/\t/, $b[0]; 29 | my @d = split/\t/, $b[1]; 30 | if ($c[2]ne$d[2]) 31 | { 32 | if ($c[1]==65||$c[1]==97) 33 | { 34 | $c[1]=0; 35 | print OUT1 "@c\n"; 36 | if ($c[5]=~ /N/) 37 | { 38 | print OUT3 "$c[0]\t$c[5]\n"; 39 | } 40 | } 41 | elsif ($c[1]==129||$c[1]==161) 42 | { 43 | $c[1]=16; 44 | print OUT2 "@c\n"; 45 | if ($c[5]=~ /N/) 46 | { 47 | print OUT3 "$c[0]\t$c[5]\n"; 48 | } 49 | } 50 | elsif ($c[1]==113||$c[1]==81) 51 | { 52 | $c[1]=16; 53 | print OUT1 "@c\n"; 54 | if ($c[5]=~ /N/) 55 | { 56 | print OUT3 "$c[0]\t$c[5]\n"; 57 | } 58 | } 59 | elsif ($c[1]==145||$c[1]==177) 60 | { 61 | $c[1]=0; 62 | print OUT2 "@c\n"; 63 | if ($c[5]=~ /N/) 64 | { 65 | print OUT3 "$c[0]\t$c[5]\n"; 66 | } 67 | } 68 | if ($d[1]==65||$d[1]==97) 69 | { 70 | $d[1]=0; 71 | print OUT1 "@d\n"; 72 | if ($d[5]=~ /N/) 73 | { 74 | print OUT3 "$d[0]\t$d[5]\n"; 75 | } 76 | } 77 | elsif ($d[1]==129||$d[1]==161) 78 | { 79 | $d[1]=16; 80 | print OUT2 "@d\n"; 81 | if ($d[5]=~ /N/) 82 | { 83 | print OUT3 "$d[0]\t$d[5]\n"; 84 | } 85 | } 86 | elsif ($d[1]==113||$d[1]==81) 87 | { 88 | $d[1]=16; 89 | print OUT1 "@d\n"; 90 | if ($d[5]=~ /N/) 91 | { 92 | print OUT3 "$d[0]\t$d[5]\n"; 93 | } 94 | } 95 | elsif ($d[1]==145||$d[1]==177) 96 | { 97 | $d[1]=0; 98 | print OUT2 "@d\n"; 99 | if ($d[5]=~ /N/) 100 | { 101 | print OUT3 "$d[0]\t$d[5]\n"; 102 | } 103 | } 104 | } 105 | undef %hash; 106 | } 107 | $n=1; 108 | $read=$a[0]; 109 | $hash{$a[0]}=$line; 110 | } 111 | else 112 | { 113 | $n=$n+1; 114 | $hash{$a[0]}.="\n".$line; 115 | } 116 | } 117 | }; 118 | close FH1; 119 | close OUT2; 120 | close OUT3; 121 | -------------------------------------------------------------------------------- /form_path.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | if( @ARGV != 3) { 4 | print "Usage: $0 both.nodes unique_map.intron.file default_gap_size\n"; 5 | exit 0; 6 | } 7 | 8 | #CR848821.12_R f 1 95891 CABZ01041119.1_F f 2525 11178 0 0 0.00039619651347067 1.04285073676547e-05 0.000406625020838325 One 9 | # One 10 | 11 | my $fh1=shift @ARGV; 12 | my $fh2=shift @ARGV; 13 | my $gap=shift @ARGV; 14 | 15 | my @intron; 16 | my $m=0; 17 | open FH2,"<$fh2"; 18 | while () 19 | { 20 | chomp($_); 21 | #SRR324684.17630829/1 3S33M9266N15M 22 | my @rec=split(/\s+/,$_); 23 | if ($rec[1] =~/([\d]+)N/) 24 | { 25 | $intron[$m++]=$1; 26 | } 27 | } 28 | close FH2; 29 | 30 | 31 | my @list = sort{$a<=>$b} @intron; 32 | my $count = @list; 33 | my $lower; 34 | if(($count%2)==1) 35 | { 36 | if (!exists $list[int(($count-1)/2)]) 37 | { 38 | $list[int(($count-1)/2)]=0; 39 | } 40 | $lower= $list[int(($count-1)/2)]; 41 | } 42 | elsif(($count%2)==0) 43 | { 44 | if (!exists $list[int(($count-1)/2)]) 45 | { 46 | $list[int(($count-1)/2)]=0; 47 | } 48 | if (!exists $list[int(($count)/2)]) 49 | { 50 | $list[int(($count)/2)]=0; 51 | } 52 | 53 | $lower= ($list[int(($count-1)/2)]+$list[int(($count)/2)])/2; 54 | } 55 | 56 | #print "Start finding header.......\n"; 57 | my $next; 58 | my $length; 59 | my $next_info; 60 | my $before; 61 | my $read; 62 | 63 | open FH1, "< $fh1"; 64 | while() 65 | { 66 | chomp($_); 67 | my @rec=split(/\s+/,$_); 68 | $next->{$rec[0]}=$rec[1]; 69 | $before->{$rec[1]}=$rec[0]; 70 | $length->{$rec[0]}{$rec[1]}=$rec[3]; 71 | $length->{$rec[1]}{$rec[0]}=$rec[3]; 72 | } 73 | close FH1; 74 | #my $real_header; 75 | foreach my $key (keys %$next) 76 | { 77 | if (!exists($before->{$key}) && !exists($read->{$key})) 78 | { 79 | print $key; 80 | # $real_header->{$key}=1; 81 | printnode ($key) 82 | } 83 | } 84 | #print "Finishing finding header.......\n"; 85 | my $temp; 86 | sub printnode 87 | { 88 | my ($key1)=@_; 89 | # print $key1."(".$next_info->{$key1}{$next->{$key1}}.")->"; 90 | # $mark->{$key1}=1; 91 | if (exists ($next->{$key1}) ) 92 | { 93 | print "->"; 94 | 95 | if (int($length->{$key1}{$next->{$key1}}) < $lower) 96 | { 97 | print "N(".($lower - int($length->{$key1}{$next->{$key1}})).")->"; 98 | } 99 | else 100 | { 101 | print "N(".$gap.")->"; 102 | } 103 | print $next->{$key1}; 104 | return (printnode($next->{$key1})); 105 | } 106 | elsif (!exists ($next->{$key1})) 107 | { 108 | if ($key1=~/([\S]+)\/r$/) 109 | { 110 | 111 | $read->{$1}=1; 112 | } 113 | else 114 | { 115 | $temp=$key1."/r"; 116 | $read->{$temp}=1; 117 | } 118 | # print $key1."(".$next_info->{$key1}{$next->{$key1}}.")->"; 119 | print "\n"; 120 | } 121 | } 122 | 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DESCRIPTION

2 | P_RNA_scaffolder is a genome scaffolding tool with paired-end RNA-seq reads from studied species. Since the nucleotide sequences are not conserved across species, P_RNA_scaffolder does not support using paired-end RNA-seq from close speciess to scaffold the genome of studied species. The paired-end RNA-seq reads could be downloaed from public read archive database (for instance, NCBI SRA database) or be your own data. The paired-end reads are aligned to contigs using HISAT2 or BWA firstly and then BLAT. The SAM alignment files by HISAT2 or BWA are used as the input files of P_RNA_scaffolder. P_RNA_scaffolder searches "guide" pairs, two reads of which were mapped to two different contigs. Then the "guide" pairs orient and order the contigs into longer scaffolds.

3 | SYSTEM REQUIREMENTS

4 | (1)The software, written with Shell script, consists of C++ programs and Perl programs. The C programs have been precompiled and therefore could be directly executed. To run Perl program, perl and Bioperl modules should be installed on the system. 5 | (2)The program requires SAM files as input file. HISAT2 or BWA should be installed on the system.

6 | (3)P_RNA_scaffolder has been tested and is supported on Linux.

7 | INPUT FILES

8 | (1)The SAM files are necessary for scaffolding. In eukaryotes, the SAM file was generated using HISAT2 program. In prokaryotes, the paired-end RNA-seq reads were aligned to the contigs using BWA program.

9 | (i)Take human contigs and RNA-seq reads as an eukaryote example. The alignment of RNA-seq reads could be performed as follows:

10 | hisat2-build contigs.fa human_hisat

11 | hisat2 -x human_hisat -1 read_1.fq -2 read_2.fq -k 3 -p 10 --pen-noncansplice 1000000 -S input.sam

12 | where read_1.fq and read_2.fq are the fastq files of two ends of RNA-seq reads.

13 | -k 3 means report up to 3 alignments per read.

14 | -p 10 means using 10 threads to align reads.

15 | --pen-noncansplice 1000000 means high penalty for a non-canonical splice site.

16 | -S input.sam means that the alignments of all reads were stored in the file of 'input.sam'.

17 | (ii)Take E.coli contigs and RNA-seq reads as a prokaryote example. The alignment of RNA-seq reads could be performed as follows:

18 | bwa index -a is contigs.fa

19 | bwa mem -t 10 contigs.fa read_1.fq read_2.fq >input.sam

20 | 21 | where read_1.fq and read_2.fq are the fastq files of two ends of RNA-seq reads.

22 | -t 10 means using 10 threads to align reads.

23 | input.sam means that the alignments of all reads were stored in the file of 'input.sam'.

24 | (2)The contig file is also required and should be fasta format, consistent with the subject sequences when alignment.

25 | COMMANDS AND OPTIONS

26 | P_RNA_scaffolder is run via the shell script: P_RNA_scaffolder.sh found in the base installation directory.

27 | Usage info is as follows:

28 | sh P_RNA_scaffolder.sh -d Program_dir -i input.sam -j contig.fa -F read_1.fa -R read_2.fq 29 | Input options

30 | -d the installing direcotry of P_RNA_scaffolder [ mandatory ]

31 | -i SAM file of RNA-seq alignments to contigs with hisat [ mandatory ]

32 | -j Pre-assembled contig FASTA file [ mandatory ]

33 | -F FASTQ file of left reads [ mandatory ]

34 | -R FASTQ file of right reads [ mandatory ]

35 | 36 | Output options

37 | -o write all output files to this directory [ default: ./ ]

38 | 39 | Species options

40 | -s the target species is Eukaryote or Prokaryote [default: yes ]

41 | (1) yes represents that the target species is Eukaryote.

42 | (2) no represents that the target species is Prokaryote.

43 | 44 | Two modes selection options

45 | -b re-align filtered RNA-seq reads to contigs with BLAT [ default: yes ]

46 | (1) If yes, perform the 'accurate' mode using BLAT to further filter out reads. The 'accurate' scaffolding has higher accuracy and longer running time than the 'fast' mode.

47 | (2) If no, perform the 'fast' mode without BLAT re-alignment and this mode is faster than the 'accurate' mode with less accuracy.

48 | -p BLAT alignment identity cutoff [ default: 0.90 ]

49 | -t number of threads used in BLAT re-alignment [ default: 5 ]

50 | 51 | Scaffolding options

52 | -e the maximal allowed intron length [ default: 100000 ]

53 | For genomes of different size, the maximal allowed intron length is different. For instance, in human, the maximal allowed intron length is set as 100000 while in C.elegans, it is set as 15000.

54 | -f the minimal supporting RNA-seq pair number [ default: 2 ]

55 | -n the number of inserted N to indicate a gap [ default: 100 bp ]

56 | OUTPUT FILES

57 | When P_RNA_scaffolder completes, it will create a P_RNA_scaffolder.fasta output file in the output_dir/ output directory.

58 | SPEED

59 | P_RNA_scaffolder spent about 195 minutes in scaffolding human genome contigs with a SAM file generated from alignment of 113.8 millions of RNA-seq pairs.

60 | -------------------------------------------------------------------------------- /P_RNA_scaffolder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #!/bin/sh 3 | output=./ 4 | intron=100000 5 | frequency=2 6 | N=100 7 | pid=0.90 8 | threads=5 9 | blat=yes 10 | species=yes 11 | while getopts ":d:l:p:n:e:f:i:j:o:F:R:t:b:s" opt; do 12 | case $opt in 13 | d) 14 | directory=$OPTARG 15 | vardir=1 16 | ;; 17 | o) 18 | output=$OPTARG 19 | ;; 20 | p) 21 | pid=$OPTARG 22 | ;; 23 | e) 24 | intron=$OPTARG 25 | ;; 26 | f) 27 | frequency=$OPTARG 28 | ;; 29 | n) 30 | N=$OPTARG 31 | ;; 32 | t) 33 | threads=$OPTARG 34 | ;; 35 | b) 36 | blat=$OPTARG 37 | ;; 38 | i) 39 | inputfile=$OPTARG 40 | varsam=1 41 | ;; 42 | j) 43 | contig=$OPTARG 44 | varfasta=1 45 | ;; 46 | F) 47 | fastqF=$OPTARG 48 | varfqF=1 49 | ;; 50 | R) 51 | fastqR=$OPTARG 52 | varfqR=1 53 | ;; 54 | s) 55 | species=$OPTARG 56 | ;; 57 | ?) 58 | 59 | echo "Usage: sh `basename $0` -d Program_DIR -i inputfile.sam -j contig.fasta -F read_1.fastq -R reads_2.fastq -s yes"; 60 | echo ""; 61 | echo "Input options"; 62 | echo " -d the installing direcotry of P_RNA_scaffolder [ mandatory ]"; 63 | echo " -i SAM file of RNA-seq alignments to contigs with hisat [ mandatory ]"; 64 | echo " -j Pre-assembled contig FASTA file [ mandatory ]"; 65 | echo " -F FASTQ file of left reads [ mandatory ]"; 66 | echo " -R FASTQ file of right reads [ mandatory ]"; 67 | echo ""; 68 | echo "Output options"; 69 | echo " -o write all output files to this directory [ default: ./ ]"; 70 | echo ""; 71 | echo "Species options" 72 | echo " -s the target species is Eukaryote or Prokaryote [default: yes ]"; 73 | echo " (1) yes represents that the target species is Eukaryote. "; 74 | echo " (2) no represents that the target species is Prokaryote"; 75 | echo ""; 76 | echo "Two modes selection options"; 77 | echo " -b re-align filtered RNA-seq reads to contigs with BLAT [ default: yes ]"; 78 | echo " (1) If yes, perform the 'accurate' mode using BLAT to further filter "; 79 | echo " out reads. The 'accurate' scaffolding has higher accuracy and longer "; 80 | echo " running time than the 'fast' mode."; 81 | echo " (2) If no, perform the 'fast' mode without BLAT re-alignment and this mode"; 82 | echo " is faster than the 'accurate' mode with less accuracy. "; 83 | echo " -p BLAT alignment identity cutoff [ default: 0.90 ]"; 84 | echo " -t number of threads used in BLAT re-alignment [ default: 5 ]"; 85 | echo ""; 86 | echo "Scaffolding options"; 87 | echo " -e the maximal allowed intron length [ default: 100000 ]"; 88 | echo " -f the minimal supporting RNA-seq pair number [ default: 2 ]"; 89 | echo " -n the number of inserted N to indicate a gap [ default: 100 bp ]"; 90 | echo ""; 91 | 92 | exit 1 93 | ;; 94 | :) 95 | echo "Option -$OPTARG requires an argument." >&2 96 | exit 1 97 | ;; 98 | esac 99 | done 100 | 101 | if [[ $vardir -eq 1 ]] && [[ $varsam -eq 1 ]] && [[ $varfasta -eq 1 ]] && [[ $varfqF -eq 1 ]] && [[ $varfqR -eq 1 ]] && [[ $blat = no ]]; then 102 | 103 | if [ ! -d $output ] ; then 104 | mkdir "$output" 105 | fi 106 | `perl $directory/UNIQUE_sam_intron.pl $inputfile $output/F.sam $output/R.sam $output/intron.txt`; 107 | `perl $directory/guider.pl $contig $output/F.sam $output/R.sam > $output/guider`; 108 | `$directory/link_block $output/guider $output/linker $intron`; 109 | `sort -k1,1 -k2,2n -k27,27n -k16,16nr $output/linker > $output/sort.linker`; 110 | `$directory/delete_linker $output/sort.linker $output/retained.linker`; 111 | `$directory/delete_same_fragment $output/retained.linker $output/linker.dif`; 112 | `$directory/exon_length $output/linker.dif $output/linker.length`; 113 | `$directory/convert_linker $output/linker.length $output/linker.convert`; 114 | `sort -k2,2 -k3,3 -k4,4nr $output/linker.convert > $output/linker.select`; 115 | `cut -f 2-4 $output/linker.select |sort -k1,1 -k2,2 > $output/connections`; 116 | `$directory/count_connection_frequency $output/connections $output/connections.frequency`; 117 | `$directory/find_reliable_connection $output/connections.frequency $output/reliable.connections $frequency`; 118 | `sort -k1,1 -k3,3nr $output/reliable.connections > $output/sort.reliable.connection`; 119 | `$directory/find_end_node $output/sort.reliable.connection $output/end.node`; 120 | `sort -k2,2 -k3,3nr $output/end.node > $output/sort.end.node`; 121 | `$directory/find_start_node $output/sort.end.node $output/start.node`; 122 | `$directory/select_nodes $output/start.node $output/both.nodes`; 123 | `perl $directory/form_path.pl $output/both.nodes $output/intron.txt $N > $output/both.path`; 124 | `sed 's/->/\n/g' $output/both.path |sed 's/\/r//g' |grep -v "N(" |sort -u > $output/scaffolded.fragment.id`; 125 | perl $directory/generate_scaffold.pl $contig $output/both.path P_RNA_scaffold_ > $output/scaffold.fasta & perl $directory/generate_unscaffold.pl $contig $output/scaffolded.fragment.id > $output/unscaffold.fasta 126 | wait 127 | `cat $output/scaffold.fasta $output/unscaffold.fasta >$output/P_RNA_scaffold.fasta`; 128 | 129 | exit 1 130 | 131 | elif [[ $vardir -eq 1 ]] && [[ $varsam -eq 1 ]] && [[ $varfasta -eq 1 ]] && [[ $varfqF -eq 1 ]] && [[ $varfqR -eq 1 ]] && [[ $blat = yes ]]; then 132 | 133 | if [ ! -d $output ] ; then 134 | mkdir "$output" 135 | fi 136 | `perl $directory/UNIQUE_sam_intron.pl $inputfile $output/F.sam $output/R.sam $output/intron.txt`; 137 | perl $directory/sam2fa.pl $output/F.sam $fastqF $output/F.fa & perl $directory/sam2fa.pl $output/R.sam $fastqR $output/R.fa 138 | wait 139 | lineF=(`wc $output/F.fa`) 140 | splitlineF=`expr $lineF / 2 / $threads \* 2 + 2` 141 | `split -d -l $splitlineF $output/F.fa $output/F_` 142 | for FILE in $output/F_* 143 | do 144 | $directory/blat $contig $FILE $FILE.psl -noHead 1>>$output/blatF_log 2>>$output/blatF_error & 145 | done 146 | wait 147 | lineR=(`wc $output/R.fa`) 148 | splitlineR=`expr $lineR / 2 / $threads \* 2 + 2` 149 | `split -d -l $splitlineR $output/R.fa $output/R_` 150 | for FILE in $output/R_* 151 | do 152 | $directory/blat $contig $FILE $FILE.psl -noHead 1>>$output/blatR_log 2>>$output/blatR_error & 153 | done 154 | wait 155 | lineN=(`ls $output/R_*.psl | wc`) 156 | a=(`ls $output/F_*.psl`) 157 | b=(`ls $output/R_*.psl`) 158 | for (( i=0; i<$lineN; i++)) 159 | do 160 | perl $directory/UNIQUE_psl.pl -f1 ${a[$i]} -f2 ${b[$i]} -n $pid >$output/blat-$i.filter.id & 161 | done 162 | wait 163 | `cat $output/blat-*.filter.id |sort -u >$output/blat_filter.id` 164 | if [[ $species = yes ]] ; then 165 | `perl $directory/count-match.pl $output/F.sam $output/R.sam |sort -u >>$output/blat_filter.id` 166 | fi 167 | `rm $output/blat-*.filter.id $output/F_* $output/R_*` 168 | `perl $directory/filter-out.pl $output/F.sam $output/blat_filter.id 1 >$output/filter_F.sam & perl $directory/filter-out.pl $output/R.sam $output/blat_filter.id 1 >$output/filter_R.sam` 169 | wait 170 | `perl $directory/guider.pl $contig $output/filter_F.sam $output/filter_R.sam > $output/guider`; 171 | `$directory/link_block $output/guider $output/linker $intron`; 172 | `sort -k1,1 -k2,2n -k27,27n -k16,16nr $output/linker > $output/sort.linker`; 173 | `$directory/delete_linker $output/sort.linker $output/retained.linker`; 174 | `$directory/delete_same_fragment $output/retained.linker $output/linker.dif`; 175 | `$directory/exon_length $output/linker.dif $output/linker.length`; 176 | `$directory/convert_linker $output/linker.length $output/linker.convert`; 177 | `sort -k2,2 -k3,3 -k4,4nr $output/linker.convert > $output/linker.select`; 178 | `cut -f 2-4 $output/linker.select |sort -k1,1 -k2,2 > $output/connections`; 179 | `$directory/count_connection_frequency $output/connections $output/connections.frequency`; 180 | `$directory/find_reliable_connection $output/connections.frequency $output/reliable.connections $frequency`; 181 | `sort -k1,1 -k3,3nr $output/reliable.connections > $output/sort.reliable.connection`; 182 | `$directory/find_end_node $output/sort.reliable.connection $output/start.node`; 183 | `$directory/select_nodes $output/start.node $output/both.nodes`; 184 | `perl $directory/form_path.pl $output/both.nodes $output/intron.txt $N > $output/both.path`; 185 | `sed 's/->/\n/g' $output/both.path |sed 's/\/r//g' |grep -v "N(" |sort -u > $output/scaffolded.fragment.id`; 186 | perl $directory/generate_scaffold.pl $contig $output/both.path P_RNA_scaffold_ > $output/scaffold.fasta & perl $directory/generate_unscaffold.pl $contig $output/scaffolded.fragment.id > $output/unscaffold.fasta 187 | wait 188 | `cat $output/scaffold.fasta $output/unscaffold.fasta >$output/P_RNA_scaffold.fasta`; 189 | 190 | exit 1 191 | 192 | else 193 | echo "Usage: sh `basename $0` -d Program_DIR -i inputfile.sam -j contig.fasta -F read_1.fastq -R read_2.fastq -s yes"; 194 | echo ""; 195 | echo "Input options"; 196 | echo " -d the installing direcotry of P_RNA_scaffolder [ mandatory ]"; 197 | echo " -i SAM file of RNA-seq alignments to contigs with hisat [ mandatory ]"; 198 | echo " -j Pre-assembled contig FASTA file [ mandatory ]"; 199 | echo " -F FASTQ file of left reads [ mandatory ]"; 200 | echo " -R FASTQ file of right reads [ mandatory ]"; 201 | echo ""; 202 | echo "Output options"; 203 | echo " -o write all output files to this directory [ default: ./ ]"; 204 | echo ""; 205 | echo "Species options" 206 | echo " -s the target species is Eukaryote or Prokaryote [default: yes ]"; 207 | echo " (1) yes represents that the target species is Eukaryote. "; 208 | echo " (2) no represents that the target species is Prokaryote"; 209 | echo ""; 210 | echo "Two modes selection options"; 211 | echo " -b re-align filtered RNA-seq reads to contigs with BLAT [ default: yes ]"; 212 | echo " (1) If yes, perform the 'accurate' mode using BLAT to further filter "; 213 | echo " out reads. The 'accurate' scaffolding has higher accuracy and longer "; 214 | echo " running time than the 'fast' mode."; 215 | echo " (2) If no, perform the 'fast' mode without BLAT re-alignment and this mode"; 216 | echo " is faster than the 'accurate' mode with less accuracy. "; 217 | echo " -p BLAT alignment identity cutoff [ default: 0.90 ]"; 218 | echo " -t number of threads used in BLAT re-alignment [ default: 5 ]"; 219 | echo ""; 220 | echo "Scaffolding options"; 221 | echo " -e the maximal allowed intron length [ default: 100000 ]"; 222 | echo " -f the minimal supporting RNA-seq pair number [ default: 2 ]"; 223 | echo " -n the number of inserted N to indicate a gap [ default: 100 bp ]"; 224 | echo ""; 225 | 226 | exit 1 227 | 228 | fi 229 | 230 | --------------------------------------------------------------------------------